diff --git a/.agents/skills/tilelang-build/SKILL.md b/.agents/skills/tilelang-build/SKILL.md
new file mode 100644
index 0000000000..f474736fc1
--- /dev/null
+++ b/.agents/skills/tilelang-build/SKILL.md
@@ -0,0 +1,88 @@
+---
+name: tilelang-build
+description: Repository-specific build, rebuild, install, and test instructions for tilelang. Use when working in the tilelang repository and the correct commands are needed for building from source, reinstalling after changes, or running project tests.
+---
+
+# Build & Install
+
+## Installing / Rebuilding tilelang
+
+The standard way to build and install:
+
+```bash
+pip install .
+```
+
+Or with verbose output for debugging build issues:
+
+```bash
+pip install . -v
+```
+
+`uv pip install .` also works if `uv` is available but is not required.
+
+Build dependencies are declared in `pyproject.toml` and resolved automatically during `pip install .`.
+
+If `ccache` is available, repeated builds only recompile changed C++ files.
+
+## Alternative: Development Build with `--no-build-isolation`
+
+If you need faster iteration (e.g. calling `cmake` directly to recompile C++ without re-running the full pip install), install build dependencies first:
+
+```bash
+pip install -r requirements-dev.txt
+pip install --no-build-isolation .
+```
+
+After this, you can invoke `cmake --build build` directly to recompile only changed C++ files. This is useful when iterating on C++ code.
+
+## Alternative: cmake + PYTHONPATH (recommended for C++ development)
+
+For the fastest C++ iteration, bypass pip entirely and drive cmake directly:
+
+```bash
+# Configure (auto-detects CUDA; git submodules are initialised automatically)
+cmake -S . -B build
+
+# Build
+cmake --build build -j$(nproc)
+
+# Make the local tilelang package importable
+export PYTHONPATH=$(pwd):$PYTHONPATH
+```
+
+After the initial configure, recompiling is just `cmake --build build -j$(nproc)`. The runtime automatically discovers native libraries from `build/lib/` when it detects a dev checkout (see `tilelang/env.py`).
+
+Useful cmake options:
+
+| Flag | Purpose |
+|------|---------|
+| `-DUSE_CUDA=ON/OFF` | Enable/disable CUDA backend (ON by default) |
+| `-DUSE_ROCM=ON` | Enable ROCm/HIP backend |
+| `-DUSE_METAL=ON` | Enable Metal backend (default on macOS) |
+| `-DCMAKE_BUILD_TYPE=Debug` | Debug build with `TVM_LOG_DEBUG` enabled |
+
+## Editable Installs
+
+**Never use `pip install -e .`** (editable install). When running Python from the repo root, the local `./tilelang` directory is imported instead of the installed copy (because `.` is on `sys.path` by default). This makes editable installs unnecessary. Avoid `pip install -e .` as it can cause import confusion with this project's layout.
+
+## Running Tests
+
+Most tests require a GPU.
+
+```bash
+python -m pytest testing/python/ -x
+```
+
+Run a specific test file or test case:
+
+```bash
+python -m pytest testing/python/language/test_tilelang_language_copy.py -x
+python -m pytest testing/python/language/test_tilelang_language_copy.py -x -k "test_name"
+```
+
+For Metal-specific tests (requires macOS with Apple Silicon):
+
+```bash
+python -m pytest testing/python/metal/ -x
+```
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index 8d5f3ffb48..1931c353ab 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -201,10 +201,10 @@ jobs:
           if [[ "${UV_INDEX}" == *"/nightly/"* ]]; then
             uv pip install --prerelease=allow -v torch
           fi
-          uv pip install -v -r requirements-test.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+          uv pip install -v -r requirements-test.txt
           echo "import torch; print(f'torch: {torch.__version__}')" | uv run --no-project --script -
           if [[ "${{ matrix.runner.toolkit }}" == *"CUDA"* ]]; then
-            uv pip install --no-build-isolation-package=flash-attn -v -r requirements-test-cuda.txt -i https://pypi.tuna.tsinghua.edu.cn/simple
+            uv pip install --no-build-isolation-package=flash-attn -v -r requirements-test-cuda.txt
             echo "import flash_attn; print(f'flash_attn: {flash_attn.__version__}')" | uv run --no-project --script -
           # elif [[ "${{ matrix.runner.toolkit }}" == *"ROCm"* ]]; then
           #   uv pip install -v -r requirements-test-rocm.txt
@@ -304,12 +304,12 @@ jobs:
           # Run distributed tests (marked with @requires_distributed) with TILELANG_USE_DISTRIBUTED=1
           # DeepEP tests requires fullmesh nvl or internode environment, we disable for now
           echo "Running distributed examples with TILELANG_USE_DISTRIBUTED=1:"
-          TILELANG_USE_DISTRIBUTED=1 "${PYTEST[@]}" --maxfail=3 --numprocesses=1 -m distributed --ignore-glob='*deepep*' . || true
+          TILELANG_USE_DISTRIBUTED=1 "${PYTEST[@]}" --maxfail=3 --numprocesses=1 -m distributed --ignore-glob='*deepep*' .
 
           # Run remaining example tests (non-distributed)
           # Temporarily disable problematic tests: sink, vs_sparse
           echo "Running non-distributed examples:"
-          "${PYTEST[@]}" --maxfail=3 --numprocesses=2 -m "not distributed" -k "not sink and not vs_sparse" . || true
+          "${PYTEST[@]}" --maxfail=3 --numprocesses=2 -m "not distributed" -k "not sink and not vs_sparse" .
 
       # NVIDIA CUDA tests
       - name: Run CUDA tests with Python ${{ matrix.python-version }} (${{ matrix.runner.toolkit }})
@@ -325,12 +325,12 @@ jobs:
 
           # Run distributed tests (marked with @requires_distributed) with TILELANG_USE_DISTRIBUTED=1
           echo "Running distributed tests with TILELANG_USE_DISTRIBUTED=1:"
-          TILELANG_USE_DISTRIBUTED=1 "${PYTEST[@]}" --maxfail=3 --numprocesses=1 -m distributed . || true
+          TILELANG_USE_DISTRIBUTED=1 "${PYTEST[@]}" --maxfail=3 --numprocesses=1 -m distributed .
 
           # Run remaining tests (non-distributed)
           # Temporarily disable problematic tests: tilelibrary_gemm, jit_gemm_ctypes
           echo "Running non-distributed tests:"
-          "${PYTEST[@]}" --maxfail=3 --numprocesses=2 -m "not distributed" -k "not tilelibrary_gemm and not jit_gemm_ctypes" . || true
+          "${PYTEST[@]}" --maxfail=3 --numprocesses=2 -m "not distributed" -k "not tilelibrary_gemm and not jit_gemm_ctypes" .
 
       - name: List generated files
         if: ${{ !cancelled() }}
diff --git a/.github/workflows/dist.yml b/.github/workflows/dist.yml
index 74132ffb3f..ade9eb8cb2 100644
--- a/.github/workflows/dist.yml
+++ b/.github/workflows/dist.yml
@@ -125,7 +125,7 @@ jobs:
           fi
 
       - name: Build wheels
-        uses: pypa/cibuildwheel@v3.3
+        uses: pypa/cibuildwheel@v3.4
         with:
           package-dir: .
           output-dir: wheelhouse
diff --git a/.gitignore b/.gitignore
index e85c2c0943..9d994457ee 100644
--- a/.gitignore
+++ b/.gitignore
@@ -133,3 +133,4 @@ maint/host_checks/logs/*
 
 # perf regression test
 .perf_regression/
+nvshmem_issue.md
diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
index 1c45ca35dd..b38bb492a1 160000
--- a/3rdparty/composable_kernel
+++ b/3rdparty/composable_kernel
@@ -1 +1 @@
-Subproject commit 1c45ca35dd5c215e0c1db1f40f01556f467f52a8
+Subproject commit b38bb492a1a55b5abb0c345962143c0f9c482cfb
diff --git a/3rdparty/tvm b/3rdparty/tvm
index 23bce012ff..0e15b274bc 160000
--- a/3rdparty/tvm
+++ b/3rdparty/tvm
@@ -1 +1 @@
-Subproject commit 23bce012ffd255a24289eea6ceab74a40b94a096
+Subproject commit 0e15b274bce8b46f971abf5ac390e844aa6acee5
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 4fb370d509..a4601f4f3e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -2,6 +2,12 @@
 # https://github.com/mlc-ai/mlc-llm/blob/main/CMakeLists.txt
 
 cmake_minimum_required(VERSION 3.26)
+
+# Detect CUDA toolkit: tries host installation first, then falls back to
+# pip-installed packages (env WITH_PIP_CUDA_TOOLCHAIN or auto-detect).
+# Must be included before project() so CMAKE_CUDA_COMPILER is set.
+include(${CMAKE_CURRENT_LIST_DIR}/cmake/FindPipCUDAToolkit.cmake)
+
 project(TILE_LANG C CXX)
 
 set(CMAKE_CXX_STANDARD 17)
@@ -110,6 +116,37 @@ foreach(BACKEND IN LISTS TILELANG_BACKENDS)
 endforeach()
 
 set(PREBUILD_CYTHON ON)
+
+# CUDA stub libraries (cuda/cudart/nvrtc) are used to build wheels that can run
+# across different CUDA Toolkit major versions and/or on CPU-only machines by
+# avoiding hard DT_NEEDED dependencies on versioned CUDA SONAMEs.
+#
+# These stubs are currently POSIX-only (dlopen/dlsym via <dlfcn.h>).
+if(WIN32 AND NOT CYGWIN)
+  set(_TILELANG_USE_CUDA_STUBS_DEFAULT OFF)
+else()
+  set(_TILELANG_USE_CUDA_STUBS_DEFAULT ON)
+endif()
+option(TILELANG_USE_CUDA_STUBS
+       "Use POSIX dlopen-based CUDA stub libraries (cuda/cudart/nvrtc) for portable wheels"
+       ${_TILELANG_USE_CUDA_STUBS_DEFAULT})
+unset(_TILELANG_USE_CUDA_STUBS_DEFAULT)
+
+# HIP stub libraries (hip/hiprtc) are used to build wheels that can be imported
+# on machines without ROCm installed by avoiding hard DT_NEEDED dependencies on
+# libamdhip64.so / libhiprtc.so.
+#
+# These stubs are currently POSIX-only (dlopen/dlsym via <dlfcn.h>).
+if(WIN32 AND NOT CYGWIN)
+  set(_TILELANG_USE_HIP_STUBS_DEFAULT OFF)
+else()
+  # Only meaningful when USE_ROCM is enabled.
+  set(_TILELANG_USE_HIP_STUBS_DEFAULT ON)
+endif()
+option(TILELANG_USE_HIP_STUBS
+       "Use POSIX dlopen-based HIP stub libraries (hip/hiprtc) for portable wheels"
+       ${_TILELANG_USE_HIP_STUBS_DEFAULT})
+unset(_TILELANG_USE_HIP_STUBS_DEFAULT)
 # Configs end
 
 include(cmake/load_tvm.cmake)
@@ -127,6 +164,8 @@ foreach(BACKEND IN LISTS TILELANG_BACKENDS)
   set(${_backend_var} ${TILELANG_OPTION_${_backend_var}} CACHE STRING "${_doc}" FORCE)
   set(${_backend_var} ${TILELANG_OPTION_${_backend_var}})
 endforeach()
+# tvm tries to detect gtest by default, but may fail if its header is not installed.
+set(USE_GTEST OFF)
 
 # Include directories for TileLang
 set(TILE_LANG_INCLUDES ${TVM_INCLUDES})
@@ -140,8 +179,8 @@ file(GLOB TILE_LANG_SRCS
   src/op/*.cc
   src/target/utils.cc
   src/target/codegen_c_host.cc
-  src/target/codegen_cpp.cc
-  src/target/rt_mod_cpp.cc
+  src/target/codegen_c.cc
+  src/target/rt_mod_c.cc
   # intrin_rule doesn't have system dependency
   src/target/intrin_rule*.cc
 )
@@ -151,6 +190,8 @@ list(APPEND TILE_LANG_SRCS
   src/runtime/error_helpers.cc
 )
 
+set(TILELANG_OUTPUT_TARGETS tilelang tvm)
+
 # Track if the user explicitly selected a backend via cache options.
 set(TILELANG_BACKEND_USER_SELECTED OFF)
 foreach(BACKEND IN LISTS TILELANG_BACKENDS)
@@ -183,49 +224,11 @@ if(NOT TILELANG_BACKEND_USER_SELECTED)
   endif()
 endif()
 
-if(USE_METAL)
-  file(GLOB TILE_LANG_METAL_SRCS
-    src/target/rt_mod_metal.cc
-  )
-  list(APPEND TILE_LANG_SRCS ${TILE_LANG_METAL_SRCS})
-  # FIXME: CIBW failed with backtrace, why???
-  set(TVM_FFI_USE_LIBBACKTRACE OFF)
-elseif(USE_ROCM)
-  set(CMAKE_HIP_STANDARD 17)
-  include(${TVM_SOURCE}/cmake/utils/FindROCM.cmake)
-  find_rocm(${USE_ROCM})
-  add_compile_definitions(__HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__=1)
-
-  file(GLOB TILE_LANG_HIP_SRCS
-    src/target/codegen_hip.cc
-    src/target/rt_mod_hip.cc
-  )
-  list(APPEND TILE_LANG_SRCS ${TILE_LANG_HIP_SRCS})
-  list(APPEND TILE_LANG_INCLUDES ${ROCM_INCLUDE_DIRS})
-elseif(USE_CUDA)
-  set(CMAKE_CUDA_STANDARD 17)
-  find_package(CUDAToolkit REQUIRED)
-  set(CMAKE_CUDA_COMPILER "${CUDAToolkit_BIN_DIR}/nvcc")
-  add_compile_definitions("CUDA_MAJOR_VERSION=${CUDAToolkit_VERSION_MAJOR}")
-
-  # Set `USE_CUDA=/usr/local/cuda-x.y`
-  cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA)
-
-  file(GLOB TILE_LANG_CUDA_SRCS
-    src/runtime/runtime.cc
-    src/runtime/tilescale_cuda_module.cc
-    src/target/ptx.cc
-    src/target/codegen_cuda.cc
-    src/target/codegen_py.cc
-    src/target/codegen_utils.cc
-    src/target/codegen_cutedsl.cc
-    src/target/rt_mod_cuda.cc
-    src/target/rt_mod_cutedsl.cc
-  )
-  list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS})
-
-  list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS})
-endif()
+# Backend-local CMake files own native source lists, stubs, include paths, and
+# compile definitions. Top-level CMake only selects and delegates.
+include("${CMAKE_CURRENT_SOURCE_DIR}/src/backend/cuda/CMakeLists.txt")
+include("${CMAKE_CURRENT_SOURCE_DIR}/src/backend/rocm/CMakeLists.txt")
+include("${CMAKE_CURRENT_SOURCE_DIR}/src/backend/metal/CMakeLists.txt")
 
 set(USE_Z3      ON CACHE STRING "Use Z3 SMT solver for TileLang optimizations")
 set(USE_PYPI_Z3 ON CACHE BOOL   "Use Z3 provided by PyPI z3-solver package")
@@ -235,27 +238,50 @@ if(USE_Z3 AND USE_PYPI_Z3)
   find_package(Z3 REQUIRED)
 endif()
 
+# Enable custom logging so we control the output format (e.g. strip build paths
+# from __FILE__ so wheel users don't see CI machine paths in warnings).
+set(USE_CUSTOM_LOGGING ON CACHE BOOL "Use custom logging implementation" FORCE)
+
+# Detect release (wheel) builds: in CI (cibuildwheel) or scikit-build-core wheel builds,
+# we strip source paths from LOG(WARNING) etc. for a cleaner user experience.
+# Local dev builds keep full paths for debugging.
+if(DEFINED ENV{CIBUILDWHEEL} OR "$ENV{SKBUILD_STATE}" STREQUAL "wheel")
+  set(TILELANG_RELEASE_BUILD_DEFAULT ON)
+else()
+  set(TILELANG_RELEASE_BUILD_DEFAULT OFF)
+endif()
+option(TILELANG_RELEASE_BUILD "Strip source paths from log messages (for wheel releases)" ${TILELANG_RELEASE_BUILD_DEFAULT})
+
 # Include tvm after configs have been populated
 add_subdirectory(${TVM_SOURCE} tvm EXCLUDE_FROM_ALL)
 
+# Provide the custom LogMessageImpl / LogFatalImpl implementation to TVM,
+# since TVM_LOG_CUSTOMIZE=1 requires them to be supplied by the user.
+target_sources(tvm_objs PRIVATE "${CMAKE_CURRENT_SOURCE_DIR}/src/runtime/logging.cc")
+if(TILELANG_RELEASE_BUILD)
+  target_compile_definitions(tvm_objs PRIVATE TILELANG_RELEASE_BUILD=1)
+endif()
+
 # Resolve compile warnings in tvm
 add_compile_definitions(DMLC_USE_LOGGING_LIBRARY=<tvm/runtime/logging.h>)
 
 add_library(tilelang_objs OBJECT ${TILE_LANG_SRCS})
 
 # Set debug mode compile definitions
-# We open the deubg option of TVM, i.e. TVM_LOG_DEBUG
+# Enable the TVM debug option, i.e., TVM_LOG_DEBUG
 if(CMAKE_BUILD_TYPE STREQUAL "Debug")
   message(STATUS "Building TileLang with DEBUG mode")
   target_compile_definitions(tilelang_objs PRIVATE "TVM_LOG_DEBUG")
 endif()
 
 target_include_directories(tilelang_objs PRIVATE ${TILE_LANG_INCLUDES})
+target_compile_definitions(tilelang_objs PRIVATE TVM_LOG_CUSTOMIZE=1)
+if(TILELANG_RELEASE_BUILD)
+  target_compile_definitions(tilelang_objs PRIVATE TILELANG_RELEASE_BUILD=1)
+endif()
 
 add_library(tilelang SHARED $<TARGET_OBJECTS:tilelang_objs>)
-add_library(tilelang_module SHARED $<TARGET_OBJECTS:tilelang_objs>)
-target_link_libraries(tilelang PUBLIC tvm_runtime tvm)
-target_link_libraries(tilelang_module PUBLIC tvm)
+target_link_libraries(tilelang PUBLIC tvm)
 
 # Place dev build outputs under build/lib for consistency
 set_target_properties(tilelang PROPERTIES
@@ -263,11 +289,6 @@ set_target_properties(tilelang PROPERTIES
   RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
   ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
 )
-set_target_properties(tilelang_module PROPERTIES
-  LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-  ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-)
 # Build cython extension
 find_package(Python REQUIRED COMPONENTS Interpreter Development.Module ${SKBUILD_SABI_COMPONENT})
 
@@ -288,6 +309,13 @@ endif()
 
 python_add_library(tilelang_cython_wrapper MODULE "${CMAKE_BINARY_DIR}/tilelang_cython_wrapper.cpp" ${USE_SABI} WITH_SOABI)
 
+# Disable Cython's PEP-489 multi-phase init for the wrapper. The generated
+# C++ depends on CPython's private `_xxsubinterpreters` module at import
+# time, which is stripped from some distributor-built Python 3.12 builds
+# (notably Red Hat's RHEL 9 system Python). Single-phase init avoids that
+# dependency and matches Cython's own suggested workaround. See #2125.
+target_compile_definitions(tilelang_cython_wrapper PRIVATE CYTHON_PEP489_MULTI_PHASE_INIT=0)
+
 # Ensure dev builds drop the extension into build/lib alongside other shared libs
 set_target_properties(tilelang_cython_wrapper PROPERTIES
   LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
@@ -304,7 +332,7 @@ install(TARGETS tilelang_cython_wrapper
 # Copy libz3.so to build folder to workaround isolated build env issue
 if(USE_Z3 AND USE_PYPI_Z3)
   get_target_property(Z3_LIBRARY_PATH z3::libz3 IMPORTED_LOCATION)
-  install(FILES "${Z3_LIBRARY_PATH}" DESTINATION "${CMAKE_BINARY_DIR}/tvm")
+  install(FILES "${Z3_LIBRARY_PATH}" DESTINATION "${CMAKE_BINARY_DIR}/lib")
   if(APPLE)
     set_target_properties(tvm PROPERTIES BUILD_RPATH "@loader_path")
   else()
@@ -312,87 +340,74 @@ if(USE_Z3 AND USE_PYPI_Z3)
   endif()
 endif()
 
+if(DEFINED TILELANG_ACTIVE_BACKEND_STUB_LINK)
+  foreach(target IN LISTS TILELANG_OUTPUT_TARGETS)
+    target_link_libraries(${target} PUBLIC ${TILELANG_ACTIVE_BACKEND_STUB_LINK})
+  endforeach()
+endif()
+
+# Append stub targets after the linking loop so they don't link to themselves
+if(DEFINED TILELANG_ACTIVE_BACKEND_STUB_TARGETS)
+  list(APPEND TILELANG_OUTPUT_TARGETS ${TILELANG_ACTIVE_BACKEND_STUB_TARGETS})
+endif()
+
+unset(PATCHELF_EXECUTABLE CACHE)
+
 if(APPLE)
   set(TILELANG_INSTALL_RPATH "@loader_path;@loader_path/../../tvm_ffi/lib")
   if(USE_Z3 AND USE_PYPI_Z3)
-    # some z3 is placed in lib/ and some in bin/, we add both in rpath
-    list(APPEND TILELANG_INSTALL_RPATH "@loader_path/../../z3/lib" "@loader_path/../../z3/bin")
+    # Some z3 is placed in lib/ and some in bin/, we add both in rpath
+    string(APPEND TILELANG_INSTALL_RPATH ";@loader_path/../../z3/lib;@loader_path/../../z3/bin")
   endif()
 elseif(UNIX)
   set(TILELANG_INSTALL_RPATH "\$ORIGIN:\$ORIGIN/../../tvm_ffi/lib")
   if(USE_Z3 AND USE_PYPI_Z3)
-    # cmake uses ; by default, we explicitly use : for linux
     string(APPEND TILELANG_INSTALL_RPATH ":\$ORIGIN/../../z3/lib")
   endif()
+  if(DEFINED TILELANG_ACTIVE_BACKEND_RPATH_EXTRA)
+    string(APPEND TILELANG_INSTALL_RPATH "${TILELANG_ACTIVE_BACKEND_RPATH_EXTRA}")
+  endif()
+  find_program(PATCHELF_EXECUTABLE patchelf)
+  if (NOT PATCHELF_EXECUTABLE)
+    message(STATUS "`patchelf` not found.")
+  endif()
 endif()
 
-set_target_properties(
-  tilelang tilelang_module tvm tvm_runtime
-  PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+# Let libtilelang search for tvm in the same directory
+foreach(target IN LISTS TILELANG_OUTPUT_TARGETS)
+  set_target_properties(${target} PROPERTIES INSTALL_RPATH "${TILELANG_INSTALL_RPATH}")
+  set_target_properties(${target} PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+  )
+endforeach()
+
+# Strip backend runtime dependencies for portable wheels
+if(DEFINED TILELANG_ACTIVE_BACKEND_PATCHELF_REMOVE AND PATCHELF_EXECUTABLE)
+  foreach(_needed IN LISTS TILELANG_ACTIVE_BACKEND_PATCHELF_REMOVE)
+    set(_patchelf_remove_args "${_patchelf_remove_args} --remove-needed ${_needed}")
+  endforeach()
+  foreach(target IN LISTS TILELANG_OUTPUT_TARGETS)
+    install(CODE "
+      execute_process(
+        COMMAND ${PATCHELF_EXECUTABLE}${_patchelf_remove_args}
+          \"$<TARGET_FILE:${target}>\"
+        WORKING_DIRECTORY \"${CMAKE_INSTALL_PREFIX}\"
+        RESULT_VARIABLE patchelf_result
+      )
+      if(patchelf_result EQUAL 0)
+        message(STATUS \"patchelf: removed dependencies from $<TARGET_FILE:${target}>\")
+      else()
+        message(WARNING \"patchelf failed for $<TARGET_FILE:${target}>\")
+      endif()
+    ")
+  endforeach()
+endif()
 
 install(
-  TARGETS tvm tvm_runtime tilelang_module tilelang
+  TARGETS ${TILELANG_OUTPUT_TARGETS}
   LIBRARY DESTINATION tilelang/lib
+  RUNTIME DESTINATION tilelang/lib
+  ARCHIVE DESTINATION tilelang/lib
 )
-
-# Build tilescale_ext PyTorch C++ extension
-if(USE_CUDA)
-  # Find Torch
-  execute_process(
-    COMMAND "${Python_EXECUTABLE}" -c "import torch; print(torch.utils.cmake_prefix_path)"
-    OUTPUT_VARIABLE TORCH_CMAKE_PREFIX_PATH
-    OUTPUT_STRIP_TRAILING_WHITESPACE
-    RESULT_VARIABLE TORCH_CMAKE_RESULT
-  )
-  if(TORCH_CMAKE_RESULT EQUAL 0 AND EXISTS "${TORCH_CMAKE_PREFIX_PATH}")
-    list(APPEND CMAKE_PREFIX_PATH "${TORCH_CMAKE_PREFIX_PATH}")
-  endif()
-
-  find_package(Torch QUIET)
-  if(Torch_FOUND)
-    message(STATUS "Building tilescale_ext with Torch ${Torch_VERSION}")
-
-    set(TILESCALE_EXT_SOURCES
-      ${CMAKE_CURRENT_SOURCE_DIR}/tilelang/utils/ts_ext/ts_ext_bindings.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/tilelang/utils/ts_ext/tensor.cpp
-      ${CMAKE_CURRENT_SOURCE_DIR}/tilelang/utils/ts_ext/ipc_ops.cpp
-    )
-
-    # Find libtorch_python.so
-    execute_process(
-      COMMAND "${Python_EXECUTABLE}" -c "import torch; import os; print(os.path.join(os.path.dirname(torch.__file__), 'lib', 'libtorch_python.so'))"
-      OUTPUT_VARIABLE TORCH_PYTHON_LIBRARY
-      OUTPUT_STRIP_TRAILING_WHITESPACE
-      RESULT_VARIABLE TORCH_PYTHON_RESULT
-    )
-
-    python_add_library(tilescale_ext_C MODULE ${TILESCALE_EXT_SOURCES} WITH_SOABI)
-    target_compile_definitions(tilescale_ext_C PRIVATE TORCH_EXTENSION_NAME=_C)
-    target_include_directories(tilescale_ext_C PRIVATE
-      ${TORCH_INCLUDE_DIRS}
-      ${CUDAToolkit_INCLUDE_DIRS}
-    )
-
-    if(TORCH_PYTHON_RESULT EQUAL 0 AND EXISTS "${TORCH_PYTHON_LIBRARY}")
-      message(STATUS "Found libtorch_python: ${TORCH_PYTHON_LIBRARY}")
-      target_link_libraries(tilescale_ext_C PRIVATE ${TORCH_LIBRARIES} ${TORCH_PYTHON_LIBRARY} CUDA::cudart)
-    else()
-      message(WARNING "libtorch_python.so not found, extension may have undefined symbols")
-      target_link_libraries(tilescale_ext_C PRIVATE ${TORCH_LIBRARIES} CUDA::cudart)
-    endif()
-
-    target_compile_options(tilescale_ext_C PRIVATE -fPIC)
-    set_target_properties(tilescale_ext_C PROPERTIES
-      OUTPUT_NAME "_C"
-      CXX_STANDARD 17
-      LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
-    )
-
-    # Install as tilescale_ext/_C.so so it can be imported as tilescale_ext._C
-    install(TARGETS tilescale_ext_C
-            LIBRARY DESTINATION tilescale_ext
-            RUNTIME DESTINATION tilescale_ext)
-  else()
-    message(WARNING "Torch not found, tilescale_ext will not be built")
-  endif()
-endif()
diff --git a/VERSION b/VERSION
index e52aba075b..97c7127d8d 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-0.1.7.post1
+0.1.9.post1
diff --git a/cmake/FindPipCUDAToolkit.cmake b/cmake/FindPipCUDAToolkit.cmake
new file mode 100644
index 0000000000..29cb3f3642
--- /dev/null
+++ b/cmake/FindPipCUDAToolkit.cmake
@@ -0,0 +1,70 @@
+# FindPipCUDAToolkit.cmake
+#
+# Locate CUDA toolkit — first trying the host system, then falling back
+# to pip-installed packages (nvidia-cuda-nvcc, nvidia-cuda-cccl).
+#
+# This module should be included BEFORE project() to set CMAKE_CUDA_COMPILER
+# when pip CUDA is used.
+#
+# Detection order:
+#   1. Try find_package(CUDAToolkit QUIET) — succeeds if a host CUDA
+#      installation is available; skip pip detection.
+#   2. If env var WITH_PIP_CUDA_TOOLCHAIN is set to a path (e.g., .../cu13),
+#      use that directory directly as the CUDA toolkit root.
+#   3. Otherwise, try auto-detecting from the current Python environment's
+#      site-packages (works with --no-build-isolation).
+
+# --- Try host CUDA first ---
+find_package(CUDAToolkit QUIET)
+if(CUDAToolkit_FOUND)
+  return()
+endif()
+
+find_program(_PIP_CUDA_PYTHON_EXE NAMES python3 python)
+if(NOT _PIP_CUDA_PYTHON_EXE)
+  return()
+endif()
+
+# --- Strategy 1: explicit path via env var ---
+if(DEFINED ENV{WITH_PIP_CUDA_TOOLCHAIN})
+  set(_PIP_CUDA_ROOT "$ENV{WITH_PIP_CUDA_TOOLCHAIN}")
+  if(NOT EXISTS "${_PIP_CUDA_ROOT}/bin/nvcc")
+    message(FATAL_ERROR
+      "FindPipCUDAToolkit: WITH_PIP_CUDA_TOOLCHAIN is set to '${_PIP_CUDA_ROOT}' "
+      "but nvcc was not found at '${_PIP_CUDA_ROOT}/bin/nvcc'")
+  endif()
+  # Prepare the directory (create lib64 symlink, unversioned .so symlinks,
+  # libcuda.so stub) that CMake / nvcc expect but pip packages omit.
+  execute_process(
+    COMMAND "${_PIP_CUDA_PYTHON_EXE}" "${CMAKE_CURRENT_LIST_DIR}/find_pip_cuda.py"
+            "${_PIP_CUDA_ROOT}"
+    OUTPUT_QUIET
+  )
+  message(STATUS "FindPipCUDAToolkit: using env WITH_PIP_CUDA_TOOLCHAIN=${_PIP_CUDA_ROOT}")
+else()
+  # --- Strategy 2: auto-detect from current Python env ---
+  execute_process(
+    COMMAND "${_PIP_CUDA_PYTHON_EXE}" "${CMAKE_CURRENT_LIST_DIR}/find_pip_cuda.py"
+    OUTPUT_VARIABLE _PIP_CUDA_OUTPUT
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+    RESULT_VARIABLE _PIP_CUDA_RESULT
+  )
+
+  if(NOT _PIP_CUDA_RESULT EQUAL 0)
+    message(STATUS "FindPipCUDAToolkit: pip-installed CUDA toolkit not found")
+    return()
+  endif()
+
+  string(JSON _PIP_CUDA_ROOT GET "${_PIP_CUDA_OUTPUT}" "root")
+  message(STATUS "FindPipCUDAToolkit: auto-detected from Python environment")
+endif()
+
+# --- Common pip-CUDA setup ---
+set(CMAKE_CUDA_COMPILER "${_PIP_CUDA_ROOT}/bin/nvcc" CACHE FILEPATH "CUDA compiler (from pip)" FORCE)
+set(CUDAToolkit_ROOT "${_PIP_CUDA_ROOT}" CACHE PATH "CUDA toolkit root (from pip)" FORCE)
+
+list(APPEND CMAKE_LIBRARY_PATH "${_PIP_CUDA_ROOT}/lib/stubs" "${_PIP_CUDA_ROOT}/lib")
+
+message(STATUS "FindPipCUDAToolkit: using pip-installed CUDA toolkit")
+message(STATUS "  nvcc: ${CMAKE_CUDA_COMPILER}")
+message(STATUS "  root: ${CUDAToolkit_ROOT}")
diff --git a/cmake/find_pip_cuda.py b/cmake/find_pip_cuda.py
new file mode 100644
index 0000000000..2bbf6b890a
--- /dev/null
+++ b/cmake/find_pip_cuda.py
@@ -0,0 +1,103 @@
+"""Locate pip-installed CUDA toolkit and prepare it for CMake consumption.
+
+Used by cmake/FindPipCUDAToolkit.cmake via ``execute_process``.
+Outputs a JSON object with paths on success, exits with code 1 on failure.
+
+Usage:
+    python find_pip_cuda.py              # auto-detect from current env
+    python find_pip_cuda.py /path/to/cu13  # use explicit path, just prepare it
+"""
+
+import contextlib
+import json
+import pathlib
+import subprocess
+import sys
+
+
+def _find_cu_dir():
+    """Find the nvidia/cu<ver> directory from the nvidia pip package."""
+    try:
+        import nvidia
+    except ImportError:
+        return None
+
+    nvidia_dir = pathlib.Path(nvidia.__path__[0])
+    cu_dirs = sorted(
+        (d for d in nvidia_dir.iterdir() if d.name[:2] == "cu" and d.name[2:].isdigit()),
+        key=lambda d: int(d.name[2:]),
+    )
+    if not cu_dirs:
+        return None
+    cu_dir = cu_dirs[-1]
+    if (cu_dir / "bin" / "nvcc").is_file():
+        return cu_dir
+    return None
+
+
+def _ensure_lib_symlinks(cu_dir):
+    """Create symlinks that CMake / nvcc expect but pip packages omit."""
+    lib_dir = cu_dir / "lib"
+    if not lib_dir.is_dir():
+        return
+
+    # nvcc expects lib64/ on 64-bit
+    lib64 = cu_dir / "lib64"
+    if not lib64.exists():
+        with contextlib.suppress(OSError):
+            lib64.symlink_to("lib")
+
+    # CMake expects unversioned .so (e.g., libcudart.so)
+    for so in lib_dir.glob("*.so.*"):
+        base = lib_dir / (so.name.split(".so.")[0] + ".so")
+        if not base.exists():
+            with contextlib.suppress(OSError):
+                base.symlink_to(so.name)
+
+
+def _ensure_cuda_stub(cu_dir):
+    """Create a minimal libcuda.so stub for build-time -lcuda linking."""
+    stubs_dir = cu_dir / "lib" / "stubs"
+    stub = stubs_dir / "libcuda.so"
+    if stub.exists():
+        return
+    stubs_dir.mkdir(parents=True, exist_ok=True)
+    src = stubs_dir / "_stub.c"
+    try:
+        src.write_text("void cuGetErrorString(void){}\n")
+        subprocess.check_call(
+            ["gcc", "-shared", "-o", str(stub), str(src)],
+            stderr=subprocess.DEVNULL,
+        )
+    except Exception:
+        pass
+    finally:
+        src.unlink(missing_ok=True)
+
+
+def main():
+    if len(sys.argv) > 1:
+        # Explicit path provided — just prepare it
+        cu_dir = pathlib.Path(sys.argv[1])
+    else:
+        # Auto-detect from current Python environment
+        cu_dir = _find_cu_dir()
+
+    if cu_dir is None or not (cu_dir / "bin" / "nvcc").is_file():
+        sys.exit(1)
+
+    _ensure_lib_symlinks(cu_dir)
+    _ensure_cuda_stub(cu_dir)
+
+    print(
+        json.dumps(
+            {
+                "nvcc": str(cu_dir / "bin" / "nvcc"),
+                "root": str(cu_dir),
+            }
+        )
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/_static/img/ir_transform_diagram.png b/docs/_static/img/ir_transform_diagram.png
index 3bd8689139..f6cbc9da4a 100644
Binary files a/docs/_static/img/ir_transform_diagram.png and b/docs/_static/img/ir_transform_diagram.png differ
diff --git a/docs/deeplearning_operators/gemv.md b/docs/deeplearning_operators/gemv.md
index 38287f2205..c2dddf47fe 100644
--- a/docs/deeplearning_operators/gemv.md
+++ b/docs/deeplearning_operators/gemv.md
@@ -292,7 +292,7 @@ def splitk_gemv_vectorized_tvm(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                    T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                    T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                     "reduce_scope",
                     T.reinterpret(T.uint64(0), dtype="handle"),
             ):
@@ -377,7 +377,7 @@ def get_best_config(N, K):
                         C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
                 C_reduced = T.alloc_local((1,), accum_dtype)
                 with T.attr(
-                        T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                        T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                         "reduce_scope",
                         T.reinterpret(T.uint64(0), dtype="handle"),
                 ):
diff --git a/docs/merge_upstream_tilelang.md b/docs/merge_upstream_tilelang.md
index 48a1ec9a65..c702585027 100644
--- a/docs/merge_upstream_tilelang.md
+++ b/docs/merge_upstream_tilelang.md
@@ -344,6 +344,135 @@ PR [#50](https://github.com/tile-ai/tilescale/pull/50) ("Sync mainstream TileLan
 
 ---
 
+## 10. Practical Lessons from PR #58
+
+This section captures the hard-won lessons from the `0.1.7.post1 → 0.1.9` sync (~50 upstream commits, ~80K LOC diff). Unlike PR #50 which cherry-picked individual commits, PR #58 merged the entire upstream delta in one operation — a much more aggressive approach that revealed systematic failure modes.
+
+### 10.1 `src/transform/` and `src/tl_templates/` Are NOT All TileScale-Exclusive
+
+Section 2.3 classifies these directories as "TileScale-specific pass infrastructure" and says "never overwrite". This is **misleading for bulk operations**. In practice:
+
+- **Most files in `src/transform/` exist in both repos** (e.g., `layout_inference.cc`, `loop_partition.cc`, `lower_tile_op.cc`, `inject_pipeline.cc`). They originated from upstream and were modified by both sides.
+- **Only a small subset are truly TileScale-only**: `lower_cpengine_intrin.cc`, `storage_access.cc/h`, `wgmma_sync_rewriter.cc`, `align_dynamic_shared_memory_allocations.cc`, `inject_ptx_async_copy.cc`, `inject_fence_proxy.cc`.
+- **Upstream also adds new transforms** (e.g., `producer_consumer_ws.cc`, `unroll_loop.cc`, `verify_parallel_loop.cc`, `fuse_mbarrier_arrive_expect_tx.cc`) that TileScale should absorb.
+- **The rule**: For each file, check whether it exists in the upstream commit (`git cat-file -e <sha>:<path>`). If it does, `--theirs` (take upstream) is the safe default. Only keep `--ours` for files that are genuinely TileScale-only.
+
+### 10.2 `git merge` vs `git cherry-pick`
+
+- **`git merge` (one-shot)**: Fast but produces ~400 conflicted files. Resolution must be done programmatically (batch `--ours`/`--theirs`). The batch resolution can silently corrupt files that need manual adaptation.
+- **`git cherry-pick` (per-commit)**: Safer, more auditable, but slow for 50+ commits.
+- **For >20 commits**: Consider `git merge --no-commit`, then resolve conflicts with the file-by-file decision table below, then `git commit`.
+
+### 10.3 Mandatory Build-Import-Run Loop
+
+After conflict resolution, the merge is **never** clean on the first try. Follow this loop until both `import tilelang` and a distributed example pass:
+
+```bash
+ninja -C build 2>&1 | grep error:     # fix C++ build errors
+python -c "import tilelang"             # fix Python import errors
+python examples/distributed/example_xxx.py  # fix runtime errors
+```
+
+Common failure categories and their symptoms:
+
+| Symptom | Root Cause | Fix |
+|---------|-----------|-----|
+| `undefined symbol: _ZN3tvm2tl31ApplyMultiVersionBufferRewriterE...` | Stale TileScale `.cc` kept as `--ours`; upstream added function to this file | `git checkout <upstream> -- <file>` |
+| `no matching function for call to 'VectorizeLoop(..., LayoutMap&)'` | Upstream removed/renamed an overload | Check upstream `loop_vectorize.h` for new signatures; adjust callers |
+| `'create_list_of_mbarrier' was not declared` / `'get_mbarrier' was not declared` | TileScale ops registered in old `builtin.cc`; removed in upstream | Add them back to `builtin.cc` and `builtin.h` |
+| `error: 'LoopPragmaUnroll' was not declared` | Upstream renamed to `PragmaUnrollLoop` | Bulk rename |
+| `error: 'atomicadd_elem_op' was not declared; did you mean 'atomic_add_elem_op'?` | Upstream added underscore | Bulk rename |
+| `Module has no function '__tilescale_init_table'` | Upstream `rt_mod_cuda.cc` uses `CUDAModuleCreate` instead of `TileScaleCUDAModuleCreate` | Restore `TileScaleCUDAModuleCreate` calls + include in `rt_mod_cuda.cc` |
+| `'JITKernel' object has no attribute 'initialize'` | Upstream `jit/kernel.py` doesn't have TileScale's `initialize()` | Add back `initialize()` method + `allocator` attribute |
+| `TVMFFIKernelAdapter has no attribute 'init_table'` | Upstream adapter doesn't have `init_table()` | Add back `init_table()` to `tilelang/jit/adapter/tvm_ffi.py` |
+| `'lazy_jit' not found in tilelang.jit` | Upstream `jit/__init__.py` doesn't export `lazy_jit` | Remove from `__init__.py` import, or add back implementation |
+
+### 10.4 The Distributed Codegen Must Be Surgically Preserved
+
+TileScale adds significant infrastructure to CUDA codegen that upstream knows nothing about. After merging upstream codegen files, the following **must** be present:
+
+**In `src/target/codegen_cuda.h`**:
+```cpp
+static inline bool use_distributed() {
+  const char *env = std::getenv("TILELANG_USE_DISTRIBUTED");
+  if (env) return std::string(env) == "1";
+  return false;
+}
+// Inside class CodeGenTileLangCUDA:
+bool use_distributed_{use_distributed()};
+bool need_multimem_h_{false};
+```
+
+**In `src/target/codegen_cuda.cc`**:
+```cpp
+#include "../op/distributed.h"
+#include "../op/sync.h"
+
+// Inside Finish():
+if (use_distributed_) {
+  decl_stream << "#include <tl_templates/cuda/distributed.h>\n";
+  decl_stream << "#include <tl_templates/cuda/sync.h>\n";
+  decl_stream << "#include <tl_templates/cuda/ldst.h>\n";
+  decl_stream << "extern \"C\" __constant__ uint64_t meta_data[1024];\n";
+}
+if (need_multimem_h_) {
+  decl_stream << "#include <tl_templates/cuda/multimem.h>\n";
+}
+```
+
+**In `src/target/rt_mod_cuda.cc`**: Replace upstream `CUDAModuleCreate` with `TileScaleCUDAModuleCreate` and add `#include "../runtime/tilescale_cuda_module.h"`.
+
+### 10.5 TileScale-Specific Python Utilities That Pass Silently
+
+These files exist only in TileScale and were not overwritten by the merge, but their callers in shared modules may have changed:
+
+| File | TileScale Purpose | What Can Break |
+|------|-------------------|----------------|
+| `tilelang/utils/allocator.py` | `BaseAllocator`, `get_allocator()` | `torch.set_default_device` conflicts with `all_gather_object`; `parse_device` must be correct |
+| `tilelang/utils/tensor.py` (line `tensor()` function) | `tilelang.tensor(...)` factory | Lost `tensor()` function if upstream file overwrites it |
+| `tilelang/utils/target.py` (line `parse_device()`) | device string parsing for allocator | `parse_device("cuda")` returning hardcoded 0 instead of `current_device()` |
+| `tilelang/distributed/utils.py` | `init_dist()`, `perf_fn()` | `torch.set_default_device("cuda")` before `init_process_group` causes NCCL device mismatch |
+
+### 10.6 The Device Mismatch Trap
+
+When `init_dist()` calls `torch.set_default_device("cuda")` (without device index), all PyTorch tensors default to `cuda:0`. With newer PyTorch (2.2+) passing `device_id` to `init_process_group`, NCCL enforces that collective tensors match the process group's device. This causes:
+
+```
+Torch.distributed.DistBackendError: Tensor found on device cuda:0 but backend constrained to cuda:1
+```
+
+**Fix**: Call `torch.cuda.set_device(local_rank)` BEFORE `init_process_group`, and use explicit device strings. Also ensure `parse_device("cuda")` returns `torch.cuda.current_device()` rather than hardcoded `0`.
+
+### 10.7 Duplicate Op Registration Detection
+
+After a large merge, upstream may have added op registrations that TileScale's files also register. Check with:
+
+```bash
+python -c "import tilelang" 2>&1 | grep "already registered"
+```
+
+If you see `Global Function 'tl.X' is already registered`, search for duplicate `refl::GlobalDef().def("tl.X", ...)` registrations and remove the TileScale copy (keep the upstream one).
+
+### 10.8 After Merge: Restore Truly TileScale-Only Files from Old Main
+
+After batch resolution, verify these files match the pre-sync TileScale version:
+
+| Category | Key Files |
+|----------|-----------|
+| Distributed C++ ops | `src/op/distributed.cc/h`, `src/op/remote_copy.cc/h`, `src/op/sync.cc/h`, `src/op/multimem.cc/h`, `src/op/multimem_rewriter.h`, `src/op/gemm_py.cc/h` |
+| Distributed runtime | `src/runtime/tilescale_cuda_module.cc/h`, `src/shared_memory/shared_memory.cc` |
+| Distributed templates | `src/tl_templates/cuda/distributed.h`, `sync.h`, `ldst.h`, `multimem.h` |
+| TileScale transforms | `lower_cpengine_intrin.cc`, `storage_access.cc/h`, `wgmma_sync_rewriter.cc`, `align_dynamic_shared_memory_allocations.cc`, `inject_ptx_async_copy.cc`, `inject_fence_proxy.cc` |
+| Python distributed | `tilelang/distributed/**`, `tilelang/language/distributed/**`, `tilelang/utils/allocator.py` |
+| Build config | `src/backend/cuda/CMakeLists.txt` (must include `tilescale_cuda_module.cc` and `shared_memory/shared_memory.cc`) |
+
+```bash
+# Restore a known-good TileScale file
+git show main:<path> > <path>
+```
+
+---
+
 ## 9. Checklist for Each Sync PR
 
 Before opening the PR:
@@ -353,8 +482,13 @@ Before opening the PR:
 - [ ] `CMakeLists.txt` conflict resolved; `tilescale_ext` target intact
 - [ ] `tilelang/__init__.py` still exports distributed namespace
 - [ ] Full build passes
+- [ ] `import tilelang` succeeds with no import errors
+- [ ] `tilelang.distributed` imports successfully
 - [ ] Shared `testing/python/` tests pass
 - [ ] At least one distributed example runs end-to-end
+- [ ] `TileScaleCUDAModuleCreate` used in `rt_mod_cuda.cc` (not `CUDAModuleCreate`)
+- [ ] Distributed template includes present in `codegen_cuda.cc` (`sync.h`, `ldst.h`, `distributed.h`, `multimem.h`, `meta_data`)
+- [ ] No duplicate TVM FFI registrations (`python -c "import tilelang"` clean)
 - [ ] API-breaking upstream changes reflected in TileScale distributed layer if applicable
 - [ ] PR title follows: `[Sync] Merge upstream TileLang <date or version range>`
 - [ ] PR description lists: last-synced upstream SHA, new upstream SHA, major features included, any skipped items with justification
diff --git a/docs/programming_guides/python_compatibility.md b/docs/programming_guides/python_compatibility.md
new file mode 100644
index 0000000000..b858e392ab
--- /dev/null
+++ b/docs/programming_guides/python_compatibility.md
@@ -0,0 +1,59 @@
+# Python Compatibility
+
+TileLang is a Python-embedded DSL, but not all Python syntax is supported inside
+TileLang DSL. This guide clarifies what works, what doesn't, and how
+to translate common Python patterns into TileLang equivalents. Specially, we focus on
+the kernel part (scripts inside `with T.Kernel`) semantics. For host-side semantics when
+using eager-style JIT, please stay tuned for our upcoming documentation.
+
+The following codes use the conventional aliases:
+
+```python
+import tilelang
+import tilelang.language as T
+from tilelang import jit
+```
+
+## Control Flow & Loops
+
+| Python Feature          | Supported | Notes / Alternative                      |
+|-------------------------|:---------:|------------------------------------------|
+| `for i in range(n)`     | ✅        | Maps to `T.serial(n)`                    |
+| `for i in range(a,b,s)` | ✅        | Maps to `T.serial(a, b, s)`              |
+| `for x in list`         | ❌        | Use index-based loop                     |
+| `while condition`       | ✅        |                                          |
+| `if` / `elif` / `else`  | ✅        |                                          |
+| `x if cond else y`      | ✅        | Ternary expression                       |
+| `break` / `continue`    | ✅        |                                          |
+| `enumerate()` / `zip()`    | ❌     |                                          |
+
+## Data Access
+
+| Python Feature          | Supported | Notes / Alternative                      |
+|-------------------------|:---------:|------------------------------------------|
+| `a[i]` indexing         | ✅        | Multi-dim indexing supported: `a[i, j, k]` |
+| `a[i:j]` slicing        | ✅        | Creates `BufferRegion`                   |
+| `a[-1]` negative index  | ✅        |                                          |
+
+## Assignment & Arithmetic Operations
+
+| Python Feature          | Supported | Notes / Alternative                      |
+|-------------------------|:---------:|------------------------------------------|
+| `x = expr`              | ✅        |                                          |
+| `+`, `-`, `*`, `/`, `%` | ✅        | Maps to device-side arithmetic operations |
+| `+=`, `-=`, `*=`, etc.  | ✅        | Augmented assignment                     |
+| `a = b = c`             | ❌        | Use separate assignments                 |
+
+## Functions & Classes
+
+As a kernel script language, TileLang doesn't support functions or classes. You can use `@T.macro` to define reusable code blocks, which will be inlined at compile time like `__device__` function.
+
+## Statements & Built-in Functions
+
+| Python Feature          | Supported | Notes / Alternative                      |
+|-------------------------|:---------:|------------------------------------------|
+| `with`                  | ⚠️        | Only `T.Kernel`, `T.ws`                  |
+| `assert`                | ⚠️        | Use `T.device_assert` or `T.assert`      |
+| `print()`               | ⚠️        | Use `T.print()`; `print` works for Python expressions |
+| `len()`                 | ❌        | Use `buffer.shape[dim]`                  |
+| `type()`, `isinstance()`| ❌        |                                          |
diff --git a/docs/runtime_internals/stubs.md b/docs/runtime_internals/stubs.md
new file mode 100644
index 0000000000..ee4c628e79
--- /dev/null
+++ b/docs/runtime_internals/stubs.md
@@ -0,0 +1,47 @@
+# CUDA and ROCm Stub Libraries
+
+This document describes TileLang's stub mechanism for GPU driver/runtime
+libraries (CUDA and ROCm/HIP).
+
+## Purpose
+
+CUDA:
+
+1. **CUDA Driver (`cuda_stub`)**: Allows TileLang to be imported on systems
+   without a GPU (e.g., CI/compilation nodes) by lazy-loading `libcuda.so` only
+   when needed.
+2. **CUDA Runtime & Compiler (`cudart_stub`, `nvrtc_stub`)**: Resolves SONAME
+   versioning mismatches (e.g. `libcudart.so.11` vs `libcudart.so.12`),
+   enabling a single build to work across different CUDA versions. This is
+   achieved by reusing CUDA libraries already loaded by frameworks like PyTorch
+   when possible.
+
+ROCm:
+
+1. **HIP Runtime/Module API (`hip_stub`)**: Allows TileLang to be imported on
+   systems without ROCm installed by lazy-loading `libamdhip64.so` only when
+   needed. The stub also prefers already-loaded symbols via `RTLD_DEFAULT` /
+   `RTLD_NEXT` to interoperate with frameworks that have already loaded HIP.
+2. **HIP Runtime Compiler (`hiprtc_stub`)**: Lazily loads `libhiprtc.so` and
+   exposes the minimal HIPRTC API subset used by TileLang/TVM.
+
+## Implementation
+
+The stubs in `src/target/stubs/` implement a lazy-loading mechanism:
+
+- **Lazy Loading**: Libraries are loaded via `dlopen` only upon the first API call.
+- **Global Symbol Reuse**: For `cudart` and `nvrtc`, the stubs first check the global namespace (`RTLD_DEFAULT`) to use any already loaded symbols (e.g., from PyTorch).
+- **ROCm Notes**: `hip_stub` checks `RTLD_DEFAULT` / `RTLD_NEXT` first and then
+  falls back to `dlopen("libamdhip64.so")`. It additionally provides wrappers
+  for `hsa_init` / `hsa_shut_down` so that ROCm-enabled wheels do not record a
+  hard dependency on `libhsa-runtime64` at import time.
+- **Versioning Support**: Handles ABI differences between CUDA versions (e.g., `cudaGraphInstantiate` changes in CUDA 12).
+
+## Build Option
+
+- `TILELANG_USE_CUDA_STUBS` (Default: `ON`) controls CUDA stubs. When enabled,
+  TileLang links against these stubs instead of the system CUDA toolkit
+  libraries.
+- `TILELANG_USE_HIP_STUBS` (Default: `ON`) controls ROCm stubs. When enabled
+  (and `USE_ROCM=ON`), TileLang/TVM link against `hip_stub` / `hiprtc_stub`
+  instead of the system ROCm libraries.
diff --git a/docs/spelling_wordlist.txt b/docs/spelling_wordlist.txt
index 6fd4334594..2d418f0142 100644
--- a/docs/spelling_wordlist.txt
+++ b/docs/spelling_wordlist.txt
@@ -1,6 +1,8 @@
 cancelled
+dout
 HDA
 hsa
+inouts
 ist
 LOD
 nd
diff --git a/examples/amd/example_amd_flash_attn_bwd.py b/examples/amd/example_amd_flash_attn_bwd.py
index 788aec367c..0a9b7d26cb 100644
--- a/examples/amd/example_amd_flash_attn_bwd.py
+++ b/examples/amd/example_amd_flash_attn_bwd.py
@@ -1,3 +1,4 @@
+import sys
 import torch
 import torch.nn.functional as F
 import tilelang
@@ -10,6 +11,15 @@
 import time
 
 
+def IsRDNA():
+    if torch.cuda.is_available():
+        gpu_name = torch.cuda.get_device_name().strip()
+        return "Radeon" in gpu_name
+    else:
+        print("Error: GPU Device is not detected")
+        sys.exit(1)
+
+
 def ref_program(Q, K, V, is_causal, groups=1):
     assert Q.size(2) == K.size(2) * groups, f"Q heads {Q.size(2)} K heads {K.size(2)} groups {groups}"
     assert Q.size(2) == V.size(2) * groups, f"Q heads {Q.size(2)} V heads {V.size(2)} groups {groups}"
@@ -30,13 +40,24 @@ def ref_program(Q, K, V, is_causal, groups=1):
 
 
 def get_fwd_configs():
-    block_M = [32, 64, 128, 256]
-    block_N = [32, 64, 128, 256]
-    threads = [128, 256, 512]
-    num_split_q = [64, 128, 256]
-    num_stages = [0, 1]
+    # Match the standalone forward example on RDNA. WMMA configs larger than
+    # 32x32 can trigger layout issues when bridging the softmax fragment into
+    # the second GEMM's A-layout.
+    if IsRDNA():
+        block_M = [16, 32, 64]
+        block_N = [16, 32, 64]
+        threads = [32, 64]
+        num_split_q = [16, 32, 64]
+        num_stages = [0]
+        k_pack = [1]
+    else:
+        block_M = [32, 64, 128, 256]
+        block_N = [32, 64, 128, 256]
+        threads = [128, 256, 512]
+        num_split_q = [64, 128, 256]
+        num_stages = [0, 1]
+        k_pack = [2]
     enable_rasterization = [True]
-    k_pack = [2]
     panel_size = [7, 8, 9, 10]
     qk_coalesced_width = [8]
     v_coalesced_width = [4]
@@ -46,6 +67,8 @@ def get_fwd_configs():
     for m, n, s, t, stages, r, k, p, qkw, vw in itertools.product(
         block_M, block_N, num_split_q, threads, num_stages, enable_rasterization, k_pack, panel_size, qk_coalesced_width, v_coalesced_width
     ):
+        if IsRDNA() and m == 16 and n == 16 and t == 64:
+            continue
         valid_configs.append(
             {
                 "block_M": m,
@@ -112,7 +135,7 @@ def main(
             bx_loop_var = T.alloc_var(T.int32)
             bx_loop_var = b_split
 
-            with T.While(bx_loop_var < num_q_blocks):
+            while bx_loop_var < num_q_blocks:
                 acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
                 m_i = T.alloc_fragment([block_M], accum_dtype)
                 l_i = T.alloc_fragment([block_M], accum_dtype)
@@ -127,6 +150,10 @@ def main(
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
+                # Bridge the WMMA D-layout softmax fragment into the A-layout
+                # expected by GEMM 2 on RDNA GPUs.
+                if IsRDNA():
+                    P_shared = T.alloc_shared([block_M, block_N], dtype)
                 acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
 
                 acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
@@ -188,7 +215,12 @@ def main(
                     for i in T.Parallel(block_M):
                         l_i[i] += row_sum[i]
 
-                    T.copy(acc_s, acc_s_cast)
+                    if IsRDNA():
+                        for i, j in T.Parallel(block_M, block_N):
+                            P_shared[i, j] = T.cast(acc_s[i, j], dtype)
+                        T.copy(P_shared, acc_s_cast)
+                    else:
+                        T.copy(acc_s, acc_s_cast)
 
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=GemmWarpPolicy.FullRow)
 
@@ -211,15 +243,27 @@ def main(
 
 
 def get_bwd_configs():
-    block_M = [16, 32, 64, 128, 256]
-    block_N = [16, 32, 64, 128, 256]
-    threads = [64, 128, 256, 512, 1024]
-    num_stages = [0, 1, 2]
+    # Keep the RDNA search space aligned with the WMMA-friendly tile sizes
+    # verified above. Larger tiles and some warp/block combinations are either
+    # unsupported or known to trigger invalid lowering on RDNA.
+    if IsRDNA():
+        block_M = [16, 32]
+        block_N = [16, 32]
+        threads = [32, 64]
+        num_stages = [0]
+        panel_size = [7, 8]
+    else:
+        block_M = [16, 32, 64, 128, 256]
+        block_N = [16, 32, 64, 128, 256]
+        threads = [64, 128, 256, 512, 1024]
+        num_stages = [0, 1, 2]
+        panel_size = [7, 8, 9, 10]
     enable_rasterization = [True]
-    panel_size = [7, 8, 9, 10]
 
     configs = []
     for m, n, stages, t, r, p in itertools.product(block_M, block_N, num_stages, threads, enable_rasterization, panel_size):
+        if IsRDNA() and m == 16 and n == 16 and t == 64:
+            continue
         configs.append(
             {
                 "block_M": m,
@@ -305,6 +349,10 @@ def flash_bwd_kernel(
             lse_shared = T.alloc_shared([block_N], accum_dtype)
             delta_shared = T.alloc_shared([block_N], accum_dtype)
             ds_shared = T.alloc_shared([block_M, block_N], dtype)
+            if IsRDNA():
+                # Bridge the WMMA D-layout fragment produced by GEMM/elementwise
+                # ops into the A-layout expected by the following GEMM.
+                p_shared = T.alloc_shared([block_M, block_N], dtype)
 
             p_cast = T.alloc_fragment([block_M, block_N], dtype)
             qkT = T.alloc_fragment([block_M, block_N], accum_dtype)
@@ -343,17 +391,29 @@ def flash_bwd_kernel(
 
                 T.gemm(V_shared, do_shared, dP, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
 
-                T.copy(P_acc, p_cast)
+                if IsRDNA():
+                    for i, j in T.Parallel(block_M, block_N):
+                        p_shared[i, j] = T.cast(P_acc[i, j], dtype)
+                    T.copy(p_shared, p_cast)
+                else:
+                    T.copy(P_acc, p_cast)
                 T.gemm(p_cast, do_shared, dv, policy=T.GemmWarpPolicy.FullRow)
 
                 T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta_shared)
 
-                for i, j in T.Parallel(block_M, block_N):
-                    p_cast[i, j] = P_acc[i, j] * (dP[i, j] - delta_shared[j]) * sm_scale
-
-                T.gemm(p_cast, q_shared, dk, policy=T.GemmWarpPolicy.FullRow)
-
+                if IsRDNA():
+                    for i, j in T.Parallel(block_M, block_N):
+                        dP[i, j] = P_acc[i, j] * (dP[i, j] - delta_shared[j]) * sm_scale
+                    for i, j in T.Parallel(block_M, block_N):
+                        p_shared[i, j] = T.cast(dP[i, j], dtype)
+                    T.copy(p_shared, p_cast)
+                    T.gemm(p_cast, q_shared, dk, policy=T.GemmWarpPolicy.FullRow)
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        p_cast[i, j] = P_acc[i, j] * (dP[i, j] - delta_shared[j]) * sm_scale
+                    T.gemm(p_cast, q_shared, dk, policy=T.GemmWarpPolicy.FullRow)
                 T.copy(p_cast, ds_shared)
+
                 T.clear(dq)
                 T.gemm(ds_shared, K_shared, dq, transpose_A=True)
                 for i, j in T.Parallel(block_N, dim):
diff --git a/examples/amd/example_amd_flash_attn_fwd.py b/examples/amd/example_amd_flash_attn_fwd.py
index ca9c361ff1..cb7d1225d2 100644
--- a/examples/amd/example_amd_flash_attn_fwd.py
+++ b/examples/amd/example_amd_flash_attn_fwd.py
@@ -1,3 +1,4 @@
+import sys
 import torch
 import torch.nn.functional as F
 import tilelang
@@ -8,7 +9,15 @@
 from functools import partial
 
 
-# Custom supply function to ensure tensors are created on GPU
+def IsRDNA():
+    if torch.cuda.is_available():
+        gpu_name = torch.cuda.get_device_name().strip()
+        return "Radeon" in gpu_name
+    else:
+        print("Error: GPU Device is not detected")
+        sys.exit(1)
+
+
 def supply_tensors_gpu(params):
     """Supply function that creates tensors on GPU for ROCm/HIP."""
     tensors = []
@@ -16,7 +25,9 @@ def supply_tensors_gpu(params):
         if hasattr(param, "shape") and hasattr(param, "dtype"):
             # Force creation on GPU device
             shape = [int(s) for s in param.shape]
-            tensor = torch.randn(shape, dtype=param.dtype, device="cuda")
+            # Convert TileLang dtype to PyTorch dtype
+            torch_dtype = param.dtype.as_torch()
+            tensor = torch.randn(shape, dtype=torch_dtype, device="cuda")
             tensors.append(tensor)
         else:
             tensors.append(param)
@@ -42,14 +53,31 @@ def ref_program(Q, K, V, is_causal, groups=1):
 
 
 def get_configs():
-    """Generates configurations for the autotuner, tailored for FA-2 style parallelism."""
-    block_M = [32, 64, 128, 256]
-    block_N = [32, 64, 128, 256]
-    threads = [128, 256, 512]
-    num_split_q = [64, 128, 256]
-    num_stages = [0, 1]
+    """Generates configurations for the autotuner.
+
+    For RDNA (gfx11xx/gfx12xx) GPUs using WMMA instructions, block sizes
+    are limited to 32x32 due to a layout mismatch between WMMA D output and A input
+    registers when block_M > 16 * num_warps_per_32_threads. Larger blocks cause
+    incorrect results in the shared memory transpose used to convert softmax scores
+    to the GEMM 2 A-matrix layout.
+    """
+    if IsRDNA():
+        block_M = [16, 32]
+        block_N = [16, 32]
+        threads = [32, 64]
+        num_split_q = [16, 32, 64]
+        num_stages = [0]
+        # k_pack=2 is broken for RDNA WMMA (incorrect K-dimension loading for multi-k_pack).
+        # Use k_pack=1 only until fixed.
+        k_pack = [1]
+    else:
+        block_M = [64, 128, 256]
+        block_N = [64, 128, 256]
+        threads = [128, 256]
+        num_split_q = [64, 128, 256]
+        num_stages = [0, 1]
+        k_pack = [2]
     enable_rasterization = [True]
-    k_pack = [2]
     panel_size = [7, 8]
     qk_coalesced_width = [8]
     v_coalesced_width = [4]
@@ -124,7 +152,7 @@ def main(
             bx = T.alloc_var(T.int32)
             bx = b_split
 
-            with T.While(bx < num_q_blocks):
+            while bx < num_q_blocks:
                 acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
                 m_i = T.alloc_fragment([block_M], accum_dtype)
                 l_i = T.alloc_fragment([block_M], accum_dtype)
@@ -138,6 +166,13 @@ def main(
                 Q_shared = T.alloc_shared([block_M, dim], dtype)
                 K_shared = T.alloc_shared([block_N, dim], dtype)
                 V_shared = T.alloc_shared([block_N, dim], dtype)
+                # P_shared is used to bridge the WMMA D-layout (acc_s output) to
+                # A-layout (acc_s_cast input for GEMM 2). On RDNA GPUs with WMMA,
+                # D and A have different register layouts, so a direct fragment-to-
+                # fragment copy would cause a layout conflict. Routing through shared
+                # memory correctly transposes the softmax values.
+                if IsRDNA():
+                    P_shared = T.alloc_shared([block_M, block_N], dtype)
                 # Use register fragment for P instead of shared memory to reduce LDS usage
                 acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
 
@@ -184,6 +219,7 @@ def main(
                     for i, j in T.Parallel(block_M, dim):
                         acc_o[i, j] *= scale_factor[i]
 
+                    # Compute softmax values
                     for i, j in T.Parallel(block_M, block_N):
                         acc_s[i, j] = T.exp(acc_s[i, j] * scale - m_i[i] * scale)
 
@@ -191,8 +227,20 @@ def main(
                     for i in T.Parallel(block_M):
                         l_i[i] += row_sum[i]
 
-                    # Cast acc_s (accum_dtype) to dtype in registers and directly GEMM with V
-                    T.copy(acc_s, acc_s_cast)
+                    if IsRDNA():
+                        # Cast softmax values from f32 (acc_s, D-layout) to f16 (acc_s_cast, A-layout).
+                        # On RDNA with WMMA, D and A have different register layouts.
+                        # Route through shared memory (P_shared) to correctly bridge them:
+                        # 1) T.Parallel writes acc_s values to P_shared at D-layout coordinates.
+                        # 2) T.copy reads P_shared into acc_s_cast at A-layout coordinates.
+                        # This shared-memory transpose is only correct when block_M / threads
+                        # gives at most 2 warps (block_M=32 with 64 threads, or block_M=16 with 32 threads).
+                        for i, j in T.Parallel(block_M, block_N):
+                            P_shared[i, j] = T.cast(acc_s[i, j], dtype)
+                        T.copy(P_shared, acc_s_cast)
+                    else:
+                        # This avoids layout conflict between acc_s and acc_s_cast
+                        T.copy(acc_s, acc_s_cast)
 
                     T.gemm(acc_s_cast, V_shared, acc_o, policy=GemmWarpPolicy.FullRow)
 
diff --git a/examples/attention_sink/benchmark_gqa_sink_fwd.py b/examples/attention_sink/benchmark_gqa_sink_fwd.py
index 211ef1d18c..e0cb5480f3 100644
--- a/examples/attention_sink/benchmark_gqa_sink_fwd.py
+++ b/examples/attention_sink/benchmark_gqa_sink_fwd.py
@@ -5,7 +5,7 @@
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
-from example_gqa_sink_fwd_bhsd_wgmma_pipelined import flashattn, ref_program, gen_inputs
+from example_gqa_sink_fwd_bhsd import flashattn, ref_program, gen_inputs
 from typing import Optional
 
 
diff --git a/examples/attention_sink/benchmark_mha_sink_fwd.py b/examples/attention_sink/benchmark_mha_sink_fwd.py
index 50747e6b09..c1e25d9be6 100644
--- a/examples/attention_sink/benchmark_mha_sink_fwd.py
+++ b/examples/attention_sink/benchmark_mha_sink_fwd.py
@@ -5,7 +5,7 @@
 import triton
 import triton.language as tl
 from triton.tools.tensor_descriptor import TensorDescriptor
-from example_mha_sink_fwd_bhsd_wgmma_pipelined import flashattn, ref_program, gen_inputs
+from example_mha_sink_fwd_bhsd import flashattn, ref_program, gen_inputs
 from typing import Optional
 
 
diff --git a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
index cfdcd21b58..97a504a0d1 100644
--- a/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_gqa_sink_bwd_bhsd.py
@@ -542,7 +542,7 @@ def run_kernel_only():
     parser.add_argument("--n_ctx", type=int, default=4096, help="Context size")
     parser.add_argument("--d_head", type=int, default=128, help="Head dimension")
     parser.add_argument("--groups", type=int, default=8, help="Groups")
-    parser.add_argument("--window_size", type=int, default=None, help="window size (default: None, which means full attention)")
+    parser.add_argument("--window_size", type=int, default=128, help="window size (default: None, which means full attention)")
     parser.add_argument("--dtype", type=str, default="float16", help="dtype, can be float16 or bfloat16")
     args = parser.parse_args()
     main(args.batch, args.h, args.n_ctx, args.d_head, args.groups, args.window_size, args.dtype)
diff --git a/examples/attention_sink/example_gqa_sink_bwd_varlen.py b/examples/attention_sink/example_gqa_sink_bwd_varlen.py
new file mode 100644
index 0000000000..64a5a39a86
--- /dev/null
+++ b/examples/attention_sink/example_gqa_sink_bwd_varlen.py
@@ -0,0 +1,798 @@
+import torch
+import tilelang
+from tilelang.profiler import do_bench
+import tilelang.language as T
+import argparse
+from typing import Optional
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../flash_attention"))
+from varlen_utils import generate_random_padding_mask, generate_qkv
+
+
+def get_bwd_configs():
+    sm_major, sm_minor = torch.cuda.get_device_capability()
+    sm_version = sm_major * 10 + sm_minor
+    if sm_version == 80:
+        return 64, 32, 1, 128
+    else:
+        return 128, 32, 2, 256
+
+
+@tilelang.jit(
+    out_idx=[6, 7],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn_fwd(
+    batch_size,
+    groups,
+    UQ,
+    UKV,
+    N_CTX,
+    heads,
+    max_seq_len,
+    dim,
+    is_causal,
+    window_size=None,  # None for full causal attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+    dtype=T.float16,
+):
+    if window_size is not None:
+        assert window_size % block_N == 0, "window_size must be divisible by block_N"
+
+    if sm_scale is None:
+        sm_scale = (1.0 / dim) ** 0.5
+    scale = sm_scale * 1.44269504  # log2(e)
+
+    head_kv = heads // groups
+    q_shape = [UQ, heads, dim]
+    kv_shape = [UKV, head_kv, dim]
+    o_shape = [UQ, heads, dim]
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        Sinks: T.Tensor([heads], dtype),
+        Output_unpad: T.Tensor(o_shape, dtype),
+        lse: T.Tensor([batch_size, heads, N_CTX], accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(max_seq_len, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+            acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+            sinks = T.alloc_fragment([block_M], dtype)
+
+            batch_idx = bz
+            head_idx = by
+            kv_head_idx = head_idx // groups
+
+            q_start_idx = cu_seqlens_q[batch_idx]
+            kv_start_idx = cu_seqlens_k[batch_idx]
+            q_end_idx = cu_seqlens_q[batch_idx + 1]
+            k_end_idx = cu_seqlens_k[batch_idx + 1]
+
+            q_current_seqlen = q_end_idx - q_start_idx
+            kv_current_seqlen = k_end_idx - kv_start_idx
+
+            T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
+
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            for i in T.Parallel(block_M):
+                sinks[i] = Sinks[head_idx]
+
+            offset = kv_current_seqlen - q_current_seqlen  # always align on the right
+            max_visible_k_idx = offset + (bx + 1) * block_M
+
+            # Determine loop range based on causal mask and sliding window
+            if is_causal:
+                if window_size is not None:
+                    start = T.max(0, (offset + bx * block_M - window_size + 1) // block_N)
+                    end = T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                else:
+                    start = 0
+                    end = T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+            else:
+                if window_size is not None:
+                    start = T.max(0, (offset + bx * block_M - window_size + 1) // block_N)
+                    end = T.ceildiv(kv_current_seqlen, block_N)
+                else:
+                    start = 0
+                    end = T.ceildiv(kv_current_seqlen, block_N)
+
+            loop_range = end - start
+
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                actual_k = k + start
+                T.copy(K_unpad[kv_start_idx + actual_k * block_N : kv_start_idx + (actual_k + 1) * block_N, kv_head_idx, :], K_shared)
+
+                # Build mask considering causal, sliding window, and padding
+                if is_causal:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            q_idx = bx * block_M + i + offset
+                            k_idx = actual_k * block_N + j
+                            acc_s[i, j] = T.if_then_else(
+                                (q_idx < k_idx)
+                                or (q_idx >= k_idx + window_size)
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i + offset < actual_k * block_N + j)
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                else:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            q_idx = bx * block_M + i + offset
+                            k_idx = actual_k * block_N + j
+                            acc_s[i, j] = T.if_then_else(
+                                (q_idx >= k_idx + window_size)
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(V_unpad[kv_start_idx + actual_k * block_N : kv_start_idx + (actual_k + 1) * block_N, kv_head_idx, :], V_shared)
+                T.copy(scores_max, scores_max_prev)
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+
+                # Handle case where scores_max is -inf (query sees no keys due to causal mask or sliding window)
+                # This can happen when q_len > k_len (offset < 0) in causal attention, or with sliding window
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+
+                T.copy(acc_s, acc_s_cast)
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+
+            # Attention sink: add sink contribution to logsum
+            for i in T.Parallel(block_M):
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)
+
+            for i, j in T.Parallel(block_M, dim):
+                acc_o[i, j] = 0 if is_causal and bx * block_M + i + offset < 0 else acc_o[i, j] / logsum[i]
+
+            for i, d in T.Parallel(block_M, dim):
+                if bx * block_M + i < q_current_seqlen:
+                    Output_unpad[q_start_idx + bx * block_M + i, head_idx, d] = acc_o[i, d]
+
+            for i in T.Parallel(block_M):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            for i in T.Parallel(block_M):
+                if bx * block_M + i < q_current_seqlen:
+                    lse[bz, head_idx, bx * block_M + i] = logsum[i]
+
+    return main
+
+
+@tilelang.jit(
+    out_idx=[3],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn_bwd_preprocess(batch_size, heads, UQ, N_CTX, max_seq_len, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
+    shape = [UQ, heads, dim]
+    blk = 32
+
+    @T.prim_func
+    def flash_bwd_prep(
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        Delta: T.Tensor([batch_size, heads, N_CTX], accum_dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, blk), batch_size) as (bx, by, bz):
+            o = T.alloc_fragment([blk, blk], dtype)
+            do = T.alloc_fragment([blk, blk], dtype)
+            acc = T.alloc_fragment([blk, blk], accum_dtype)
+            delta = T.alloc_fragment([blk], accum_dtype)
+
+            q_start_idx = cu_seqlens_q[bz]
+            q_end_idx = cu_seqlens_q[bz + 1]
+            q_current_seqlen = q_end_idx - q_start_idx
+
+            T.clear(acc)
+            for k in range(T.ceildiv(dim, blk)):
+                for i, j in T.Parallel(blk, blk):
+                    if by * blk + i < q_current_seqlen and k * blk + j < dim:
+                        o[i, j] = O[q_start_idx + by * blk + i, bx, k * blk + j]
+                        do[i, j] = dO[q_start_idx + by * blk + i, bx, k * blk + j]
+                    else:
+                        o[i, j] = 0.0
+                        do[i, j] = 0.0
+                for i, j in T.Parallel(blk, blk):
+                    acc[i, j] += o[i, j] * do[i, j]
+            T.reduce_sum(acc, delta, 1)
+
+            for i in T.Parallel(blk):
+                if by * blk + i < q_current_seqlen:
+                    Delta[bz, bx, by * blk + i] = delta[i]
+
+    return flash_bwd_prep
+
+
+def make_dq_layout(dQ):
+    # Reorder dq for atomic add: [seq, head, dim] -> permuted layout
+    return T.Layout(dQ.shape, lambda l, h, d: [h, l, d])
+
+
+@tilelang.jit(
+    out_idx=[1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn_bwd_postprocess(UQ, heads, dim, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
+    shape = [UQ, heads, dim]
+    blk = 64
+
+    @T.prim_func
+    def flash_bwd_post(
+        dQ: T.Tensor(shape, accum_dtype),
+        dQ_out: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(UQ, blk), heads, threads=128) as (bx, by):
+            T.annotate_layout({dQ: make_dq_layout(dQ)})
+            T.copy(
+                dQ[bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bx * blk : (bx + 1) * blk, by, :],
+            )
+
+    return flash_bwd_post
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    }
+)
+def flashattn_bwd(
+    batch_size,
+    groups,
+    UQ,
+    UKV,
+    N_CTX,
+    heads,
+    max_seq_len,
+    dim,
+    is_causal,
+    window_size=None,
+    sm_scale=None,
+    dtype=T.float16,
+):
+    if sm_scale is None:
+        sm_scale = (1.0 / dim) ** 0.5
+    scale = sm_scale * 1.44269504  # log2(e)
+
+    head_kv = heads // groups
+    q_shape = [UQ, heads, dim]
+    kv_shape = [UKV, head_kv, dim]
+    accum_dtype = T.float32
+
+    block_M, block_N, num_stages, threads = get_bwd_configs()
+
+    if window_size is not None:
+        assert window_size % block_N == 0, "window_size must be divisible by block_N"
+
+    @T.prim_func
+    def flash_bwd(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        dO: T.Tensor(q_shape, dtype),
+        lse: T.Tensor([batch_size, heads, N_CTX], accum_dtype),
+        Delta: T.Tensor([batch_size, heads, N_CTX], accum_dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        dQ: T.Tensor(q_shape, accum_dtype),
+        dK: T.Tensor(kv_shape, accum_dtype),
+        dV: T.Tensor(kv_shape, accum_dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block_M), batch_size, threads=threads) as (bx, by, bz):
+            K_shared = T.alloc_shared([block_M, dim], dtype)
+            dsT_shared = T.alloc_shared([block_M, block_N], dtype)
+            q = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_M, dim], dtype)
+            qkT = T.alloc_fragment([block_M, block_N], accum_dtype)
+            dsT = T.alloc_fragment([block_M, block_N], accum_dtype)
+            qkT_cast = T.alloc_fragment([block_M, block_N], dtype)
+            dsT_cast = T.alloc_fragment([block_M, block_N], dtype)
+            lse_shared = T.alloc_shared([block_N], accum_dtype)
+            delta = T.alloc_shared([block_N], accum_dtype)
+            do = T.alloc_shared([block_N, dim], dtype)
+            dv = T.alloc_fragment([block_M, dim], accum_dtype)
+            dk = T.alloc_fragment([block_M, dim], accum_dtype)
+            dq = T.alloc_fragment([block_N, dim], accum_dtype)
+            dv_shared = T.alloc_shared([block_M, dim], accum_dtype)
+            dk_shared = T.alloc_shared([block_M, dim], accum_dtype)
+
+            q_start_idx = cu_seqlens_q[bz]
+            kv_start_idx = cu_seqlens_k[bz]
+            q_end_idx = cu_seqlens_q[bz + 1]
+            k_end_idx = cu_seqlens_k[bz + 1]
+            q_current_seqlen = q_end_idx - q_start_idx
+            kv_current_seqlen = k_end_idx - kv_start_idx
+
+            T.annotate_layout(
+                {
+                    dQ: make_dq_layout(dQ),
+                }
+            )
+            T.copy(K[kv_start_idx + by * block_M : kv_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[kv_start_idx + by * block_M : kv_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
+            T.clear(dv)
+            T.clear(dk)
+
+            # For varlen causal attention, we need to account for offset between q and kv lengths
+            # In forward: Q at pos q can see KV at pos k if q + offset >= k (where offset = kv_len - q_len)
+            # In backward: KV at pos kv_pos is seen by Q at pos q_pos if kv_pos <= q_pos + offset
+            offset = kv_current_seqlen - q_current_seqlen
+
+            # loop_st: first Q block that can see this KV block
+            # kv_pos <= q_pos + offset => by * block_M <= k * block_N + offset
+            # => k >= (by * block_M - offset) / block_N
+            loop_st = T.max(0, T.floordiv(by * block_M - offset, block_N)) if is_causal else 0
+            loop_ed = (
+                T.min(T.ceildiv((by + 1) * block_M - offset + window_size, block_N), T.ceildiv(q_current_seqlen, block_N))
+                if window_size is not None
+                else T.ceildiv(q_current_seqlen, block_N)
+            )
+
+            for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
+                T.copy(Q[q_start_idx + k * block_N : q_start_idx + (k + 1) * block_N, bx, :], q)
+                T.clear(qkT)
+                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
+                if is_causal:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            # Causal: kv_pos <= q_pos + offset
+                            # Sliding window: kv_pos > q_pos + offset - window_size
+                            qkT[i, j] = T.if_then_else(
+                                (by * block_M + i <= k * block_N + j + offset)
+                                and (by * block_M + i > k * block_N + j + offset - window_size)
+                                and (by * block_M + i < kv_current_seqlen and k * block_N + j < q_current_seqlen),
+                                qkT[i, j],
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            # Causal: kv_pos <= q_pos + offset
+                            qkT[i, j] = T.if_then_else(
+                                (by * block_M + i <= k * block_N + j + offset)
+                                and (by * block_M + i < kv_current_seqlen and k * block_N + j < q_current_seqlen),
+                                qkT[i, j],
+                                0,
+                            )
+                else:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            qkT[i, j] = T.if_then_else(
+                                (by * block_M + i > k * block_N + j + offset - window_size)
+                                and (by * block_M + i < kv_current_seqlen and k * block_N + j < q_current_seqlen),
+                                qkT[i, j],
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            qkT[i, j] = T.if_then_else(
+                                by * block_M + i < kv_current_seqlen and k * block_N + j < q_current_seqlen,
+                                qkT[i, j],
+                                0,
+                            )
+
+                T.copy(dO[q_start_idx + k * block_N : q_start_idx + (k + 1) * block_N, bx, :], dst=do)
+                T.clear(dsT)
+                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(qkT, qkT_cast)
+                T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
+
+                for i, j in T.Parallel(block_M, block_N):
+                    dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
+                T.gemm(dsT_cast, q, dk, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(dsT_cast, dsT_shared)
+                T.clear(dq)
+                T.gemm(dsT_shared, K_shared, dq, transpose_A=True)
+                T.atomic_add(dQ[q_start_idx + k * block_N : q_start_idx + (k + 1) * block_N, bx, :], dq)
+
+            T.copy(dv, dv_shared)
+            T.atomic_add(dV[kv_start_idx + by * block_M : kv_start_idx + (by + 1) * block_M, bx // groups, :], dv_shared)
+            T.copy(dk, dk_shared)
+            T.atomic_add(dK[kv_start_idx + by * block_M : kv_start_idx + (by + 1) * block_M, bx // groups, :], dk_shared)
+
+    return flash_bwd
+
+
+@tilelang.jit(out_idx=-1)
+def flashattn_bwd_dsink(batch_size, heads, N_CTX, max_seq_len, block=256, dtype: T.dtype = T.float16):
+    accum_dtype = T.float32
+    shape = [batch_size, heads, N_CTX]
+
+    @T.prim_func
+    def flash_bwd_dsink(
+        Sinks: T.Tensor([heads], dtype),
+        Delta: T.Tensor(shape, accum_dtype),
+        lse: T.Tensor(shape, accum_dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        dsinks: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(max_seq_len, block), batch_size, threads=256) as (bx, by, bz):
+            lse_fragment = T.alloc_fragment([block], accum_dtype)
+            delta_fragment = T.alloc_fragment([block], accum_dtype)
+            dsink_fragment = T.alloc_fragment([block], dtype)
+
+            # Get actual sequence length for this batch item
+            q_start_idx = cu_seqlens_q[bz]
+            q_end_idx = cu_seqlens_q[bz + 1]
+            q_current_seqlen = q_end_idx - q_start_idx
+
+            sink = Sinks[bx]
+            T.copy(lse[bz, bx, by * block : (by + 1) * block], lse_fragment)
+            T.copy(Delta[bz, bx, by * block : (by + 1) * block], delta_fragment)
+            for i in T.Parallel(block):
+                # Only compute for valid positions, set 0 for positions beyond sequence length
+                dsink_fragment[i] = T.if_then_else(
+                    by * block + i < q_current_seqlen,
+                    -T.exp2(sink * 1.44269504 - lse_fragment[i]) * delta_fragment[i],
+                    0,
+                )
+            T.copy(dsink_fragment, dsinks[bz, bx, by * block : (by + 1) * block])
+
+    return flash_bwd_dsink
+
+
+class _attention(torch.autograd.Function):
+    @staticmethod
+    def forward(
+        ctx, q_unpad, k_unpad, v_unpad, sinks, cu_seqlens_q, cu_seqlens_k, N_CTX, max_seqlen_q, max_seqlen_k, window_size, groups, is_causal
+    ):
+        def maybe_contiguous(x):
+            if x.stride(-1) != 1:
+                return x.contiguous()
+            return x
+
+        q_unpad, k_unpad, v_unpad, sinks = [maybe_contiguous(x) for x in (q_unpad, k_unpad, v_unpad, sinks)]
+        UQ, H, D_HEAD = q_unpad.shape
+        UKV = k_unpad.shape[0]
+        batch_size = cu_seqlens_q.shape[0] - 1
+        dtype = T.float16 if q_unpad.dtype == torch.float16 else T.bfloat16
+
+        kernel = flashattn_fwd(
+            batch_size,
+            groups,
+            UQ,
+            UKV,
+            N_CTX,
+            H,
+            max_seqlen_q,
+            D_HEAD,
+            is_causal,
+            window_size=window_size,
+            block_M=64,
+            block_N=64,
+            num_stages=1,
+            threads=128,
+            dtype=dtype,
+        )
+        o_unpad, lse = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, sinks)
+
+        ctx.save_for_backward(q_unpad, k_unpad, v_unpad, sinks, o_unpad, lse, cu_seqlens_q, cu_seqlens_k)
+        ctx.window_size = window_size
+        ctx.groups = groups
+        ctx.is_causal = is_causal
+        ctx.N_CTX = N_CTX
+        ctx.max_seqlen_q = max_seqlen_q
+        ctx.max_seqlen_k = max_seqlen_k
+        ctx.batch_size = batch_size
+        return o_unpad
+
+    @staticmethod
+    def backward(ctx, do):
+        q_unpad, k_unpad, v_unpad, sinks, o_unpad, lse, cu_seqlens_q, cu_seqlens_k = ctx.saved_tensors
+        UQ, H, D_HEAD = q_unpad.shape
+        UKV = k_unpad.shape[0]
+        groups = ctx.groups
+        batch_size = ctx.batch_size
+        dtype = T.float16 if q_unpad.dtype == torch.float16 else T.bfloat16
+
+        kernel_prep = flashattn_bwd_preprocess(batch_size, H, UQ, ctx.N_CTX, ctx.max_seqlen_q, D_HEAD, dtype=dtype)
+        kernel_post = flashattn_bwd_postprocess(UQ, H, D_HEAD, dtype=dtype)
+        delta = kernel_prep(o_unpad, do, cu_seqlens_q)
+
+        kernel = flashattn_bwd(
+            batch_size,
+            groups,
+            UQ,
+            UKV,
+            ctx.N_CTX,
+            H,
+            ctx.max_seqlen_q,
+            D_HEAD,
+            ctx.is_causal,
+            window_size=ctx.window_size,
+            dtype=dtype,
+        )
+
+        head_kv = H // groups
+        dq = torch.zeros_like(q_unpad, dtype=torch.float32)
+        dk = torch.zeros([UKV, head_kv, D_HEAD], dtype=torch.float32, device=q_unpad.device)
+        dv = torch.zeros([UKV, head_kv, D_HEAD], dtype=torch.float32, device=q_unpad.device)
+
+        kernel(q_unpad, k_unpad, v_unpad, do, lse, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
+        dq = kernel_post(dq)
+        dk = dk.to(q_unpad.dtype)
+        dv = dv.to(q_unpad.dtype)
+
+        kernel_dsink = flashattn_bwd_dsink(batch_size, H, ctx.N_CTX, ctx.max_seqlen_q, dtype=dtype)
+        dsinks = kernel_dsink(sinks, delta, lse, cu_seqlens_q).sum(0).sum(1)
+
+        return dq, dk, dv, dsinks, None, None, None, None, None, None, None, None
+
+
+attention = _attention.apply
+
+
+def ref_program(
+    q_unpad: torch.Tensor,
+    k_unpad: torch.Tensor,
+    v_unpad: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    sinks: torch.Tensor,
+    batch_size: int,
+    is_causal: bool,
+    sliding_window: Optional[int] = None,
+    groups: int = 1,
+) -> torch.Tensor:
+    """Reference implementation for varlen attention with sinks."""
+    total_q, num_heads, head_dim = q_unpad.shape
+    _, num_key_value_heads, _ = k_unpad.shape
+
+    sm_scale = 1.0 / head_dim**0.5
+
+    output = torch.zeros_like(q_unpad)
+
+    for b in range(batch_size):
+        q_start = cu_seqlens_q[b].item()
+        q_end = cu_seqlens_q[b + 1].item()
+        k_start = cu_seqlens_k[b].item()
+        k_end = cu_seqlens_k[b + 1].item()
+
+        q_len = q_end - q_start
+        k_len = k_end - k_start
+
+        if q_len == 0:
+            continue
+
+        q_seq = q_unpad[q_start:q_end]  # [q_len, heads, dim]
+        k_seq = k_unpad[k_start:k_end]  # [k_len, head_kv, dim]
+        v_seq = v_unpad[k_start:k_end]  # [k_len, head_kv, dim]
+
+        # Reshape for GQA
+        q_seq = q_seq.view(q_len, num_key_value_heads, groups, head_dim)
+        sinks_expanded = sinks.view(num_key_value_heads, groups, 1, 1).float()
+
+        k_seq = k_seq.unsqueeze(2)  # [k_len, head_kv, 1, dim]
+        v_seq = v_seq.unsqueeze(2)  # [k_len, head_kv, 1, dim]
+
+        logits = torch.einsum("qhgd,khgd->hgqk", q_seq.float(), k_seq.float()) * sm_scale
+
+        start_q = k_len - q_len
+        pos_keys = torch.arange(k_len, device=q_unpad.device)
+        pos_queries = torch.arange(q_len, device=q_unpad.device) + start_q
+
+        if is_causal:
+            mask = pos_keys[None, :] > pos_queries[:, None]
+            mask = mask.float().masked_fill(mask, float("-inf"))
+        else:
+            mask = torch.zeros(q_len, k_len, device=q_unpad.device)
+
+        if sliding_window is not None:
+            too_old = pos_keys[None, :] < (pos_queries[:, None] - sliding_window + 1)
+            mask.masked_fill_(too_old, float("-inf"))
+
+        logits = logits + mask[None, None, :, :]
+
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(sinks_expanded, logits_max)
+        sinks_exp = torch.exp(sinks_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks_exp
+        scores = unnormalized_scores / normalizer
+
+        out = torch.einsum("hgqk,khgd->qhgd", scores, v_seq.float())
+        out = out.reshape(q_len, num_heads, head_dim).to(q_unpad.dtype)
+
+        output[q_start:q_end] = out
+
+    return output
+
+
+def main(
+    batch: int = 1,
+    heads: int = 64,
+    q_seqlen: int = 2048,
+    k_seqlen: int = 2048,
+    dim: int = 128,
+    groups: int = 16,
+    is_causal: bool = True,
+    window_size: Optional[int] = None,
+):
+    assert heads % groups == 0, "heads must be divisible by groups"
+
+    flops_per_matmul = 2.0 * batch * heads * q_seqlen * k_seqlen * dim
+    total_flops = 5 * flops_per_matmul  # fwd + bwd
+
+    if is_causal:
+        total_flops *= 0.5
+
+    if window_size is not None:
+        print(f"Using sliding window attention with window_size={window_size}")
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, k_seqlen // 2) * q_seqlen * dim
+        total_flops = 5 * flops_per_matmul
+
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    head_kv = heads // groups
+    q = torch.randn(batch, q_seqlen, heads, dim, dtype=dtype, device=device)
+    k = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    v = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    sinks = torch.randn(heads, dtype=dtype, device=device)
+
+    query_padding_mask = generate_random_padding_mask(q_seqlen, batch, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(k_seqlen, batch, device, mode="random")
+
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        _,
+        _,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+
+    q_unpad = q_unpad.requires_grad_(True)
+    k_unpad = k_unpad.requires_grad_(True)
+    v_unpad = v_unpad.requires_grad_(True)
+    sinks = sinks.requires_grad_(True)
+
+    dO_unpad = torch.randn_like(q_unpad)
+
+    # TileLang forward + backward
+    # N_CTX is the padded sequence length used for tensor allocation
+    N_CTX = q_seqlen
+    O_unpad = attention(
+        q_unpad, k_unpad, v_unpad, sinks, cu_seqlens_q, cu_seqlens_k, N_CTX, max_seqlen_q, max_seqlen_k, window_size, groups, is_causal
+    )
+    O_unpad.backward(dO_unpad, retain_graph=True)
+    dQ, q_unpad.grad = q_unpad.grad.clone(), None
+    dK, k_unpad.grad = k_unpad.grad.clone(), None
+    dV, v_unpad.grad = v_unpad.grad.clone(), None
+    dsinks, sinks.grad = sinks.grad.clone(), None
+
+    # Reference forward + backward
+    O_ref_unpad = ref_program(
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        sinks,
+        batch,
+        is_causal,
+        sliding_window=window_size,
+        groups=groups,
+    )
+    O_ref_unpad.backward(dO_unpad, retain_graph=True)
+    dQ_ref, q_unpad.grad = q_unpad.grad.clone(), None
+    dK_ref, k_unpad.grad = k_unpad.grad.clone(), None
+    dV_ref, v_unpad.grad = v_unpad.grad.clone(), None
+    dsinks_ref, sinks.grad = sinks.grad.clone(), None
+
+    # Checks
+    # Sliding window attention has slightly higher numerical error due to more complex masking
+    rtol, atol = (2e-2, 2e-2) if window_size is not None else (1e-2, 1e-2)
+    assert torch.allclose(O_unpad, O_ref_unpad, rtol=rtol, atol=atol), f"O max err: {(O_unpad - O_ref_unpad).abs().max()}"
+    assert torch.allclose(dV, dV_ref, rtol=rtol, atol=atol), f"dV max err: {(dV - dV_ref).abs().max()}"
+    assert torch.allclose(dK, dK_ref, rtol=rtol, atol=atol), f"dK max err: {(dK - dK_ref).abs().max()}"
+    assert torch.allclose(dQ, dQ_ref, rtol=rtol, atol=atol), f"dQ max err: {(dQ - dQ_ref).abs().max()}"
+    assert torch.allclose(dsinks, dsinks_ref, rtol=rtol, atol=atol), f"dsinks max err: {(dsinks - dsinks_ref).abs().max()}"
+
+    print("All checks passed for tilelang kernels.✅")
+
+    # Benchmark backward
+    def torch_bwd():
+        O_ref_unpad.backward(dO_unpad, retain_graph=True)
+
+    def tl_bwd():
+        O_unpad.backward(dO_unpad, retain_graph=True)
+
+    latency = do_bench(torch_bwd, warmup=500)
+    print("torch: {:.2f} ms".format(latency))
+    print("torch: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+    latency = do_bench(tl_bwd, warmup=500)
+    print("tilelang: {:.2f} ms".format(latency))
+    print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="query heads")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--q_seqlen", type=int, default=2048, help="query sequence length")
+    parser.add_argument("--k_seqlen", type=int, default=2048, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="head dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal attention")
+    parser.add_argument("--window_size", type=int, default=None, help="sliding window size (default: None for full attention)")
+    args = parser.parse_args()
+    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups, args.is_causal, args.window_size)
diff --git a/examples/attention_sink/example_gqa_sink_fwd_varlen.py b/examples/attention_sink/example_gqa_sink_fwd_varlen.py
new file mode 100644
index 0000000000..16838dd860
--- /dev/null
+++ b/examples/attention_sink/example_gqa_sink_fwd_varlen.py
@@ -0,0 +1,401 @@
+# ruff: noqa
+# Using varlen (variable length) format with attention sink
+
+import argparse
+import torch
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang.profiler import do_bench
+from typing import Optional
+import sys
+import os
+
+sys.path.append(os.path.join(os.path.dirname(__file__), "../flash_attention"))
+from varlen_utils import generate_random_padding_mask, generate_qkv
+
+
+@tilelang.jit(
+    out_idx=[7],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn_sink(
+    batch_size,
+    groups,
+    UQ,
+    UKV,
+    heads,
+    dim,
+    is_causal,
+    window_size=None,  # None for full causal attention
+    sm_scale=None,
+    block_M=64,
+    block_N=64,
+    num_stages=1,
+    threads=128,
+):
+    if window_size is not None:
+        assert window_size % block_N == 0, "window_size must be divisible by block_N"
+
+    if sm_scale is None:
+        sm_scale = (1.0 / dim) ** 0.5
+    scale = sm_scale * 1.44269504  # log2(e)
+
+    head_kv = heads // groups
+    q_shape = [UQ, heads, dim]
+    kv_shape = [UKV, head_kv, dim]
+    o_shape = [UQ, heads, dim]
+    dtype = T.float16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q_unpad: T.Tensor(q_shape, dtype),
+        K_unpad: T.Tensor(kv_shape, dtype),
+        V_unpad: T.Tensor(kv_shape, dtype),
+        cu_seqlens_q: T.Tensor([batch_size + 1], T.int32),
+        cu_seqlens_k: T.Tensor([batch_size + 1], T.int32),
+        max_seqlen_q: T.int32,
+        Sinks: T.Tensor([heads], dtype),
+        Output_unpad: T.Tensor(o_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(max_seqlen_q, block_M), heads, batch_size, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
+            acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+            acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+            sinks = T.alloc_fragment([block_M], dtype)
+
+            batch_idx = bz
+            head_idx = by
+            kv_head_idx = head_idx // groups
+
+            q_start_idx = cu_seqlens_q[batch_idx]
+            kv_start_idx = cu_seqlens_k[batch_idx]
+            q_end_idx = cu_seqlens_q[batch_idx + 1]
+            k_end_idx = cu_seqlens_k[batch_idx + 1]
+
+            q_current_seqlen = q_end_idx - q_start_idx
+            kv_current_seqlen = k_end_idx - kv_start_idx
+
+            T.copy(Q_unpad[q_start_idx + bx * block_M : q_start_idx + (bx + 1) * block_M, head_idx, :], Q_shared)
+
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            for i in T.Parallel(block_M):
+                sinks[i] = Sinks[head_idx]
+
+            offset = kv_current_seqlen - q_current_seqlen  # always align on the right
+            max_visible_k_idx = offset + (bx + 1) * block_M
+
+            # Determine loop range based on causal mask and sliding window
+            if is_causal:
+                if window_size is not None:
+                    # Sliding window + causal: start from window boundary
+                    start = T.max(0, (offset + bx * block_M - window_size + 1) // block_N)
+                    end = T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+                else:
+                    # Full causal attention
+                    start = 0
+                    end = T.min(T.ceildiv(max_visible_k_idx, block_N), T.ceildiv(kv_current_seqlen, block_N))
+            else:
+                if window_size is not None:
+                    start = T.max(0, (offset + bx * block_M - window_size + 1) // block_N)
+                    end = T.ceildiv(kv_current_seqlen, block_N)
+                else:
+                    start = 0
+                    end = T.ceildiv(kv_current_seqlen, block_N)
+
+            loop_range = end - start
+
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                actual_k = k + start
+                T.copy(K_unpad[kv_start_idx + actual_k * block_N : kv_start_idx + (actual_k + 1) * block_N, kv_head_idx, :], K_shared)
+
+                # Build mask considering causal, sliding window, and padding
+                if is_causal:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            q_idx = bx * block_M + i + offset
+                            k_idx = actual_k * block_N + j
+                            # Causal + sliding window mask
+                            acc_s[i, j] = T.if_then_else(
+                                (q_idx < k_idx)  # causal: can't see future
+                                or (q_idx >= k_idx + window_size)  # sliding window: too old
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i + offset < actual_k * block_N + j)
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                else:
+                    if window_size is not None:
+                        for i, j in T.Parallel(block_M, block_N):
+                            q_idx = bx * block_M + i + offset
+                            k_idx = actual_k * block_N + j
+                            acc_s[i, j] = T.if_then_else(
+                                (q_idx >= k_idx + window_size)  # sliding window: too old
+                                or (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            acc_s[i, j] = T.if_then_else(
+                                (bx * block_M + i >= q_current_seqlen or actual_k * block_N + j >= kv_current_seqlen),
+                                -T.infinity(acc_s.dtype),
+                                0,
+                            )
+
+                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+
+                # Check_inf for sliding window attention
+                if window_size is not None:
+                    for i in T.Parallel(block_M):
+                        scores_max[i] = T.if_then_else(scores_max[i] == -T.infinity(accum_dtype), 0, scores_max[i])
+
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                T.copy(acc_s, acc_s_cast)
+
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+
+                T.copy(V_unpad[kv_start_idx + actual_k * block_N : kv_start_idx + (actual_k + 1) * block_N, kv_head_idx, :], V_shared)
+
+                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            # Attention sink: add sink contribution to logsum
+            for i in T.Parallel(block_M):
+                logsum[i] += T.exp2(sinks[i] * 1.44269504 - scores_max[i] * scale)
+
+            for i, j in T.Parallel(block_M, dim):
+                # When sq > skv, some tokens can see nothing (for causal)
+                acc_o[i, j] = 0 if is_causal and bx * block_M + i + offset < 0 else acc_o[i, j] / logsum[i]
+
+            T.copy(acc_o, O_shared)
+            for i, d in T.Parallel(block_M, dim):
+                if bx * block_M + i < q_current_seqlen:
+                    Output_unpad[q_start_idx + bx * block_M + i, head_idx, d] = O_shared[i, d]
+
+    return main
+
+
+def ref_program(
+    q_unpad: torch.Tensor,
+    k_unpad: torch.Tensor,
+    v_unpad: torch.Tensor,
+    cu_seqlens_q: torch.Tensor,
+    cu_seqlens_k: torch.Tensor,
+    max_seqlen_q: int,
+    max_seqlen_k: int,
+    sinks: torch.Tensor,
+    batch_size: int,
+    is_causal: bool,
+    sliding_window: Optional[int] = None,
+    groups: int = 1,
+) -> torch.Tensor:
+    """Reference implementation for varlen attention with sinks."""
+    # q_unpad: [total_q, heads, dim]
+    # k_unpad: [total_kv, head_kv, dim]
+    # v_unpad: [total_kv, head_kv, dim]
+    total_q, num_heads, head_dim = q_unpad.shape
+    _, num_key_value_heads, _ = k_unpad.shape
+
+    sm_scale = 1.0 / head_dim**0.5
+
+    output = torch.zeros_like(q_unpad)
+
+    for b in range(batch_size):
+        q_start = cu_seqlens_q[b].item()
+        q_end = cu_seqlens_q[b + 1].item()
+        k_start = cu_seqlens_k[b].item()
+        k_end = cu_seqlens_k[b + 1].item()
+
+        q_len = q_end - q_start
+        k_len = k_end - k_start
+
+        if q_len == 0:
+            continue
+
+        # Extract sequences for this batch
+        q_seq = q_unpad[q_start:q_end]  # [q_len, heads, dim]
+        k_seq = k_unpad[k_start:k_end]  # [k_len, head_kv, dim]
+        v_seq = v_unpad[k_start:k_end]  # [k_len, head_kv, dim]
+
+        # Reshape for GQA
+        q_seq = q_seq.view(q_len, num_key_value_heads, groups, head_dim)  # [q_len, head_kv, groups, dim]
+        sinks_expanded = sinks.view(num_key_value_heads, groups, 1, 1).float()  # [head_kv, groups, 1, 1]
+
+        k_seq = k_seq.unsqueeze(2)  # [k_len, head_kv, 1, dim]
+        v_seq = v_seq.unsqueeze(2)  # [k_len, head_kv, 1, dim]
+
+        # Compute attention
+        # q_seq: [q_len, head_kv, groups, dim], k_seq: [k_len, head_kv, 1, dim]
+        logits = torch.einsum("qhgd,khgd->hgqk", q_seq.float(), k_seq.float()) * sm_scale
+
+        # Build mask
+        start_q = k_len - q_len  # offset for causal alignment
+        pos_keys = torch.arange(k_len, device=q_unpad.device)
+        pos_queries = torch.arange(q_len, device=q_unpad.device) + start_q
+
+        if is_causal:
+            mask = pos_keys[None, :] > pos_queries[:, None]
+            mask = mask.float().masked_fill(mask, float("-inf"))
+        else:
+            mask = torch.zeros(q_len, k_len, device=q_unpad.device)
+
+        if sliding_window is not None:
+            too_old = pos_keys[None, :] < (pos_queries[:, None] - sliding_window + 1)
+            mask.masked_fill_(too_old, float("-inf"))
+
+        logits = logits + mask[None, None, :, :]  # [head_kv, groups, q_len, k_len]
+
+        # Apply sink-adjusted softmax
+        logits_max = torch.max(logits, dim=-1, keepdim=True).values
+        logits_or_sinks_max = torch.maximum(sinks_expanded, logits_max)
+        sinks_exp = torch.exp(sinks_expanded - logits_or_sinks_max)
+        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
+        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks_exp
+        scores = unnormalized_scores / normalizer
+
+        # Compute output
+        out = torch.einsum("hgqk,khgd->qhgd", scores, v_seq.float())
+        out = out.reshape(q_len, num_heads, head_dim).to(q_unpad.dtype)
+
+        output[q_start:q_end] = out
+
+    return output
+
+
+def main(
+    batch: int = 1,
+    heads: int = 64,
+    q_seqlen: int = 2048,
+    k_seqlen: int = 2048,
+    dim: int = 128,
+    groups: int = 16,
+    is_causal: bool = True,
+    window_size: Optional[int] = None,
+):
+    assert heads % groups == 0, "heads must be divisible by groups"
+
+    flops_per_matmul = 2.0 * batch * heads * q_seqlen * k_seqlen * dim
+    total_flops = 2 * flops_per_matmul
+
+    tilelang.testing.set_random_seed(0)
+
+    if is_causal:
+        total_flops *= 0.5
+
+    if window_size is not None:
+        print(f"Using sliding window attention with window_size={window_size}")
+        flops_per_matmul = 2.0 * batch * heads * min(window_size, k_seqlen // 2) * q_seqlen * dim
+        total_flops = 2 * flops_per_matmul
+
+    dtype = torch.float16
+    device = torch.device("cuda")
+
+    head_kv = heads // groups
+    q = torch.randn(batch, q_seqlen, heads, dim, dtype=dtype, device=device)
+    k = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    v = torch.randn(batch, k_seqlen, head_kv, dim, dtype=dtype, device=device)
+    sinks = torch.randn(heads, dtype=dtype, device=device)
+
+    query_padding_mask = generate_random_padding_mask(q_seqlen, batch, device, mode="random")
+    key_padding_mask = generate_random_padding_mask(k_seqlen, batch, device, mode="random")
+
+    (
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        q,
+        k,
+        v,
+        output_pad_fn,
+        _,
+        _,
+    ) = generate_qkv(q, k, v, query_padding_mask, key_padding_mask, kvpacked=False)
+
+    UQ = q_unpad.shape[0]
+    UKV = k_unpad.shape[0]
+
+    kernel = flashattn_sink(
+        batch, groups, UQ, UKV, heads, dim, is_causal, window_size=window_size, block_M=128, block_N=128, num_stages=2, threads=256
+    )
+
+    out_unpad = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, sinks)
+    out = output_pad_fn(out_unpad)
+
+    # Reference implementation
+    ref_out_unpad = ref_program(
+        q_unpad,
+        k_unpad,
+        v_unpad,
+        cu_seqlens_q,
+        cu_seqlens_k,
+        max_seqlen_q,
+        max_seqlen_k,
+        sinks,
+        batch,
+        is_causal,
+        sliding_window=window_size,
+        groups=groups,
+    )
+    ref_out = output_pad_fn(ref_out_unpad)
+
+    torch.testing.assert_close(out, ref_out, rtol=1e-2, atol=1e-2)
+
+    print("All checks passed.✅")
+    latency = do_bench(
+        lambda: kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k, max_seqlen_q, sinks),
+        warmup=500,
+    )
+    print("Tile-lang: {:.2f} ms".format(latency))
+    print("Tile-lang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=8, help="batch size")
+    parser.add_argument("--heads", type=int, default=64, help="query heads")
+    parser.add_argument("--groups", type=int, default=16, help="groups")
+    parser.add_argument("--q_seqlen", type=int, default=2048, help="query sequence length")
+    parser.add_argument("--k_seqlen", type=int, default=2048, help="key/value sequence length")
+    parser.add_argument("--dim", type=int, default=128, help="head dim")
+    parser.add_argument("--is_causal", action="store_true", help="causal attention")
+    parser.add_argument("--window_size", type=int, default=None, help="sliding window size (default: None for full attention)")
+    args = parser.parse_args()
+    main(args.batch, args.heads, args.q_seqlen, args.k_seqlen, args.dim, args.groups, args.is_causal, args.window_size)
diff --git a/examples/attention_sink/example_mha_sink_bwd_bhsd.py b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
index 66905f55d1..fa045a1d78 100644
--- a/examples/attention_sink/example_mha_sink_bwd_bhsd.py
+++ b/examples/attention_sink/example_mha_sink_bwd_bhsd.py
@@ -13,7 +13,7 @@ def get_bwd_configs():
     sm_version = sm_major * 10 + sm_minor
     if sm_version == 80:
         return 64, 32, 1, 128
-    elif sm_version == 90:
+    elif sm_version >= 90:
         return 128, 32, 2, 256
     else:
         raise ValueError(f"Unsupported SM version: {sm_version}")
diff --git a/examples/attention_sink/regression_attention_sink.py b/examples/attention_sink/regression_attention_sink.py
index e2453173cf..f1116befcb 100644
--- a/examples/attention_sink/regression_attention_sink.py
+++ b/examples/attention_sink/regression_attention_sink.py
@@ -1,9 +1,7 @@
 import tilelang.testing
 import example_mha_sink_fwd_bhsd
-import example_mha_sink_fwd_bhsd_wgmma_pipelined
 import example_mha_sink_bwd_bhsd
 import example_gqa_sink_bwd_bhsd
-import example_gqa_sink_fwd_bhsd_wgmma_pipelined
 
 
 def regression_example_mha_sink_fwd_bhsd():
@@ -16,30 +14,6 @@ def regression_example_mha_sink_fwd_bhsd_sliding_window():
     )
 
 
-def regression_example_mha_sink_fwd_bhsd_wgmma_pipelined():
-    tilelang.testing.process_func(example_mha_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf)
-
-
-def regression_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
-    tilelang.testing.process_func(
-        example_mha_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf,
-        "regression_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window",
-        window_size=128,
-    )
-
-
-def regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined():
-    tilelang.testing.process_func(example_gqa_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf)
-
-
-def regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
-    tilelang.testing.process_func(
-        example_gqa_sink_fwd_bhsd_wgmma_pipelined.run_regression_perf,
-        "regression_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window",
-        window_size=128,
-    )
-
-
 def regression_example_mha_sink_bwd_bhsd():
     tilelang.testing.process_func(example_mha_sink_bwd_bhsd.run_regression_perf)
 
diff --git a/examples/attention_sink/test_example_attention_sink.py b/examples/attention_sink/test_example_attention_sink.py
index 57242c199c..41682c6c7c 100644
--- a/examples/attention_sink/test_example_attention_sink.py
+++ b/examples/attention_sink/test_example_attention_sink.py
@@ -1,10 +1,10 @@
 import tilelang.testing
 
 import example_mha_sink_fwd_bhsd
-import example_mha_sink_fwd_bhsd_wgmma_pipelined
-import example_gqa_sink_fwd_bhsd_wgmma_pipelined
 import example_mha_sink_bwd_bhsd
 import example_gqa_sink_bwd_bhsd
+import example_gqa_sink_fwd_varlen
+import example_gqa_sink_bwd_varlen
 
 
 @tilelang.testing.requires_cuda
@@ -17,30 +17,6 @@ def test_example_mha_sink_fwd_bhsd_sliding_window():
     example_mha_sink_fwd_bhsd.main(window_size=128)
 
 
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_mha_sink_fwd_bhsd_wgmma_pipelined_full_attn():
-    example_mha_sink_fwd_bhsd_wgmma_pipelined.main()
-
-
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_mha_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
-    example_mha_sink_fwd_bhsd_wgmma_pipelined.main(window_size=128)
-
-
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_gqa_sink_fwd_bhsd_wgmma_pipelined_full_attn():
-    example_gqa_sink_fwd_bhsd_wgmma_pipelined.main()
-
-
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_gqa_sink_fwd_bhsd_wgmma_pipelined_sliding_window():
-    example_gqa_sink_fwd_bhsd_wgmma_pipelined.main(window_size=128)
-
-
 @tilelang.testing.requires_cuda
 def test_example_mha_sink_bwd_bhsd():
     example_mha_sink_bwd_bhsd.main()
@@ -61,5 +37,12 @@ def test_example_gqa_sink_bwd_bhsd_sliding_window():
     example_gqa_sink_bwd_bhsd.main(window_size=128)
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_example_gqa_sink_varlen():
+    example_gqa_sink_fwd_varlen.main()  # non-causal
+    example_gqa_sink_bwd_varlen.main()  # causal
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/autodd/README.md b/examples/autodd/README.md
new file mode 100644
index 0000000000..9ae9f98167
--- /dev/null
+++ b/examples/autodd/README.md
@@ -0,0 +1,126 @@
+# AutoDD - Automatic Delta Debugging for TileLang
+
+AutoDD (Automatic Delta Debugging) is a built-in debugging tool for TileLang that automatically simplifies complex Python programs to the minimal code needed to reproduce a specific error. This is extremely useful for debugging large, complex TileLang programs.
+
+## What is Delta Debugging?
+
+Delta Debugging is an automated debugging technique with the core idea:
+1. Given a program that triggers a bug
+2. Systematically remove code fragments from the program
+3. Check if the simplified program still triggers the same bug
+4. Eventually obtain the minimal code that triggers the bug
+
+AutoDD uses a Probability Distribution Driven Delta Debugging (PDD) algorithm for efficient search of minimized code.
+
+## Why AutoDD?
+
+When developing TileLang programs, bugs are often hidden in complex code:
+
+- **Lots of irrelevant code**: Real projects may have hundreds of lines of configuration, helper functions, logging, etc.
+- **Hard to locate**: Error messages may point to underlying TVM/CUDA rather than TileLang code
+- **Tedious debugging**: Manually deleting code to locate bugs is very time-consuming
+
+AutoDD automates this process, reducing hundreds of lines of code to just a few dozen, directly exposing the root cause of the problem.
+
+## Usage
+
+### Basic Usage
+
+```bash
+python -m tilelang.autodd <source_file> --err-msg "<error_message>" -o <output_file>
+```
+
+### Parameters
+
+| Parameter | Description |
+|-----------|-------------|
+| `source` | Path to the input Python source file |
+| `--err-msg` | Error message to match (searched in stdout or stderr) |
+| `-o, --output` | Path to the minimized output file |
+| `--backend` | Execution backend: `runner` (faster) or `subproc` (more stable), default `runner` |
+| `--timeout` | Timeout for each task in seconds, default 60 |
+| `-j, --jobs` | Number of parallel jobs, default 1 |
+
+### Example
+
+Run AutoDD on `tilelang_buggy.py` in this directory:
+
+```bash
+# Use 4 parallel jobs, search for "Dimension mismatch" error
+python -m tilelang.autodd tilelang_buggy.py --err-msg "Dimension mismatch" -o minimized.py -j 4
+
+# Or use subprocess backend (more stable but slower)
+python -m tilelang.autodd tilelang_buggy.py --err-msg "Dimension mismatch" -o minimized.py --backend subproc
+```
+
+## Example Files
+
+### `tilelang_buggy.py`
+
+A complex TileLang program with a bug (~200 lines), containing:
+- Multiple useless helper functions (`calculate_optimal_block_size`, `get_memory_requirements`, etc.)
+- A complex configuration class (`MatmulConfig`)
+- Unused benchmark code (`benchmark_pytorch`)
+- **A GEMM shape mismatch bug**
+
+The bug is on line 124:
+```python
+B_shared = T.alloc_shared((block_M, block_N), dtype)  # Wrong! Should be (block_K, block_N)
+```
+
+### `tilelang_minimized_expected.py`
+
+The expected output after AutoDD simplification (~30 lines). The simplified code clearly shows the root cause of the bug:
+
+```python
+def buggy_matmul(...):
+    @T.prim_func
+    def matmul_kernel():
+        with T.Kernel():
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)  # Bug!
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.gemm(A_shared, B_shared, C_local)  # Error occurs here
+```
+
+## How AutoDD Works
+
+AutoDD uses AST (Abstract Syntax Tree) analysis and multiple rewrite rules to simplify code:
+
+### 1. Fast Reducers
+- **Statement removal**: Directly remove statements that don't affect bug reproduction
+- **If statement simplification**: Simplify `if cond: body` to `body`
+- **For loop simplification**: Bind loop variables to constants
+
+### 2. Canonicalizers
+- **With statement expansion**: Convert `with expr as var` to explicit assignment
+- **Function argument extension**: Add `*args, **kwargs` for compatibility
+
+### 3. Simplifiers
+- **Assignment simplification**: Replace complex expressions with constants
+- **Function call simplification**: Simplify `f(x)` to `x`
+- **Binary operation simplification**: Simplify `a + b` to `a` or `b`
+
+### 4. Slow Reducers
+- **Expression removal**: Remove arbitrary expressions
+- **Argument removal**: Remove function arguments
+- **Integer reduction**: Gradually reduce large integers
+
+## Use Cases
+
+1. **TileLang kernel debugging**: Simplify complex TileLang programs to locate bugs
+2. **Bug report submission**: Generate minimal reproduction code for easier issue tracking
+3. **Understanding errors**: Easier to understand the nature of errors after removing irrelevant code
+4. **Regression testing**: Simplified code can serve as regression test cases
+
+## Notes
+
+1. **Error message matching**: The `--err-msg` parameter needs to exactly match a string in the error output
+2. **Timeout setting**: For programs with long compilation times, you may need to increase `--timeout`
+3. **Parallel jobs**: Increasing `-j` can speed up the simplification process but consumes more resources
+4. **Backend selection**: If the `runner` backend is unstable, try the `subproc` backend
+
+## References
+
+- [Delta Debugging Paper](https://www.st.cs.uni-saarland.de/papers/tse2002/)
+- [TileLang Documentation](https://github.com/tile-ai/tilelang)
diff --git a/examples/autodd/tilelang_buggy.py b/examples/autodd/tilelang_buggy.py
new file mode 100644
index 0000000000..d2c5469bbe
--- /dev/null
+++ b/examples/autodd/tilelang_buggy.py
@@ -0,0 +1,229 @@
+"""
+A complex TileLang program with lots of redundant code and a bug that triggers an error.
+AutoDD will simplify it to the minimal code needed to reproduce the error.
+
+This example demonstrates how AutoDD can help developers quickly isolate bugs
+in complex TileLang programs by automatically removing irrelevant code.
+
+To run AutoDD on this file:
+    python -m tilelang.autodd tilelang_buggy.py --err-msg "Dimension mismatch" -o minimized.py -j 4
+
+The bug in this file: B_shared has shape (block_M, block_N) instead of (block_K, block_N),
+causing a GEMM dimension mismatch error.
+"""
+
+import tilelang
+import tilelang.language as T
+import torch
+
+
+# Useless helper function - will be removed by AutoDD
+def calculate_optimal_block_size(M, N, K):
+    """Calculate optimal block size - this function is completely useless"""
+    options = [32, 64, 128, 256]
+    best = 128
+    for opt in options:
+        if M % opt == 0 and N % opt == 0:
+            best = opt
+            break
+    return best, best, 32
+
+
+def get_memory_requirements(M, N, K, block_M, block_N, block_K, dtype_size=2):
+    """Calculate memory requirements - completely useless"""
+    shared_mem_a = block_M * block_K * dtype_size
+    shared_mem_b = block_K * block_N * dtype_size
+    total_shared = shared_mem_a + shared_mem_b
+    return total_shared
+
+
+def validate_parameters(M, N, K, block_M, block_N, block_K):
+    """Validate parameters - redundant check"""
+    if M <= 0 or N <= 0 or K <= 0:
+        raise ValueError("Matrix dimensions must be positive")
+    if block_M <= 0 or block_N <= 0 or block_K <= 0:
+        raise ValueError("Block sizes must be positive")
+    if M % block_M != 0:
+        print(f"Warning: M ({M}) not divisible by block_M ({block_M})")
+    if N % block_N != 0:
+        print(f"Warning: N ({N}) not divisible by block_N ({block_N})")
+    if K % block_K != 0:
+        print(f"Warning: K ({K}) not divisible by block_K ({block_K})")
+    return True
+
+
+class MatmulConfig:
+    """Configuration class - increases code complexity but is actually useless"""
+
+    def __init__(self, M, N, K):
+        self.M = M
+        self.N = N
+        self.K = K
+        self.block_M = 128
+        self.block_N = 128
+        self.block_K = 32
+        self.num_stages = 3
+        self.threads = 128
+        self.dtype = "float16"
+        self.accum_dtype = "float32"
+
+    def get_grid_size(self):
+        grid_x = (self.N + self.block_N - 1) // self.block_N
+        grid_y = (self.M + self.block_M - 1) // self.block_M
+        return grid_x, grid_y
+
+    def get_shared_memory_size(self):
+        return get_memory_requirements(self.M, self.N, self.K, self.block_M, self.block_N, self.block_K)
+
+    def validate(self):
+        return validate_parameters(self.M, self.N, self.K, self.block_M, self.block_N, self.block_K)
+
+
+def create_reference_output(a, b, activation="relu"):
+    """Create reference output - not actually used in verification"""
+    result = a @ b
+    if activation == "relu":
+        result = torch.relu(result)
+    elif activation == "gelu":
+        result = torch.nn.functional.gelu(result)
+    elif activation == "sigmoid":
+        result = torch.sigmoid(result)
+    return result
+
+
+def benchmark_pytorch(M, N, K, num_iters=10, warmup=5):
+    """PyTorch benchmark - not used"""
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+
+    # Warmup
+    for _ in range(warmup):
+        _ = a @ b
+    torch.cuda.synchronize()
+
+    # Benchmark
+    import time
+
+    start = time.time()
+    for _ in range(num_iters):
+        _ = a @ b
+    torch.cuda.synchronize()
+    end = time.time()
+
+    return (end - start) / num_iters * 1000  # ms
+
+
+# Main TileLang kernel - contains a BUG: GEMM shape mismatch!
+@tilelang.jit
+def buggy_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def matmul_kernel(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            # Allocate shared memory
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            # BUG: the first dimension of B_shared should be block_K, but block_M is used here!
+            B_shared = T.alloc_shared((block_M, block_N), dtype)  # Wrong shape!
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Allocate some useless temp variables
+            temp_buffer = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            # Zero out
+            T.clear(C_local)
+            T.clear(temp_buffer)
+
+            # Main loop
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                # Copy a tile of A
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+
+                # Copy a tile of B - shape can mismatch here too
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+
+                # GEMM computation - shape mismatch will cause an error
+                # A_shared: (block_M, block_K)
+                # B_shared: (block_M, block_N) <- should be (block_K, block_N)
+                T.gemm(A_shared, B_shared, C_local)
+
+            # ReLU activation
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+
+            # Some useless postprocessing
+            for i, j in T.Parallel(block_M, block_N):
+                if temp_buffer[i, j] > 0:
+                    C_local[i, j] = C_local[i, j] + 0.0
+
+            # Write back result
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return matmul_kernel
+
+
+def run_kernel(config):
+    """Run kernel - includes extra redundant logic"""
+    # Validate parameters
+    config.validate()
+
+    # Get config
+    M, N, K = config.M, config.N, config.K
+    block_M, block_N, block_K = config.block_M, config.block_N, config.block_K
+
+    # Calculate some useless statistics
+    grid_size = config.get_grid_size()
+    shared_mem = config.get_shared_memory_size()
+    print(f"Grid size: {grid_size}")
+    print(f"Shared memory: {shared_mem} bytes")
+
+    # Create test data
+    a = torch.randn(M, K, device="cuda", dtype=torch.float16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.float16)
+    c = torch.empty(M, N, device="cuda", dtype=torch.float16)
+
+    # Compile and run kernel - will trigger the BUG here
+    kernel = buggy_matmul(M, N, K, block_M, block_N, block_K)
+    kernel(a, b, c)
+
+    # Validate results (if it can get here)
+    ref_c = torch.relu(a @ b)
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("Kernel output matches PyTorch reference.")
+
+    return c
+
+
+def main():
+    # Useless printing
+    print("=" * 60)
+    print("TileLang Matmul Kernel Test")
+    print("=" * 60)
+
+    # Create config
+    M, N, K = 512, 512, 512
+    config = MatmulConfig(M, N, K)
+
+    # Calculate some useless values
+    optimal_block = calculate_optimal_block_size(M, N, K)
+    print(f"Optimal block size: {optimal_block}")
+
+    # Run PyTorch benchmark - result is not used
+    # pytorch_time = benchmark_pytorch(M, N, K)
+    # print(f"PyTorch time: {pytorch_time:.3f} ms")
+
+    # Run our kernel - will trigger the error here
+    try:
+        result = run_kernel(config)
+        print(f"Result shape: {result.shape}")
+    except Exception as e:
+        print(f"Error: {e}")
+        raise
+
+    print("Done!")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/autodd/tilelang_minimized_expected.py b/examples/autodd/tilelang_minimized_expected.py
new file mode 100644
index 0000000000..3dc88f9921
--- /dev/null
+++ b/examples/autodd/tilelang_minimized_expected.py
@@ -0,0 +1,49 @@
+"""
+This is the expected output after running AutoDD on tilelang_buggy.py.
+AutoDD automatically simplified the 200+ line buggy program to ~30 lines
+while preserving the ability to reproduce the error.
+
+The minimized code clearly shows the root cause of the bug:
+- A_shared has shape (block_M, block_K)
+- B_shared has shape (block_M, block_N) - should be (block_K, block_N)
+- This causes a dimension mismatch in T.gemm()
+"""
+
+import tilelang.language as T
+
+
+class MatmulConfig:
+    def __init__(self, *args, **kwargs):
+        self.M = 1
+        self.N = 1
+        self.K = 1
+        self.block_M = 2
+        self.block_N = 1
+        self.block_K = 1
+
+
+def buggy_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32, *args, **kwargs):
+    @T.prim_func
+    def matmul_kernel():
+        with T.Kernel():
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_M, block_N), dtype)  # Bug: should be (block_K, block_N)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.gemm(A_shared, B_shared, C_local)
+
+
+def run_kernel(config, *args, **kwargs):
+    M, N, K = (config.M, config.N, config.K)
+    block_M, block_N, block_K = (config.block_M, config.block_N, config.block_K)
+    buggy_matmul(M, N, K, block_M, block_N, block_K)
+
+
+def main(*args, **kwargs):
+    config = MatmulConfig()
+    try:
+        run_kernel(config)
+    except Exception as e:
+        print(f"{e}")
+
+
+main()
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
index 7b8b7b95cd..ab8346619d 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_decode.py
@@ -139,8 +139,8 @@ def kernel(
                 T.call_extern(
                     "handle",
                     "decode_i2u_to_i8s",
-                    T.address_of(B_quant_local[0]),
-                    T.address_of(B_dequantize_local[0]),
+                    T.access_ptr(B_quant_local, "r"),
+                    T.access_ptr(B_dequantize_local, "w"),
                 )
 
                 if use_dp4a:
@@ -155,7 +155,7 @@ def kernel(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                 "reduce_scope",
                 T.reinterpret(T.uint64(0), dtype="handle"),
             ):
diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
index f4a60098a5..9d7ebcf88c 100644
--- a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
+++ b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
@@ -253,8 +253,8 @@ def main(
                     T.call_extern(
                         "handle",
                         "decode_i2u_to_i8s",
-                        T.address_of(B_local[0]),
-                        T.address_of(B_dequantize_local[0]),
+                        T.access_ptr(B_local, "r"),
+                        T.access_ptr(B_dequantize_local, "w"),
                     )
 
                     for v in T.vectorized(0, local_size):
diff --git a/examples/bitnet-1.58b/requirements.txt b/examples/bitnet-1.58b/requirements.txt
index 67357781e0..7660c28c6d 100644
--- a/examples/bitnet-1.58b/requirements.txt
+++ b/examples/bitnet-1.58b/requirements.txt
@@ -1,3 +1,3 @@
 lm_eval==0.3.0
 flash_attn
-transformers==4.53.0
+transformers==5.0.0rc3
diff --git a/examples/bitnet-1.58b/tokenization_bitnet.py b/examples/bitnet-1.58b/tokenization_bitnet.py
index 2adfd6dee1..8db57a9c09 100644
--- a/examples/bitnet-1.58b/tokenization_bitnet.py
+++ b/examples/bitnet-1.58b/tokenization_bitnet.py
@@ -38,10 +38,10 @@
 
 PRETRAINED_VOCAB_FILES_MAP = {
     "vocab_file": {
-        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer.model"
     },
     "tokenizer_file": {
-        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json",
+        "hf-internal-testing/llama-tokenizer": "https://huggingface.co/hf-internal-testing/llama-tokenizer/resolve/main/tokenizer_config.json"
     },
 }
 PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {
diff --git a/examples/blockscaled_gemm_sm100/figures/blockscaled_data_path.svg b/examples/blockscaled_gemm_sm100/figures/blockscaled_data_path.svg
new file mode 100644
index 0000000000..f08de2b779
--- /dev/null
+++ b/examples/blockscaled_gemm_sm100/figures/blockscaled_data_path.svg
@@ -0,0 +1,138 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="720" viewBox="0 0 1200 720" role="img" aria-labelledby="title desc">
+  <title id="title">SM100 block-scaled GEMM data path</title>
+  <desc id="desc">Global memory A, B, SFA, and SFB move through TMA, shared memory, scale-factor transpose, UTCCP copy, tensor memory MMA, and epilogue store.</desc>
+  <defs>
+    <linearGradient id="bgGrad" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0%" stop-color="#fbfaf6"/>
+      <stop offset="58%" stop-color="#f6fbff"/>
+      <stop offset="100%" stop-color="#f7fff9"/>
+    </linearGradient>
+    <linearGradient id="headerGrad" x1="0" y1="0" x2="1" y2="0">
+      <stop offset="0%" stop-color="#eef6ff"/>
+      <stop offset="100%" stop-color="#f2fff8"/>
+    </linearGradient>
+    <filter id="shadow" x="-12%" y="-18%" width="124%" height="140%">
+      <feDropShadow dx="0" dy="6" stdDeviation="7" flood-color="#0f172a" flood-opacity="0.12"/>
+    </filter>
+    <marker id="arrow" markerWidth="10" markerHeight="10" refX="8" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M0,0 L8,3 L0,6 Z" fill="#263238"/>
+    </marker>
+    <marker id="blueArrow" markerWidth="10" markerHeight="10" refX="8" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M0,0 L8,3 L0,6 Z" fill="#1f6feb"/>
+    </marker>
+  </defs>
+  <style>
+    .bg { fill: url(#bgGrad); }
+    .halo { fill: url(#headerGrad); stroke: #d9e6f2; stroke-width: 1; rx: 18; }
+    .title { font: 700 27px Inter, Segoe UI, Arial, sans-serif; fill: #1f2933; }
+    .section { font: 700 16px Arial, sans-serif; fill: #344054; letter-spacing: .3px; }
+    .label { font: 700 15px Arial, sans-serif; fill: #1f2933; }
+    .small { font: 12px Arial, sans-serif; fill: #334155; }
+    .tiny { font: 11px Arial, sans-serif; fill: #475569; }
+    .box { fill: #ffffff; stroke: #cbd5e1; stroke-width: 1.25; rx: 10; filter: url(#shadow); }
+    .gmem { fill: #e8f4ff; stroke: #6da9ff; }
+    .smem { fill: #fff5df; stroke: #d49a30; }
+    .tmem { fill: #e9f8ef; stroke: #4cae68; }
+    .bar { fill: #f8fafc; stroke: #94a3b8; }
+    .op { fill: #f4ecff; stroke: #a56ae8; }
+    .line { stroke: #334155; stroke-width: 2.1; fill: none; marker-end: url(#arrow); }
+    .blue { stroke: #1f6feb; stroke-width: 2.5; fill: none; marker-end: url(#blueArrow); }
+    .dash { stroke-dasharray: 6 5; }
+  </style>
+  <rect class="bg" width="1200" height="720"/>
+  <rect class="halo" x="28" y="20" width="720" height="64"/>
+  <text class="title" x="48" y="48">Blackwell MXFP8 Block-Scaled GEMM Data Path</text>
+  <text class="small" x="50" y="70">A/B are FP8; SFA/SFB are packed UE8M0 scale factors; C accumulates in TMEM.</text>
+
+  <text class="section" x="50" y="118">GLOBAL MEMORY</text>
+  <rect class="box gmem" x="40" y="132" width="150" height="72"/>
+  <text class="label" x="68" y="163">A[M,K]</text>
+  <text class="tiny" x="62" y="184">FP8 tile M x K</text>
+
+  <rect class="box gmem" x="210" y="132" width="150" height="72"/>
+  <text class="label" x="238" y="163">B[K,N]</text>
+  <text class="tiny" x="228" y="184">FP8 tile K x N</text>
+
+  <rect class="box gmem" x="380" y="132" width="190" height="72"/>
+  <text class="label" x="412" y="163">SFA / SFB</text>
+  <text class="tiny" x="404" y="184">4 UE8M0 bytes per u32</text>
+
+  <rect class="box gmem" x="1010" y="132" width="150" height="72"/>
+  <text class="label" x="1058" y="163">C</text>
+  <text class="tiny" x="1034" y="184">BF16 output</text>
+
+  <text class="section" x="50" y="270">SHARED MEMORY PIPELINE STAGE</text>
+  <rect class="box smem" x="40" y="286" width="210" height="94"/>
+  <text class="label" x="78" y="318">A_shared[s]</text>
+  <text class="tiny" x="68" y="342">block_M x block_K</text>
+  <text class="tiny" x="68" y="362">loaded by TMA</text>
+
+  <rect class="box smem" x="270" y="286" width="210" height="94"/>
+  <text class="label" x="308" y="318">B_shared[s]</text>
+  <text class="tiny" x="294" y="342">block_K x block_N</text>
+  <text class="tiny" x="294" y="362">half_N per CTA in 2CTA</text>
+
+  <rect class="box smem" x="500" y="286" width="220" height="94"/>
+  <text class="label" x="528" y="318">SFA/SFB_shared[s]</text>
+  <text class="tiny" x="532" y="342">u32 vectors from TMA</text>
+  <text class="tiny" x="532" y="362">load every 4 K stages</text>
+
+  <rect class="box op" x="760" y="286" width="190" height="94"/>
+  <text class="label" x="796" y="318">SF transpose</text>
+  <text class="tiny" x="788" y="342">4 x 32 -> 32 x 4</text>
+  <text class="tiny" x="788" y="362">then fence.proxy.async</text>
+
+  <text class="section" x="50" y="500">TENSOR MEMORY AND MMA</text>
+  <rect class="box tmem" x="520" y="456" width="200" height="96"/>
+  <text class="label" x="558" y="488">SFA_tmem</text>
+  <text class="tiny" x="550" y="512">128 lanes x 4 cols</text>
+  <text class="tiny" x="550" y="532">id = k mod 4</text>
+
+  <rect class="box tmem" x="740" y="456" width="200" height="96"/>
+  <text class="label" x="778" y="488">SFB_tmem</text>
+  <text class="tiny" x="770" y="512">128 lanes x 4/8 cols</text>
+  <text class="tiny" x="770" y="532">N chunks of 128 cols</text>
+
+  <rect class="box op" x="410" y="580" width="250" height="88"/>
+  <text class="label" x="438" y="612">tcgen05.mma.block_scale</text>
+  <text class="tiny" x="444" y="636">(A * scale_A) * (B * scale_B)</text>
+  <text class="tiny" x="444" y="654">cta_group::1 or cta_group::2</text>
+
+  <rect class="box tmem" x="690" y="580" width="210" height="88"/>
+  <text class="label" x="746" y="612">C_tmem</text>
+  <text class="tiny" x="724" y="636">FP32 accumulator</text>
+  <text class="tiny" x="724" y="654">block_M x block_N</text>
+
+  <rect class="box smem" x="930" y="580" width="180" height="88"/>
+  <text class="label" x="984" y="612">Epilogue</text>
+  <text class="tiny" x="962" y="636">TMEM -> regs</text>
+  <text class="tiny" x="962" y="654">regs -> SMEM -> C</text>
+
+  <path class="blue" d="M115 204 C115 235, 145 252, 145 286"/>
+  <path class="blue" d="M285 204 C285 235, 335 252, 375 286"/>
+  <path class="blue" d="M475 204 C475 235, 610 246, 610 286"/>
+  <text class="tiny" x="78" y="244">TMA every K stage</text>
+  <text class="tiny" x="404" y="244">SF TMA every 4 K stages</text>
+
+  <path class="line" d="M720 333 L760 333"/>
+  <path class="line" d="M850 380 C850 420, 710 420, 620 456"/>
+  <path class="line" d="M855 380 C855 420, 805 420, 840 456"/>
+  <text class="tiny" x="860" y="400">tcgen05.cp.32x128b.warpx4</text>
+
+  <path class="line" d="M250 356 C310 430, 410 536, 490 580"/>
+  <path class="line" d="M480 356 C500 440, 505 515, 520 580"/>
+  <path class="line dash" d="M620 552 C600 568, 565 568, 535 580"/>
+  <path class="line dash" d="M840 552 C790 570, 705 570, 632 592"/>
+  <path class="line" d="M660 624 L690 624"/>
+  <path class="line" d="M900 624 L930 624"/>
+  <path class="line" d="M1110 624 C1160 540, 1120 300, 1085 204"/>
+
+  <rect class="box bar" x="48" y="418" width="410" height="56"/>
+  <text class="label" x="66" y="450">Barrier ring:</text>
+  <text class="small" x="170" y="450">consumed -> loaded -> with_sf_full -> consumed</text>
+
+  <rect class="box bar" x="786" y="418" width="330" height="56"/>
+  <text class="label" x="806" y="450">Final handoff:</text>
+  <text class="small" x="920" y="442">tmem_full</text>
+  <text class="small" x="920" y="460">persistent path also uses tmem_empty</text>
+</svg>
diff --git a/examples/blockscaled_gemm_sm100/figures/blockscaled_sf_layout.svg b/examples/blockscaled_gemm_sm100/figures/blockscaled_sf_layout.svg
new file mode 100644
index 0000000000..94d8ef0ca5
--- /dev/null
+++ b/examples/blockscaled_gemm_sm100/figures/blockscaled_sf_layout.svg
@@ -0,0 +1,163 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="760" viewBox="0 0 1200 760" role="img" aria-labelledby="title desc">
+  <title id="title">Scale-factor packing, SMEM transpose, and TMEM layout</title>
+  <desc id="desc">UE8M0 scale factors are packed four per uint32, loaded to shared memory, transposed in 128-word chunks, copied to tensor memory columns, and selected by sf_id.</desc>
+  <defs>
+    <linearGradient id="bgGrad" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0%" stop-color="#fbfaf6"/>
+      <stop offset="58%" stop-color="#f7fbff"/>
+      <stop offset="100%" stop-color="#f8fff8"/>
+    </linearGradient>
+    <linearGradient id="panelGrad" x1="0" y1="0" x2="0" y2="1">
+      <stop offset="0%" stop-color="#ffffff"/>
+      <stop offset="100%" stop-color="#f8fafc"/>
+    </linearGradient>
+    <filter id="shadow" x="-10%" y="-8%" width="120%" height="124%">
+      <feDropShadow dx="0" dy="8" stdDeviation="9" flood-color="#0f172a" flood-opacity="0.12"/>
+    </filter>
+    <marker id="arrow" markerWidth="10" markerHeight="10" refX="8" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M0,0 L8,3 L0,6 Z" fill="#263238"/>
+    </marker>
+  </defs>
+  <style>
+    .bg { fill: url(#bgGrad); }
+    .title { font: 700 27px Inter, Segoe UI, Arial, sans-serif; fill: #1f2933; }
+    .head { font: 700 18px Inter, Segoe UI, Arial, sans-serif; fill: #111827; }
+    .label { font: 700 13px Arial, sans-serif; fill: #1f2933; }
+    .small { font: 12px Arial, sans-serif; fill: #334155; }
+    .tiny { font: 10.5px Arial, sans-serif; fill: #475569; }
+    .panel { fill: url(#panelGrad); stroke: #d0d7de; stroke-width: 1.25; rx: 14; filter: url(#shadow); }
+    .gmem { fill: #e8f4ff; stroke: #6da9ff; }
+    .smem { fill: #fff5df; stroke: #d49a30; }
+    .tmem { fill: #e9f8ef; stroke: #4cae68; }
+    .byte0 { fill: #dbeafe; stroke: #2563eb; }
+    .byte1 { fill: #dcfce7; stroke: #16a34a; }
+    .byte2 { fill: #fef3c7; stroke: #b45309; }
+    .byte3 { fill: #fce7f3; stroke: #db2777; }
+    .word { fill: #f8fafc; stroke: #64748b; }
+    .line { stroke: #263238; stroke-width: 2; fill: none; marker-end: url(#arrow); }
+    .dash { stroke-dasharray: 5 5; }
+  </style>
+  <rect class="bg" width="1200" height="760"/>
+  <text class="title" x="46" y="46">Scale-Factor Layout: Global -> SMEM -> TMEM</text>
+  <text class="small" x="48" y="72">Example uses block_K = sf_granularity_k = 128, so one packed word covers four K iterations.</text>
+
+  <rect class="panel" x="40" y="110" width="330" height="560"/>
+  <text class="head" x="70" y="145">1. Global packed SF</text>
+  <text class="small" x="70" y="172">group-major flat layout</text>
+  <rect class="gmem" x="72" y="205" width="248" height="64"/>
+  <text class="label" x="94" y="230">SFA[sf_group * M + row]</text>
+  <text class="tiny" x="94" y="252">same pattern for SFB with N</text>
+
+  <text class="small" x="72" y="306">One uint32 word:</text>
+  <g transform="translate(72,322)">
+    <rect class="byte0" x="0" y="0" width="58" height="42"/>
+    <rect class="byte1" x="58" y="0" width="58" height="42"/>
+    <rect class="byte2" x="116" y="0" width="58" height="42"/>
+    <rect class="byte3" x="174" y="0" width="58" height="42"/>
+    <text class="label" x="16" y="26">sf0</text>
+    <text class="label" x="74" y="26">sf1</text>
+    <text class="label" x="132" y="26">sf2</text>
+    <text class="label" x="190" y="26">sf3</text>
+    <text class="tiny" x="0" y="60">bits 0..7</text>
+    <text class="tiny" x="56" y="60">8..15</text>
+    <text class="tiny" x="112" y="60">16..23</text>
+    <text class="tiny" x="172" y="60">24..31</text>
+  </g>
+
+  <rect class="word" x="72" y="442" width="248" height="102"/>
+  <text class="label" x="92" y="470">K stage mapping</text>
+  <text class="tiny" x="92" y="494">k % 4 = 0 -> sf0</text>
+  <text class="tiny" x="92" y="514">k % 4 = 1 -> sf1</text>
+  <text class="tiny" x="92" y="534">k % 4 = 2 -> sf2</text>
+  <text class="tiny" x="92" y="554">k % 4 = 3 -> sf3</text>
+  <text class="tiny" x="72" y="610">The unpacked scale is 2^(ue8m0 - 127).</text>
+
+  <rect class="panel" x="430" y="110" width="330" height="560"/>
+  <text class="head" x="460" y="145">2. SMEM transpose for UTCCP</text>
+  <text class="small" x="460" y="172">per 128-word chunk</text>
+
+  <text class="label" x="460" y="218">Before: 4 x 32 view</text>
+  <g transform="translate(460,235)">
+    <rect class="smem" x="0" y="0" width="232" height="104"/>
+    <line x1="0" y1="26" x2="232" y2="26" stroke="#b7791f"/>
+    <line x1="0" y1="52" x2="232" y2="52" stroke="#b7791f"/>
+    <line x1="0" y1="78" x2="232" y2="78" stroke="#b7791f"/>
+    <line x1="58" y1="0" x2="58" y2="104" stroke="#b7791f"/>
+    <line x1="116" y1="0" x2="116" y2="104" stroke="#b7791f"/>
+    <line x1="174" y1="0" x2="174" y2="104" stroke="#b7791f"/>
+    <text class="tiny" x="8" y="17">word[0..31]</text>
+    <text class="tiny" x="66" y="17">...</text>
+    <text class="tiny" x="124" y="17">lane</text>
+    <text class="tiny" x="182" y="17">31</text>
+    <text class="tiny" x="8" y="43">word[32..63]</text>
+    <text class="tiny" x="8" y="69">word[64..95]</text>
+    <text class="tiny" x="8" y="95">word[96..127]</text>
+  </g>
+
+  <path class="line" d="M578 360 C578 386, 578 398, 578 424"/>
+  <text class="tiny" x="604" y="393">tcgen05_sf_warp_transpose</text>
+
+  <text class="label" x="460" y="456">After: 32 x 4 view</text>
+  <g transform="translate(460,473)">
+    <rect class="smem" x="0" y="0" width="232" height="104"/>
+    <line x1="58" y1="0" x2="58" y2="104" stroke="#b7791f"/>
+    <line x1="116" y1="0" x2="116" y2="104" stroke="#b7791f"/>
+    <line x1="174" y1="0" x2="174" y2="104" stroke="#b7791f"/>
+    <line x1="0" y1="26" x2="232" y2="26" stroke="#b7791f"/>
+    <line x1="0" y1="52" x2="232" y2="52" stroke="#b7791f"/>
+    <line x1="0" y1="78" x2="232" y2="78" stroke="#b7791f"/>
+    <text class="tiny" x="18" y="17">lane 0</text>
+    <text class="tiny" x="78" y="17">sf0</text>
+    <text class="tiny" x="136" y="17">sf1</text>
+    <text class="tiny" x="192" y="17">sf2</text>
+    <text class="tiny" x="18" y="43">lane 1</text>
+    <text class="tiny" x="18" y="69">...</text>
+    <text class="tiny" x="18" y="95">lane 31</text>
+  </g>
+  <text class="tiny" x="460" y="612">TileLang applies this to each 128-word</text>
+  <text class="tiny" x="460" y="630">chunk automatically.</text>
+
+  <rect class="panel" x="820" y="110" width="330" height="560"/>
+  <text class="head" x="850" y="145">3. TMEM SF columns</text>
+  <text class="small" x="850" y="172">tcgen05.cp.32x128b.warpx4</text>
+
+  <g transform="translate(852,215)">
+    <rect class="tmem" x="0" y="0" width="248" height="190"/>
+    <text class="label" x="70" y="28">SFA_tmem</text>
+    <text class="tiny" x="74" y="48">128 lanes x 4 columns</text>
+    <line x1="0" y1="70" x2="248" y2="70" stroke="#238636"/>
+    <line x1="0" y1="110" x2="248" y2="110" stroke="#238636"/>
+    <line x1="0" y1="150" x2="248" y2="150" stroke="#238636"/>
+    <line x1="62" y1="70" x2="62" y2="190" stroke="#238636"/>
+    <line x1="124" y1="70" x2="124" y2="190" stroke="#238636"/>
+    <line x1="186" y1="70" x2="186" y2="190" stroke="#238636"/>
+    <text class="tiny" x="12" y="96">lane 0</text>
+    <text class="tiny" x="78" y="96">col1</text>
+    <text class="tiny" x="140" y="96">col2</text>
+    <text class="tiny" x="202" y="96">col3</text>
+    <text class="tiny" x="12" y="136">...</text>
+    <text class="tiny" x="12" y="176">lane 127</text>
+  </g>
+
+  <g transform="translate(852,450)">
+    <rect class="tmem" x="0" y="0" width="248" height="124"/>
+    <text class="label" x="72" y="28">SFB_tmem</text>
+    <text class="tiny" x="48" y="48">128 lanes x 8 columns for block_N=256</text>
+    <rect class="byte0" x="20" y="70" width="45" height="34"/>
+    <rect class="byte1" x="65" y="70" width="45" height="34"/>
+    <rect class="byte2" x="110" y="70" width="45" height="34"/>
+    <rect class="byte3" x="155" y="70" width="45" height="34"/>
+    <text class="tiny" x="31" y="91">id0</text>
+    <text class="tiny" x="76" y="91">id1</text>
+    <text class="tiny" x="121" y="91">id2</text>
+    <text class="tiny" x="166" y="91">id3</text>
+    <text class="tiny" x="22" y="116">u32 cell byte sub-columns</text>
+  </g>
+
+  <path class="line" d="M370 390 C410 390, 410 390, 430 390"/>
+  <path class="line" d="M760 390 C800 390, 800 310, 852 310"/>
+  <text class="tiny" x="806" y="328">4 TMEM columns per</text>
+  <text class="tiny" x="806" y="344">128-word chunk</text>
+  <path class="line dash" d="M976 574 C976 620, 910 632, 840 632"/>
+  <text class="tiny" x="842" y="654">sf_id selects the byte sub-column for this K iteration.</text>
+</svg>
diff --git a/examples/blockscaled_gemm_sm100/figures/blockscaled_variants.svg b/examples/blockscaled_gemm_sm100/figures/blockscaled_variants.svg
new file mode 100644
index 0000000000..f570c22666
--- /dev/null
+++ b/examples/blockscaled_gemm_sm100/figures/blockscaled_variants.svg
@@ -0,0 +1,97 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="650" viewBox="0 0 1200 650" role="img" aria-labelledby="title desc">
+  <title id="title">Block-scaled GEMM kernel variants</title>
+  <desc id="desc">Comparison of 1CTA, 2CTA, and persistent 2CTA scheduling designs.</desc>
+  <defs>
+    <linearGradient id="bgGrad" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0%" stop-color="#fbfaf6"/>
+      <stop offset="55%" stop-color="#f7fbff"/>
+      <stop offset="100%" stop-color="#f8fff8"/>
+    </linearGradient>
+    <linearGradient id="cardGrad" x1="0" y1="0" x2="0" y2="1">
+      <stop offset="0%" stop-color="#ffffff"/>
+      <stop offset="100%" stop-color="#f8fafc"/>
+    </linearGradient>
+    <filter id="shadow" x="-10%" y="-10%" width="120%" height="126%">
+      <feDropShadow dx="0" dy="8" stdDeviation="9" flood-color="#0f172a" flood-opacity="0.12"/>
+    </filter>
+    <marker id="arrow" markerWidth="10" markerHeight="10" refX="8" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M0,0 L8,3 L0,6 Z" fill="#263238"/>
+    </marker>
+  </defs>
+  <style>
+    .bg { fill: url(#bgGrad); }
+    .title { font: 700 27px Inter, Segoe UI, Arial, sans-serif; fill: #1f2933; }
+    .head { font: 700 19px Inter, Segoe UI, Arial, sans-serif; fill: #111827; }
+    .label { font: 700 14px Arial, sans-serif; fill: #1f2933; }
+    .small { font: 12px Arial, sans-serif; fill: #334155; }
+    .tiny { font: 11px Arial, sans-serif; fill: #475569; }
+    .card { fill: url(#cardGrad); stroke: #d0d7de; stroke-width: 1.25; rx: 14; filter: url(#shadow); }
+    .cta0 { fill: #e8f4ff; stroke: #6da9ff; }
+    .cta1 { fill: #fff5df; stroke: #d49a30; }
+    .tile { fill: #e9f8ef; stroke: #4cae68; }
+    .tmem { fill: #f4ecff; stroke: #a56ae8; }
+    .ep { fill: #ffe9e4; stroke: #e28a60; }
+    .line { stroke: #263238; stroke-width: 2; fill: none; marker-end: url(#arrow); }
+    .dash { stroke-dasharray: 6 5; }
+  </style>
+  <rect class="bg" width="1200" height="650"/>
+  <text class="title" x="46" y="46">Three Versions in GEMM Example</text>
+  <text class="small" x="48" y="72">All versions run the same block-scaled MMA data path; they differ in CTA grouping and scheduling.</text>
+
+  <rect class="card" x="40" y="105" width="340" height="500"/>
+  <text class="head" x="70" y="140">1CTA</text>
+  <text class="small" x="70" y="164">mxfp8_blockscaled_gemm</text>
+  <rect class="tile" x="82" y="200" width="250" height="120"/>
+  <text class="label" x="160" y="230">one CTA tile</text>
+  <text class="small" x="170" y="254">128 x 128</text>
+  <text class="tiny" x="116" y="282">local SMEM, local TMEM, local barriers</text>
+  <rect class="cta0" x="78" y="360" width="260" height="44"/>
+  <text class="label" x="96" y="388">warp 0: TMA producer</text>
+  <rect class="tmem" x="78" y="414" width="260" height="44"/>
+  <text class="label" x="96" y="442">warp 1: UTCCP + MMA issue</text>
+  <rect class="cta1" x="78" y="468" width="260" height="44"/>
+  <text class="label" x="96" y="496">warp 2: SF transpose</text>
+  <text class="tiny" x="78" y="542">Epilogue waits tmem_full.</text>
+  <text class="tiny" x="78" y="562">Then C_tmem is copied through regs/SMEM to C.</text>
+
+  <rect class="card" x="430" y="105" width="340" height="500"/>
+  <text class="head" x="460" y="140">2CTA</text>
+  <text class="small" x="460" y="164">mxfp8_blockscaled_gemm_2cta</text>
+  <rect class="tile" x="468" y="200" width="260" height="120"/>
+  <line x1="598" y1="200" x2="598" y2="320" stroke="#238636" stroke-width="2"/>
+  <text class="label" x="565" y="230">logical tile</text>
+  <text class="small" x="565" y="254">128 x 256</text>
+  <text class="tiny" x="535" y="282">CTA pair computes one tile</text>
+  <rect class="cta0" x="466" y="354" width="120" height="70"/>
+  <text class="label" x="490" y="382">CTA 0</text>
+  <text class="tiny" x="488" y="404">B columns 0..127</text>
+  <rect class="cta1" x="612" y="354" width="120" height="70"/>
+  <text class="label" x="636" y="382">CTA 1</text>
+  <text class="tiny" x="634" y="404">B columns 128..255</text>
+  <path class="line dash" d="M586 390 L612 390"/>
+  <rect class="tmem" x="474" y="454" width="246" height="58"/>
+  <text class="label" x="505" y="484">leader CTA issues cta_group::2</text>
+  <text class="tiny" x="512" y="504">cluster barriers cover both CTAs</text>
+  <text class="tiny" x="466" y="542">A and SFA are available in both peers.</text>
+  <text class="tiny" x="466" y="562">SFB covers the full logical block_N.</text>
+
+  <rect class="card" x="820" y="105" width="340" height="500"/>
+  <text class="head" x="850" y="140">2CTA Persistent</text>
+  <text class="small" x="850" y="164">mxfp8_blockscaled_gemm_2cta_persistent</text>
+  <rect class="cta0" x="864" y="200" width="230" height="52"/>
+  <text class="label" x="890" y="232">resident CTA pair per cluster</text>
+  <path class="line" d="M980 252 C980 280, 980 292, 980 318"/>
+  <rect class="tile" x="860" y="318" width="240" height="92"/>
+  <text class="label" x="900" y="348">wave loop over tiles</text>
+  <text class="tiny" x="890" y="374">tile_id = num_clusters * w + cluster_id</text>
+  <rect class="tmem" x="854" y="438" width="118" height="68"/>
+  <text class="label" x="878" y="466">MMA</text>
+  <text class="tiny" x="876" y="488">warps 0..2</text>
+  <rect class="ep" x="986" y="438" width="118" height="68"/>
+  <text class="label" x="1012" y="466">Store</text>
+  <text class="tiny" x="1004" y="488">warps 4..7</text>
+  <path class="line" d="M972 472 L986 472"/>
+  <path class="line dash" d="M1045 506 C1045 534, 900 534, 900 506"/>
+  <text class="tiny" x="914" y="542">tmem_full sends data to epilogue.</text>
+  <text class="tiny" x="914" y="562">tmem_empty releases TMEM for the next wave.</text>
+</svg>
diff --git a/examples/blockscaled_gemm_sm100/figures/blockscaled_warp_specialization.svg b/examples/blockscaled_gemm_sm100/figures/blockscaled_warp_specialization.svg
new file mode 100644
index 0000000000..2ae2e1b43e
--- /dev/null
+++ b/examples/blockscaled_gemm_sm100/figures/blockscaled_warp_specialization.svg
@@ -0,0 +1,103 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="1200" height="670" viewBox="0 0 1200 670" role="img" aria-labelledby="title desc">
+  <title id="title">Warp-specialized pipeline and barriers</title>
+  <desc id="desc">Timeline for TMA, scale-factor transpose, UTCCP, MMA, and epilogue roles.</desc>
+  <defs>
+    <linearGradient id="bgGrad" x1="0" y1="0" x2="1" y2="1">
+      <stop offset="0%" stop-color="#fbfaf6"/>
+      <stop offset="58%" stop-color="#f7fbff"/>
+      <stop offset="100%" stop-color="#f8fff8"/>
+    </linearGradient>
+    <filter id="shadow" x="-8%" y="-18%" width="116%" height="142%">
+      <feDropShadow dx="0" dy="5" stdDeviation="6" flood-color="#0f172a" flood-opacity="0.11"/>
+    </filter>
+    <marker id="arrow" markerWidth="10" markerHeight="10" refX="8" refY="3" orient="auto" markerUnits="strokeWidth">
+      <path d="M0,0 L8,3 L0,6 Z" fill="#263238"/>
+    </marker>
+  </defs>
+  <style>
+    .bg { fill: url(#bgGrad); }
+    .title { font: 700 27px Inter, Segoe UI, Arial, sans-serif; fill: #1f2933; }
+    .label { font: 700 14px Arial, sans-serif; fill: #1f2933; }
+    .small { font: 12px Arial, sans-serif; fill: #334155; }
+    .tiny { font: 11px Arial, sans-serif; fill: #475569; }
+    .lane { fill: #ffffff; stroke: #d0d7de; stroke-width: 1; rx: 12; filter: url(#shadow); }
+    .tma { fill: #e8f4ff; stroke: #6da9ff; }
+    .sf { fill: #fff5df; stroke: #d49a30; }
+    .mma { fill: #f4ecff; stroke: #a56ae8; }
+    .store { fill: #e9f8ef; stroke: #4cae68; }
+    .bar { fill: #f8fafc; stroke: #94a3b8; }
+    .line { stroke: #263238; stroke-width: 2; fill: none; marker-end: url(#arrow); }
+    .dash { stroke-dasharray: 5 5; }
+  </style>
+  <rect class="bg" width="1200" height="670"/>
+  <text class="title" x="46" y="46">Warp-Specialized Handshake</text>
+  <text class="small" x="48" y="72">The roles are fixed by warp index. Barrier parity advances with pipeline stage and persistent wave.</text>
+
+  <text class="label" x="72" y="124">Role</text>
+  <text class="label" x="240" y="124">1. wait / produce</text>
+  <text class="label" x="452" y="124">2. transform</text>
+  <text class="label" x="664" y="124">3. issue MMA</text>
+  <text class="label" x="892" y="124">4. epilogue</text>
+
+  <rect class="lane" x="50" y="145" width="1080" height="88"/>
+  <text class="label" x="72" y="182">warp 0</text>
+  <text class="tiny" x="72" y="204">TMA producer</text>
+  <rect class="bar" x="200" y="166" width="142" height="46"/>
+  <text class="small" x="224" y="194">wait consumed</text>
+  <rect class="tma" x="372" y="166" width="172" height="46"/>
+  <text class="small" x="392" y="187">TMA A/B</text>
+  <text class="tiny" x="392" y="205">plus SF every 4 K</text>
+  <rect class="bar" x="580" y="166" width="128" height="46"/>
+  <text class="small" x="608" y="194">arrive loaded</text>
+
+  <rect class="lane" x="50" y="255" width="1080" height="88"/>
+  <text class="label" x="72" y="292">warp 2</text>
+  <text class="tiny" x="72" y="314">SF transposer</text>
+  <rect class="bar" x="200" y="276" width="128" height="46"/>
+  <text class="small" x="224" y="304">wait loaded</text>
+  <rect class="sf" x="372" y="276" width="190" height="46"/>
+  <text class="small" x="390" y="297">transpose 128 u32 chunks</text>
+  <text class="tiny" x="390" y="315">4 x 32 -> 32 x 4</text>
+  <rect class="sf" x="590" y="276" width="144" height="46"/>
+  <text class="small" x="616" y="297">proxy fence</text>
+  <text class="tiny" x="608" y="315">SMEM -> async proxy</text>
+  <rect class="bar" x="768" y="276" width="150" height="46"/>
+  <text class="small" x="792" y="304">arrive with_sf_full</text>
+
+  <rect class="lane" x="50" y="365" width="1080" height="96"/>
+  <text class="label" x="72" y="404">warp 1</text>
+  <text class="tiny" x="72" y="426">leader CTA MMA</text>
+  <rect class="bar" x="200" y="386" width="160" height="46"/>
+  <text class="small" x="226" y="414">wait with_sf_full</text>
+  <rect class="mma" x="392" y="386" width="172" height="46"/>
+  <text class="small" x="412" y="407">tcgen05.cp.warpx4</text>
+  <text class="tiny" x="414" y="425">SMEM SF -> TMEM SF</text>
+  <rect class="mma" x="596" y="386" width="180" height="46"/>
+  <text class="small" x="616" y="407">tcgen05.mma.block_scale</text>
+  <text class="tiny" x="616" y="425">clear_accum only on k=0</text>
+  <rect class="bar" x="806" y="386" width="130" height="46"/>
+  <text class="small" x="830" y="414">arrive consumed</text>
+  <rect class="bar" x="966" y="386" width="128" height="46"/>
+  <text class="small" x="988" y="414">arrive tmem_full</text>
+
+  <rect class="lane" x="50" y="485" width="1080" height="96"/>
+  <text class="label" x="72" y="524">epilogue</text>
+  <text class="tiny" x="72" y="546">all warps or 4..7</text>
+  <rect class="bar" x="200" y="506" width="128" height="46"/>
+  <text class="small" x="224" y="534">wait tmem_full</text>
+  <rect class="store" x="372" y="506" width="170" height="46"/>
+  <text class="small" x="396" y="527">C_tmem -> regs</text>
+  <text class="tiny" x="396" y="545">then SMEM/global</text>
+  <rect class="bar" x="584" y="506" width="150" height="46"/>
+  <text class="small" x="606" y="527">persistent only</text>
+  <text class="tiny" x="610" y="545">arrive tmem_empty</text>
+
+  <path class="line dash" d="M708 189 C740 210, 184 240, 200 299"/>
+  <path class="line dash" d="M918 299 C950 324, 172 348, 200 409"/>
+  <path class="line dash" d="M936 409 C970 442, 172 468, 200 529"/>
+  <path class="line dash" d="M584 529 C538 600, 170 600, 200 189"/>
+
+  <rect class="bar" x="792" y="592" width="338" height="52"/>
+  <text class="small" x="816" y="616">2CTA uses cluster barriers for with_sf_full,</text>
+  <text class="small" x="816" y="634">consumed, and tmem_empty.</text>
+</svg>
diff --git a/examples/blockscaled_gemm_sm100/figures/sfa.png b/examples/blockscaled_gemm_sm100/figures/sfa.png
new file mode 100644
index 0000000000..545a648693
Binary files /dev/null and b/examples/blockscaled_gemm_sm100/figures/sfa.png differ
diff --git a/examples/blockscaled_gemm_sm100/gemm_mxfp8_blockscaled_1d1d.py b/examples/blockscaled_gemm_sm100/gemm_mxfp8_blockscaled_1d1d.py
new file mode 100644
index 0000000000..a8acb6dc97
--- /dev/null
+++ b/examples/blockscaled_gemm_sm100/gemm_mxfp8_blockscaled_1d1d.py
@@ -0,0 +1,762 @@
+# MXFP8 Block-Scaled GEMM on SM100
+# Blockscale size: (M, N, K) = (1, 1, 128)
+
+import argparse
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.carver.arch import driver
+from tilelang.profiler import do_bench
+
+
+@tilelang.jit
+def mxfp8_blockscaled_gemm(
+    A,
+    B,
+    SFA,
+    SFB,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    sf_granularity_k=128,
+    transpose_B=False,
+):
+    """1D-1D Block-scaled MXFP8 GEMM.
+
+    A:   [M, K] in FP8 (E4M3 or E5M2)
+    B:   [K, N] in FP8 (E4M3 or E5M2), or [N, K] when transpose_B=True
+    SFA: [(K / sf_granularity_k) / 4) * M] in uint32
+         Group-major packed E8M0 scale factors for A.
+    SFB: [(K / sf_granularity_k) / 4) * N] in uint32
+         Group-major packed E8M0 scale factors for B.
+    """
+    M, N, K = T.const("M, N, K")
+
+    k_iters = T.ceildiv(K, block_K)
+    # Load 4 K-blocks of SF at once → load every 4 iterations
+    sf_load_period = sf_granularity_k * 4 // block_K
+    sf_k_groups = T.ceildiv(T.ceildiv(K, sf_granularity_k), 4)
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[N, K] if transpose_B else [K, N], in_dtype]
+    SFA: T.Tensor[[sf_k_groups * M], T.uint32]
+    SFB: T.Tensor[[sf_k_groups * N], T.uint32]
+    C = T.empty((M, N), out_dtype)
+
+    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+        # Data shared memory (pipelined)
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared(
+            (num_stages, block_N, block_K) if transpose_B else (num_stages, block_K, block_N),
+            in_dtype,
+        )
+
+        # Scale factor shared memory — one uint32 per row/column, packing 4 K-blocks.
+        SFA_shared = T.alloc_shared((num_stages, block_M), "uint32")
+        SFB_shared = T.alloc_shared((num_stages, block_N), "uint32")
+
+        # Accumulator in tensor memory
+        C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+
+        # Scale factors in tensor memory (TMEM has 128 rows / 32-bit cells)
+        SFA_tmem = T.alloc_tmem([block_M, block_M // 128 * 4], "uint32")
+        SFB_tmem = T.alloc_tmem([block_M, block_N // 128 * 4], "uint32")
+
+        # Output buffers
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+        # Barriers
+        loaded = T.alloc_barrier([32] * num_stages)
+        with_sf_full = T.alloc_barrier([32] * num_stages)
+        consumed = T.alloc_barrier([1] * num_stages)
+        tmem_full = T.alloc_barrier([1])
+
+        tx = T.get_thread_binding()
+        T.use_swizzle(8)
+
+        if tx < 32:
+            # Warp 0: TMA load
+            for k in T.serial(k_iters):
+                T.mbarrier_wait_parity(consumed[k % num_stages], ((k // num_stages) & 1) ^ 1)
+                T.tma_copy(
+                    A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K],
+                    A_shared[k % num_stages, :, :],
+                    barrier=loaded[k % num_stages],
+                )
+                if transpose_B:
+                    T.tma_copy(
+                        B[by * block_N : (by + 1) * block_N, k * block_K : (k + 1) * block_K],
+                        B_shared[k % num_stages, :, :],
+                        barrier=loaded[k % num_stages],
+                    )
+                else:
+                    T.tma_copy(
+                        B[k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N],
+                        B_shared[k % num_stages, :, :],
+                        barrier=loaded[k % num_stages],
+                    )
+                # Load one packed uint32 SF word every sf_load_period iterations.
+                if k % sf_load_period == 0:
+                    sf_group_idx = k // sf_load_period
+                    T.tma_copy(
+                        SFA[sf_group_idx * M + bx * block_M : sf_group_idx * M + (bx + 1) * block_M],
+                        SFA_shared[k % num_stages, :],
+                        barrier=loaded[k % num_stages],
+                    )
+                    T.tma_copy(
+                        SFB[sf_group_idx * N + by * block_N : sf_group_idx * N + (by + 1) * block_N],
+                        SFB_shared[k % num_stages, :],
+                        barrier=loaded[k % num_stages],
+                    )
+                T.mbarrier_arrive(loaded[k % num_stages])
+
+        elif tx < 64:
+            # Warp 1: MMA issue + UTCCP
+            for k in T.serial(k_iters):
+                stage = k % num_stages
+                phase = (k // num_stages) & 1
+                T.mbarrier_wait_parity(loaded[stage], phase)
+                T.mbarrier_wait_parity(with_sf_full[stage], phase)
+
+                if k % sf_load_period == 0:
+                    T.tcgen05_cp_warpx4(SFA_shared[stage, :], SFA_tmem)
+                    T.tcgen05_cp_warpx4(SFB_shared[stage, :], SFB_tmem)
+
+                # sf_id selects which of the 4 packed E8M0 values to use
+                T.tcgen05_gemm_blockscaled(
+                    A_shared[stage, :, :],
+                    B_shared[stage, :, :],
+                    C_tmem,
+                    SFA_tmem,
+                    SFB_tmem,
+                    transpose_B=transpose_B,
+                    mbar=consumed[stage],
+                    clear_accum=k == 0,
+                    sf_a_id=k % sf_load_period,
+                    sf_b_id=k % sf_load_period,
+                )
+
+            T.tcgen05_mma_arrive(tmem_full)
+
+        elif tx < 96:
+            # Warp 2: scale-factor transpose
+            for k in T.serial(k_iters):
+                stage = k % num_stages
+                phase = (k // num_stages) & 1
+                T.mbarrier_wait_parity(loaded[stage], phase)
+
+                if k % sf_load_period == 0:
+                    T.tcgen05_sf_warp_transpose(SFA_shared[stage, :])
+                    T.tcgen05_sf_warp_transpose(SFB_shared[stage, :])
+                    T.fence_proxy_async()
+                T.mbarrier_arrive(with_sf_full[stage])
+
+        # Epilogue: all warps
+        T.mbarrier_wait_parity(tmem_full, 0)
+        T.sync_threads()
+
+        T.copy(C_tmem, C_local)
+        T.copy(C_local, C_shared)
+        T.copy(C_shared, C[bx * block_M, by * block_N])
+
+    return C
+
+
+@tilelang.jit
+def mxfp8_blockscaled_gemm_2cta(
+    A,
+    B,
+    SFA,
+    SFB,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    sf_granularity_k=128,
+    transpose_B=False,
+):
+    M, N, K = T.const("M, N, K")
+
+    assert block_M == 128
+    assert block_N == 256
+    assert block_K == 128
+    assert sf_granularity_k == 128
+
+    half_N = block_N // 2
+    k_iters = T.ceildiv(K, block_K)
+    sf_load_period = sf_granularity_k * 4 // block_K
+    sf_k_groups = T.ceildiv(T.ceildiv(K, sf_granularity_k), 4)
+    assert sf_load_period == 4
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[N, K] if transpose_B else [K, N], in_dtype]
+    SFA: T.Tensor[[sf_k_groups * M], T.uint32]
+    SFB: T.Tensor[[sf_k_groups * N], T.uint32]
+    C = T.empty((M, N), out_dtype)
+
+    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128, cluster_dims=2) as (bx, by):
+        cta_id = T.block_rank_in_cluster()
+        T.assume(cta_id < 2)
+
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared(
+            (num_stages, half_N, block_K) if transpose_B else (num_stages, block_K, half_N),
+            in_dtype,
+        )
+        SFA_shared = T.alloc_shared((num_stages, block_M), "uint32")
+        SFB_shared = T.alloc_shared((num_stages, block_N), "uint32")
+
+        C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+        SFA_tmem = T.alloc_tmem([block_M, 4], "uint32")
+        SFB_tmem = T.alloc_tmem([block_M, 8], "uint32")
+
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+        loaded = T.alloc_barrier([32] * num_stages)
+        with_sf_full = T.alloc_cluster_barrier([32 * 2] * num_stages)
+        consumed = T.alloc_cluster_barrier([1] * num_stages)
+        tmem_full = T.alloc_barrier([1])
+
+        tx = T.get_thread_binding()
+        warp_idx = tx // 32
+        T.use_swizzle(16)
+
+        if warp_idx == 0:
+            for k in T.serial(k_iters):
+                stage = k % num_stages
+                phase = (k // num_stages) & 1
+                T.mbarrier_wait_parity(consumed[stage], phase ^ 1)
+                T.tma_copy(
+                    A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K],
+                    A_shared[stage, :, :],
+                    barrier=loaded[stage],
+                )
+                if transpose_B:
+                    T.tma_copy(
+                        B[
+                            (by * block_N + cta_id * half_N) : (by * block_N + (cta_id + 1) * half_N),
+                            k * block_K : (k + 1) * block_K,
+                        ],
+                        B_shared[stage, :, :],
+                        barrier=loaded[stage],
+                    )
+                else:
+                    T.tma_copy(
+                        B[
+                            k * block_K : (k + 1) * block_K,
+                            (by * block_N + cta_id * half_N) : (by * block_N + (cta_id + 1) * half_N),
+                        ],
+                        B_shared[stage, :, :],
+                        barrier=loaded[stage],
+                    )
+                if k % sf_load_period == 0:
+                    sf_group_idx = k // sf_load_period
+                    T.tma_copy(
+                        SFA[sf_group_idx * M + bx * block_M : sf_group_idx * M + (bx + 1) * block_M],
+                        SFA_shared[stage, :],
+                        barrier=loaded[stage],
+                    )
+                    T.tma_copy(
+                        SFB[sf_group_idx * N + by * block_N : sf_group_idx * N + (by + 1) * block_N],
+                        SFB_shared[stage, :],
+                        barrier=loaded[stage],
+                    )
+                T.mbarrier_arrive(loaded[stage])
+
+        elif warp_idx == 1 and cta_id == 0:
+            for k in T.serial(k_iters):
+                stage = k % num_stages
+                phase = (k // num_stages) & 1
+                T.mbarrier_wait_parity(with_sf_full[stage], phase)
+                if k % sf_load_period == 0:
+                    T.tcgen05_cp_warpx4(SFA_shared[stage, :], SFA_tmem, use_2cta=True)
+                    T.tcgen05_cp_warpx4(SFB_shared[stage, :], SFB_tmem, use_2cta=True)
+
+                T.tcgen05_gemm_blockscaled(
+                    A_shared[stage, :, :],
+                    B_shared[stage, :, :],
+                    C_tmem,
+                    SFA_tmem,
+                    SFB_tmem,
+                    transpose_B=transpose_B,
+                    mbar=consumed[stage],
+                    clear_accum=k == 0,
+                    sf_a_id=k % sf_load_period,
+                    sf_b_id=k % sf_load_period,
+                    use_2cta=True,
+                )
+            T.tcgen05_mma_arrive(tmem_full, arrive_2cta=True)
+
+        elif warp_idx == 2:
+            for k in T.serial(k_iters):
+                stage = k % num_stages
+                phase = (k // num_stages) & 1
+                T.mbarrier_wait_parity(loaded[stage], phase)
+                if k % sf_load_period == 0:
+                    T.tcgen05_sf_warp_transpose(SFA_shared[stage, :])
+                    T.tcgen05_sf_warp_transpose(SFB_shared[stage, :])
+                    T.fence_proxy_async()
+                T.mbarrier_arrive(with_sf_full[stage], 0)
+
+        T.mbarrier_wait_parity(tmem_full, 0)
+        T.copy(C_tmem, C_local)
+        T.copy(C_local, C_shared)
+        T.copy(C_shared, C[bx * block_M, by * block_N])
+
+    return C
+
+
+@tilelang.jit
+def mxfp8_blockscaled_gemm_2cta_persistent(
+    A,
+    B,
+    SFA,
+    SFB,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    sf_granularity_k=128,
+    transpose_B=False,
+    use_tma_store=True,
+    store_block_N=64,
+):
+    M, N, K = T.const("M, N, K")
+
+    half_N = block_N // 2
+    k_iters = T.ceildiv(K, block_K)
+    sf_load_period = sf_granularity_k * 4 // block_K
+    sf_k_groups = T.ceildiv(T.ceildiv(K, sf_granularity_k), 4)
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[N, K] if transpose_B else [K, N], in_dtype]
+    SFA: T.Tensor[[sf_k_groups * M], T.uint32]
+    SFB: T.Tensor[[sf_k_groups * N], T.uint32]
+    C = T.empty((M, N), out_dtype)
+
+    sm_num = driver.get_num_sms()
+    num_clusters = sm_num // 2
+    m_blocks = T.ceildiv(M, block_M)
+    m_clusters = m_blocks // 2
+    n_blocks = T.ceildiv(N, block_N)
+    assert K % (2 * block_K) == 0  # for simplicity
+    waves = T.ceildiv(m_blocks * n_blocks, sm_num)
+    group_size = 16  # in cluster
+    assert n_blocks % (2 * group_size) == 0  # Please adjust group_size if not satisfied
+
+    with T.Kernel(sm_num, threads=256, cluster_dims=2) as (block_id):
+        cta_id = T.block_rank_in_cluster()
+        T.assume(cta_id < 2)
+
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared(
+            (num_stages, half_N, block_K) if transpose_B else (num_stages, block_K, half_N),
+            in_dtype,
+        )
+        SFA_shared = T.alloc_shared((num_stages, block_M), "uint32")
+        SFB_shared = T.alloc_shared((num_stages, block_N), "uint32")
+
+        C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+        SFA_tmem = T.alloc_tmem([block_M, block_M // 128 * 4], "uint32")
+        SFB_tmem = T.alloc_tmem([block_M, block_N // 128 * 4], "uint32")
+
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        C_shared = T.alloc_shared((block_M, store_block_N), out_dtype)
+
+        loaded = T.alloc_barrier([32] * num_stages)
+        with_sf_full = T.alloc_cluster_barrier([32 * 2] * num_stages)
+        consumed = T.alloc_cluster_barrier([1] * num_stages)
+        tmem_full = T.alloc_cluster_barrier([1])
+        tmem_empty = T.alloc_cluster_barrier([128 * 2])
+
+        tx = T.get_thread_binding()
+        warp_idx = tx // 32
+
+        if warp_idx == 0:
+            for w in T.unroll(waves):
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                bx_cluster = (tile_id // group_size) % m_clusters
+                bx = bx_cluster * 2 + cta_id
+                by = (tile_id % group_size) + (tile_id // group_size) // m_clusters * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    for k in T.serial(k_iters):
+                        phase = w * k_iters + k
+                        stage = phase % num_stages
+                        parity = (phase // num_stages) & 1
+                        T.mbarrier_wait_parity(consumed[stage], parity ^ 1)
+                        T.tma_copy(
+                            A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K],
+                            A_shared[stage, :, :],
+                            barrier=loaded[stage],
+                        )
+                        if transpose_B:
+                            T.tma_copy(
+                                B[
+                                    by * block_N + cta_id * half_N : by * block_N + (cta_id + 1) * half_N,
+                                    k * block_K : (k + 1) * block_K,
+                                ],
+                                B_shared[stage, :, :],
+                                barrier=loaded[stage],
+                            )
+                        else:
+                            T.tma_copy(
+                                B[
+                                    k * block_K : (k + 1) * block_K,
+                                    by * block_N + cta_id * half_N : by * block_N + (cta_id + 1) * half_N,
+                                ],
+                                B_shared[stage, :, :],
+                                barrier=loaded[stage],
+                            )
+                        if k % sf_load_period == 0:
+                            sf_group_idx = k // sf_load_period
+                            T.tma_copy(
+                                SFA[sf_group_idx * M + bx * block_M : sf_group_idx * M + (bx + 1) * block_M],
+                                SFA_shared[stage, :],
+                                barrier=loaded[stage],
+                            )
+                            T.tma_copy(
+                                SFB[sf_group_idx * N + by * block_N : sf_group_idx * N + (by + 1) * block_N],
+                                SFB_shared[stage, :],
+                                barrier=loaded[stage],
+                            )
+                        T.mbarrier_arrive(loaded[stage])
+
+        elif warp_idx == 1 and cta_id == 0:
+            for w in T.unroll(waves):
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                bx_cluster = (tile_id // group_size) % m_clusters
+                bx = bx_cluster * 2 + cta_id
+                by = (tile_id % group_size) + (tile_id // group_size) // m_clusters * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    T.mbarrier_wait_parity(tmem_empty, (w & 1) ^ 1)
+                    for k in T.serial(k_iters):
+                        phase = w * k_iters + k
+                        stage = phase % num_stages
+                        parity = (phase // num_stages) & 1
+                        T.mbarrier_wait_parity(with_sf_full[stage], parity)
+                        if k % sf_load_period == 0:
+                            T.tcgen05_cp_warpx4(SFA_shared[stage, :], SFA_tmem, use_2cta=True)
+                            T.tcgen05_cp_warpx4(SFB_shared[stage, :], SFB_tmem, use_2cta=True)
+                        T.tcgen05_gemm_blockscaled(
+                            A_shared[stage, :, :],
+                            B_shared[stage, :, :],
+                            C_tmem,
+                            SFA_tmem,
+                            SFB_tmem,
+                            transpose_B=transpose_B,
+                            mbar=consumed[stage],
+                            clear_accum=k == 0,
+                            sf_a_id=k % sf_load_period,
+                            sf_b_id=k % sf_load_period,
+                            use_2cta=True,
+                        )
+                    T.tcgen05_mma_arrive(tmem_full, arrive_2cta=True)
+
+        elif warp_idx == 2:
+            for w in T.unroll(waves):
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                bx_cluster = (tile_id // group_size) % m_clusters
+                bx = bx_cluster * 2 + cta_id
+                by = (tile_id % group_size) + (tile_id // group_size) // m_clusters * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    for k in T.serial(k_iters):
+                        phase = w * k_iters + k
+                        stage = phase % num_stages
+                        parity = (phase // num_stages) & 1
+                        T.mbarrier_wait_parity(loaded[stage], parity)
+                        if k % sf_load_period == 0:
+                            T.tcgen05_sf_warp_transpose(SFA_shared[stage, :])
+                            T.tcgen05_sf_warp_transpose(SFB_shared[stage, :])
+                            T.fence_proxy_async()
+                        T.mbarrier_arrive(with_sf_full[stage], 0)
+
+        elif 128 <= tx < 256:
+            for w in T.unroll(waves):
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                bx_cluster = (tile_id // group_size) % m_clusters
+                bx = bx_cluster * 2 + cta_id
+                by = (tile_id % group_size) + (tile_id // group_size) // m_clusters * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    T.mbarrier_wait_parity(tmem_full, w & 1)
+                    T.copy(C_tmem, C_local)
+                    T.mbarrier_arrive(tmem_empty, 0)
+
+                    if use_tma_store:
+                        for i in T.unroll(T.ceildiv(block_N, store_block_N)):
+                            T.copy(C_local[:, i * store_block_N : (i + 1) * store_block_N], C_shared)
+                            T.copy(C_shared, C[bx * block_M, by * block_N + i * store_block_N])
+                    else:
+                        T.copy(C_local, C_local_cast)
+                        T.copy(C_local_cast, C[bx * block_M, by * block_N])
+    return C
+
+
+def unpack_sf_u32_1d(packed_sf, mn, sf_k_blocks):
+    sf_k_groups = (sf_k_blocks + 3) // 4
+    packed_2d = packed_sf.view(sf_k_groups, mn).T.contiguous().to(torch.int64)
+    unpacked = torch.empty((mn, sf_k_groups * 4), device=packed_sf.device, dtype=torch.uint8)
+    for i in range(4):
+        unpacked[:, i::4] = ((packed_2d >> (8 * i)) & 0xFF).to(torch.uint8)
+    return unpacked[:, :sf_k_blocks].contiguous()
+
+
+def pack_sf_u8_to_u32_1d(sf_u8):
+    assert sf_u8.dtype == torch.uint8
+    assert sf_u8.dim() == 2
+    mn, sf_k_padded = sf_u8.shape
+    assert sf_k_padded % 4 == 0
+    words = sf_u8.to(torch.int64)
+    packed = (words[:, 0::4] | (words[:, 1::4] << 8) | (words[:, 2::4] << 16) | (words[:, 3::4] << 24)).to(torch.uint32)
+    return packed.T.contiguous().reshape(-1)
+
+
+def quantize_fp8_with_packed_ue8m0(x, gran_k=128):
+    """DeepGEMM-style per-token FP8 quantization with UE8M0 scale factors.
+
+    Returns:
+        x_fp8: [MN, K] in float8_e4m3fn
+        sf_packed_u32: flattened group-major packed uint32 scale factors
+        sf_u8: [MN, ceil(K / gran_k)] unpacked E8M0 exponents
+    """
+
+    def ceil_div_int(x, y):
+        return (x + y - 1) // y
+
+    def align_up(x, y):
+        return ceil_div_int(x, y) * y
+
+    def ceil_to_ue8m0(x):
+        bits = x.abs().float().view(torch.int32)
+        exp = ((bits >> 23) & 0xFF) + (bits & 0x7FFFFF).ne(0).to(torch.int32)
+        return (exp.clamp(1, 254) << 23).view(torch.float32)
+
+    assert x.dim() == 2
+    mn, k = x.shape
+    padded_k = align_up(k, gran_k)
+
+    x_padded = torch.zeros((mn, padded_k), device=x.device, dtype=x.dtype)
+    x_padded[:, :k] = x
+    x_view = x_padded.view(mn, padded_k // gran_k, gran_k)
+
+    x_amax = x_view.abs().float().amax(dim=2).clamp_min(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+
+    x_fp8 = (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn)
+    x_fp8 = x_fp8.view(mn, padded_k)[:, :k].contiguous()
+
+    sf_u8 = (sf.contiguous().view(torch.int32) >> 23).to(torch.uint8)
+    sf_k_blocks = sf_u8.shape[1]
+    sf_k_padded = align_up(sf_k_blocks, 4)
+    if sf_k_padded != sf_k_blocks:
+        sf_u8_padded = torch.full((mn, sf_k_padded), 127, device=x.device, dtype=torch.uint8)
+        sf_u8_padded[:, :sf_k_blocks] = sf_u8
+    else:
+        sf_u8_padded = sf_u8
+
+    sf_packed_u32 = pack_sf_u8_to_u32_1d(sf_u8_padded)
+    return x_fp8, sf_packed_u32, sf_u8
+
+
+def blockscaled_gemm_ref(a, b, sfa_packed, sfb_packed, sf_granularity_k=128, transpose_B=False):
+    """Torch reference for block-scaled MXFP8 GEMM.
+
+    Args:
+        a: [M, K] FP8 tensor
+        b: [K, N] FP8 tensor, or [N, K] when transpose_B=True
+        sfa_packed: [(sf_k_blocks / 4) * M] uint32 packed E8M0 scale factors for A
+        sfb_packed: [(sf_k_blocks / 4) * N] uint32 packed E8M0 scale factors for B
+        sf_granularity_k: number of K elements per scale factor block (default 128)
+
+    Returns:
+        [M, N] float32 result
+    """
+    M, K = a.shape
+    if transpose_B:
+        N, K2 = b.shape
+    else:
+        K2, N = b.shape
+    assert K == K2
+    sf_k_blocks = (K + sf_granularity_k - 1) // sf_granularity_k
+    sfa_unpacked = unpack_sf_u32_1d(sfa_packed, M, sf_k_blocks)
+    sfb_unpacked = unpack_sf_u32_1d(sfb_packed, N, sf_k_blocks)
+
+    a_f32 = a.to(torch.float32)
+    b_f32 = b.to(torch.float32)
+
+    # E8M0 exponent to float scale: 2^(exp - 127)
+    sfa_scales = torch.pow(2.0, sfa_unpacked.to(torch.float32) - 127.0)  # [M, sf_k_blocks]
+    sfb_scales = torch.pow(2.0, sfb_unpacked.to(torch.float32) - 127.0)  # [N, sf_k_blocks]
+
+    c = torch.zeros(M, N, device=a.device, dtype=torch.float32)
+    for bi in range(sf_k_blocks):
+        k_start = bi * sf_granularity_k
+        k_end = min(k_start + sf_granularity_k, K)
+        # Scale A block: [M, block_k] * [M, 1]
+        a_block = a_f32[:, k_start:k_end] * sfa_scales[:, bi : bi + 1]
+        if transpose_B:
+            # Scale B block: [N, block_k] * [N, 1]
+            b_block = b_f32[:, k_start:k_end] * sfb_scales[:, bi : bi + 1]
+            c += a_block @ b_block.T
+        else:
+            # Scale B block: [block_k, N] * [1, N]
+            b_block = b_f32[k_start:k_end, :] * sfb_scales[:, bi : bi + 1].T
+            c += a_block @ b_block
+    return c
+
+
+def cosine_similarity(a, b):
+    a_flat = a.flatten().float()
+    b_flat = b.flatten().float()
+    return (a_flat @ b_flat) / (a_flat.norm() * b_flat.norm())
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--use-e2e-quant-path", action="store_true", default=True)
+    parser.add_argument("--persistent", action="store_true", default=True)
+    parser.add_argument("--enable-2cta", action="store_true", default=True)
+    parser.add_argument("--transpose-b", action="store_true", help="Use B as [N, K] and compute A @ B.T.")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    M, N, K = 8192, 8192, 8192
+    block_M, block_N, block_K = 128, 256, 128
+    in_dtype, out_dtype, accum_dtype = T.float8_e4m3fn, T.bfloat16, T.float
+    use_e2e_quant_path = args.use_e2e_quant_path
+    persistent = args.persistent
+    enable_2cta = args.enable_2cta
+    transpose_B = args.transpose_b
+    num_stages = 6 if enable_2cta else 4
+    if persistent:
+        assert enable_2cta
+        kernel = mxfp8_blockscaled_gemm_2cta_persistent
+    else:
+        kernel = mxfp8_blockscaled_gemm_2cta if enable_2cta else mxfp8_blockscaled_gemm
+    sf_granularity_k = 128
+    assert sf_granularity_k == 128
+
+    if use_e2e_quant_path:
+        # End-to-end path:
+        #   fp16/bf16 source tensors -> per-token FP8 quantization with UE8M0 SF
+        #   -> pack 4 SF entries into one uint32 -> blockscaled GEMM
+        x = torch.randn(M, K, device="cuda", dtype=torch.float16)
+        w_nt = torch.randn(N, K, device="cuda", dtype=torch.float16)
+
+        a, sfa, _ = quantize_fp8_with_packed_ue8m0(x, gran_k=sf_granularity_k)
+        b_nt, sfb, _ = quantize_fp8_with_packed_ue8m0(w_nt, gran_k=sf_granularity_k)
+        b = b_nt if transpose_B else b_nt.T.contiguous()
+    else:
+        a = torch.randn(M, K, device="cuda", dtype=torch.float16).to(torch.float8_e4m3fn)
+        if transpose_B:
+            b = torch.randn(N, K, device="cuda", dtype=torch.float16).to(torch.float8_e4m3fn)
+        else:
+            b = torch.randn(K, N, device="cuda", dtype=torch.float16).to(torch.float8_e4m3fn)
+
+        # E8M0 scale factors: one uint32 per row per 4 K-blocks.
+        sf_k_blocks = (K + sf_granularity_k - 1) // sf_granularity_k
+
+        # Pad to multiple of 4 (UTCCP loads 4 K-blocks at a time)
+        sf_k_padded = ((sf_k_blocks + 3) // 4) * 4
+        sfa_u8 = torch.randint(127 - 5, 127 + 5, (M, sf_k_padded), device="cuda", dtype=torch.uint8)
+        sfb_u8 = torch.randint(127 - 5, 127 + 5, (N, sf_k_padded), device="cuda", dtype=torch.uint8)
+        sfa = pack_sf_u8_to_u32_1d(sfa_u8)
+        sfb = pack_sf_u8_to_u32_1d(sfb_u8)
+
+    c = kernel(
+        a,
+        b,
+        sfa,
+        sfb,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_stages,
+        sf_granularity_k,
+        transpose_B,
+    )
+    print(
+        kernel.get_kernel_source(
+            a,
+            b,
+            sfa,
+            sfb,
+            block_M,
+            block_N,
+            block_K,
+            in_dtype,
+            out_dtype,
+            accum_dtype,
+            num_stages,
+            sf_granularity_k,
+            transpose_B,
+        )
+    )
+
+    if use_e2e_quant_path:
+        # For the end-to-end quantization path, compare against the reference with bf16 gemm
+        ref_c = (x.float() @ w_nt.float().T).to(torch.bfloat16)
+    else:
+        ref_c = blockscaled_gemm_ref(a, b, sfa, sfb, sf_granularity_k, transpose_B=transpose_B).to(torch.bfloat16)
+    sim = cosine_similarity(c, ref_c)
+
+    print(f"Output shape: {c.shape}, dtype: {c.dtype}")
+    print(f"E2E quant path: {use_e2e_quant_path}")
+    print(f"transpose_B: {transpose_B}")
+    print(f"{c=}, {ref_c=}")
+    # print(f"Max abs error: {(c.float() - ref_c.float()).abs().max().item():.6f}")
+    print(f"Cosine similarity: {sim.item():.6f}")
+    if use_e2e_quant_path:
+        assert 1 - sim < 1e-3  # err tolerance from DeepGEMM
+        print("e2e check passed ✅")
+
+    tl_latency = do_bench(
+        lambda: kernel(
+            a,
+            b,
+            sfa,
+            sfb,
+            block_M,
+            block_N,
+            block_K,
+            in_dtype,
+            out_dtype,
+            accum_dtype,
+            num_stages,
+            sf_granularity_k,
+            transpose_B,
+        ),
+        backend="cupti",
+    )
+    print(f"Tilelang MXFP8 latency: {tl_latency} ms")
+    print(f"TFLOPs: {2 * M * N * K / (tl_latency / 1e3) / 1e12:.2f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/blockscaled_gemm_sm100/grouped_gemm_mxfp8_blockscaled_1d1d.py b/examples/blockscaled_gemm_sm100/grouped_gemm_mxfp8_blockscaled_1d1d.py
new file mode 100644
index 0000000000..eaa87e994e
--- /dev/null
+++ b/examples/blockscaled_gemm_sm100/grouped_gemm_mxfp8_blockscaled_1d1d.py
@@ -0,0 +1,657 @@
+# Grouped MXFP8 block-scaled GEMM on SM100.
+# Blockscale size: (M, N, K) = (1, 1, 128)
+
+import argparse
+
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.carver.arch import driver
+from tilelang.profiler import do_bench
+
+
+@tilelang.jit
+def grouped_mxfp8_blockscaled_gemm_2cta(
+    A,
+    B,
+    SFA,
+    SFB,
+    offsets,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    max_M_per_E,
+    transpose_B=True,
+    sf_granularity_k=128,
+):
+    """Grouped 2CTA MXFP8 blockscaled GEMM.
+
+    Logical scale shape follows tilelang_gemm.py:
+      SFA [M_total, sf_k_packed], SFB [E, N, sf_k_packed]
+
+    Kernel scale operands are group-major flat buffers so the SF loads can use
+    the same contiguous TMA pattern as mxfp8_blockscaled_gemm_2cta.
+    """
+    M_total, N, K, E, E1 = T.const("M_total, N, K, E, E1")
+
+    assert block_M == 128
+    assert block_N == 256
+    assert block_K == 128
+    assert sf_granularity_k == 128
+
+    half_N = block_N // 2
+    k_iters = T.ceildiv(K, block_K)
+    sf_load_period = sf_granularity_k * 4 // block_K
+    sf_k_groups = T.ceildiv(T.ceildiv(K, sf_granularity_k), 4)
+    assert sf_load_period == 4
+
+    A: T.Tensor[[M_total, K], in_dtype]
+    B: T.Tensor[[E, N, K] if transpose_B else [E, K, N], in_dtype]
+    SFA: T.Tensor[[sf_k_groups * M_total], T.uint32]
+    SFB: T.Tensor[[sf_k_groups * E * N], T.uint32]
+    offsets: T.Tensor[[E1], T.int32]
+    C = T.empty((M_total, N), out_dtype)
+
+    n_blocks = T.ceildiv(N, block_N)
+    max_M_blocks = T.ceildiv(max_M_per_E, block_M)
+    max_M_blocks_padded = T.ceildiv(max_M_blocks, 2) * 2
+
+    with T.Kernel(max_M_blocks_padded, n_blocks, E, threads=128, cluster_dims=2) as (pid_m, pid_n, eid):
+        cta_id = T.block_rank_in_cluster()
+        T.assume(cta_id < 2)
+
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared(
+            (num_stages, half_N, block_K) if transpose_B else (num_stages, block_K, half_N),
+            in_dtype,
+        )
+        SFA_shared = T.alloc_shared((num_stages, block_M), "uint32")
+        SFB_shared = T.alloc_shared((num_stages, block_N), "uint32")
+
+        C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+        SFA_tmem = T.alloc_tmem([block_M, 4], "uint32")
+        SFB_tmem = T.alloc_tmem([block_M, 8], "uint32")
+
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+        loaded = T.alloc_barrier([32] * num_stages)
+        with_sf_full = T.alloc_cluster_barrier([32 * 2] * num_stages)
+        consumed = T.alloc_cluster_barrier([1] * num_stages)
+        tmem_full = T.alloc_barrier([1])
+
+        tx = T.get_thread_binding()
+        warp_idx = tx // 32
+        T.use_swizzle(16)
+
+        start_m = offsets[eid]
+        end_m = offsets[eid + 1]
+        m_size = end_m - start_m
+        expert_m_blocks = T.ceildiv(m_size, block_M)
+        clamped_pid_m = T.min(pid_m, T.max(expert_m_blocks, 1) - 1)
+        tile_m = start_m + clamped_pid_m * block_M
+
+        if warp_idx == 0:
+            for k in T.serial(k_iters):
+                stage = k % num_stages
+                phase = (k // num_stages) & 1
+                T.mbarrier_wait_parity(consumed[stage], phase ^ 1)
+                T.tma_copy(
+                    A[tile_m : tile_m + block_M, k * block_K : (k + 1) * block_K],
+                    A_shared[stage, :, :],
+                    barrier=loaded[stage],
+                )
+                if transpose_B:
+                    T.tma_copy(
+                        B[
+                            eid,
+                            pid_n * block_N + cta_id * half_N : pid_n * block_N + (cta_id + 1) * half_N,
+                            k * block_K : (k + 1) * block_K,
+                        ],
+                        B_shared[stage, :, :],
+                        barrier=loaded[stage],
+                    )
+                else:
+                    T.tma_copy(
+                        B[
+                            eid,
+                            k * block_K : (k + 1) * block_K,
+                            pid_n * block_N + cta_id * half_N : pid_n * block_N + (cta_id + 1) * half_N,
+                        ],
+                        B_shared[stage, :, :],
+                        barrier=loaded[stage],
+                    )
+                if k % sf_load_period == 0:
+                    sf_group_idx = k // sf_load_period
+                    T.tma_copy(
+                        SFA[sf_group_idx * M_total + tile_m : sf_group_idx * M_total + tile_m + block_M],
+                        SFA_shared[stage, :],
+                        barrier=loaded[stage],
+                    )
+                    T.tma_copy(
+                        SFB[sf_group_idx * E * N + eid * N + pid_n * block_N : sf_group_idx * E * N + eid * N + (pid_n + 1) * block_N],
+                        SFB_shared[stage, :],
+                        barrier=loaded[stage],
+                    )
+                T.mbarrier_arrive(loaded[stage])
+
+        elif warp_idx == 1 and cta_id == 0:
+            for k in T.serial(k_iters):
+                stage = k % num_stages
+                phase = (k // num_stages) & 1
+                T.mbarrier_wait_parity(with_sf_full[stage], phase)
+                if k % sf_load_period == 0:
+                    T.tcgen05_cp_warpx4(SFA_shared[stage, :], SFA_tmem, use_2cta=True)
+                    T.tcgen05_cp_warpx4(SFB_shared[stage, :], SFB_tmem, use_2cta=True)
+
+                T.tcgen05_gemm_blockscaled(
+                    A_shared[stage, :, :],
+                    B_shared[stage, :, :],
+                    C_tmem,
+                    SFA_tmem,
+                    SFB_tmem,
+                    transpose_B=transpose_B,
+                    mbar=consumed[stage],
+                    clear_accum=k == 0,
+                    sf_a_id=k % sf_load_period,
+                    sf_b_id=k % sf_load_period,
+                    use_2cta=True,
+                )
+            T.tcgen05_mma_arrive(tmem_full, arrive_2cta=True)
+
+        elif warp_idx == 2:
+            for k in T.serial(k_iters):
+                stage = k % num_stages
+                phase = (k // num_stages) & 1
+                T.mbarrier_wait_parity(loaded[stage], phase)
+                if k % sf_load_period == 0:
+                    T.tcgen05_sf_warp_transpose(SFA_shared[stage, :])
+                    T.tcgen05_sf_warp_transpose(SFB_shared[stage, :])
+                    T.fence_proxy_async()
+                T.mbarrier_arrive(with_sf_full[stage], 0)
+
+        T.mbarrier_wait_parity(tmem_full, 0)
+        T.copy(C_tmem, C_local)
+
+        if pid_m * block_M < m_size and tile_m + block_M <= end_m:
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[tile_m, pid_n * block_N])
+        elif pid_m * block_M < m_size:
+            T.copy(C_local, C_local_cast)
+            actual_rows = end_m - tile_m
+            for i, j in T.Parallel(block_M, block_N):
+                if i < actual_rows and pid_n * block_N + j < N:
+                    C[tile_m + i, pid_n * block_N + j] = C_local_cast[i, j]
+
+    return C
+
+
+@tilelang.jit
+def grouped_mxfp8_blockscaled_gemm_2cta_persistent(
+    A,
+    B,
+    SFA,
+    SFB,
+    offsets,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    max_M_per_E,
+    transpose_B=True,
+    sf_granularity_k=128,
+    store_block_N=64,
+):
+    """Persistent grouped 2CTA MXFP8 blockscaled GEMM with one accumulator TMEM."""
+    M_total, N, K, E, E1 = T.const("M_total, N, K, E, E1")
+
+    assert block_M == 128
+    assert block_N == 256
+    assert block_K == 128
+    assert sf_granularity_k == 128
+
+    half_N = block_N // 2
+    k_iters = T.ceildiv(K, block_K)
+    sf_load_period = sf_granularity_k * 4 // block_K
+    sf_k_groups = T.ceildiv(T.ceildiv(K, sf_granularity_k), 4)
+    assert sf_load_period == 4
+
+    A: T.Tensor[[M_total, K], in_dtype]
+    B: T.Tensor[[E, N, K] if transpose_B else [E, K, N], in_dtype]
+    SFA: T.Tensor[[sf_k_groups * M_total], T.uint32]
+    SFB: T.Tensor[[sf_k_groups * E * N], T.uint32]
+    offsets: T.Tensor[[E1], T.int32]
+    C = T.empty((M_total, N), out_dtype)
+
+    sm_num = driver.get_num_sms()
+    num_clusters = sm_num // 2
+    n_blocks = T.ceildiv(N, block_N)
+    max_M_blocks = T.ceildiv(max_M_per_E, block_M)
+    max_M_blocks_padded = T.ceildiv(max_M_blocks, 2) * 2
+    m_clusters = max_M_blocks_padded // 2
+    total_cluster_tiles = E * n_blocks * m_clusters
+    waves = T.ceildiv(total_cluster_tiles, num_clusters)
+    group_size = 8
+
+    with T.Kernel(sm_num, threads=256, cluster_dims=2) as (block_id):
+        cta_id = T.block_rank_in_cluster()
+        T.assume(cta_id < 2)
+
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared(
+            (num_stages, half_N, block_K) if transpose_B else (num_stages, block_K, half_N),
+            in_dtype,
+        )
+        SFA_shared = T.alloc_shared((num_stages, block_M), "uint32")
+        SFB_shared = T.alloc_shared((num_stages, block_N), "uint32")
+
+        C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+        SFA_tmem = T.alloc_tmem([block_M, block_M // 128 * 4], "uint32")
+        SFB_tmem = T.alloc_tmem([block_M, block_N // 128 * 4], "uint32")
+
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        C_shared = T.alloc_shared((block_M, store_block_N), out_dtype)
+
+        loaded = T.alloc_barrier([32] * num_stages)
+        with_sf_full = T.alloc_cluster_barrier([32 * 2] * num_stages)
+        consumed = T.alloc_cluster_barrier([1] * num_stages)
+        tmem_full = T.alloc_cluster_barrier([1])
+        tmem_empty = T.alloc_cluster_barrier([128 * 2])
+
+        tx = T.get_thread_binding()
+        warp_idx = tx // 32
+
+        if warp_idx == 0:
+            for w in T.unroll(waves):
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                eid = tile_id // (n_blocks * m_clusters)
+                local_tile_id = tile_id - eid * n_blocks * m_clusters
+                num_pid_in_group = group_size * n_blocks
+                group_id = local_tile_id // num_pid_in_group
+                first_pid_m_cluster = group_id * group_size
+                group_m = T.min(m_clusters - first_pid_m_cluster, group_size)
+                pid_m_cluster = first_pid_m_cluster + (local_tile_id % num_pid_in_group) % group_m
+                pid_n = (local_tile_id % num_pid_in_group) // group_m
+
+                if tile_id < total_cluster_tiles:
+                    start_m = offsets[eid]
+                    end_m = offsets[eid + 1]
+                    m_size = end_m - start_m
+                    expert_m_blocks = T.ceildiv(m_size, block_M)
+                    pid_m = pid_m_cluster * 2 + cta_id
+                    safe_pid_m = T.min(pid_m, T.max(expert_m_blocks, 1) - 1)
+                    tile_m = start_m + safe_pid_m * block_M
+
+                    for k in T.serial(k_iters):
+                        phase = w * k_iters + k
+                        stage = phase % num_stages
+                        parity = (phase // num_stages) & 1
+                        T.mbarrier_wait_parity(consumed[stage], parity ^ 1)
+                        T.tma_copy(
+                            A[tile_m : tile_m + block_M, k * block_K : (k + 1) * block_K],
+                            A_shared[stage, :, :],
+                            barrier=loaded[stage],
+                        )
+                        if transpose_B:
+                            T.tma_copy(
+                                B[
+                                    eid,
+                                    pid_n * block_N + cta_id * half_N : pid_n * block_N + (cta_id + 1) * half_N,
+                                    k * block_K : (k + 1) * block_K,
+                                ],
+                                B_shared[stage, :, :],
+                                barrier=loaded[stage],
+                            )
+                        else:
+                            T.tma_copy(
+                                B[
+                                    eid,
+                                    k * block_K : (k + 1) * block_K,
+                                    pid_n * block_N + cta_id * half_N : pid_n * block_N + (cta_id + 1) * half_N,
+                                ],
+                                B_shared[stage, :, :],
+                                barrier=loaded[stage],
+                            )
+                        if k % sf_load_period == 0:
+                            sf_group_idx = k // sf_load_period
+                            T.tma_copy(
+                                SFA[sf_group_idx * M_total + tile_m : sf_group_idx * M_total + tile_m + block_M],
+                                SFA_shared[stage, :],
+                                barrier=loaded[stage],
+                            )
+                            T.tma_copy(
+                                SFB[
+                                    sf_group_idx * E * N + eid * N + pid_n * block_N : sf_group_idx * E * N
+                                    + eid * N
+                                    + (pid_n + 1) * block_N
+                                ],
+                                SFB_shared[stage, :],
+                                barrier=loaded[stage],
+                            )
+                        T.mbarrier_arrive(loaded[stage])
+
+        elif warp_idx == 1 and cta_id == 0:
+            for w in T.unroll(waves):
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+
+                if tile_id < total_cluster_tiles:
+                    T.mbarrier_wait_parity(tmem_empty, (w & 1) ^ 1)
+                    for k in T.serial(k_iters):
+                        phase = w * k_iters + k
+                        stage = phase % num_stages
+                        parity = (phase // num_stages) & 1
+                        T.mbarrier_wait_parity(with_sf_full[stage], parity)
+                        if k % sf_load_period == 0:
+                            T.tcgen05_cp_warpx4(SFA_shared[stage, :], SFA_tmem, use_2cta=True)
+                            T.tcgen05_cp_warpx4(SFB_shared[stage, :], SFB_tmem, use_2cta=True)
+                        T.tcgen05_gemm_blockscaled(
+                            A_shared[stage, :, :],
+                            B_shared[stage, :, :],
+                            C_tmem,
+                            SFA_tmem,
+                            SFB_tmem,
+                            transpose_B=transpose_B,
+                            mbar=consumed[stage],
+                            clear_accum=k == 0,
+                            sf_a_id=k % sf_load_period,
+                            sf_b_id=k % sf_load_period,
+                            use_2cta=True,
+                        )
+                    T.tcgen05_mma_arrive(tmem_full, arrive_2cta=True)
+
+        elif warp_idx == 2:
+            for w in T.unroll(waves):
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+
+                if tile_id < total_cluster_tiles:
+                    for k in T.serial(k_iters):
+                        phase = w * k_iters + k
+                        stage = phase % num_stages
+                        parity = (phase // num_stages) & 1
+                        T.mbarrier_wait_parity(loaded[stage], parity)
+                        if k % sf_load_period == 0:
+                            T.tcgen05_sf_warp_transpose(SFA_shared[stage, :])
+                            T.tcgen05_sf_warp_transpose(SFB_shared[stage, :])
+                            T.fence_proxy_async()
+                        T.mbarrier_arrive(with_sf_full[stage], 0)
+
+        elif 128 <= tx < 256:
+            for w in T.unroll(waves):
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                eid = tile_id // (n_blocks * m_clusters)
+                local_tile_id = tile_id - eid * n_blocks * m_clusters
+                num_pid_in_group = group_size * n_blocks
+                group_id = local_tile_id // num_pid_in_group
+                first_pid_m_cluster = group_id * group_size
+                group_m = T.min(m_clusters - first_pid_m_cluster, group_size)
+                pid_m_cluster = first_pid_m_cluster + (local_tile_id % num_pid_in_group) % group_m
+                pid_n = (local_tile_id % num_pid_in_group) // group_m
+                pid_m = pid_m_cluster * 2 + cta_id
+
+                if tile_id < total_cluster_tiles:
+                    start_m = offsets[eid]
+                    end_m = offsets[eid + 1]
+                    m_size = end_m - start_m
+                    tile_m = start_m + pid_m * block_M
+                    T.mbarrier_wait_parity(tmem_full, w & 1)
+                    T.copy(C_tmem, C_local)
+                    T.mbarrier_arrive(tmem_empty, 0)
+
+                    if pid_m * block_M < m_size and tile_m + block_M <= end_m:
+                        for i in T.unroll(T.ceildiv(block_N, store_block_N)):
+                            T.copy(C_local[:, i * store_block_N : (i + 1) * store_block_N], C_shared)
+                            T.copy(C_shared, C[tile_m, pid_n * block_N + i * store_block_N])
+                    elif pid_m * block_M < m_size:
+                        T.copy(C_local, C_local_cast)
+                        actual_rows = end_m - tile_m
+                        for i, j in T.Parallel(block_M, block_N):
+                            if i < actual_rows and pid_n * block_N + j < N:
+                                C[tile_m + i, pid_n * block_N + j] = C_local_cast[i, j]
+
+    return C
+
+
+def pack_sf_u8_to_u32_rows(sf_u8):
+    assert sf_u8.dtype == torch.uint8
+    assert sf_u8.dim() == 2
+    assert sf_u8.shape[1] % 4 == 0
+    words = sf_u8.to(torch.int64)
+    return (words[:, 0::4] | (words[:, 1::4] << 8) | (words[:, 2::4] << 16) | (words[:, 3::4] << 24)).to(torch.uint32).contiguous()
+
+
+def pack_rows_to_group_major_flat(packed_rows):
+    return packed_rows.contiguous().T.contiguous().reshape(-1)
+
+
+def pack_sfb_to_group_major_flat(packed_sfb):
+    return packed_sfb.contiguous().permute(2, 0, 1).contiguous().reshape(-1)
+
+
+def unpack_sf_u32_rows(packed_sf, sf_k_blocks):
+    words = packed_sf.contiguous().view(-1, packed_sf.shape[-1]).to(torch.int64)
+    unpacked = torch.empty((words.shape[0], words.shape[1] * 4), device=packed_sf.device, dtype=torch.uint8)
+    for i in range(4):
+        unpacked[:, i::4] = ((words >> (8 * i)) & 0xFF).to(torch.uint8)
+    return unpacked[:, :sf_k_blocks].view(*packed_sf.shape[:-1], sf_k_blocks).contiguous()
+
+
+def quantize_fp8_with_packed_ue8m0_rows(x, gran_k=128):
+    def ceil_div_int(x, y):
+        return (x + y - 1) // y
+
+    def align_up(x, y):
+        return ceil_div_int(x, y) * y
+
+    def ceil_to_ue8m0(x):
+        bits = x.abs().float().view(torch.int32)
+        exp = ((bits >> 23) & 0xFF) + (bits & 0x7FFFFF).ne(0).to(torch.int32)
+        return (exp.clamp(1, 254) << 23).view(torch.float32)
+
+    assert x.dim() == 2
+    mn, k = x.shape
+    padded_k = align_up(k, gran_k)
+    x_padded = torch.zeros((mn, padded_k), device=x.device, dtype=x.dtype)
+    x_padded[:, :k] = x
+    x_view = x_padded.view(mn, padded_k // gran_k, gran_k)
+
+    x_amax = x_view.abs().float().amax(dim=2).clamp_min(1e-4)
+    sf = ceil_to_ue8m0(x_amax / 448.0)
+    x_fp8 = (x_view * (1.0 / sf.unsqueeze(2))).to(torch.float8_e4m3fn)
+    x_fp8 = x_fp8.view(mn, padded_k)[:, :k].contiguous()
+
+    sf_u8 = (sf.contiguous().view(torch.int32) >> 23).to(torch.uint8)
+    sf_k_padded = align_up(sf_u8.shape[1], 4)
+    if sf_k_padded != sf_u8.shape[1]:
+        sf_u8_padded = torch.full((mn, sf_k_padded), 127, device=x.device, dtype=torch.uint8)
+        sf_u8_padded[:, : sf_u8.shape[1]] = sf_u8
+    else:
+        sf_u8_padded = sf_u8
+    return x_fp8, pack_sf_u8_to_u32_rows(sf_u8_padded), sf_u8
+
+
+def grouped_blockscaled_gemm_ref(a, b, sfa_packed, sfb_packed, offsets, sf_granularity_k=128, transpose_B=True):
+    m_total, k = a.shape
+    if transpose_B:
+        e, n, k2 = b.shape
+    else:
+        e, k2, n = b.shape
+    assert k == k2
+    sf_k_blocks = (k + sf_granularity_k - 1) // sf_granularity_k
+    sfa_unpacked = unpack_sf_u32_rows(sfa_packed, sf_k_blocks)
+    sfb_unpacked = unpack_sf_u32_rows(sfb_packed, sf_k_blocks)
+
+    a_f32 = a.to(torch.float32)
+    b_f32 = b.to(torch.float32)
+    sfa_scales = torch.pow(2.0, sfa_unpacked.to(torch.float32) - 127.0)
+    sfb_scales = torch.pow(2.0, sfb_unpacked.to(torch.float32) - 127.0)
+
+    c = torch.empty((m_total, n), device=a.device, dtype=torch.float32)
+    for eid in range(e):
+        start = int(offsets[eid].item())
+        end = int(offsets[eid + 1].item())
+        if start == end:
+            continue
+        out = torch.zeros((end - start, n), device=a.device, dtype=torch.float32)
+        for bi in range(sf_k_blocks):
+            k_start = bi * sf_granularity_k
+            k_end = min(k_start + sf_granularity_k, k)
+            a_block = a_f32[start:end, k_start:k_end] * sfa_scales[start:end, bi : bi + 1]
+            if transpose_B:
+                b_block = b_f32[eid, :, k_start:k_end] * sfb_scales[eid, :, bi : bi + 1]
+                out += a_block @ b_block.T
+            else:
+                b_block = b_f32[eid, k_start:k_end, :] * sfb_scales[eid, :, bi : bi + 1].T
+                out += a_block @ b_block
+        c[start:end] = out
+    return c
+
+
+def cosine_similarity(a, b):
+    a_flat = a.flatten().float()
+    b_flat = b.flatten().float()
+    return (a_flat @ b_flat) / (a_flat.norm() * b_flat.norm())
+
+
+def make_offsets(batch_sizes, device):
+    offsets = torch.zeros(len(batch_sizes) + 1, device=device, dtype=torch.int32)
+    offsets[1:] = torch.tensor(batch_sizes, device=device, dtype=torch.int32).cumsum(0)
+    return offsets
+
+
+def run_grouped_mxfp8_blockscaled_gemm(
+    a,
+    b,
+    sfa_flat,
+    sfb_flat,
+    offsets,
+    max_M_per_E,
+    transpose_B=True,
+    persistent=True,
+):
+    block_M, block_N, block_K = 128, 256, 128
+    in_dtype, out_dtype, accum_dtype = T.float8_e4m3fn, T.bfloat16, T.float
+    num_stages = 6
+    sf_granularity_k = 128
+
+    m_total, k = a.shape
+    if transpose_B:
+        _, n, k2 = b.shape
+    else:
+        _, k2, n = b.shape
+    assert k == k2
+    assert n % block_N == 0, f"N={n} not divisible by {block_N}"
+    assert k % block_K == 0, f"K={k} not divisible by {block_K}"
+
+    kernel = grouped_mxfp8_blockscaled_gemm_2cta_persistent if persistent else grouped_mxfp8_blockscaled_gemm_2cta
+    return kernel(
+        a,
+        b,
+        sfa_flat,
+        sfb_flat,
+        offsets,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_stages,
+        max_M_per_E,
+        transpose_B,
+        sf_granularity_k,
+    )
+
+
+def parse_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch-sizes", type=str, default="512,1024,1536,2048")
+    parser.add_argument("--N", type=int, default=8192)
+    parser.add_argument("--K", type=int, default=8192)
+    parser.add_argument("--transpose-b", action="store_true", help="Use B as [E, N, K] and compute grouped A @ B.T.")
+    parser.add_argument("--no-persistent", action="store_true", help="Run the non-persistent 2CTA kernel.")
+    parser.add_argument("--no-bench", action="store_true")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+    batch_sizes = [int(x) for x in args.batch_sizes.split(",") if x.strip()]
+    transpose_B = args.transpose_b
+    persistent = not args.no_persistent
+    device = "cuda"
+    m_total = sum(batch_sizes)
+    e = len(batch_sizes)
+    n = args.N
+    k = args.K
+
+    offsets = make_offsets(batch_sizes, device)
+    max_M_per_E = max(batch_sizes)
+
+    x = torch.randn(m_total, k, device=device, dtype=torch.float16)
+    w_nt = torch.randn(e, n, k, device=device, dtype=torch.float16)
+
+    a, sfa, _ = quantize_fp8_with_packed_ue8m0_rows(x)
+    b_nt, sfb_2d, _ = quantize_fp8_with_packed_ue8m0_rows(w_nt.view(e * n, k))
+    b_nt = b_nt.view(e, n, k).contiguous()
+    sfb = sfb_2d.view(e, n, -1).contiguous()
+
+    sfa_flat = pack_rows_to_group_major_flat(sfa)
+    sfb_flat = pack_sfb_to_group_major_flat(sfb)
+    b = b_nt if transpose_B else b_nt.transpose(1, 2).contiguous()
+
+    c = run_grouped_mxfp8_blockscaled_gemm(
+        a,
+        b,
+        sfa_flat,
+        sfb_flat,
+        offsets,
+        max_M_per_E,
+        transpose_B,
+        persistent,
+    )
+    ref_c = grouped_blockscaled_gemm_ref(a, b, sfa, sfb, offsets, transpose_B=transpose_B).to(torch.bfloat16)
+    sim = cosine_similarity(c, ref_c)
+    max_abs = (c.float() - ref_c.float()).abs().max().item()
+
+    print(f"Output shape: {c.shape}, dtype: {c.dtype}")
+    print(f"batch_sizes: {batch_sizes}")
+    print(f"transpose_B: {transpose_B}")
+    print(f"persistent: {persistent}")
+    print(f"Cosine similarity: {sim.item():.6f}")
+    print(f"Max abs error: {max_abs:.6f}")
+    assert 1 - sim < 1e-5
+    print("grouped blockscaled check passed")
+
+    if not args.no_bench:
+        latency = do_bench(
+            lambda: run_grouped_mxfp8_blockscaled_gemm(
+                a,
+                b,
+                sfa_flat,
+                sfb_flat,
+                offsets,
+                max_M_per_E,
+                transpose_B,
+                persistent,
+            ),
+            backend="cupti",
+        )
+        print(f"Tilelang grouped MXFP8 latency: {latency} ms")
+        print(f"TFLOPs: {2 * m_total * n * k / (latency / 1e3) / 1e12:.2f}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/blockscaled_gemm_sm100/mxfp8_illustrated.md b/examples/blockscaled_gemm_sm100/mxfp8_illustrated.md
new file mode 100644
index 0000000000..48460bed88
--- /dev/null
+++ b/examples/blockscaled_gemm_sm100/mxfp8_illustrated.md
@@ -0,0 +1,117 @@
+# SM100 MXFP8 Blockscaled Illustration
+
+This note explains the Blackwell data path used by
+[`gemm_mxfp8_blockscaled_1d1d.py`](gemm_mxfp8_blockscaled_1d1d.py) and the
+grouped variants in
+[`grouped_gemm_mxfp8_blockscaled_1d1d.py`](grouped_gemm_mxfp8_blockscaled_1d1d.py).
+The kernels use 1D-1D MXFP8 block scaling: one `ue8m0` scale for each A row
+and B column per 128 K elements. Four adjacent K scale blocks are packed into
+one `uint32`.
+
+![SM100 block-scaled GEMM data path](figures/blockscaled_data_path.svg)
+
+## Kernel Variants
+
+![1CTA, 2CTA, and persistent 2CTA comparison](figures/blockscaled_variants.svg)
+
+| Variant | Function | Launch shape | Tile ownership | Main difference |
+| --- | --- | --- | --- | --- |
+| 1CTA | `mxfp8_blockscaled_gemm` | `threads=128` | One CTA computes one logical `block_M x block_N` tile. | Local barriers. Warp 0 loads A/B/SF, warp 1 issues UTCCP and MMA, warp 2 transposes SF in SMEM. |
+| 2CTA | `mxfp8_blockscaled_gemm_2cta` | `threads=128, cluster_dims=2` | A CTA pair computes one logical `128 x 256` tile. | Each peer loads a half-N B panel, A is loaded in both peers, and leader CTA issues `use_2cta=True`. |
+| 2CTA persistent | `mxfp8_blockscaled_gemm_2cta_persistent` | `T.Kernel(sm_num, threads=256, cluster_dims=2)` | Resident CTA pairs walk logical tiles over multiple waves. | Adds a persistent scheduler and dedicated epilogue warpgroup, with `tmem_empty` handing TMEM back to the MMA warp. |
+
+The grouped kernels reuse the same 2CTA and persistent structure. Their extra
+work is scheduler-side: map `(pid_m, pid_n, eid)` through `offsets`, clamp tail
+M blocks, and use `SFB[sf_group * E * N + eid * N + n]` for expert-local B
+scales.
+
+## Warp Specialization
+
+![Warp-specialized pipeline](figures/blockscaled_warp_specialization.svg)
+
+The examples are warp-specialized by role, but they do not use the
+`tcgen05.mma.ws` PTX form. The block-scaled call lowers to
+`tcgen05.mma.cta_group::{1,2}.kind::mxf8f6f4.block_scale`; TileLang currently
+rejects combining the block-scaled `.ws` variant with 2CTA.
+
+| Threads | Non-persistent role | Persistent role | Main handoff |
+| --- | --- | --- | --- |
+| warp 0 | TMA producer for A, B, SFA, SFB | Same, inside the wave loop | Wait `consumed`, arrive `loaded`. |
+| warp 1 in leader CTA | UTCCP copy plus `tcgen05.mma.block_scale` issue | Same, but waits `tmem_empty` before each wave | Wait `with_sf_full`, arrive `consumed`, finally arrive `tmem_full`. |
+| warp 2 | SF transposer | Same | Wait `loaded`, transpose packed SF chunks, `fence_proxy_async`, arrive `with_sf_full`. |
+| all warps / warps 4-7 | Copy `C_tmem` through registers/SMEM to global | Dedicated epilogue warpgroup | Wait `tmem_full`; persistent path arrives `tmem_empty` after reading TMEM. |
+
+Two details are easy to miss:
+
+- `tcgen05.mma` has single-thread issue semantics, so one elected lane in the
+  MMA warp initiates the whole operation.
+- `tcgen05.mma` and `tcgen05.cp` access SMEM through the async proxy. After the
+  SF warp rewrites the SF buffer in normal SMEM, it uses `fence_proxy_async()`
+  before the MMA warp uses `tcgen05.cp`.
+
+## Scale-Factor Layout
+
+Blackwell blockscaled tcgen05 MMA instructions require special layout for scale factors in TMEM. Take K=32, SFA as an example:
+![Layout requirement for SFA in TMEM](figures/sfa.png)
+
+We need process like below to pack the scale factors into the required layout:
+
+![Scale-factor packing and TMEM layout](figures/blockscaled_sf_layout.svg)
+
+For `sf_granularity_k = block_K = 128`, the examples load one packed SF word
+every four K iterations:
+
+```text
+sf_load_period = sf_granularity_k * 4 / block_K = 4
+sf_k_blocks    = ceil(K / 128)
+sf_k_groups    = ceil(sf_k_blocks / 4)
+```
+
+The global flat layout is group-major:
+
+```text
+SFA[sf_group * M + m]
+SFB[sf_group * N + n]
+
+word = sf0 | (sf1 << 8) | (sf2 << 16) | (sf3 << 24)
+```
+
+For grouped GEMM, SFA is still group-major over the concatenated M dimension,
+while SFB is group-major over `(E, N)`:
+
+```text
+SFA[sf_group * M_total + m]
+SFB[sf_group * E * N + eid * N + n]
+```
+
+Each SF TMA places a 1D `uint32` vector in SMEM:
+
+- SFA has `block_M` words, one per output row.
+- SFB has `block_N` words, one per output column.
+- Each word contains four `ue8m0` bytes for four consecutive 128-wide K
+  groups.
+
+`T.tcgen05_sf_warp_transpose` works on each 128-word chunk. It rewrites a
+`4 x 32` word view into a `32 x 4` word view, matching the
+`tcgen05.cp.32x128b.warpx4` source pattern. `T.tcgen05_cp_warpx4` then copies
+one 128-word chunk into four TMEM columns and duplicates it across the four
+32-lane TMEM partitions required by block-scaled MMA, which is also required by hardware.
+
+The resulting TMEM shapes in the 128x256 examples are:
+
+```text
+SFA_tmem: [128 lanes, 4 columns]
+SFB_tmem: [128 lanes, 8 columns]  # two 128-column N chunks
+```
+
+During MMA issue, `sf_a_id = k % 4` and `sf_b_id = k % 4` select the active
+byte sub-column from the packed `uint32` cell. This is why one SF TMA load
+serves four adjacent `block_K=128` MMA iterations.
+
+## Related
+
+- TileLang kernels: [`gemm_mxfp8_blockscaled_1d1d.py`](gemm_mxfp8_blockscaled_1d1d.py)
+- Grouped kernels: [`grouped_gemm_mxfp8_blockscaled_1d1d.py`](grouped_gemm_mxfp8_blockscaled_1d1d.py)
+- TileLang helpers: `T.tcgen05_cp_warpx4`, `T.tcgen05_sf_warp_transpose`, and
+  `T.tcgen05_gemm_blockscaled`
+- PTX document: https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#tcgen05-block-scaling
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
index 6e73214522..a93e4de135 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_paged.py
@@ -2,7 +2,6 @@
 import torch
 import torch.nn.functional as F
 import tilelang
-from tilelang.autotuner import *
 import tilelang.language as T
 from einops import rearrange, einsum
 import argparse
@@ -13,160 +12,159 @@
 from heuristic import num_splits_heuristic
 
 
-def flashattn(batch, heads, heads_kv, dim, dim_v):
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn(batch, heads, heads_kv, dim, dim_v, block_N, block_H, page_block_size, num_stages, threads, num_pages):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     dtype = T.float16
     accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
-    @tilelang.jit(
-        out_idx=[-1],
-        pass_configs={
-            tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        },
-    )
-    def kernel_func(
-        block_N, block_H, page_block_size, num_split, num_stages, threads, num_pages, max_num_blocks_per_seq, max_selected_blocks
+    num_split = T.dynamic("num_split")
+    max_num_blocks_per_seq = T.dynamic("max_num_blocks_per_seq")
+    max_selected_blocks = T.dynamic("max_selected_blocks")
+
+    shape_q = [batch, heads, dim]
+    shape_k = [num_pages, page_block_size, heads_kv, dim]
+    shape_v = [num_pages, page_block_size, heads_kv, dim_v]
+    shape_indices = [batch, heads_kv, max_selected_blocks]
+    shape_block_table = [batch, max_num_blocks_per_seq]
+    shape_o = [batch, heads, dim_v]
+    part_shape = [batch, heads, num_split, dim_v]
+    valid_block_H = min(block_H, kv_group_num)
+    assert block_N <= page_block_size and page_block_size % block_N == 0
+    block_ratio = page_block_size // block_N
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        block_indices: T.Tensor(shape_indices, T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        block_table: T.Tensor(shape_block_table, T.int32),
+        glse: T.Tensor([batch, heads, num_split], accum_dtype),
+        Output_partial: T.Tensor(part_shape, accum_dtype),
+        Output: T.Tensor(shape_o, dtype),
     ):
-        shape_q = [batch, heads, dim]
-        shape_k = [num_pages, page_block_size, heads_kv, dim]
-        shape_v = [num_pages, page_block_size, heads_kv, dim_v]
-        shape_indices = [batch, heads_kv, max_selected_blocks]
-        shape_block_table = [batch, max_num_blocks_per_seq]
-        shape_o = [batch, heads, dim_v]
-        part_shape = [batch, heads, num_split, dim_v]
-        valid_block_H = min(block_H, kv_group_num)
-        assert block_N <= page_block_size and page_block_size % block_N == 0
-        block_ratio = page_block_size // block_N
-
-        @T.prim_func
-        def main(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            block_indices: T.Tensor(shape_indices, T.int32),
-            cache_seqlens: T.Tensor([batch], T.int32),
-            block_table: T.Tensor(shape_block_table, T.int32),
-            glse: T.Tensor([batch, heads, num_split], accum_dtype),
-            Output_partial: T.Tensor(part_shape, accum_dtype),
-            Output: T.Tensor(shape_o, dtype),
-        ):
-            # flash_attn_split
-            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
-                Q_shared = T.alloc_shared([block_H, dim], dtype)
-                K_shared = T.alloc_shared([block_N, dim], dtype)
-                V_shared = T.alloc_shared([block_N, dim_v], dtype)
-                acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-                acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
-                acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
-
-                scores_max = T.alloc_fragment([block_H], accum_dtype)
-                scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
-                scores_scale = T.alloc_fragment([block_H], accum_dtype)
-                scores_sum = T.alloc_fragment([block_H], accum_dtype)
-                logsum = T.alloc_fragment([block_H], accum_dtype)
-                has_valid_block = T.alloc_var("bool")
-
-                bid = bx
-                hid = by
-                sid = bz
-                cur_kv_head = hid // (kv_group_num // valid_block_H)
-
-                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
-                T.fill(acc_o, 0)
-                T.fill(logsum, 0)
-                T.fill(scores_max, -T.infinity(accum_dtype))
-
-                num_blocks = max_selected_blocks
-                blocks_per_split = T.floordiv(num_blocks, num_split)
-                remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
-                start = blocks_per_split * sid + T.min(sid, remaining_blocks)
-                has_valid_block = False
-                for k in T.Pipelined(loop_range, num_stages=num_stages):
-                    logical_block_idx = block_indices[bid, cur_kv_head, start + k]
-                    if logical_block_idx >= 0:
-                        has_valid_block = True
-                        block_table_idx = T.floordiv(logical_block_idx, block_ratio)
-                        block_tile_idx = T.floormod(logical_block_idx, block_ratio)
-                        physical_block_idx = block_table[bid, block_table_idx]
-                        T.copy(K[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], K_shared)
-                        T.clear(acc_s)
-                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                        if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
-                            for i, j in T.Parallel(block_H, block_N):
-                                acc_s[i, j] = T.if_then_else(
-                                    logical_block_idx * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j]
-                                )
-                        T.copy(scores_max, scores_max_prev)
-                        T.fill(scores_max, -T.infinity(accum_dtype))
-                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                        for i in T.Parallel(block_H):
-                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+        # flash_attn_split
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim_v], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            has_valid_block = T.alloc_var(T.bool)
+
+            bid = bx
+            hid = by
+            sid = bz
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            num_blocks = max_selected_blocks
+            blocks_per_split = T.floordiv(num_blocks, num_split)
+            remaining_blocks = T.floormod(num_blocks, num_split)
+            loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
+            start = blocks_per_split * sid + T.min(sid, remaining_blocks)
+            has_valid_block = False
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                logical_block_idx = block_indices[bid, cur_kv_head, start + k]
+                if logical_block_idx >= 0:
+                    has_valid_block = True
+                    block_table_idx = T.floordiv(logical_block_idx, block_ratio)
+                    block_tile_idx = T.floormod(logical_block_idx, block_ratio)
+                    physical_block_idx = block_table[bid, block_table_idx]
+                    T.copy(K[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], K_shared)
+                    T.clear(acc_s)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                    if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                         for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-                        T.reduce_sum(acc_s, scores_sum, dim=1)
-                        for i in T.Parallel(block_H):
-                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                        T.copy(acc_s, acc_s_cast)
-                        for i, j in T.Parallel(block_H, dim_v):
-                            acc_o[i, j] *= scores_scale[i]
-                        T.copy(V[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], V_shared)
-                        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-                if has_valid_block:
-                    for i, j in T.Parallel(block_H, dim_v):
-                        acc_o[i, j] /= logsum[i]
-
+                            acc_s[i, j] = T.if_then_else(
+                                logical_block_idx * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j]
+                            )
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                     for i in T.Parallel(block_H):
-                        logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-
-                for i in T.Parallel(block_H):
-                    if i < valid_block_H:
-                        glse[bid, hid * valid_block_H + i, sid] = logsum[i]
-
+                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    T.reduce_sum(acc_s, scores_sum, dim=1)
+                    for i in T.Parallel(block_H):
+                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    T.copy(acc_s, acc_s_cast)
+                    for i, j in T.Parallel(block_H, dim_v):
+                        acc_o[i, j] *= scores_scale[i]
+                    T.copy(V[physical_block_idx, block_tile_idx * block_N : (block_tile_idx + 1) * block_N, cur_kv_head, :], V_shared)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+            if has_valid_block:
                 for i, j in T.Parallel(block_H, dim_v):
-                    if i < valid_block_H:
-                        Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
-
-            # combine
-            with T.Kernel(heads, batch, threads=128) as (by, bz):
-                po_local = T.alloc_fragment([dim_v], accum_dtype)
-                o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_var(accum_dtype)
-                lse_logsum_local = T.alloc_var(accum_dtype)
-                lse_max_local = T.alloc_var(accum_dtype)
-                scale_local = T.alloc_var(accum_dtype)
-                max_split = T.alloc_var(T.int32)
-
-                T.clear(lse_logsum_local)
-                T.clear(o_accum_local)
-                lse_max_local = -T.infinity(accum_dtype)
-                for k in T.serial(num_split):
+                    acc_o[i, j] /= logsum[i]
+                for i in T.Parallel(block_H):
+                    logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+
+            # TODO(lei): Support T.Parallel(valid_block_H)
+            for i in T.Parallel(block_H):
+                if i < valid_block_H:
+                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
+            for i, j in T.Parallel(block_H, dim_v):
+                if i < valid_block_H:
+                    Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim_v], accum_dtype)
+            o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+            max_split = T.alloc_var(T.int32)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_local_split = glse[bz, by, k]
+                if lse_local_split != 0:
+                    max_split = k
+                    lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+
+            for k in T.Pipelined(num_split, num_stages=1):
+                if k <= max_split:
                     lse_local_split = glse[bz, by, k]
-                    if lse_local_split != 0:
-                        max_split = k
-                        lse_max_local = T.max(lse_max_local, glse[bz, by, k])
-
-                for k in T.Pipelined(num_split, num_stages=1):
-                    if k <= max_split:
-                        lse_local_split = glse[bz, by, k]
-                        lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
-                lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
-                for k in T.serial(num_split):
-                    if k <= max_split:
-                        for i in T.Parallel(dim_v):
-                            po_local[i] = Output_partial[bz, by, k, i]
-                        lse_local_split = glse[bz, by, k]
-                        scale_local = T.exp2(lse_local_split - lse_logsum_local)
-                        for i in T.Parallel(dim_v):
-                            o_accum_local[i] += po_local[i] * scale_local
-                for i in T.Parallel(dim_v):
-                    Output[bz, by, i] = o_accum_local[i]
-
-        return main
-
-    return kernel_func
+                    lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                if k <= max_split:
+                    for i in T.Parallel(dim_v):
+                        po_local[i] = Output_partial[bz, by, k, i]
+                    lse_local_split = glse[bz, by, k]
+                    scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                    for i in T.Parallel(dim_v):
+                        o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim_v):
+                Output[bz, by, i] = o_accum_local[i]
+
+    print(main)
+    return main
 
 
 class SparseFlashAttn(torch.nn.Module):
@@ -181,19 +179,6 @@ def __init__(self, batch, heads, heads_kv, dim, dim_v, page_block_size, block_N,
         self.page_block_size = page_block_size
         self.num_pages = num_pages
         self.block_H = 64
-
-        self.kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
-            block_N=block_N,
-            block_H=self.block_H,
-            page_block_size=page_block_size,
-            num_split=T.dynamic("num_split"),
-            num_stages=2,
-            threads=128,
-            num_pages=num_pages,
-            max_num_blocks_per_seq=T.dynamic("max_num_blocks_per_seq"),
-            max_selected_blocks=T.dynamic("max_selected_blocks"),
-        )
-
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
 
@@ -221,16 +206,19 @@ def forward(self, query, key, value, block_indices, cache_seqlens, block_table):
         glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
         output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
 
-        output = self.kernel(
-            query,
-            key,
-            value,
-            block_indices,
-            cache_seqlens,
-            block_table,
-            glse,
-            output_partial,
-        )
+        output = flashattn(
+            batch,
+            heads,
+            heads_kv,
+            dim,
+            dim_v,
+            block_N=block_size,
+            block_H=self.block_H,
+            page_block_size=self.page_block_size,
+            num_stages=2,
+            threads=128,
+            num_pages=self.num_pages,
+        )(query, key, value, block_indices, cache_seqlens, block_table, glse, output_partial)
         return output
 
 
@@ -513,6 +501,8 @@ def main(args):
 
 
 def run_regression_perf(args):
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = (
         args.batch,
         args.heads,
@@ -524,15 +514,15 @@ def run_regression_perf(args):
     sparse_ratio = args.sparse_ratio
     block_N = args.block_N
     page_block_size = args.page_block_size
-    num_blocks = args.num_pages
+    num_pages = args.num_pages
     max_selected_blocks = int(math.ceil(max_cache_seqlen / block_N))
     dtype = torch.float16
     Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
     cache_seqlens = torch.randint(max_cache_seqlen // 2, max_cache_seqlen + 1, (batch,), dtype=torch.int32, device="cuda")
     K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
     V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
-    K_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim), dtype=dtype, device="cuda")
-    V_cache = torch.zeros((num_blocks, page_block_size, heads_kv, dim_v), dtype=dtype, device="cuda")
+    K_cache = torch.zeros((num_pages, page_block_size, heads_kv, dim), dtype=dtype, device="cuda")
+    V_cache = torch.zeros((num_pages, page_block_size, heads_kv, dim_v), dtype=dtype, device="cuda")
     max_num_blocks_per_seq = int(math.ceil(max_cache_seqlen / page_block_size))
     block_table = torch.zeros((batch, max_num_blocks_per_seq), dtype=torch.int32, device="cuda")
     block_indices = torch.zeros((batch, heads_kv, max_selected_blocks), dtype=torch.int32, device="cuda")
@@ -596,22 +586,20 @@ def run_regression_perf(args):
                 for i in range(len(selected_blocks), max_selected_blocks):
                     block_indices[seq_idx, head_idx, i] = -1
 
-    sparse_attn = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_blocks)
-    kernel = sparse_attn.kernel
-    batch = sparse_attn.batch
-    heads = sparse_attn.heads
-    heads_kv = sparse_attn.heads_kv
-    dim_v = sparse_attn.dim_v
-    dim = sparse_attn.dim
-    block_size = sparse_attn.block_N
+    sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, page_block_size, block_N, num_pages)
+    batch = sparse_kernel.batch
+    heads = sparse_kernel.heads
+    heads_kv = sparse_kernel.heads_kv
+    dim_v = sparse_kernel.dim_v
+    dim = sparse_kernel.dim
+    block_size = sparse_kernel.block_N
     max_selected_blocks = block_indices.shape[-1]
 
-    num_m_blocks = 1 * (heads // heads_kv + sparse_attn.block_H - 1) // sparse_attn.block_H
+    num_m_blocks = 1 * (heads // heads_kv + sparse_kernel.block_H - 1) // sparse_kernel.block_H
     num_n_blocks = max_selected_blocks
     size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
-
-    num_sm = sparse_attn.num_sm
+    num_sm = sparse_kernel.num_sm
 
     num_split = num_splits_heuristic(
         total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
@@ -619,18 +607,22 @@ def run_regression_perf(args):
 
     glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
     output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = flashattn(
+        batch,
+        heads,
+        heads_kv,
+        dim,
+        dim_v,
+        block_N=block_size,
+        block_H=sparse_kernel.block_H,
+        page_block_size=sparse_kernel.page_block_size,
+        num_stages=2,
+        threads=128,
+        num_pages=sparse_kernel.num_pages,
+    )
 
     def run_kernel_only():
-        kernel(
-            Q,
-            K_cache,
-            V_cache,
-            block_indices,
-            cache_seqlens,
-            block_table,
-            glse,
-            output_partial,
-        )
+        kernel(Q, K_cache, V_cache, block_indices, cache_seqlens, block_table, glse, output_partial)
 
     return do_bench(run_kernel_only, backend="cupti")
 
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
index d6cf7d9176..b608adfae9 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_indice.py
@@ -10,153 +10,150 @@
 from tilelang.profiler import do_bench
 
 
-def flashattn(batch, heads, heads_kv, dim, dim_v):
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn(batch, heads, heads_kv, dim, dim_v, block_N, block_H, num_stages, threads):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     dtype = T.float16
     accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
-    @tilelang.jit(
-        out_idx=[-1],
-        pass_configs={
-            tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        },
-    )
-    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, max_selected_blocks):
-        shape_q = [batch, heads, dim]
-        shape_k = [batch, max_cache_seqlen, heads_kv, dim]
-        shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
-        shape_indices = [batch, heads_kv, max_selected_blocks]
-        shape_o = [batch, heads, dim_v]
-        part_shape = [batch, heads, num_split, dim_v]
-        valid_block_H = min(block_H, kv_group_num)
-
-        @T.prim_func
-        def main(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            block_indices: T.Tensor(shape_indices, T.int32),
-            cache_seqlens: T.Tensor([batch], T.int32),
-            # actual_num_blocks: T.Tensor([batch], T.int32),
-            glse: T.Tensor([batch, heads, num_split], accum_dtype),
-            Output_partial: T.Tensor(part_shape, accum_dtype),
-            Output: T.Tensor(shape_o, dtype),
-        ):
-            # flash_attn_split(Q, K, V, block_indices, cache_seqlens, actual_num_blocks, glse, Output_partial)
-            # flash_attn_split
-            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
-                Q_shared = T.alloc_shared([block_H, dim], dtype)
-                K_shared = T.alloc_shared([block_N, dim], dtype)
-                V_shared = T.alloc_shared([block_N, dim_v], dtype)
-                # O_shared = T.alloc_shared([valid_block_H, dim_v], dtype)
-                acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-                acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
-                acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
-
-                scores_max = T.alloc_fragment([block_H], accum_dtype)
-                scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
-                scores_scale = T.alloc_fragment([block_H], accum_dtype)
-                scores_sum = T.alloc_fragment([block_H], accum_dtype)
-                logsum = T.alloc_fragment([block_H], accum_dtype)
-                has_valid_block = T.alloc_var("bool")
-
-                bid = bx
-                hid = by
-                sid = bz
-                cur_kv_head = hid // (kv_group_num // valid_block_H)
-
-                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
-                T.fill(acc_o, 0)
-                T.fill(logsum, 0)
-                T.fill(scores_max, -T.infinity(accum_dtype))
-
-                num_blocks = max_selected_blocks
-                blocks_per_split = T.floordiv(num_blocks, num_split)
-                remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
-                start = blocks_per_split * sid + T.min(sid, remaining_blocks)
-                has_valid_block = False
-
-                for k in T.Pipelined(loop_range, num_stages=num_stages):
-                    i_s = block_indices[bid, cur_kv_head, start + k]
-                    if i_s >= 0:
-                        has_valid_block = True
-                        T.copy(K[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], K_shared)
-                        T.clear(acc_s)
-                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                        if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
-                            for i, j in T.Parallel(block_H, block_N):
-                                acc_s[i, j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j])
-                        T.copy(scores_max, scores_max_prev)
-                        T.fill(scores_max, -T.infinity(accum_dtype))
-                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                        for i in T.Parallel(block_H):
-                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+    num_split = T.dynamic("num_split")
+    max_cache_seqlen = T.dynamic("max_cache_seqlen")
+    max_selected_blocks = T.dynamic("max_selected_blocks")
+
+    shape_q = [batch, heads, dim]
+    shape_k = [batch, max_cache_seqlen, heads_kv, dim]
+    shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
+    shape_indices = [batch, heads_kv, max_selected_blocks]
+    shape_o = [batch, heads, dim_v]
+    part_shape = [batch, heads, num_split, dim_v]
+    valid_block_H = min(block_H, kv_group_num)
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        block_indices: T.Tensor(shape_indices, T.int32),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        # actual_num_blocks: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, heads, num_split], accum_dtype),
+        Output_partial: T.Tensor(part_shape, accum_dtype),
+        Output: T.Tensor(shape_o, dtype),
+    ):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim_v], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            has_valid_block = T.alloc_var(T.bool)
+
+            bid = bx
+            hid = by
+            sid = bz
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            num_blocks = max_selected_blocks
+            blocks_per_split = T.floordiv(num_blocks, num_split)
+            remaining_blocks = T.floormod(num_blocks, num_split)
+            loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
+            start = blocks_per_split * sid + T.min(sid, remaining_blocks)
+            has_valid_block = False
+
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                i_s = block_indices[bid, cur_kv_head, start + k]
+                if i_s >= 0:
+                    has_valid_block = True
+                    T.copy(K[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], K_shared)
+                    T.clear(acc_s)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                    if k == 0:  # assume block_indices is sorted in reverse order, otherwise, remove this if condition
                         for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-                        T.reduce_sum(acc_s, scores_sum, dim=1)
-                        for i in T.Parallel(block_H):
-                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                        T.copy(acc_s, acc_s_cast)
-                        for i, j in T.Parallel(block_H, dim_v):
-                            acc_o[i, j] *= scores_scale[i]
-                        T.copy(V[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], V_shared)
-                        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-                if has_valid_block:
-                    for i, j in T.Parallel(block_H, dim_v):
-                        acc_o[i, j] /= logsum[i]
-
+                            acc_s[i, j] = T.if_then_else(i_s * block_N + j >= cache_seqlens[bid], -T.infinity(accum_dtype), acc_s[i, j])
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                     for i in T.Parallel(block_H):
-                        logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-
-                for i in T.Parallel(block_H):
-                    if i < valid_block_H:
-                        glse[bid, hid * valid_block_H + i, sid] = logsum[i]
-
+                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    T.reduce_sum(acc_s, scores_sum, dim=1)
+                    for i in T.Parallel(block_H):
+                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    T.copy(acc_s, acc_s_cast)
+                    for i, j in T.Parallel(block_H, dim_v):
+                        acc_o[i, j] *= scores_scale[i]
+                    T.copy(V[bid, i_s * block_N : (i_s + 1) * block_N, cur_kv_head, :], V_shared)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+            if has_valid_block:
                 for i, j in T.Parallel(block_H, dim_v):
-                    if i < valid_block_H:
-                        Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
-
-            # combine
-            with T.Kernel(heads, batch, threads=128) as (by, bz):
-                po_local = T.alloc_fragment([dim_v], accum_dtype)
-                o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_var(accum_dtype)
-                lse_logsum_local = T.alloc_var(accum_dtype)
-                lse_max_local = T.alloc_var(accum_dtype)
-                scale_local = T.alloc_var(accum_dtype)
-                max_split = T.alloc_var(T.int32)
-
-                T.clear(lse_logsum_local)
-                T.clear(o_accum_local)
-                lse_max_local = -T.infinity(accum_dtype)
-                for k in T.serial(num_split):
+                    acc_o[i, j] /= logsum[i]
+                for i in T.Parallel(block_H):
+                    logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+
+            # TODO(lei): Support T.Parallel(valid_block_H)
+            for i in T.Parallel(block_H):
+                if i < valid_block_H:
+                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
+            for i, j in T.Parallel(block_H, dim_v):
+                if i < valid_block_H:
+                    Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim_v], accum_dtype)
+            o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+            max_split = T.alloc_var(T.int32)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_local_split = glse[bz, by, k]
+                if lse_local_split != 0:
+                    max_split = k
+                    lse_max_local = T.max(lse_max_local, glse[bz, by, k])
+
+            for k in T.Pipelined(num_split, num_stages=1):
+                if k <= max_split:
+                    lse_local_split = glse[bz, by, k]
+                    lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                if k <= max_split:
+                    for i in T.Parallel(dim_v):
+                        po_local[i] = Output_partial[bz, by, k, i]
                     lse_local_split = glse[bz, by, k]
-                    if lse_local_split != 0:
-                        max_split = k
-                        lse_max_local = T.max(lse_max_local, glse[bz, by, k])
-
-                for k in T.Pipelined(num_split, num_stages=1):
-                    if k <= max_split:
-                        lse_local_split = glse[bz, by, k]
-                        lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
-                lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
-                for k in T.serial(num_split):
-                    if k <= max_split:
-                        for i in T.Parallel(dim_v):
-                            po_local[i] = Output_partial[bz, by, k, i]
-                        lse_local_split = glse[bz, by, k]
-                        scale_local = T.exp2(lse_local_split - lse_logsum_local)
-                        for i in T.Parallel(dim_v):
-                            o_accum_local[i] += po_local[i] * scale_local
-                for i in T.Parallel(dim_v):
-                    Output[bz, by, i] = o_accum_local[i]
-
-        return main
-
-    return kernel_func
+                    scale_local = T.exp2(lse_local_split - lse_logsum_local)
+                    for i in T.Parallel(dim_v):
+                        o_accum_local[i] += po_local[i] * scale_local
+            for i in T.Parallel(dim_v):
+                Output[bz, by, i] = o_accum_local[i]
+
+    return main
 
 
 class SparseFlashAttn(torch.nn.Module):
@@ -168,19 +165,7 @@ def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         self.dim = dim
         self.dim_v = dim_v
         self.block_size = block_size
-
         self.block_H = 64
-
-        self.kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
-            block_N=block_size,
-            block_H=self.block_H,
-            num_split=T.dynamic("num_split"),
-            num_stages=2,
-            threads=128,
-            max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            max_selected_blocks=T.dynamic("max_selected_blocks"),
-        )
-
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
 
@@ -208,7 +193,18 @@ def forward(self, query, key, value, block_indices, cache_seqlens):
         glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
         output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
 
-        output = self.kernel(query, key, value, block_indices, cache_seqlens, glse, output_partial)
+        kernel = flashattn(
+            batch,
+            heads,
+            heads_kv,
+            dim,
+            dim_v,
+            block_N=block_size,
+            block_H=self.block_H,
+            num_stages=2,
+            threads=128,
+        )
+        output = kernel(query, key, value, block_indices, cache_seqlens, glse, output_partial)
         return output
 
 
@@ -252,14 +248,16 @@ def sparse_gqa_decode_varlen_indice(query, key, value, block_indices, cache_seql
 
     glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
     Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
-    kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
+    kernel = flashattn(
+        batch,
+        heads,
+        heads_kv,
+        dim,
+        dim_v,
         block_N=block_size,
         block_H=block_H,
-        num_split=T.dynamic("num_split"),
         num_stages=2,
         threads=128,
-        max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        max_selected_blocks=T.dynamic("max_selected_blocks"),
     )
 
     output = kernel(query, key, value, block_indices, cache_seqlens, glse, Output_partial)
@@ -311,7 +309,7 @@ def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_se
     return output
 
 
-def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
+def assert_close(name, expect, actual, atol=1e-3, rtol=1e-3):
     all_close = torch.allclose(expect, actual, atol=atol, rtol=rtol)
     print(name + "  all_close={}".format(all_close))
     if not all_close:
@@ -324,29 +322,17 @@ def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
 
 def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
+    dtype = torch.float16
     sparse_ratio = sparse_ratio
     block_size = block_size
     max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
-    print("max_selected_blocks: ", max_selected_blocks)
-    dtype = torch.float16
 
     Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
     K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
     V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
     cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
-    # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
-    # # Ensure at least one element equals cache_seqlen
-    # random_index = torch.randint(0, batch, (1,), device='cuda').item()  # Select a random index
-    # # cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
-
-    print("cache_seqlens: ", cache_seqlens)
-
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
-    print("max_valid_num_blocks: ", max_valid_num_blocks)
-    # Initialize block_indices with -1 (for padding blocks)
     block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
-    # max_num_blocks = int((max_cache_seqlen + block_size - 1)/ block_size)
-    # block_indices = torch.full((batch, heads_kv, max_num_blocks), -1, dtype=torch.int32, device='cuda')
 
     # Assign valid indices while ensuring no duplicates within each batch-group
     for b in range(batch):
@@ -354,27 +340,17 @@ def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=12
         if max_valid_block > 0:  # Ensure there's at least one valid block
             for h in range(heads_kv):
                 valid_indices = torch.randperm(max_valid_block, device="cuda", dtype=torch.int32)[:max_selected_blocks]
-                # valid_indices = torch.randperm(max_valid_block, device='cuda', dtype=torch.int32)[:max_num_blocks]
                 block_indices[b, h, : len(valid_indices)] = valid_indices
 
-    # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
-    # print("block_indices: ", block_indices)
-    actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)[:, 0]
-    print("actual_num_blocks: ", actual_num_blocks)
-    # print(block_indices.shape, actual_num_blocks.shape)
-
     max_num_blocks = torch.max(max_valid_num_blocks).item()
-    print("max_num_blocks: ", max_num_blocks)
 
     # parity reference
     ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
     sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
-    debug("output", ref, out, atol=1e-3, rtol=1e-3)
-
-    import flash_attn  # noqa: F401
+    assert_close("output", ref, out, atol=1e-3, rtol=1e-3)
 
     ## latency reference
     for _ in range(10):
@@ -387,12 +363,10 @@ def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=12
     print("dense time: ", (time.time() - start) / 100 * 1000)
 
     for _ in range(10):
-        # out = sparse_gqa_decode_varlen_indice(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, block_size)
         out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
     torch.cuda.synchronize()
     start = time.time()
     for _ in range(100):
-        # out = sparse_gqa_decode_varlen_indice(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, block_size)
         out = sparse_kernel(Q, K, V, block_indices, cache_seqlens)
     torch.cuda.synchronize()
     print("sparse time: ", (time.time() - start) / 100 * 1000)
@@ -428,24 +402,9 @@ def run_regression_perf(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, di
     dim_v = sparse_kernel.dim_v
     dim = sparse_kernel.dim
     block_size = sparse_kernel.block_size
-    max_selected_blocks = block_indices.shape[-1]
-
-    num_m_blocks = 1 * (heads // heads_kv + sparse_kernel.block_H - 1) // sparse_kernel.block_H
-    num_n_blocks = max_selected_blocks
-    size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2
-    total_mblocks = batch * heads_kv * num_m_blocks
-    num_sm = sparse_kernel.num_sm
-
-    num_split = num_splits_heuristic(
-        total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
-    )
-
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
-    output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
-    kernel = sparse_kernel.kernel
 
     def run_kernel_only():
-        kernel(Q, K, V, block_indices, cache_seqlens, glse, output_partial)
+        sparse_kernel(Q, K, V, block_indices, cache_seqlens)
 
     return do_bench(run_kernel_only, backend="cupti")
 
diff --git a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
index e48428fb89..e588ec54cc 100644
--- a/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
+++ b/examples/blocksparse_attention/example_tilelang_sparse_gqa_decode_varlen_mask.py
@@ -1,7 +1,6 @@
 import torch
 import torch.nn.functional as F
 import tilelang
-from tilelang.autotuner import *
 import tilelang.language as T
 from einops import rearrange, einsum
 import argparse
@@ -11,137 +10,144 @@
 from tilelang.profiler import do_bench
 
 
-def flashattn(batch, heads, heads_kv, dim, dim_v):
+@tilelang.jit(
+    out_idx=[-1],
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def flashattn(batch, heads, heads_kv, dim, dim_v, block_N, block_H, num_stages, threads):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     dtype = T.float16
     accum_dtype = T.float32
     kv_group_num = heads // heads_kv
 
-    @tilelang.jit(
-        out_idx=[-1],
-        pass_configs={
-            tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        },
-    )
-    def kernel_func(block_N, block_H, num_split, num_stages, threads, max_cache_seqlen, num_blocks):
-        shape_q = [batch, heads, dim]
-        shape_k = [batch, max_cache_seqlen, heads_kv, dim]
-        shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
-        shape_mask = [batch, heads_kv, num_blocks]
-        shape_o = [batch, heads, dim_v]
-        part_shape = [batch, heads, num_split, dim_v]
-        valid_block_H = min(block_H, kv_group_num)
-
-        @T.prim_func
-        def main(
-            Q: T.Tensor(shape_q, dtype),
-            K: T.Tensor(shape_k, dtype),
-            V: T.Tensor(shape_v, dtype),
-            block_mask: T.Tensor(shape_mask, T.bool),
-            cache_seqlens: T.Tensor([batch], T.int32),
-            glse: T.Tensor([batch, heads, num_split], accum_dtype),
-            Output_partial: T.Tensor(part_shape, accum_dtype),
-            Output: T.Tensor(shape_o, dtype),
-        ):
-            with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
-                Q_shared = T.alloc_shared([block_H, dim], dtype)
-                K_shared = T.alloc_shared([block_N, dim], dtype)
-                V_shared = T.alloc_shared([block_N, dim_v], dtype)
-                acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
-                acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
-                acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
-
-                scores_max = T.alloc_fragment([block_H], accum_dtype)
-                scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
-                scores_scale = T.alloc_fragment([block_H], accum_dtype)
-                scores_sum = T.alloc_fragment([block_H], accum_dtype)
-                logsum = T.alloc_fragment([block_H], accum_dtype)
-                has_valid_block = T.alloc_var("bool")
-
-                bid = bx
-                hid = by
-                sid = bz
-                cur_kv_head = hid // (kv_group_num // valid_block_H)
-
-                T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
-                T.fill(acc_o, 0)
-                T.fill(logsum, 0)
-                T.fill(scores_max, -T.infinity(accum_dtype))
-                blocks_per_split = T.floordiv(num_blocks, num_split)
-                remaining_blocks = T.floormod(num_blocks, num_split)
-                loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
-                start = blocks_per_split * sid + T.min(sid, remaining_blocks)
-                has_valid_block = False
-                for k in T.Pipelined(loop_range, num_stages=num_stages):
-                    if block_mask[bid, hid, start + k]:
-                        has_valid_block = True
-                        T.copy(K[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], K_shared)
-                        T.clear(acc_s)
-                        T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                        for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.if_then_else(
-                                (start + k) * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j]
-                            )
-                        T.copy(scores_max, scores_max_prev)
-                        T.fill(scores_max, -T.infinity(accum_dtype))
-                        T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                        for i in T.Parallel(block_H):
-                            scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                            scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-                        for i, j in T.Parallel(block_H, block_N):
-                            acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-                        T.reduce_sum(acc_s, scores_sum, dim=1)
-                        for i in T.Parallel(block_H):
-                            logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-                        T.copy(acc_s, acc_s_cast)
-                        for i, j in T.Parallel(block_H, dim_v):
-                            acc_o[i, j] *= scores_scale[i]
-                        T.copy(V[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], V_shared)
-                        T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-                if has_valid_block:
-                    for i, j in T.Parallel(block_H, dim_v):
-                        acc_o[i, j] /= logsum[i]
+    num_split = T.dynamic("num_split")
+    max_cache_seqlen = T.dynamic("max_cache_seqlen")
+    num_blocks = T.dynamic("num_blocks")
+
+    shape_q = [batch, heads, dim]
+    shape_k = [batch, max_cache_seqlen, heads_kv, dim]
+    shape_v = [batch, max_cache_seqlen, heads_kv, dim_v]
+    shape_mask = [batch, heads_kv, num_blocks]
+    shape_o = [batch, heads, dim_v]
+    part_shape = [batch, heads, num_split, dim_v]
+    valid_block_H = min(block_H, kv_group_num)
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape_q, dtype),
+        K: T.Tensor(shape_k, dtype),
+        V: T.Tensor(shape_v, dtype),
+        block_mask: T.Tensor(shape_mask, T.bool),
+        cache_seqlens: T.Tensor([batch], T.int32),
+        glse: T.Tensor([batch, heads, num_split], accum_dtype),
+        Output_partial: T.Tensor(part_shape, accum_dtype),
+        Output: T.Tensor(shape_o, dtype),
+    ):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_H, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim_v], dtype)
+            acc_s = T.alloc_fragment([block_H, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_H, block_N], dtype)
+            acc_o = T.alloc_fragment([block_H, dim_v], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_H], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_H], accum_dtype)
+            scores_scale = T.alloc_fragment([block_H], accum_dtype)
+            scores_sum = T.alloc_fragment([block_H], accum_dtype)
+            logsum = T.alloc_fragment([block_H], accum_dtype)
+            has_valid_block = T.alloc_var(T.bool)
+
+            bid = bx
+            hid = by
+            sid = bz
+            cur_kv_head = hid // (kv_group_num // valid_block_H)
+
+            T.copy(Q[bid, hid * valid_block_H : hid * valid_block_H + block_H, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            blocks_per_split = T.floordiv(num_blocks, num_split)
+            remaining_blocks = T.floormod(num_blocks, num_split)
+            loop_range = blocks_per_split + T.if_then_else(sid < remaining_blocks, 1, 0)
+            start = blocks_per_split * sid + T.min(sid, remaining_blocks)
+            has_valid_block = False
+            for k in T.Pipelined(loop_range, num_stages=num_stages):
+                if block_mask[bid, hid, start + k]:
+                    has_valid_block = True
+                    T.copy(K[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], K_shared)
+                    T.clear(acc_s)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.if_then_else((start + k) * block_N + j >= cache_seqlens[bx], -T.infinity(accum_dtype), acc_s[i, j])
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(acc_s, scores_max, dim=1, clear=False)
                     for i in T.Parallel(block_H):
-                        logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-
-                for i in T.Parallel(block_H):
-                    if i < valid_block_H:
-                        glse[bid, hid * valid_block_H + i, sid] = logsum[i]
-
+                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_H, block_N):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    T.reduce_sum(acc_s, scores_sum, dim=1)
+                    for i in T.Parallel(block_H):
+                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    T.copy(acc_s, acc_s_cast)
+                    for i, j in T.Parallel(block_H, dim_v):
+                        acc_o[i, j] *= scores_scale[i]
+                    T.copy(V[bid, (start + k) * block_N : (start + k + 1) * block_N, cur_kv_head, :], V_shared)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+            if has_valid_block:
                 for i, j in T.Parallel(block_H, dim_v):
-                    if i < valid_block_H:
-                        Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
-
-            with T.Kernel(heads, batch, threads=128) as (by, bz):
-                po_local = T.alloc_fragment([dim_v], accum_dtype)
-                o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
-                lse_local_split = T.alloc_var(accum_dtype)
-                lse_logsum_local = T.alloc_var(accum_dtype)
-                lse_max_local = T.alloc_var(accum_dtype)
-                scale_local = T.alloc_var(accum_dtype)
-
-                T.clear(lse_logsum_local)
-                T.clear(o_accum_local)
-                lse_max_local = -T.infinity(accum_dtype)
-                for k in T.serial(num_split):
+                    acc_o[i, j] /= logsum[i]
+                for i in T.Parallel(block_H):
+                    logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+
+            # TODO(lei): Support T.Parallel(valid_block_H)
+            for i in T.Parallel(block_H):
+                if i < valid_block_H:
+                    glse[bid, hid * valid_block_H + i, sid] = logsum[i]
+            for i, j in T.Parallel(block_H, dim_v):
+                if i < valid_block_H:
+                    Output_partial[bid, hid * valid_block_H + i, sid, j] = acc_o[i, j]
+
+        # combine
+        with T.Kernel(heads, batch, threads=128) as (by, bz):
+            po_local = T.alloc_fragment([dim_v], accum_dtype)
+            o_accum_local = T.alloc_fragment([dim_v], accum_dtype)
+            lse_local_split = T.alloc_var(accum_dtype)
+            lse_logsum_local = T.alloc_var(accum_dtype)
+            lse_max_local = T.alloc_var(accum_dtype)
+            scale_local = T.alloc_var(accum_dtype)
+            max_split = T.alloc_var(T.int32)
+
+            T.clear(lse_logsum_local)
+            T.clear(o_accum_local)
+            lse_max_local = -T.infinity(accum_dtype)
+            for k in T.serial(num_split):
+                lse_local_split = glse[bz, by, k]
+                if lse_local_split != 0:
+                    max_split = k
                     lse_max_local = T.max(lse_max_local, glse[bz, by, k])
-                for k in T.Pipelined(num_split, num_stages=1):
+
+            for k in T.Pipelined(num_split, num_stages=1):
+                if k <= max_split:
                     lse_local_split = glse[bz, by, k]
                     lse_logsum_local += T.exp2(lse_local_split - lse_max_local)
-                lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
-                for k in T.serial(num_split):
+            lse_logsum_local = T.log2(lse_logsum_local) + lse_max_local
+            for k in T.serial(num_split):
+                if k <= max_split:
                     for i in T.Parallel(dim_v):
                         po_local[i] = Output_partial[bz, by, k, i]
                     lse_local_split = glse[bz, by, k]
                     scale_local = T.exp2(lse_local_split - lse_logsum_local)
                     for i in T.Parallel(dim_v):
                         o_accum_local[i] += po_local[i] * scale_local
-                for i in T.Parallel(dim_v):
-                    Output[bz, by, i] = o_accum_local[i]
-
-        return main
+            for i in T.Parallel(dim_v):
+                Output[bz, by, i] = o_accum_local[i]
 
-    return kernel_func
+    return main
 
 
 class SparseFlashAttn(torch.nn.Module):
@@ -153,19 +159,7 @@ def __init__(self, batch, heads, heads_kv, dim, dim_v, block_size):
         self.dim = dim
         self.dim_v = dim_v
         self.block_size = block_size
-
         self.block_H = 64
-
-        self.kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
-            block_N=block_size,
-            block_H=self.block_H,
-            num_split=T.dynamic("num_split"),
-            num_stages=2,
-            threads=128,
-            max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-            num_blocks=T.dynamic("num_blocks"),
-        )
-
         props = torch.cuda.get_device_properties(torch.device("cuda:0"))
         self.num_sm = props.multi_processor_count
 
@@ -176,24 +170,33 @@ def forward(self, query, key, value, block_mask, cache_seqlens):
         dim_v = self.dim_v
         dim = self.dim
         block_size = self.block_size
-        block_H = self.block_H
         max_cache_seqlen = key.shape[1]
         # get num_split
         max_selected_blocks = (max_cache_seqlen + block_size - 1) // block_size
-        num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
+        num_m_blocks = 1 * (heads // heads_kv + self.block_H - 1) // self.block_H
         num_n_blocks = max_selected_blocks
 
         size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2  # kv_seqlen * (dim + dim_v) * 2
         total_mblocks = batch * heads_kv * num_m_blocks
-        # num_sm = 132
         num_sm = self.num_sm
         num_split = num_splits_heuristic(
             total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
         )
-        # print("num_split: ", num_split)
+
         glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
-        Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
-        output = self.kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
+        output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+
+        output = flashattn(
+            batch,
+            heads,
+            heads_kv,
+            dim,
+            dim_v,
+            block_N=block_size,
+            block_H=self.block_H,
+            num_stages=2,
+            threads=128,
+        )(query, key, value, block_mask, cache_seqlens, glse, output_partial)
         return output
 
 
@@ -233,21 +236,21 @@ def sparse_gqa_decode_varlen_mask(query, key, value, block_mask, cache_seqlens,
         total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
     )
 
-    kernel = flashattn(batch, heads, heads_kv, dim, dim_v)(
+    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
+    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = flashattn(
+        batch,
+        heads,
+        heads_kv,
+        dim,
+        dim_v,
         block_N=block_size,
         block_H=block_H,
-        num_split=T.dynamic("num_split"),
         num_stages=2,
         threads=128,
-        max_cache_seqlen=T.dynamic("max_cache_seqlen"),
-        num_blocks=T.dynamic("num_blocks"),
     )
-    glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
-    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
-    # print(kernel.get_kernel_source())
 
     output = kernel(query, key, value, block_mask, cache_seqlens, glse, Output_partial)
-
     return output
 
 
@@ -297,12 +300,10 @@ def ref_program_fa(query, key, value, block_indices, cache_seqlens, max_cache_se
     return output
 
 
-def debug(name, expect, actual, atol=1e-3, rtol=1e-3):
+def assert_close(name, expect, actual, atol=1e-3, rtol=1e-3):
     all_close = torch.allclose(expect, actual, atol=atol, rtol=rtol)
     print(name + "  all_close={}".format(all_close))
     if not all_close:
-        # print(expect[3, 28])
-        # print(actual[3, 28])
         diff = (expect - actual).abs()
         print("all_close={}, max={}, min={}, mean={}".format(all_close, diff.max().item(), diff.min().item(), diff.mean().item()))
         max_indices = torch.nonzero(diff == diff.max().item())
@@ -353,7 +354,7 @@ def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=12
     # out = sparse_gqa_decode_varlen_mask(Q, K, V, block_mask, cache_seqlens, block_size)
     model = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
     out = model(Q, K, V, block_mask, cache_seqlens)
-    debug("output", ref, out, atol=1e-3, rtol=1e-3)
+    assert_close("output", ref, out, atol=1e-3, rtol=1e-3)
 
     import flash_attn  # noqa: F401
 
@@ -381,12 +382,13 @@ def main(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=12
 
 
 def run_regression_perf(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32):
+    torch.manual_seed(42)
+    torch.cuda.manual_seed_all(42)
     batch, heads, heads_kv, max_cache_seqlen, dim, dim_v = batch, heads, heads_kv, max_cache_seqlen, dim, dim_v
     sparse_ratio = sparse_ratio
     block_size = block_size
     max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
     dtype = torch.float16
-
     Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
     K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
     V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
@@ -408,31 +410,41 @@ def run_regression_perf(batch=8, heads=32, heads_kv=8, max_cache_seqlen=8192, di
                 perm = torch.randperm(max_valid_block, device="cuda")[:valid_num_block]
                 block_mask[b, h, perm] = True
 
-    model = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
-    batch = model.batch
-    heads = model.heads
-    heads_kv = model.heads_kv
-    dim_v = model.dim_v
-    dim = model.dim
-    block_size = model.block_size
-    block_H = model.block_H
-    max_cache_seqlen = K.shape[1]
+    sparse_kernel = SparseFlashAttn(batch, heads, heads_kv, dim, dim_v, block_size)
+    batch = sparse_kernel.batch
+    heads = sparse_kernel.heads
+    heads_kv = sparse_kernel.heads_kv
+    dim_v = sparse_kernel.dim_v
+    dim = sparse_kernel.dim
+    block_size = sparse_kernel.block_size
     max_selected_blocks = (max_cache_seqlen + block_size - 1) // block_size
-    num_m_blocks = 1 * (heads // heads_kv + block_H - 1) // block_H
-    num_n_blocks = max_selected_blocks
 
+    num_m_blocks = 1 * (heads // heads_kv + sparse_kernel.block_H - 1) // sparse_kernel.block_H
+    num_n_blocks = max_selected_blocks
     size_one_kv_head = max_selected_blocks * block_size * (dim + dim_v) * 2
     total_mblocks = batch * heads_kv * num_m_blocks
-    num_sm = model.num_sm
+    num_sm = sparse_kernel.num_sm
+
     num_split = num_splits_heuristic(
         total_mblocks, num_sm, num_n_blocks, num_m_blocks, size_one_kv_head, is_causal_or_local=True, max_splits=128
     )
+
     glse = torch.empty((batch, heads, num_split), dtype=torch.float32, device="cuda")
-    Output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
-    kernel = model.kernel
+    output_partial = torch.empty((batch, heads, num_split, dim_v), dtype=torch.float32, device="cuda")
+    kernel = flashattn(
+        batch,
+        heads,
+        heads_kv,
+        dim,
+        dim_v,
+        block_N=block_size,
+        block_H=sparse_kernel.block_H,
+        num_stages=2,
+        threads=128,
+    )
 
     def run_kernel_only():
-        kernel(Q, K, V, block_mask, cache_seqlens, glse, Output_partial)
+        kernel(Q, K, V, block_mask, cache_seqlens, glse, output_partial)
 
     return do_bench(run_kernel_only, backend="cupti")
 
diff --git a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
index 01695742b5..91d85a1a43 100644
--- a/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
+++ b/examples/blocksparse_attention/example_triton_sparse_gqa_decode_varlen_indice.py
@@ -329,21 +329,15 @@ def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=1
     max_selected_blocks = int(math.ceil(max_cache_seqlen * (1 - sparse_ratio) / block_size))
     print("max_selected_blocks: ", max_selected_blocks)
     dtype = torch.float16
-    block_H = 64
 
     Q = torch.randn((batch, heads, dim), dtype=dtype, device="cuda")
     K = torch.randn((batch, max_cache_seqlen, heads_kv, dim), dtype=dtype, device="cuda")
     V = torch.randn((batch, max_cache_seqlen, heads_kv, dim_v), dtype=dtype, device="cuda")
     cache_seqlens = torch.randint(1, max_cache_seqlen, (batch,), dtype=torch.int32, device="cuda")
-    # cache_seqlens = torch.full((batch,), max_cache_seqlen, dtype=torch.int32, device='cuda')
     # Ensure at least one element equals cache_seqlen
     random_index = torch.randint(0, batch, (1,), device="cuda").item()  # Select a random index
     cache_seqlens[random_index] = max_cache_seqlen  # Assign cache_seqlen to ensure at least one occurrence
-
-    print("cache_seqlens: ", cache_seqlens)
-
     max_valid_num_blocks = torch.ceil(cache_seqlens / block_size).int()
-    print("max_valid_num_blocks: ", max_valid_num_blocks)
     # Initialize block_indices with -1 (for padding blocks)
     block_indices = torch.full((batch, heads_kv, max_selected_blocks), -1, dtype=torch.int32, device="cuda")
 
@@ -357,13 +351,7 @@ def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=1
 
     # Sort indices within each batch-group for consistency
     block_indices, _ = block_indices.sort(dim=-1, descending=True)
-    # print("block_indices: ", block_indices)
-    actual_num_blocks = torch.sum(block_indices != -1, dim=-1).to(torch.int32)[:, 0]
-    print("actual_num_blocks: ", actual_num_blocks)
-    # print(block_indices.shape, actual_num_blocks.shape)
-
     max_num_blocks = torch.max(max_valid_num_blocks).item()
-    print("max_num_blocks: ", max_num_blocks)
 
     ref = ref_program_torch(Q, K, V, block_indices, cache_seqlens, max_cache_seqlen, max_num_blocks, block_size)
 
@@ -402,6 +390,7 @@ def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=1
     avg_time = elapsed_time / 1000
     avg_flops = total_flops / avg_time
     print(f"Average time: {avg_time:.6f} seconds")
+    print(f"Average FLOPS: {avg_flops:.2f} GFLOPS")
 
     # Measure performance of reference implementation
     import flash_attn  # noqa: F401
@@ -415,7 +404,7 @@ def main(batch=64, heads=32, heads_kv=8, max_cache_seqlen=8192, dim=128, dim_v=1
     avg_time_ref = elapsed_time_ref / 1000
     avg_flops_ref = total_flops / avg_time_ref
     print(f"Average time of ref: {avg_time_ref:.6f} seconds")
-
+    print(f"Average FLOPS of ref: {avg_flops_ref:.2f} GFLOPS")
     print(f"Speedup: {avg_time_ref / avg_time:.2f}x")
 
 
diff --git a/examples/blocksparse_attention/test_example_blocksparse_attention.py b/examples/blocksparse_attention/test_example_blocksparse_attention.py
index dd33f46c4e..0144f2004a 100644
--- a/examples/blocksparse_attention/test_example_blocksparse_attention.py
+++ b/examples/blocksparse_attention/test_example_blocksparse_attention.py
@@ -3,8 +3,6 @@
 import example_tilelang_block_sparse_attn
 import example_tilelang_sparse_gqa_decode_varlen_indice
 import example_tilelang_sparse_gqa_decode_varlen_mask
-import example_triton_sparse_gqa_decode_varlen_indice
-import example_triton_sparse_gqa_decode_varlen_mask
 
 
 def test_block_sparse_attn_triton():
@@ -23,17 +21,5 @@ def test_example_tilelang_sparse_gqa_decode_varlen_mask():
     example_tilelang_sparse_gqa_decode_varlen_mask.main(batch=1, max_cache_seqlen=2048)
 
 
-def test_example_triton_sparse_gqa_decode_varlen_indice():
-    example_triton_sparse_gqa_decode_varlen_indice.main(
-        batch=8, heads=8, heads_kv=4, max_cache_seqlen=2048, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
-    )
-
-
-def test_example_triton_sparse_gqa_decode_varlen_mask():
-    example_triton_sparse_gqa_decode_varlen_mask.main(
-        batch=16, heads=16, heads_kv=8, max_cache_seqlen=1024, dim=128, dim_v=128, sparse_ratio=0.8, block_size=32
-    )
-
-
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/blocksparse_gemm/example_blocksparse_gemm.py b/examples/blocksparse_gemm/example_blocksparse_gemm.py
index 178cc59842..9defb72882 100644
--- a/examples/blocksparse_gemm/example_blocksparse_gemm.py
+++ b/examples/blocksparse_gemm/example_blocksparse_gemm.py
@@ -2,10 +2,8 @@
 import itertools
 import tilelang
 import tilelang.language as T
-from tilelang.engine.param import KernelParam
 from tilelang.utils.tensor import get_tensor_supply, TensorSupplyType
 import torch
-from typing import List
 from tilelang.profiler import do_bench
 
 DEFAULT_BLOCK_M = 128
@@ -14,24 +12,8 @@
 DEFAULT_NUM_STAGES = 2
 DEFAULT_THREAD_NUM = 128
 DEFAULT_ENABLE_RASTERIZATION = True
-
-parser = argparse.ArgumentParser(description="Autotuned BlockSparse MatMul Benchmark")
-parser.add_argument("--m", type=int, default=1024, help="Matrix dimension M")
-parser.add_argument("--n", type=int, default=1024, help="Matrix dimension N")
-parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
-parser.add_argument("--sparsity", type=float, default=0.5, help="Sparsity ratio (0-1)")
-parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune")
-
-args, _ = parser.parse_known_args()
-M, N, K = args.m, args.n, args.k
-sparsity = args.sparsity
-use_autotune = args.use_autotune
 default_tensor_supply = get_tensor_supply(TensorSupplyType.Auto)
 
-print(f"Running BlockSparse MatMul Benchmark for M={M}, N={N}, K={K}")
-print(f"Target Block Sparsity: {sparsity}")
-print(f"Using Autotuner: {use_autotune}\n")
-
 
 def get_configs():
     block_M = [64, 128, 256]
@@ -57,6 +39,8 @@ def get_configs():
 
 
 def ref_program(A, B, BlockMask, block_M, block_N, block_K):
+    M, K = A.shape
+    _, N = B.shape
     ref_c = torch.zeros((M, N), dtype=torch.float16, device=A.device)
     for i in range(M // block_M):
         for j in range(N // block_N):
@@ -70,25 +54,6 @@ def ref_program(A, B, BlockMask, block_M, block_N, block_K):
     return ref_c
 
 
-def supply_program(params: List[KernelParam]):
-    input_tensors = []
-
-    for p in params:
-        # Check if the kernel parameter is BlockMask tensor.
-        # Here, BlockMask is uniquely identified by having 3 dimensions.
-        if len(p.shape) != 3:
-            # For non-BlockMask tensors, use the default tensor generation logic.
-            input_tensors.append(default_tensor_supply(p))
-        else:
-            # For BlockMask tensor, randomly set elements to True based on desired
-            # sparsity level.
-            block_mask = torch.zeros(p.shape, dtype=torch.bool, device=torch.cuda.current_device())
-            block_mask[:, :, :] = torch.rand(p.shape) > sparsity
-            input_tensors.append(block_mask)
-
-    return input_tensors
-
-
 @tilelang.autotune(
     configs=get_configs(),
 )
@@ -127,6 +92,20 @@ def block_sparse_matmul(
 
 
 def main():
+    parser = argparse.ArgumentParser(description="Autotuned BlockSparse MatMul Benchmark")
+    parser.add_argument("--m", type=int, default=1024, help="Matrix dimension M")
+    parser.add_argument("--n", type=int, default=1024, help="Matrix dimension N")
+    parser.add_argument("--k", type=int, default=1024, help="Matrix dimension K")
+    parser.add_argument("--sparsity", type=float, default=0.5, help="Sparsity ratio (0-1)")
+    parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune")
+
+    args, _ = parser.parse_known_args()
+    M, N, K = args.m, args.n, args.k
+    sparsity = args.sparsity
+    use_autotune = args.use_autotune
+    print(f"Running BlockSparse MatMul Benchmark for M={M}, N={N}, K={K}")
+    print(f"Target Block Sparsity: {sparsity}")
+    print(f"Using Autotuner: {use_autotune}\n")
     # Initialize input matrices A and B on the GPU with half precision
     a = torch.randn(M, K).cuda().half()
     b = torch.randn(K, N).cuda().half()
@@ -158,6 +137,7 @@ def main():
         )
         block_M, block_N, block_K = DEFAULT_BLOCK_M, DEFAULT_BLOCK_N, DEFAULT_BLOCK_K
         print(f"Using default kernel with block size ({block_M}, {block_N}, {block_K})")
+
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
     block_mask = torch.rand(mask_shape).cuda() > sparsity
@@ -177,6 +157,8 @@ def main():
 
 
 def run_regression_perf():
+    M = N = K = 1024
+    sparsity = 0.5
     torch.manual_seed(42)
     torch.cuda.manual_seed_all(42)
     a = torch.randn(M, K).cuda().half()
diff --git a/examples/cast/example_per_token_cast_to_fp8.py b/examples/cast/example_per_token_cast_to_fp8.py
index 4b3730b4b9..693e90d30a 100644
--- a/examples/cast/example_per_token_cast_to_fp8.py
+++ b/examples/cast/example_per_token_cast_to_fp8.py
@@ -92,21 +92,15 @@ def main(M=8192, N=8192, blk_m=8):
     print("Tile-lang: {:.2f} ms".format(latency))
 
     from tilelang.profiler import do_bench
+    from example_triton_cast_to_fp8 import per_token_group_quant_fp8
 
-    # Triton fp8e4nv is only supported on Hopper (SM90) and later
-    major, _ = torch.cuda.get_device_capability()
-    if major >= 9:
-        from example_triton_cast_to_fp8 import per_token_group_quant_fp8
+    def run_triton():
+        x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
+        return x_fp8_triton_, x_amax_triton_
 
-        def run_triton():
-            x_fp8_triton_, x_amax_triton_ = per_token_group_quant_fp8(x, 128, 1e-4, dtype=torch.float8_e4m3fn, column_major_scales=False)
-            return x_fp8_triton_, x_amax_triton_
-
-        x_fp8_triton, x_amax_triton = run_triton()
-        latency = do_bench(run_triton)
-        print("Triton: {:.2f} ms".format(latency))
-    else:
-        print("Triton fp8e4nv benchmark skipped (requires SM90+)")
+    x_fp8_triton, x_amax_triton = run_triton()
+    latency = do_bench(run_triton)
+    print("Triton: {:.2f} ms".format(latency))
 
 
 def run_regression_perf(M=8192, N=8192, blk_m=8):
diff --git a/examples/conftest.py b/examples/conftest.py
index 4010e0d83a..afc122b6c2 100644
--- a/examples/conftest.py
+++ b/examples/conftest.py
@@ -21,6 +21,42 @@
     np.random.seed(0)
 
 
+# ---------------------------------------------------------------------------
+# CuTeDSL backend: auto-mark known failures / unsupported tests
+# ---------------------------------------------------------------------------
+
+# Known failures when running with TILELANG_TARGET=cutedsl.
+# These are marked as xfail(strict=False) so unexpected passes are reported.
+CUTEDSL_KNOWN_FAILURES = {
+    # Unimplemented sparse ops: tl.tl_gemm_sp
+    "sparse_tensorcore/test_example_sparse_tensorcore.py::test_tilelang_example_sparse_tensorcore",
+    "gemm_sp/test_example_gemm_sp.py::test_example_gemm_sp",
+    # Flaky — passes when run in isolation, fails under parallel execution
+    "minference/test_vs_sparse_attn.py::test_vs_sparse_attn",
+}
+
+
+def _match_any(nodeid, patterns):
+    """Return True if *nodeid* contains any of the *patterns*."""
+    return any(p in nodeid for p in patterns)
+
+
+def pytest_collection_modifyitems(config, items):  # noqa: ARG001
+    """When TILELANG_TARGET=cutedsl, annotate known-bad tests automatically."""
+    if os.environ.get("TILELANG_TARGET") != "cutedsl":
+        return
+
+    for item in items:
+        nid = item.nodeid
+        if _match_any(nid, CUTEDSL_KNOWN_FAILURES):
+            item.add_marker(
+                pytest.mark.xfail(
+                    reason="CuTeDSL: known limitation (unimplemented op or flaky)",
+                    strict=False,
+                )
+            )
+
+
 def pytest_terminal_summary(terminalreporter, exitstatus, config):
     """Ensure that at least one test is collected. Error out if all tests are skipped."""
     known_types = {
diff --git a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
index 18467a8118..22ce27de18 100644
--- a/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
+++ b/examples/deepseek_deepgemm/example_deepgemm_fp8_2xAcc.py
@@ -4,7 +4,6 @@
 import tilelang.testing
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 tilelang.testing.set_random_seed(42)
 
@@ -28,7 +27,7 @@ def tl_gemm(
     ], "Currently only float16 and float32 are supported"
 
     group_size = 128
-    block_M = 128
+    block_M = 64
     block_K = 128
 
     A_shape = (M, K)
@@ -51,7 +50,6 @@ def main(
             A_shared = T.alloc_shared(A_shared_shape, in_dtype)
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_shared = T.alloc_shared(C_shared_shape, out_dtype)
-            Scale_C_shared = T.alloc_shared((block_M), T.float32)
             C_local = T.alloc_fragment(C_shared_shape, accum_dtype)
             C_local_accum = T.alloc_fragment(C_shared_shape, accum_dtype)
 
@@ -66,15 +64,12 @@ def main(
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 # Load B into shared memory
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                # Load scale into shared memory
                 Scale_B = scales_b[bx * block_N // group_size, k]
-                for i in T.Parallel(block_M):
-                    Scale_C_shared[i] = scales_a[by * block_M + i, k] * Scale_B
 
                 T.gemm(A_shared, B_shared, C_local, transpose_B=True)
                 # Promote to enable 2xAcc
                 for i, j in T.Parallel(block_M, block_N):
-                    C_local_accum[i, j] += C_local[i, j] * Scale_C_shared[i]
+                    C_local_accum[i, j] += C_local[i, j] * (scales_a[by * block_M + i, k] * Scale_B)
                 T.clear(C_local)
             # TMA store
             T.copy(C_local_accum, C_shared)
@@ -148,9 +143,9 @@ def assert_tl_gemm_correctness(M, N, K, block_N, in_dtype, out_dtype, accum_dtyp
     # src_code is the generated cuda source
     assert src_code is not None
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-    accum_dtype = map_torch_type(accum_dtype)
+    in_dtype = in_dtype.as_torch()
+    out_dtype = out_dtype.as_torch()
+    accum_dtype = accum_dtype.as_torch()
 
     A = torch.randn(M, K).to(torch.bfloat16).cuda()
     B = torch.randn(N, K).to(torch.bfloat16).cuda()
diff --git a/examples/deepseek_mhc/example_mhc_bwd.py b/examples/deepseek_mhc/example_mhc_bwd.py
new file mode 100644
index 0000000000..2961d87952
--- /dev/null
+++ b/examples/deepseek_mhc/example_mhc_bwd.py
@@ -0,0 +1,283 @@
+# NOTE: This bwd script is not an official upstream script; it is community-written and provided for reference only.
+# checkout pr: https://github.com/tile-ai/tilelang/pull/1758
+import torch
+
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import set_autotune_inputs
+from tqdm import trange
+
+
+dtype = torch.float32
+
+seqlen = 65536
+n_stream = 16
+iters = 100
+repeat = 512
+
+EPS = 1e-10
+
+
+def sinkhorn_forward(M, iters=20):
+    P = torch.exp(M)
+    R = P
+
+    for _ in range(iters):
+        R = R / R.sum(-2, keepdim=True)
+        R = R / R.sum(-1, keepdim=True)
+
+    return R, P
+
+
+def sinkhorn_bwd_configs(n_stream, seqlen):
+    """Generate autotune configurations for different tilesize and threads"""
+    configs = []
+
+    # Explore different tile sizes and thread counts
+    tilesizes = [1, 2, 4, 8, 16, 32, 64]
+    thread_counts = [32, 64, 128, 256]
+
+    for tilesize in tilesizes:
+        # Skip if tilesize doesn't divide seqlen evenly (optional constraint)
+        if seqlen % tilesize != 0:
+            continue
+
+        for threads in thread_counts:
+            configs.append({"tilesize": tilesize, "threads": threads})
+
+    return configs
+
+
+@tilelang.autotune(
+    configs=sinkhorn_bwd_configs(n_stream, seqlen),
+    warmup=4,
+    rep=repeat,
+)
+@tilelang.jit(
+    out_idx=[2],
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
+def sinkhorn_bwd_implicit_cg(n_stream: int, tilesize: int = 32, threads: int = 128):
+    seqlen = T.dynamic("seqlen")
+    tensor_shape = [seqlen, n_stream, n_stream]
+    dtype = T.float32
+
+    @T.macro
+    def matvec_A(R, x1, x2, buf, y1, y2):
+        for i_tile, i, j in T.Parallel(tilesize, n_stream, n_stream):
+            buf[i_tile, i, j] = R[i_tile, i, j] * x2[i_tile, j]
+        T.reduce_sum(buf, y1, dim=-1)
+
+        for i_tile, i, j in T.Parallel(tilesize, n_stream, n_stream):
+            buf[i_tile, i, j] = R[i_tile, i, j] * x1[i_tile, i]
+        T.reduce_sum(buf, y2, dim=-2)
+
+        for i_tile, i in T.Parallel(tilesize, n_stream):
+            y1[i_tile, i] += x1[i_tile, i]
+            y2[i_tile, i] += x2[i_tile, i]
+
+    @T.macro
+    def dot(x1, x2, y1, y2, buf, out):
+        for i_tile, i in T.Parallel(tilesize, n_stream):
+            buf[i_tile, i] = x1[i_tile, i] * y1[i_tile, i] + x2[i_tile, i] * y2[i_tile, i]
+
+        T.reduce_sum(buf, out, dim=-1)
+
+    @T.prim_func
+    def main(
+        out: T.Tensor(tensor_shape, dtype),
+        dout: T.Tensor(tensor_shape, dtype),
+        res: T.Tensor(tensor_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seqlen, tilesize), threads=threads) as i_seq:
+            R = T.alloc_fragment([tilesize, n_stream, n_stream], dtype=dtype)
+            dR = T.alloc_fragment([tilesize, n_stream, n_stream], dtype=dtype)
+            RdR = T.alloc_fragment([tilesize, n_stream, n_stream], dtype=dtype)
+            res_tile = T.alloc_shared([tilesize, n_stream, n_stream], dtype=dtype)
+            b1 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            b2 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            x1 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            x2 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            r1 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            r2 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            p1 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            p2 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            alpha = T.alloc_fragment([tilesize, n_stream], dtype=dtype)
+            beta = T.alloc_fragment([tilesize, n_stream], dtype=dtype)
+            r_normsq = T.alloc_fragment([tilesize], dtype=dtype)
+            r_new_normsq = T.alloc_fragment([tilesize], dtype=dtype)
+            Ap1 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            Ap2 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+            pAp = T.alloc_fragment([tilesize], dtype=dtype)
+
+            # Buffers for intermediate results
+            buf1 = T.alloc_shared([tilesize, n_stream, n_stream], dtype=dtype)
+            buf2 = T.alloc_shared([tilesize, n_stream], dtype=dtype)
+
+            T.copy(out[i_seq * tilesize : (i_seq + 1) * tilesize, :, :], R)
+            T.copy(dout[i_seq * tilesize : (i_seq + 1) * tilesize, :, :], dR)
+
+            for i_tile, i_nx, i_ny in T.Parallel(tilesize, n_stream, n_stream):
+                RdR[i_tile, i_nx, i_ny] = R[i_tile, i_nx, i_ny] * dR[i_tile, i_nx, i_ny]
+
+            T.reduce_sum(RdR, b1, dim=-1)
+            T.reduce_sum(RdR, b2, dim=-2)
+
+            T.fill(x1, 0.0)
+            T.fill(x2, 0.0)
+
+            matvec_A(R, x1, x2, buf1, r1, r2)
+
+            for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                r1[i_tile, i_n] = b1[i_tile, i_n] - r1[i_tile, i_n]
+
+            for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                r2[i_tile, i_n] = b2[i_tile, i_n] - r2[i_tile, i_n]
+
+            T.copy(r1, p1)
+            T.copy(r2, p2)
+
+            dot(r1, r2, r1, r2, buf2, r_normsq)
+
+            # Conjugate gradient: iteration starts
+            for _ in T.serial(2 * n_stream):
+                matvec_A(R, p1, p2, buf1, Ap1, Ap2)
+
+                dot(p1, p2, Ap1, Ap2, buf2, pAp)
+
+                for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                    # VERY important to avoid divide by zero
+                    alpha[i_tile, i_n] = r_normsq[i_tile] / (pAp[i_tile] + EPS)
+                for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                    x1[i_tile, i_n] += alpha[i_tile, i_n] * p1[i_tile, i_n]
+                for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                    x2[i_tile, i_n] += alpha[i_tile, i_n] * p2[i_tile, i_n]
+                for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                    r1[i_tile, i_n] -= alpha[i_tile, i_n] * Ap1[i_tile, i_n]
+                for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                    r2[i_tile, i_n] -= alpha[i_tile, i_n] * Ap2[i_tile, i_n]
+
+                dot(r1, r2, r1, r2, buf2, r_new_normsq)
+
+                for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                    # not very important to avoid divide by zero, but it's good to have it
+                    beta[i_tile, i_n] = r_new_normsq[i_tile] / (r_normsq[i_tile] + EPS)
+                for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                    p1[i_tile, i_n] = r1[i_tile, i_n] + beta[i_tile, i_n] * p1[i_tile, i_n]
+                for i_tile, i_n in T.Parallel(tilesize, n_stream):
+                    p2[i_tile, i_n] = r2[i_tile, i_n] + beta[i_tile, i_n] * p2[i_tile, i_n]
+
+                T.copy(r_new_normsq, r_normsq)
+            # Conjugate gradient: iteration ends
+
+            for i_tile, i_nx, i_ny in T.Parallel(tilesize, n_stream, n_stream):
+                res_tile[i_tile, i_nx, i_ny] = (dR[i_tile, i_nx, i_ny] - x1[i_tile, i_nx] - x2[i_tile, i_ny]) * R[i_tile, i_nx, i_ny]
+
+            T.copy(res_tile, res[i_seq * tilesize : (i_seq + 1) * tilesize, :, :])
+
+    return main
+
+
+def main():
+    print("Autotuning TileLang kernel for sinkhorn backward pass")
+    print(f"{seqlen = }")
+    print(f"{n_stream = }")
+    print(f"{iters = }")
+    print(f"{repeat = }")
+
+    ######################################################################
+    # Variable
+    ######################################################################
+    dist = torch.distributions.uniform.Uniform(0.0, 4.0)
+    device = torch.device("cuda")
+    M = dist.sample((seqlen, n_stream, n_stream)).to(device)
+    M.requires_grad_()
+
+    ######################################################################
+    # Shared forward + one shared loss weight
+    ######################################################################
+    R, P = sinkhorn_forward(M, iters)
+    loss_weight = torch.randn_like(R)
+
+    ######################################################################
+    # Method A: Autograd (reference)
+    ######################################################################
+    loss_a = (R * loss_weight).sum()
+    loss_a.backward()
+    grad_M_autograd = M.grad.detach().clone()
+
+    ######################################################################
+    # Method B: Implicit differentiation with autotuning
+    ######################################################################
+    grad_R = loss_weight
+
+    print("\n" + "=" * 60)
+    print("Starting autotuning...")
+    print("=" * 60)
+
+    # Set autotune inputs
+    with set_autotune_inputs(R, grad_R):
+        kernel = sinkhorn_bwd_implicit_cg(n_stream)
+    print(kernel.get_kernel_source())
+    print("\n" + "=" * 60)
+    print("Autotuning completed! Running with best configuration...")
+    print("=" * 60)
+
+    # Warmup and timing with best config
+    a = torch.randn(8192, 8192, device=device)
+    for _ in trange(4, desc="Warmup"):
+        _ = a @ a
+        grad_M_implicit = kernel(R, grad_R)
+        torch.cuda.synchronize()
+
+    # Timing
+    start_event = torch.cuda.Event(enable_timing=True)
+    end_event = torch.cuda.Event(enable_timing=True)
+
+    torch.cuda.synchronize()
+    start_event.record()
+
+    for _ in range(repeat):
+        grad_M_implicit = kernel(R, grad_R)
+
+    end_event.record()
+    torch.cuda.synchronize()
+
+    elapsed_time_ms = start_event.elapsed_time(end_event)
+
+    print(f"\nKernel execution time ({repeat = }): {elapsed_time_ms:.3f} ms")
+    print(f"Average time per iteration: {elapsed_time_ms / repeat:.3f} ms")
+
+    ######################################################################
+    # Compare
+    ######################################################################
+    g1 = grad_M_autograd
+    g2 = grad_M_implicit
+
+    abs_diff = (g1 - g2).abs()
+    # Use max of absolute values for more stable relative error
+    rel_diff = abs_diff / (torch.maximum(g1.abs(), g2.abs()) + 1e-8)
+
+    print("\n" + "=" * 60)
+    print("Comparison of gradients dL/dM")
+    print("=" * 60)
+
+    def format_list(ls):
+        return [f"{x:.2e}" for x in ls]
+
+    MAE = abs_diff.mean(dim=(-1, -2)).tolist()
+    max_abs_diff = abs_diff.reshape(seqlen, -1).max(-1).values.tolist()
+    mean_rel_diff = rel_diff.mean(dim=(-1, -2)).tolist()
+    max_rel_diff = rel_diff.reshape(seqlen, -1).max(-1).values.tolist()
+
+    print(f"Max MAE = {max(MAE):.6e}")
+    print(f"Max max_abs_diff = {max(max_abs_diff):.6e}")
+    print(f"Max mean_rel_diff = {max(mean_rel_diff):.6e}")
+    print(f"Max max_rel_diff = {max(max_rel_diff):.6e}")
+
+    print("\nGrad (autograd) sample:\n", g1[0, :3, :3])
+    print("\nGrad (implicit) sample:\n", g2[0, :3, :3])
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/deepseek_mhc/example_mhc_post.py b/examples/deepseek_mhc/example_mhc_post.py
new file mode 100644
index 0000000000..9c9dc2f720
--- /dev/null
+++ b/examples/deepseek_mhc/example_mhc_post.py
@@ -0,0 +1,140 @@
+import math
+
+import torch
+
+import tilelang
+import tilelang.language as T
+
+
+@tilelang.jit(
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10},
+)
+def mhc_post_tilelang(a, b, c, d, x, hc: int, hidden: int, n_thr: int = 128, h_blk: int = 1024) -> tilelang.JITKernel:
+    # rename for shorter code
+    n = T.dynamic("num_tokens")
+    h = hidden
+
+    h_blk = math.gcd(hidden, h_blk)
+    a: T.Tensor((n, hc, hc), T.float32)
+    b: T.Tensor((n, hc, h), T.bfloat16)
+    c: T.Tensor((n, hc), T.float32)
+    d: T.Tensor((n, h), T.bfloat16)
+    x: T.Tensor((n, hc, h), T.bfloat16)
+    with T.Kernel(n, threads=n_thr) as i_n:
+        x_shared = T.alloc_shared((hc, h_blk), T.bfloat16)
+        b_shared = T.alloc_shared((hc, h_blk), T.bfloat16)
+        d_shared = T.alloc_shared(h_blk, T.bfloat16)
+
+        x_local = T.alloc_fragment((hc, h_blk), T.float32)
+        b_local = T.alloc_fragment((hc, h_blk), T.float32)
+        d_local = T.alloc_fragment(h_blk, T.float32)
+
+        a_local = T.alloc_fragment((hc, hc), T.float32)
+        c_local = T.alloc_fragment(hc, T.float32)
+        T.copy(a[i_n, 0, 0], a_local)
+        T.copy(c[i_n, 0], c_local)
+
+        for i0_h in T.Pipelined(T.ceildiv(h, h_blk), num_stages=2):
+            T.copy(b[i_n, 0, i0_h * h_blk], b_shared)
+            T.copy(d[i_n, i0_h * h_blk], d_shared)
+
+            T.copy(b_shared, b_local)
+            T.copy(d_shared, d_local)
+            for i_hco, i1_h in T.Parallel(hc, h_blk):
+                x_local[i_hco, i1_h] = c_local[i_hco] * d_local[i1_h]
+                for i_hci in T.serial(hc):
+                    x_local[i_hco, i1_h] += a_local[i_hci, i_hco] * b_local[i_hci, i1_h]
+            T.copy(x_local, x_shared)
+
+            T.copy(x_shared, x[i_n, 0, i0_h * h_blk])
+
+
+def mhc_post(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    post_layer_mix: torch.Tensor,
+    comb_res_mix: torch.Tensor,
+) -> torch.Tensor:
+    out = torch.empty_like(residual)
+    print(
+        mhc_post_tilelang.get_kernel_source(
+            comb_res_mix, residual, post_layer_mix.squeeze(-1), x, out, residual.shape[-2], residual.shape[-1]
+        )
+    )
+    mhc_post_tilelang(comb_res_mix, residual, post_layer_mix.squeeze(-1), x, out, residual.shape[-2], residual.shape[-1])
+    return out
+
+
+def mhc_post_ref(
+    x: torch.Tensor,
+    residual: torch.Tensor,
+    post_layer_mix: torch.Tensor,
+    comb_res_mix: torch.Tensor,
+) -> torch.Tensor:
+    term2 = torch.bmm(comb_res_mix.mT, residual.float())
+    return (x.float().unsqueeze(-2) * post_layer_mix + term2).bfloat16()
+
+
+def generate_test_data(
+    n: int,
+    h: int,
+    hc_mult: int,
+    device: str = "cuda",
+) -> dict[str, torch.Tensor]:
+    """Generate test data for post operator."""
+    torch.random.manual_seed(42)
+
+    x = torch.randn((n, h), dtype=torch.bfloat16, device=device)
+    residual = torch.randn((n, hc_mult, h), dtype=torch.bfloat16, device=device)
+    post_layer_mix = torch.randn((n, hc_mult, 1), dtype=torch.float32, device=device)
+    comb_res_mix = torch.randn((n, hc_mult, hc_mult), dtype=torch.float32, device=device)
+
+    return {
+        "x": x,
+        "residual": residual,
+        "post_layer_mix": post_layer_mix,
+        "comb_res_mix": comb_res_mix,
+    }
+
+
+def test(n: int, h: int) -> None:
+    print(f"Testing mhc_post with {n=} {h=}")
+    test_data = generate_test_data(n=n, h=h, hc_mult=4)
+    out_tl = mhc_post(**test_data)
+    out_ref = mhc_post_ref(**test_data)
+    torch.testing.assert_close(out_tl, out_ref)
+
+
+def run_regression_perf(n: int = 4096, h: int = 2560, hc_mult: int = 4) -> float:
+    test_data = generate_test_data(n=n, h=h, hc_mult=hc_mult)
+    out = torch.empty_like(test_data["residual"])
+    post_layer_mix = test_data["post_layer_mix"].squeeze(-1)
+
+    def run_kernel_only():
+        mhc_post_tilelang(
+            test_data["comb_res_mix"],
+            test_data["residual"],
+            post_layer_mix,
+            test_data["x"],
+            out,
+            hc_mult,
+            h,
+        )
+
+    run_kernel_only()
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
+def main():
+    for n in [4096]:
+        for h in [1280, 2560, 7168]:
+            test(n=n, h=h)
+
+
+if __name__ == "__main__":
+    # main()
+    tilelang.disable_cache()
+    test(n=4096, h=2560)
diff --git a/examples/deepseek_mhc/example_mhc_pre.py b/examples/deepseek_mhc/example_mhc_pre.py
new file mode 100644
index 0000000000..28b6c32bf6
--- /dev/null
+++ b/examples/deepseek_mhc/example_mhc_pre.py
@@ -0,0 +1,490 @@
+import math
+
+import tilelang
+import tilelang.language as T
+import torch
+
+
+@tilelang.jit(
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL: 10},
+)
+def mhc_pre_big_fuse_tilelang(
+    gemm_out_mul,
+    gemm_out_sqrsum,
+    hc_scale,
+    hc_base,
+    residual,
+    post_mix,
+    comb_mix,
+    layer_input,
+    hidden_size: int,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+    n_splits: int = 16,
+    hc_mult: int = 4,
+):
+    """Deeply fused kernels, everything other than gemm & sqrsum in mHC pre block."""
+    num_tokens = T.dynamic("num_tokens")
+    hc_mult3 = hc_mult * (2 + hc_mult)
+    hidden_block = math.gcd(512, hidden_size)
+
+    gemm_out_mul: T.Tensor[[n_splits, num_tokens, hc_mult3], T.float32]
+    gemm_out_sqrsum: T.Tensor[[n_splits, num_tokens], T.float32]
+    hc_scale: T.Tensor[[3], T.float32]
+    hc_base: T.Tensor[[hc_mult3], T.float32]
+    residual: T.Tensor[[num_tokens, hc_mult, hidden_size], T.bfloat16]
+    # outputs
+    post_mix: T.Tensor[[num_tokens, hc_mult], T.float32]
+    comb_mix: T.Tensor[[num_tokens, hc_mult * hc_mult], T.float32]
+    layer_input: T.Tensor[[num_tokens, hidden_size], T.bfloat16]
+
+    with T.Kernel(num_tokens, threads=96) as i:
+        ##################################################################
+        # _pre_norm_fn_fwd_norm
+        rms = T.alloc_fragment(1, T.float32)
+        mixes = T.alloc_fragment(hc_mult3, T.float32)
+        T.clear(mixes)
+        rms[0] = 0
+        for i_split in T.serial(n_splits):
+            rms[0] += gemm_out_sqrsum[i_split, i]
+        rms[0] = T.rsqrt(rms[0] / (hc_mult * hidden_size) + rms_eps)
+        for j in T.Parallel(hc_mult3):
+            mixes[j] = 0
+            for i_split in T.serial(n_splits):
+                mixes[j] += gemm_out_mul[i_split, i, j]
+            mixes[j] *= rms[0]
+        mixes_shared = T.alloc_shared(hc_mult3, T.float32)
+        T.copy(mixes, mixes_shared)
+
+        if T.get_thread_binding() < 32:
+            ##################################################################
+            # _pre_split_mixes_fwd (post & comb)
+            cm = T.alloc_fragment((hc_mult, hc_mult), T.float32)
+            for j in T.Parallel(hc_mult):
+                post_mix[i, j] = T.sigmoid(mixes_shared[j + hc_mult] * hc_scale[1] + hc_base[j + hc_mult]) * hc_post_mult_value
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                cm[j, k] = mixes_shared[j * hc_mult + k + hc_mult * 2] * hc_scale[2] + hc_base[j * hc_mult + k + hc_mult * 2]
+
+            ##################################################################
+            # _sinkhorn_fwd
+            row_sum = T.alloc_fragment(hc_mult, T.float32)
+            col_sum = T.alloc_fragment(hc_mult, T.float32)
+
+            # comb = comb.softmax(-1) + eps
+            row_max = T.alloc_fragment(hc_mult, T.float32)
+            T.reduce_max(cm, row_max, dim=1)
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                cm[j, k] = T.exp(cm[j, k] - row_max[j])
+            T.reduce_sum(cm, row_sum, dim=1)
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                cm[j, k] = cm[j, k] / row_sum[j] + hc_sinkhorn_eps
+
+            # comb = comb / (comb.sum(-2) + eps)
+            T.reduce_sum(cm, col_sum, dim=0)
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                cm[j, k] = cm[j, k] / (col_sum[k] + hc_sinkhorn_eps)
+
+            for _ in T.serial(sinkhorn_repeat - 1):
+                # comb = comb / (comb.sum(-1) + eps)
+                T.reduce_sum(cm, row_sum, dim=1)
+                for j, k in T.Parallel(hc_mult, hc_mult):
+                    cm[j, k] = cm[j, k] / (row_sum[j] + hc_sinkhorn_eps)
+
+                # comb = comb / (comb.sum(-2) + eps)
+                T.reduce_sum(cm, col_sum, dim=0)
+                for j, k in T.Parallel(hc_mult, hc_mult):
+                    cm[j, k] = cm[j, k] / (col_sum[k] + hc_sinkhorn_eps)
+
+            # save comb_mix to global memory
+            for j, k in T.Parallel(hc_mult, hc_mult):
+                comb_mix[i, j * hc_mult + k] = cm[j, k]
+        else:
+            ##################################################################
+            # _pre_split_mixes_fwd (pre)
+            pre_mix_shared = T.alloc_shared(hc_mult, T.float32)
+            for j in T.Parallel(hc_mult):
+                pre_mix_shared[j] = (
+                    T.sigmoid(
+                        mixes_shared[j] * hc_scale[0] + hc_base[j],
+                    )
+                    + hc_pre_eps
+                )
+            ###################################################################
+            # _pre_apply_mix_fwd
+            for i0_h in T.Pipelined(hidden_size // hidden_block, num_stages=2):
+                xs = T.alloc_shared((hc_mult, hidden_block), T.float32)
+                xl = T.alloc_fragment((hc_mult, hidden_block), T.float32)
+                T.copy(residual[i, 0, i0_h * hidden_block], xs)
+                T.copy(xs, xl)
+
+                ol = T.alloc_fragment(hidden_block, T.float32)
+                T.clear(ol)
+
+                for i_hc in T.serial(hc_mult):
+                    pre = pre_mix_shared[i_hc]
+                    for i1_h in T.Parallel(hidden_block):
+                        ol[i1_h] += pre * xl[i_hc, i1_h]
+
+                T.copy(ol, layer_input[i, i0_h * hidden_block])
+
+
+@tilelang.jit
+def mhc_pre_gemm_sqrsum_tilelang(
+    x,
+    fn,
+    out,
+    sqrsum,
+    hc_mult3: int,
+    hc_hidden_size: int,
+    token_block: int = 32,
+    hidden_block: int = 256,
+) -> tilelang.JITKernel:
+    """Not highly optimized TileLang implementation of fused gemm and sqrsum in mHC pre block."""
+    assert hc_mult3 <= 32  # should be 24 usually
+    num_tokens = T.dynamic("num_tokens")
+    assert hc_hidden_size % hidden_block == 0
+
+    x: T.Tensor((num_tokens, hc_hidden_size), T.bfloat16)
+    fn: T.Tensor((hc_mult3, hc_hidden_size), T.float32)
+    out: T.Tensor((num_tokens, hc_mult3), T.float32)
+    sqrsum: T.Tensor((num_tokens), T.float32)
+
+    with T.Kernel(T.ceildiv(num_tokens, token_block)) as px:
+        out_frag = T.alloc_fragment((token_block, 32), T.float32)
+        sqrsum_part = T.alloc_fragment((token_block, 4), T.float32)
+        T.clear(out_frag)
+        T.clear(sqrsum_part)
+        for pz in T.Pipelined(hc_hidden_size // hidden_block, num_stages=2):
+            x_smem_16 = T.alloc_shared((token_block, hidden_block), T.bfloat16)
+            fn_smem = T.alloc_shared((32, hidden_block), T.float32)
+
+            T.annotate_layout({x_smem_16: tilelang.layout.make_swizzled_layout(x_smem_16)})
+
+            T.copy(x[px * token_block, pz * hidden_block], x_smem_16)
+            T.copy(fn[0, pz * hidden_block], fn_smem)
+
+            x_frag_16 = T.alloc_fragment((token_block, hidden_block), T.bfloat16)
+            T.copy(x_smem_16, x_frag_16)
+            x_frag = T.alloc_fragment((token_block, hidden_block), T.float32)
+            T.copy(x_frag_16, x_frag)
+
+            for jj in T.serial(hidden_block // 4):
+                for i, j in T.Parallel(token_block, 4):
+                    sqrsum_part[i, j] += x_frag[i, jj * 4 + j] * x_frag[i, jj * 4 + j]
+
+            # should be TF32 gemm
+            T.gemm(
+                x_frag,
+                fn_smem,
+                out_frag,
+                transpose_A=False,
+                transpose_B=True,
+                clear_accum=False,
+            )
+        sqrsum_l = T.alloc_fragment(token_block, T.float32)
+        T.reduce_sum(sqrsum_part, sqrsum_l)
+        for i in T.Parallel(token_block):
+            sqrsum[px * token_block + i] = sqrsum_l[i]
+        for i, j in T.Parallel(token_block, 32):
+            if j < hc_mult3:
+                out[px * token_block + i, j] = out_frag[i, j]
+
+
+def mhc_pre(
+    residual: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+    n_splits: int = 1,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    """
+    Forward pass for mHC pre block.
+
+    Args:
+        residual: shape (..., hc_mult, hidden_size), dtype torch.bfloat16
+        fn: shape (hc_mult3, hc_mult * hidden_size), dtype torch.float32
+        hc_scale: shape (3,), dtype torch.float32
+        hc_base: shape (hc_mult3,), dtype torch.float32
+        rms_eps: RMS normalization epsilon
+        hc_pre_eps: pre-mix epsilon
+        hc_sinkhorn_eps: sinkhorn epsilon
+        hc_post_mult_value: post-mix multiplier value
+        sinkhorn_repeat: number of sinkhorn iterations
+        n_splits: split-k factor; TileLang version of mhc_pre_gemm_sqrsum doesn't support this
+
+    Returns:
+        post_mix: shape (..., hc_mult), dtype torch.float32
+        comb_mix: shape (..., hc_mult, hc_mult), dtype torch.float32
+        layer_input: shape (..., hidden_size), dtype torch.bfloat16
+    """
+
+    # Validate shapes
+    assert residual.dtype == torch.bfloat16
+    assert fn.dtype == torch.float32
+    assert hc_scale.dtype == torch.float32
+    assert hc_base.dtype == torch.float32
+
+    hc_mult = residual.shape[-2]
+    hidden_size = residual.shape[-1]
+    hc_mult2 = hc_mult * hc_mult
+    hc_mult3 = hc_mult * 2 + hc_mult2
+
+    hc_hidden_size = hc_mult * hidden_size
+    assert fn.shape[0] == hc_mult3
+    assert fn.shape[1] == hc_hidden_size
+    assert hc_scale.shape == (3,)
+    assert hc_base.shape == (hc_mult3,)
+
+    outer_shape = residual.shape[:-2]
+
+    residual_flat = residual.view(-1, hc_mult, hidden_size)
+    num_tokens = residual_flat.shape[0]
+    fn_flat = fn
+
+    post_mix = torch.empty(num_tokens, hc_mult, dtype=torch.float32, device=residual.device)
+    comb_mix = torch.empty(num_tokens, hc_mult2, dtype=torch.float32, device=residual.device)
+    layer_input = torch.empty(num_tokens, hidden_size, dtype=torch.bfloat16, device=residual.device)
+
+    gemm_out_mul = torch.empty(n_splits, num_tokens, hc_mult3, dtype=torch.float32, device=residual.device)
+    gemm_out_sqrsum = torch.empty(n_splits, num_tokens, dtype=torch.float32, device=residual.device)
+    assert n_splits == 1, "The simple TileLang version gemm_sqrsum doesn't support split-k"
+    mhc_pre_gemm_sqrsum_tilelang(
+        residual_flat.view(num_tokens, hc_mult * hidden_size),
+        fn_flat,
+        gemm_out_mul.squeeze(0),
+        gemm_out_sqrsum.squeeze(0),
+        hc_mult3,
+        hc_mult * hidden_size,
+    )
+
+    mhc_pre_big_fuse_tilelang(
+        gemm_out_mul,
+        gemm_out_sqrsum,
+        hc_scale,
+        hc_base,
+        residual_flat,
+        post_mix,
+        comb_mix,
+        layer_input,
+        hidden_size,
+        rms_eps,
+        hc_pre_eps,
+        hc_sinkhorn_eps,
+        hc_post_mult_value,
+        sinkhorn_repeat,
+        n_splits,
+        hc_mult,
+    )
+
+    post_mix = post_mix.view(*outer_shape, hc_mult, 1)
+    comb_mix = comb_mix.view(*outer_shape, hc_mult, hc_mult)
+    layer_input = layer_input.view(*outer_shape, hidden_size)
+
+    return post_mix, comb_mix, layer_input
+
+
+def sinkhorn_normalize_ref(x: torch.Tensor, repeat: int, eps: float) -> torch.Tensor:
+    x = x.softmax(-1) + eps
+    x = x / (x.sum(-2, keepdim=True) + eps)
+    for _ in range(repeat - 1):
+        x = x / (x.sum(-1, keepdim=True) + eps)
+        x = x / (x.sum(-2, keepdim=True) + eps)
+    return x
+
+
+def mhc_pre_ref(
+    residual: torch.Tensor,
+    fn: torch.Tensor,
+    hc_scale: torch.Tensor,
+    hc_base: torch.Tensor,
+    rms_eps: float,
+    hc_pre_eps: float,
+    hc_sinkhorn_eps: float,
+    hc_post_mult_value: float,
+    sinkhorn_repeat: int,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    hc_mult = residual.shape[-2]
+
+    residual_flat = residual.flatten(-2, -1).float()
+    sqrsum = residual_flat.square().sum(-1)
+    mixes = residual_flat @ fn.T * (sqrsum.unsqueeze(-1) / fn.shape[-1] + rms_eps).rsqrt()
+
+    hc_scale = torch.cat(
+        [
+            hc_scale[0].expand(hc_mult),
+            hc_scale[1].expand(hc_mult),
+            hc_scale[2].expand(hc_mult * hc_mult),
+        ],
+    )
+    mixes = mixes * hc_scale + hc_base
+
+    pre_mix = mixes[:, :hc_mult].sigmoid().unsqueeze(-1) + hc_pre_eps
+    post_mix = (mixes[:, hc_mult : 2 * hc_mult].sigmoid() * hc_post_mult_value).unsqueeze(-1)
+    res_mix = mixes[:, 2 * hc_mult :].view(-1, hc_mult, hc_mult)
+
+    res_mix = sinkhorn_normalize_ref(res_mix, repeat=sinkhorn_repeat, eps=hc_sinkhorn_eps)
+
+    layer_input = (residual * pre_mix).sum(-2).bfloat16()
+
+    return post_mix, res_mix, layer_input
+
+
+def generate_test_data(
+    n: int,
+    hc_mult: int,
+    hidden_size: int,
+    rms_eps: float = 1e-6,
+    hc_pre_eps: float = 1e-6,
+    hc_sinkhorn_eps: float = 1e-6,
+    hc_post_mult_value: float = 1.0,
+    sinkhorn_repeat: int = 10,
+) -> dict[str, torch.Tensor | float]:
+    """Generate test data for big fuse operator."""
+    torch.random.manual_seed(42)
+
+    hc_mult2 = hc_mult * hc_mult
+    hc_mult3 = hc_mult * 2 + hc_mult2
+    device = "cuda"
+
+    residual = (
+        torch.randn((n, hc_mult, hidden_size), dtype=torch.float, device=device)
+        .mul(1 + torch.arange(hc_mult, device=device).mul(0.01).view(1, -1, 1))
+        .bfloat16()
+    )
+
+    fn = (
+        torch.randn((hc_mult3, hc_mult, hidden_size), dtype=torch.float, device=device)
+        * 1e-4
+        * (1 + torch.arange(hc_mult, device=device).mul(0.01).view(1, -1, 1))
+    ).flatten(1, 2)
+
+    hc_scale = torch.randn((3,), dtype=torch.float, device=device) * 0.1
+
+    hc_base = torch.randn((hc_mult3,), dtype=torch.float, device=device) * 0.1
+
+    return {
+        "residual": residual,
+        "fn": fn,
+        "hc_scale": hc_scale,
+        "hc_base": hc_base,
+        "rms_eps": rms_eps,
+        "hc_pre_eps": hc_pre_eps,
+        "hc_sinkhorn_eps": hc_sinkhorn_eps,
+        "hc_post_mult_value": hc_post_mult_value,
+        "sinkhorn_repeat": sinkhorn_repeat,
+    }
+
+
+def test(n: int, hidden_size: int, hc_mult: int) -> None:
+    print(f"Testing mhc_pre with {n=} {hidden_size=} {hc_mult=}")
+    test_data = generate_test_data(
+        n=n,
+        hc_mult=hc_mult,
+        hidden_size=hidden_size,
+    )
+
+    # Forward pass with big fuse
+    post_mix_fused, comb_mix_fused, layer_input_fused = mhc_pre(**test_data)
+
+    # Forward pass with reference
+    post_mix_ref, comb_mix_ref, layer_input_ref = mhc_pre_ref(**test_data)
+
+    # Compare outputs
+    torch.testing.assert_close(post_mix_fused, post_mix_ref)
+    torch.testing.assert_close(comb_mix_fused, comb_mix_ref)
+    torch.testing.assert_close(layer_input_fused, layer_input_ref)
+
+
+def run_regression_perf(
+    n: int = 2048,
+    hidden_size: int = 4096,
+    hc_mult: int = 4,
+    rms_eps: float = 1e-6,
+    hc_pre_eps: float = 1e-6,
+    hc_sinkhorn_eps: float = 1e-6,
+    hc_post_mult_value: float = 1.0,
+    sinkhorn_repeat: int = 10,
+    n_splits: int = 1,
+) -> float:
+    assert n_splits == 1, "The simple TileLang version gemm_sqrsum doesn't support split-k"
+
+    test_data = generate_test_data(
+        n=n,
+        hc_mult=hc_mult,
+        hidden_size=hidden_size,
+        rms_eps=rms_eps,
+        hc_pre_eps=hc_pre_eps,
+        hc_sinkhorn_eps=hc_sinkhorn_eps,
+        hc_post_mult_value=hc_post_mult_value,
+        sinkhorn_repeat=sinkhorn_repeat,
+    )
+
+    residual = test_data["residual"]
+    fn = test_data["fn"]
+    hc_scale = test_data["hc_scale"]
+    hc_base = test_data["hc_base"]
+
+    num_tokens = residual.shape[0]
+    hc_mult2 = hc_mult * hc_mult
+    hc_mult3 = hc_mult * 2 + hc_mult2
+
+    residual_flat = residual.view(num_tokens, hc_mult, hidden_size)
+    post_mix = torch.empty(num_tokens, hc_mult, dtype=torch.float32, device=residual.device)
+    comb_mix = torch.empty(num_tokens, hc_mult2, dtype=torch.float32, device=residual.device)
+    layer_input = torch.empty(num_tokens, hidden_size, dtype=torch.bfloat16, device=residual.device)
+    gemm_out_mul = torch.empty(n_splits, num_tokens, hc_mult3, dtype=torch.float32, device=residual.device)
+    gemm_out_sqrsum = torch.empty(n_splits, num_tokens, dtype=torch.float32, device=residual.device)
+
+    def run_kernel_only():
+        mhc_pre_gemm_sqrsum_tilelang(
+            residual_flat.view(num_tokens, hc_mult * hidden_size),
+            fn,
+            gemm_out_mul.squeeze(0),
+            gemm_out_sqrsum.squeeze(0),
+            hc_mult3,
+            hc_mult * hidden_size,
+        )
+
+        mhc_pre_big_fuse_tilelang(
+            gemm_out_mul,
+            gemm_out_sqrsum,
+            hc_scale,
+            hc_base,
+            residual_flat,
+            post_mix,
+            comb_mix,
+            layer_input,
+            hidden_size,
+            rms_eps,
+            hc_pre_eps,
+            hc_sinkhorn_eps,
+            hc_post_mult_value,
+            sinkhorn_repeat,
+            n_splits,
+            hc_mult,
+        )
+
+    run_kernel_only()
+
+    from tilelang.profiler import do_bench
+
+    return do_bench(run_kernel_only, backend="cupti")
+
+
+def main():
+    for n1 in [512, 1024, 2048, 8192]:
+        for hidden_size in [1280, 2560, 4096]:
+            for hc_mult in [4]:
+                test(n=n1, hidden_size=hidden_size, hc_mult=hc_mult)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/deepseek_mhc/regression_example_mhc.py b/examples/deepseek_mhc/regression_example_mhc.py
new file mode 100644
index 0000000000..880c3e04a6
--- /dev/null
+++ b/examples/deepseek_mhc/regression_example_mhc.py
@@ -0,0 +1,15 @@
+import tilelang.testing
+import example_mhc_post
+import example_mhc_pre
+
+
+def regression_example_mhc_post():
+    tilelang.testing.process_func(example_mhc_post.run_regression_perf)
+
+
+def regression_example_mhc_pre():
+    tilelang.testing.process_func(example_mhc_pre.run_regression_perf)
+
+
+if __name__ == "__main__":
+    tilelang.testing.regression()
diff --git a/examples/deepseek_mhc/test_example_mhc.py b/examples/deepseek_mhc/test_example_mhc.py
new file mode 100644
index 0000000000..3d9ecad4da
--- /dev/null
+++ b/examples/deepseek_mhc/test_example_mhc.py
@@ -0,0 +1,18 @@
+import tilelang.testing
+
+from example_mhc_post import main as main_post
+from example_mhc_pre import main as main_pre
+
+
+@tilelang.testing.requires_cuda
+def test_mhc_post():
+    main_post()
+
+
+@tilelang.testing.requires_cuda
+def test_mhc_pre():
+    main_pre()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/examples/deepseek_mla/README.md b/examples/deepseek_mla/README.md
index bd3539d269..f75e606bd5 100644
--- a/examples/deepseek_mla/README.md
+++ b/examples/deepseek_mla/README.md
@@ -44,7 +44,7 @@ for i in range(loop_range):
     scores_scale = exp(scores_max_prev - scores_max)
     acc_o *= scores_scale
     acc_s = exp(acc_s - scores_max)
-    acc_o = acc_s @ V[i]
+    acc_o += acc_s @ V[i]
     ...
 ```
 
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_aiter.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_aiter.py
new file mode 100644
index 0000000000..9eae480822
--- /dev/null
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_aiter.py
@@ -0,0 +1,290 @@
+# This benchmark script is modified based on: https://github.com/deepseek-ai/FlashMLA/blob/main/benchmark/bench_flash_mla.py
+# ruff: noqa
+import argparse
+import math
+import random
+import torch
+
+import triton
+import triton.language as tl
+
+import tilelang
+from tilelang.profiler import do_bench
+
+try:
+    from aiter.mla import mla_decode_fwd
+except ImportError:
+    print("aiter is AMD specific kernel library. Please make sure aiter is installed on your AMD device.")
+
+
+def scaled_dot_product_attention(query, key, value, h_q, h_kv, is_causal=False):
+    query = query.float()
+    key = key.float()
+    value = value.float()
+    key = key.repeat_interleave(h_q // h_kv, dim=0)
+    value = value.repeat_interleave(h_q // h_kv, dim=0)
+    attn_weight = query @ key.transpose(-2, -1) / math.sqrt(query.size(-1))
+    if is_causal:
+        s_q = query.shape[-2]
+        s_k = key.shape[-2]
+        attn_bias = torch.zeros(s_q, s_k, dtype=query.dtype)
+        temp_mask = torch.ones(s_q, s_k, dtype=torch.bool).tril(diagonal=s_k - s_q)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+        attn_weight += attn_bias
+    lse = attn_weight.logsumexp(dim=-1)
+    attn_weight = torch.softmax(attn_weight, dim=-1, dtype=torch.float32)
+    return attn_weight @ value, lse
+
+
+@torch.inference_mode()
+def run_torch_mla(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
+    blocked_v = blocked_k[..., :dv]
+
+    def ref_mla():
+        out = torch.empty(b, s_q, h_q, dv, dtype=torch.float32)
+        lse = torch.empty(b, h_q, s_q, dtype=torch.float32)
+        for i in range(b):
+            begin = i * max_seqlen_pad
+            end = begin + cache_seqlens[i]
+            O, LSE = scaled_dot_product_attention(
+                q[i].transpose(0, 1),
+                blocked_k.view(-1, h_kv, d)[begin:end].transpose(0, 1),
+                blocked_v.view(-1, h_kv, dv)[begin:end].transpose(0, 1),
+                h_q,
+                h_kv,
+                is_causal=causal,
+            )
+            out[i] = O.transpose(0, 1)
+            lse[i] = LSE
+        return out, lse
+
+    out_torch, lse_torch = ref_mla()
+    t = triton.testing.do_bench(ref_mla)
+    return out_torch, lse_torch, t
+
+
+@torch.inference_mode()
+def run_mla_aiter(q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
+    assert d > dv, "mla with rope dim should be larger than no rope dim"
+
+    qo_indptr = torch.zeros(b + 1, dtype=torch.int)
+    kv_indptr = torch.zeros(b + 1, dtype=torch.int)
+    seq_lens_qo = torch.empty(b, dtype=torch.int)
+    seq_lens_qo.fill_(1)
+    max_seqlen_qo = seq_lens_qo.max().item()
+
+    kv_indptr[1 : b + 1] = torch.cumsum(cache_seqlens, dim=0)
+    qo_indptr[1 : b + 1] = torch.cumsum(seq_lens_qo, dim=0)
+    total_q = qo_indptr[-1].item()
+
+    # set block_size to 1
+    page_size = 1
+    kv_buffer = blocked_k.view(-1, page_size, h_kv, d)
+
+    flat_indices = []
+    for i in range(b):
+        start = i * max_seqlen_pad
+        end = start + cache_seqlens[i]
+        flat_indices.append(torch.arange(start, end, dtype=torch.int))
+
+    kv_indices = torch.cat(flat_indices)
+
+    kv_last_page_lens = torch.ones(b, dtype=torch.int)
+
+    sm_scale = 1.0 / (d**0.5)
+
+    def mla_aiter():
+        out_aiter = torch.empty((total_q, h_q, dv), dtype=dtype).fill_(-1)
+        attn_logits_aiter, attn_lse_aiter = mla_decode_fwd(
+            q.view((total_q, h_q, d)),
+            kv_buffer,
+            out_aiter,
+            qo_indptr,
+            kv_indptr,
+            kv_indices,
+            kv_last_page_lens,
+            max_seqlen_qo,
+            sm_scale,
+        )
+        return out_aiter.view([b, s_q, h_q, dv])
+
+    out_aiter = mla_aiter()
+    t = triton.testing.do_bench(mla_aiter)
+    return out_aiter, None, t
+
+
+FUNC_TABLE = {
+    "torch": run_torch_mla,
+    "mla_aiter": run_mla_aiter,
+}
+
+
+def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
+    print(
+        f"comparing {baseline} vs {target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}"
+    )
+    device = torch.device("cuda:0")
+    torch.set_default_dtype(dtype)
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(0)
+    random.seed(0)
+    assert baseline in FUNC_TABLE
+    assert target in FUNC_TABLE
+    baseline_func = FUNC_TABLE[baseline]
+    target_func = FUNC_TABLE[target]
+
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+    # print(f"{total_seqlens=}, {mean_seqlens=}, {max_seqlen=}")
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_size = 64
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+
+    out_a, lse_a, perf_a = baseline_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+
+    torch.testing.assert_close(out_b.float(), out_a.float(), atol=1e-2, rtol=1e-2), "out"
+    if target not in ["mla_aiter"]:
+        # flash_mla_triton doesn't return lse
+        torch.testing.assert_close(lse_b.float(), lse_a.float(), atol=1e-2, rtol=1e-2), "lse"
+
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.3f} TFLOPS, {bytes / 10**6 / perf_a:.3f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.3f} TFLOPS, {bytes / 10**6 / perf_b:.3f} GB/s")
+    return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
+
+
+def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
+    print(f"{target}: {b=}, {s_q=}, mean_seqlens={cache_seqlens.float().mean()}, {h_q=}, {h_kv=}, {d=}, {dv=}, {causal=}, {dtype=}")
+    torch.set_default_dtype(dtype)
+    device = torch.device("cuda:0")
+    torch.set_default_device(device)
+    torch.cuda.set_device(device)
+    torch.manual_seed(0)
+    random.seed(0)
+    assert target in FUNC_TABLE, f"target {target} not in {FUNC_TABLE}"
+    target_func = FUNC_TABLE[target]
+
+    total_seqlens = cache_seqlens.sum().item()
+    max_seqlen = cache_seqlens.max().item()
+    max_seqlen_pad = triton.cdiv(max_seqlen, 256) * 256
+    # print(f"{total_seqlens=}, {mean_seqlens=}, {max_seqlen=}")
+
+    q = torch.randn(b, s_q, h_q, d)
+    block_size = 64
+    block_table = torch.arange(b * max_seqlen_pad // block_size, dtype=torch.int32).view(b, max_seqlen_pad // block_size)
+    blocked_k = torch.randn(block_table.numel(), block_size, h_kv, d)
+
+    out_b, lse_b, perf_b = target_func(
+        q, block_table, blocked_k, max_seqlen_pad, block_size, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype
+    )
+
+    FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
+    bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.3f} TFLOPS, {bytes / 10**6 / perf_b:.3f} GB/s")
+    return bytes / 10**6 / perf_b
+
+
+available_targets = [
+    "torch",
+    "mla_aiter",
+]
+
+shape_configs = [
+    {
+        "b": batch,
+        "s_q": 1,
+        "cache_seqlens": torch.tensor([seqlen + 2 * i for i in range(batch)], dtype=torch.int32, device="cuda"),
+        "h_q": head,
+        "h_kv": 1,
+        "d": 512 + 64,
+        "dv": 512,
+        "causal": True,
+        "dtype": torch.bfloat16,
+    }
+    for batch in [64, 128]
+    for seqlen in [1024, 2048, 4096, 8192, 16384]
+    for head in [128]
+]
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--baseline", type=str, default="torch")
+    parser.add_argument("--target", type=str, default="mla_aiter")
+    parser.add_argument("--all", action="store_true")
+    parser.add_argument("--one", action="store_true")
+    parser.add_argument("--compare", action="store_true")
+    args = parser.parse_args()
+    return args
+
+
+if __name__ == "__main__":
+    args = get_args()
+    benchmark_type = "all" if args.all else f"{args.baseline}_vs_{args.target}" if args.compare else args.target
+    with open(f"{benchmark_type}_perf.csv", "w") as fout:
+        fout.write("name,batch,seqlen,head,bw\n")
+        for shape in shape_configs:
+            if args.all:
+                for target in available_targets:
+                    perf = compare_a(
+                        target,
+                        shape["b"],
+                        shape["s_q"],
+                        shape["cache_seqlens"],
+                        shape["h_q"],
+                        shape["h_kv"],
+                        shape["d"],
+                        shape["dv"],
+                        shape["causal"],
+                        shape["dtype"],
+                    )
+                    fout.write(
+                        f"{target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
+                    )
+            elif args.compare:
+                perfa, prefb = compare_ab(
+                    args.baseline,
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
+                fout.write(
+                    f"{args.baseline},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perfa:.0f}\n"
+                )
+                fout.write(
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{prefb:.0f}\n"
+                )
+            elif args.one:
+                perf = compare_a(
+                    args.target,
+                    shape["b"],
+                    shape["s_q"],
+                    shape["cache_seqlens"],
+                    shape["h_q"],
+                    shape["h_kv"],
+                    shape["d"],
+                    shape["dv"],
+                    shape["causal"],
+                    shape["dtype"],
+                )
+                fout.write(
+                    f"{args.target},{shape['b']},{shape['cache_seqlens'].float().mean().cpu().item():.0f},{shape['h_q']},{perf:.0f}\n"
+                )
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
index dccf333ad3..399bb8e6e5 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_tilelang.py
@@ -131,7 +131,7 @@ def main_split(
                 lse_local_split = glse[bz, by, k]
                 scale_local = T.exp2(lse_local_split - lse_logsum_local)
                 for i in T.Parallel(dim):
-                    o_accum_local[i] += po_local[i] * scale_local[0]
+                    o_accum_local[i] += po_local[i] * scale_local
             for i in T.Parallel(dim):
                 Output[bz, by, i] = o_accum_local[i]
 
@@ -259,6 +259,8 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     num_split = 4
     threads = 128
 
+    print(f"Using {batch=}, {heads=}, {kv_heads=}, {kv_ctx=}, {dim=}, {pe_dim=}")
+
     if enable_autotune:
         kernel = flashmla_decode(batch, heads, kv_heads, kv_ctx, dim, pe_dim)
     else:
@@ -267,8 +269,6 @@ def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
     input_tensors = profiler._get_inputs()
     tilelang_output = kernel(*input_tensors)
     ref_output = ref_program(*input_tensors)
-    print(f"Tilelang output: {tilelang_output}")
-    print(f"Ref output: {ref_output}")
     torch.testing.assert_close(tilelang_output, ref_output, rtol=0.01, atol=0.01)
     latency = profiler.do_bench(warmup=500)
     print(f"Latency: {latency} ms")
diff --git a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
index 861e841c4e..e8c1006a01 100644
--- a/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
+++ b/examples/deepseek_mla/amd/benchmark_mla_decode_amd_triton.py
@@ -378,8 +378,8 @@ def compare_ab(baseline, target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
     bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
-    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.0f} TFLOPS, {bytes / 10**6 / perf_a:.0f} GB/s")
-    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
+    print(f"perf {baseline}: {perf_a:.3f} ms, {FLOPS / 10**9 / perf_a:.3f} TFLOPS, {bytes / 10**6 / perf_a:.3f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.3f} TFLOPS, {bytes / 10**6 / perf_b:.3f} GB/s")
     return bytes / 10**6 / perf_a, bytes / 10**6 / perf_b
 
 
@@ -410,7 +410,7 @@ def compare_a(target, b, s_q, cache_seqlens, h_q, h_kv, d, dv, causal, dtype):
 
     FLOPS = s_q * total_seqlens * h_q * (d + dv) * 2
     bytes = (total_seqlens * h_kv * d + b * s_q * h_q * d + b * s_q * h_q * dv) * (torch.finfo(dtype).bits // 8)
-    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.0f} TFLOPS, {bytes / 10**6 / perf_b:.0f} GB/s")
+    print(f"perf {target}: {perf_b:.3f} ms, {FLOPS / 10**9 / perf_b:.3f} TFLOPS, {bytes / 10**6 / perf_b:.3f} GB/s")
     return bytes / 10**6 / perf_b
 
 
diff --git a/examples/deepseek_mla/example_mla_decode.py b/examples/deepseek_mla/example_mla_decode.py
index 7de4faf089..4daa39f494 100644
--- a/examples/deepseek_mla/example_mla_decode.py
+++ b/examples/deepseek_mla/example_mla_decode.py
@@ -6,14 +6,10 @@
 from einops import rearrange, einsum
 import argparse
 
-tilelang.disable_cache()
-
 
 @tilelang.jit(
-    out_idx=[6],
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
+    out_idx=[4],
+    pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True},
 )
 def flashattn(batch, heads, kv_head_num, seqlen_kv, dim, pe_dim, block_N, block_H, num_split, softmax_scale):
     scale = float(softmax_scale * 1.44269504)  # log2(e)
@@ -29,10 +25,10 @@ def main_split(
         Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
         KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
         K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-        glse: T.Tensor([batch, heads, num_split], dtype),
-        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
         Output: T.Tensor([batch, heads, dim], dtype),
     ):
+        glse = T.alloc_global([batch, heads, num_split], dtype)
+        Output_partial = T.alloc_global([batch, heads, num_split, dim], dtype)
         # flash_attn_split
         with T.Kernel(batch, heads // min(block_H, kv_group_num), num_split, threads=256) as (bid, hid, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -127,8 +123,6 @@ def main_no_split(
         Q_pe: T.Tensor([batch, heads, pe_dim], dtype),
         KV: T.Tensor([batch, seqlen_kv, kv_head_num, dim], dtype),
         K_pe: T.Tensor([batch, seqlen_kv, kv_head_num, pe_dim], dtype),
-        glse: T.Tensor([batch, heads, num_split], dtype),
-        Output_partial: T.Tensor([batch, heads, num_split, dim], dtype),
         Output: T.Tensor([batch, heads, dim], dtype),
     ):
         with T.Kernel(heads // min(block_H, kv_group_num), batch, threads=256) as (hid, bid):
@@ -187,15 +181,13 @@ def main_no_split(
         return main_no_split
 
 
-def ref_program(q, q_pe, kv, k_pe, glse, Output_partial):
+def ref_program(q, q_pe, kv, k_pe):
     #     """
     #     Inputs:
     #     - q (Tensor): [batch, heads, dim]
     #     - q_pe (Tensor): [batch, heads, pe_dim]
     #     - kv (Tensor): [batch, seqlen_kv, kv_head_num, dim]
     #     - k_pe (Tensor): [batch, seqlen_kv, kv_head_num, pe_dim]
-    #     - glse (Tensor): [batch, heads, num_split]
-    #     - Output_partial (Tensor): [batch, heads, num_split, dim]
     #     Outputs:
     #     - output (Tensor): [batch, heads, dim]
     #     """
diff --git a/examples/deepseek_mla/example_mla_decode_ws.py b/examples/deepseek_mla/example_mla_decode_ws.py
index 32eb0d4754..98657e381a 100644
--- a/examples/deepseek_mla/example_mla_decode_ws.py
+++ b/examples/deepseek_mla/example_mla_decode_ws.py
@@ -101,9 +101,9 @@ def main_split(
                     T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
 
                     T.clear(acc_s)
-                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True)
 
                     T.wait_wgmma(0)
 
@@ -136,9 +136,9 @@ def main_split(
                     T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
 
                     T.clear(acc_s)
-                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True)
 
                     T.wait_wgmma(0)
 
@@ -215,40 +215,44 @@ def main_split(
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
                         kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
-                        with T.attr("default", "async_scope", 1):
-                            for u in T.serial(4):
-                                for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
-                                    ]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
-                                    ]
-                        with T.attr("default", "async_scope", 1):
-                            for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
-                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
-                                ]
+                        for u in T.serial(4):
+                            T.ptx_cp_async(
+                                T.access_ptr(KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                            T.ptx_cp_async(
+                                T.access_ptr(KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                        T.ptx_cp_async(
+                            T.access_ptr(K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8], "w", 8),
+                            T.access_ptr(K_pe[bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8], "r", 8),
+                            8,
+                        )
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
                         kv_indices = (seqlen_kv // num_split) * bz + (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
-                        with T.attr("default", "async_scope", 1):
-                            for u in T.serial(4):
-                                for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
-                                    ]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
-                                    ]
-                        with T.attr("default", "async_scope", 1):
-                            for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
-                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
-                                ]
+                        for u in T.serial(4):
+                            T.ptx_cp_async(
+                                T.access_ptr(KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                            T.ptx_cp_async(
+                                T.access_ptr(KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                        T.ptx_cp_async(
+                            T.access_ptr(K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8], "w", 8),
+                            T.access_ptr(K_pe[bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8], "r", 8),
+                            8,
+                        )
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
         # combine
@@ -346,9 +350,9 @@ def main_no_split(
                     T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
 
                     T.clear(acc_s)
-                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True)
 
                     T.wait_wgmma(0)
 
@@ -381,9 +385,9 @@ def main_no_split(
                     T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
 
                     T.clear(acc_s)
-                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True)
 
                     T.wait_wgmma(0)
 
@@ -459,40 +463,44 @@ def main_no_split(
                     T.barrier_wait(bar_k_0_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
                         kv_indices = (i_i * 2) * block_N + r * 16 + (tx - 256) // 8
-                        with T.attr("default", "async_scope", 1):
-                            for u in T.serial(4):
-                                for v in T.vectorized(8):
-                                    KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
-                                    ]
-                                    KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
-                                    ]
-                        with T.attr("default", "async_scope", 1):
-                            for v in T.vectorized(8):
-                                K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
-                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
-                                ]
+                        for u in T.serial(4):
+                            T.ptx_cp_async(
+                                T.access_ptr(KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                            T.ptx_cp_async(
+                                T.access_ptr(KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                        T.ptx_cp_async(
+                            T.access_ptr(K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8], "w", 8),
+                            T.access_ptr(K_pe[bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8], "r", 8),
+                            8,
+                        )
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
                     T.barrier_wait(bar_k_1_free[0], ((i_i & 1) ^ 1))
                     for r in T.serial(4):
                         kv_indices = (i_i * 2 + 1) * block_N + r * 16 + (tx - 256) // 8
-                        with T.attr("default", "async_scope", 1):
-                            for u in T.serial(4):
-                                for v in T.vectorized(8):
-                                    KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                        bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8 + v
-                                    ]
-                                    KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                        bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8 + v
-                                    ]
-                        with T.attr("default", "async_scope", 1):
-                            for v in T.vectorized(8):
-                                K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = K_pe[
-                                    bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8 + v
-                                ]
+                        for u in T.serial(4):
+                            T.ptx_cp_async(
+                                T.access_ptr(KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[bid, kv_indices, cur_kv_head, 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                            T.ptx_cp_async(
+                                T.access_ptr(KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[bid, kv_indices, cur_kv_head, dim // 2 + 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                        T.ptx_cp_async(
+                            T.access_ptr(K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8], "w", 8),
+                            T.access_ptr(K_pe[bid, kv_indices, cur_kv_head, (tx - 256) % 8 * 8], "r", 8),
+                            8,
+                        )
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     if num_split > 1:
diff --git a/examples/deepseek_mla/test_example_mla_decode.py b/examples/deepseek_mla/test_example_mla_decode.py
index a269ea57ae..00e30023a4 100644
--- a/examples/deepseek_mla/test_example_mla_decode.py
+++ b/examples/deepseek_mla/test_example_mla_decode.py
@@ -1,9 +1,14 @@
+import os
+import pytest
 import tilelang.testing
 import example_mla_decode
 
+_is_cutedsl = os.environ.get("TILELANG_TARGET", "").lower() == "cutedsl"
+
 
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@pytest.mark.skipif(_is_cutedsl, reason="CuTeDSL backend does not support alloc_global yet")
 def test_example_mla_decode():
     example_mla_decode.main()
 
diff --git a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
index ca98d01be9..697f3de38c 100644
--- a/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
+++ b/examples/deepseek_nsa/benchmark/benchmark_nsa_fwd.py
@@ -460,13 +460,7 @@ def get_configs():
 @tilelang.autotune(
     configs=get_configs(),
 )
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    }
-)
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tilelang_sparse_attention(
     batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16, block_T=128, num_stages=2, threads=32
 ):
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
index 3da285a9ba..2aa30a5bc5 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_bwd.py
@@ -18,13 +18,7 @@
 import tilelang
 
 
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    }
-)
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tilelang_kernel_fwd(
     batch,
     heads,
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_decode.py b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
index 381d92493e..79414a762b 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_decode.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_decode.py
@@ -12,11 +12,7 @@
 # auto warp specialization may have some bugs.
 @tilelang.jit(
     out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True},
 )
 def native_sparse_attention(
     batch,
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
index 7b36d6e26f..abed2e41dd 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd.py
@@ -10,11 +10,7 @@
 
 @tilelang.jit(
     out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def native_sparse_attention(batch, heads, seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
diff --git a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
index b52ebe42e2..1d5d942b40 100644
--- a/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
+++ b/examples/deepseek_nsa/example_tilelang_nsa_fwd_varlen.py
@@ -17,13 +17,7 @@
 from einops import rearrange
 
 
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    }
-)
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def native_sparse_attention_varlen(batch, heads, c_seq_len, dim, is_causal, scale=None, block_size=64, groups=1, selected_blocks=16):
     if scale is None:
         scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
diff --git a/examples/deepseek_v32/fp8_lighting_indexer.py b/examples/deepseek_v32/fp8_lighting_indexer.py
index 03e88dd972..2f88575971 100644
--- a/examples/deepseek_v32/fp8_lighting_indexer.py
+++ b/examples/deepseek_v32/fp8_lighting_indexer.py
@@ -283,28 +283,16 @@ def run_regression_perf(S=4096, SKV=8192, H=32, HKV=1, D=64, kv_stride=1):
     q = torch.randn(S, H, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
     kv = torch.randn(SKV, D, device="cuda", dtype=torch.bfloat16).to(torch.bfloat16)
     weights = torch.randn(S, H, device="cuda", dtype=torch.float32)
-    p = (torch.randn(S, SKV, device="cuda", dtype=torch.float32) * 4).softmax(dim=-1)
-
     ks, ke = generate_random_cu_seqlens(per_cp_seqlen=S, cp_size=4, cp_rank=3, kv_stride=kv_stride, average_q_len=2048)
 
-    logits_ref, cost_ref = ref_fp8_mqa_logits(q=q, kv=kv, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
-
     q_fp8 = q.to(torch.float8_e4m3fn)
     kv_fp8, kv_scales = per_custom_dims_cast_to_fp8(kv, (0,), False)
 
-    logits_tl = mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
-    diff = validate_tensor_match(logits_ref, logits_tl, tolerance=1e-14, tensor_name="logits", should_raise=False)
-
     from tilelang.profiler import do_bench
 
     def logits_fn():
         return mqa_attn_return_logits_interface(q=q_fp8, kv=kv_fp8, kv_scales=kv_scales, weights=weights, cu_seqlen_ks=ks, cu_seqlen_ke=ke)
 
-    with torch.profiler.profile(activities=[torch.profiler.ProfilerActivity.CUDA]) as prof:
-        logits_fn()
-
-    print(prof.key_averages().table(sort_by="cuda_time_total", max_name_column_width=50))
-
     return do_bench(logits_fn, backend="cupti")
 
 
diff --git a/examples/deepseek_v32/inference/README.md b/examples/deepseek_v32/inference/README.md
index 60afe7ceb1..f5cd62491c 100644
--- a/examples/deepseek_v32/inference/README.md
+++ b/examples/deepseek_v32/inference/README.md
@@ -1,6 +1,6 @@
 # DeepSeek V3.2
 
-First convert huggingface model weights to the the format required by our inference demo. Set `MP` to match your available GPU count:
+First convert huggingface model weights to the format required by our inference demo. Set `MP` to match your available GPU count:
 ```bash
 cd inference
 export EXPERTS=256
diff --git a/examples/deepseek_v32/inference/convert.py b/examples/deepseek_v32/inference/convert.py
index 090be71455..cb912e1d8b 100644
--- a/examples/deepseek_v32/inference/convert.py
+++ b/examples/deepseek_v32/inference/convert.py
@@ -29,8 +29,7 @@
     "wq_b": ("wq_b", None),
     "wk": ("wk", None),
     "k_norm": ("k_norm", None),
-    "weights_proj": ("weights_proj", None),
-}
+    "weights_proj": ("weights_proj", None)}
 
 
 def main(hf_ckpt_path, save_path, n_experts, mp):
diff --git a/examples/deepseek_v32/inference/kernel.py b/examples/deepseek_v32/inference/kernel.py
index 25abf15d59..9d9402d1a8 100644
--- a/examples/deepseek_v32/inference/kernel.py
+++ b/examples/deepseek_v32/inference/kernel.py
@@ -7,9 +7,7 @@
 
 pass_configs = {
     tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True,
-}
+    tilelang.PassConfigKey.TL_DISABLE_FAST_MATH: True}
 
 FP8 = T.float8_e4m3fn
 BF16 = T.bfloat16
@@ -17,15 +15,15 @@
 
 
 def fast_log2_ceil(x):
-    bits_x = T.reinterpret(T.uint32, x)
+    bits_x = T.reinterpret(x, T.uint32)
     exp_x = (bits_x >> 23) & 0xFF
     man_bits = bits_x & ((1 << 23) - 1)
-    return T.Cast(T.int32, exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0))
+    return T.cast(exp_x - 127 + T.if_then_else(man_bits != 0, 1, 0), T.int32)
 
 
 def fast_pow2(x):
     bits_x = (x + 127) << 23
-    return T.reinterpret(T.float32, bits_x)
+    return T.reinterpret(bits_x, T.float32)
 
 
 def fast_round_scale(amax, fp8_max_inv):
diff --git a/examples/deepseek_v32/sparse_mla_bwd.py b/examples/deepseek_v32/sparse_mla_bwd.py
index 527de22b39..50192fa2bf 100644
--- a/examples/deepseek_v32/sparse_mla_bwd.py
+++ b/examples/deepseek_v32/sparse_mla_bwd.py
@@ -76,7 +76,6 @@ def postprocess_kernel(
 @tilelang.jit(
     out_idx=[-2],
     pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
         tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
         tilelang.PassConfigKey.TL_ENABLE_AGGRESSIVE_SHARED_MEMORY_MERGE: True,
     },
diff --git a/examples/deepseek_v32/sparse_mla_fwd.py b/examples/deepseek_v32/sparse_mla_fwd.py
index 2c8bf7fc74..5426d9072b 100644
--- a/examples/deepseek_v32/sparse_mla_fwd.py
+++ b/examples/deepseek_v32/sparse_mla_fwd.py
@@ -7,10 +7,7 @@
 
 @tilelang.jit(
     out_idx=[-2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def sparse_mla_fwd(
     heads,
@@ -161,9 +158,7 @@ def main(
             for h_i in T.Parallel(H_per_block):
                 sumexp[h_i] = T.log2(sumexp[h_i]) + m_i[h_i] * sm_scale
 
-            T.copy(acc_o, O_shared)
             T.copy(acc_o, Output[b_i, s_i, H0:H1, :])
-            T.copy(sumexp, Lse_shared)
             T.copy(sumexp, Lse[b_i, s_i, H0:H1])
 
     return main
diff --git a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
index 7e664d11b4..bff9f19b98 100644
--- a/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
+++ b/examples/deepseek_v32/sparse_mla_fwd_pipelined.py
@@ -133,9 +133,9 @@ def main(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l)
-            T.copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r)
-            T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
+            T.tma_copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l, barrier=bar_q)
+            T.tma_copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r, barrier=bar_q)
+            T.tma_copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared, barrier=bar_q)
             T.barrier_arrive(bar_q)
 
             if tx < 128:
@@ -151,9 +151,9 @@ def main(
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
                         acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
-                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_0_l, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_0_r, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_tail_shared, K_tail_shared_0, acc_s, transpose_B=True)
 
                     T.wait_wgmma(0)
 
@@ -187,9 +187,9 @@ def main(
 
                     for h_i, bi_i in T.Parallel(H_per_block, BI):
                         acc_s[h_i, bi_i] = T.if_then_else(is_kv_valid[bi_i], 0, -T.infinity(acc_s.dtype))
-                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True, wg_wait=-1)
-                    T.gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_1_l, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_1_r, acc_s, transpose_B=True)
+                    T.wgmma_gemm(Q_tail_shared, K_tail_shared_1, acc_s, transpose_B=True)
 
                     T.wait_wgmma(0)
 
@@ -266,20 +266,23 @@ def main(
                         indices_local = Indices[b_i, s_i, g_i, (i_i * 2) * BI + r * 16 + (tx - 256) // 8]
                         is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
-                            with T.attr("default", "async_scope", 1):
-                                for u in T.serial(4):
-                                    for v in T.vectorized(8):
-                                        KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                            b_i, indices_local, g_i, 64 * u + (tx - 256) % 8 * 8 + v
-                                        ]
-                                        KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                            b_i, indices_local, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
-                                        ]
-                            with T.attr("default", "async_scope", 1):
-                                for v in T.vectorized(8):
-                                    K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
-                                        b_i, indices_local, g_i, D + (tx - 256) % 8 * 8 + v
-                                    ]
+                            # Manually issue cp.async copies for KV_left, KV_right, and K_tail.
+                            for u in T.serial(4):
+                                T.ptx_cp_async(
+                                    T.access_ptr(KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                    T.access_ptr(KV[b_i, indices_local, g_i, 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                    8,
+                                )
+                                T.ptx_cp_async(
+                                    T.access_ptr(KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                    T.access_ptr(KV[b_i, indices_local, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                    8,
+                                )
+                            T.ptx_cp_async(
+                                T.access_ptr(K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[b_i, indices_local, g_i, D + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
                     T.cp_async_barrier_noinc(bar_k_0_ready[0])
 
                     # Buffer 1
@@ -288,20 +291,23 @@ def main(
                         indices_local = Indices[b_i, s_i, g_i, (i_i * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
                         is_kv_valid[r * 16 + (tx - 256) // 8] = indices_local <= max_kv_i
                         if is_kv_valid[r * 16 + (tx - 256) // 8]:
-                            with T.attr("default", "async_scope", 1):
-                                for u in T.serial(4):
-                                    for v in T.vectorized(8):
-                                        KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                            b_i, indices_local, g_i, 64 * u + (tx - 256) % 8 * 8 + v
-                                        ]
-                                        KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8 + v] = KV[
-                                            b_i, indices_local, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8 + v
-                                        ]
-                            with T.attr("default", "async_scope", 1):
-                                for v in T.vectorized(8):
-                                    K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8 + v] = KV[
-                                        b_i, indices_local, g_i, D + (tx - 256) % 8 * 8 + v
-                                    ]
+                            # Manually issue cp.async copies for KV_left, KV_right, and K_tail.
+                            for u in T.serial(4):
+                                T.ptx_cp_async(
+                                    T.access_ptr(KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                    T.access_ptr(KV[b_i, indices_local, g_i, 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                    8,
+                                )
+                                T.ptx_cp_async(
+                                    T.access_ptr(KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                    T.access_ptr(KV[b_i, indices_local, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                    8,
+                                )
+                            T.ptx_cp_async(
+                                T.access_ptr(K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[b_i, indices_local, g_i, D + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
                     T.cp_async_barrier_noinc(bar_k_1_ready[0])
 
     return main
@@ -410,8 +416,6 @@ def fn():
 
     tl_out, tl_lse = fn()
     ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
-    # print(f"tl_out: {tl_out}")
-    # print(f"ref_out: {ref_out}")
 
     torch.testing.assert_close(tl_out, ref_out, rtol=1e-3, atol=1e-3)
 
@@ -450,12 +454,12 @@ def run_regression_perf(B=1, S=4096, SKV=8192, H=128, HKV=1, DQK=576, DV=512, to
     CP0 = q_start_s_index == 0
     kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, KV_stride, kv_group, None, True, CP0)
 
-    def run_kernel_only():
+    def fn():
         kernel(q, kv, indices, torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda"))
 
     from tilelang.profiler import do_bench
 
-    return do_bench(run_kernel_only, backend="cupti")
+    return do_bench(fn, backend="cupti")
 
 
 if __name__ == "__main__":
diff --git a/examples/deepseek_v32/sparse_mla_fwd_seesaw.py b/examples/deepseek_v32/sparse_mla_fwd_seesaw.py
new file mode 100644
index 0000000000..cdffac281a
--- /dev/null
+++ b/examples/deepseek_v32/sparse_mla_fwd_seesaw.py
@@ -0,0 +1,643 @@
+# ruff: noqa
+import torch
+import tilelang
+from tilelang import language as T
+import argparse
+
+
+@tilelang.jit(
+    out_idx=[-2, -1],
+    compile_flags=[
+        "-O3",
+        "--ptxas-options=-v,--register-usage-level=10",
+        "-DNDEBUG",
+        "-Wno-deprecated-declarations",
+        "-U__CUDA_NO_HALF_OPERATORS__",
+        "-U__CUDA_NO_HALF_CONVERSIONS__",
+        "-U__CUDA_NO_HALF2_OPERATORS__",
+        "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+        "--expt-relaxed-constexpr",
+        "--expt-extended-lambda",
+    ],
+)
+def sparse_mla_fwd(
+    batch,
+    seq_len,
+    seq_len_kv,
+    heads,
+    dim,
+    tail_dim,
+    topk,
+    kv_stride,
+    kv_group=1,
+    sm_scale=None,
+    is_causal=True,
+    CP0=True,
+    block_I=64,
+    num_stages=0,
+    threads=384,
+):
+    assert dim == tilelang.math.next_power_of_2(dim), f"haven't check padding correctness yet, dim={dim}"
+    assert tail_dim == tilelang.math.next_power_of_2(tail_dim), f"haven't check padding correctness yet, dim={tail_dim}"
+    assert is_causal == True, "non-casual is not supported"
+    assert topk % block_I == 0, "otherwise will load some index=0 thus causing wrong kv to be loaded"
+    if sm_scale is None:
+        sm_scale = (1.0 / (dim + tail_dim)) ** 0.5 * 1.44269504  # log2(e)
+    else:
+        sm_scale = sm_scale * 1.44269504  # log2(e)
+
+    head_kv = heads // kv_group
+    q_shape = [batch, seq_len, heads, dim + tail_dim]
+    kv_shape = [batch, seq_len_kv, kv_group, dim + tail_dim]
+    o_shape = [batch, seq_len, heads, dim]
+    indices_shape = [batch, seq_len, kv_group, topk]
+    lse_shape = [batch, seq_len, heads]
+    indices_dtype = "int32"
+    dtype = "bfloat16"
+    accum_dtype = "float"
+
+    G = kv_group
+    H = head_kv
+    padded_H = max(tilelang.math.next_power_of_2(head_kv), 16)
+    if padded_H != H:
+        assert kv_group == 1, (
+            "here we solve the H padding automatically, other wise you "
+            "should handle Q copy and Output copy with your mask (when "
+            "kv_group == 1, use g_i * padded_H:(g_i+1) * padded_H would "
+            "be handled automatically)"
+        )
+    BI = block_I
+    NI = tilelang.cdiv(topk, block_I)
+    assert NI % 2 == 0, "NI should be a multiple of 2"
+    D = dim
+    D_tail = tail_dim
+    KV_stride = kv_stride
+    if head_kv > 64:
+        assert head_kv % 64 == 0, "head_kv should be a multiple of 64"
+        REPLICATE_H = head_kv // 64
+    else:
+        REPLICATE_H = 1
+
+    # Increasing from 32->64 reduces the time spent reading kvcache. If num_query_head = 128
+    # and num_kv_head = 1, the same kvcache originally needed to be read 4 times, but now only 2 times
+    H_per_block = padded_H if REPLICATE_H == 1 else 64
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),  # type: ignore
+        KV: T.Tensor(kv_shape, dtype),  # type: ignore
+        Indices: T.Tensor(indices_shape, indices_dtype),  # type: ignore
+        q_start_index_s: T.Tensor(1, indices_dtype),  # type: ignore
+        Output: T.Tensor(o_shape, dtype),  # type: ignore
+        Lse: T.Tensor(lse_shape, accum_dtype),  # type: ignore
+    ):
+        with T.Kernel(
+            # If CP0 is True (i.e., start of sequence), skip the first (KV_stride - 1)
+            # queries that cannot see any KV. Also be careful that seq_len < kv_stride could cause negative grid size
+            (max(0, seq_len - kv_stride + 1) if CP0 else seq_len) * REPLICATE_H,
+            batch,
+            kv_group,
+            threads=threads,
+        ) as (bx, by, bz):
+            Q_shared_l = T.alloc_shared([H_per_block, D // 2], dtype)
+            Q_shared_r = T.alloc_shared([H_per_block, D // 2], dtype)
+            Q_tail_shared = T.alloc_shared([H_per_block, D_tail], dtype)
+
+            KV_shared_0_l = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_0_r = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_1_l = T.alloc_shared([BI, D // 2], dtype)
+            KV_shared_1_r = T.alloc_shared([BI, D // 2], dtype)
+            K_tail_shared_0 = T.alloc_shared([BI, D_tail], dtype)
+            K_tail_shared_1 = T.alloc_shared([BI, D_tail], dtype)
+
+            O_shared_l = Q_shared_l
+            O_shared_r = Q_shared_r
+
+            # Whether the kv in current BI is visible for this query
+            # Producer alternates writing to buf0 and buf1 masks. To avoid the situation
+            # where consumer0 is still reading buf0 mask when producer has already started
+            # writing buf1 mask, we use two buf masks
+            is_kv_valid = T.alloc_shared([2, BI], "bool", scope="shared")
+
+            acc_o_l = T.alloc_fragment([H_per_block, D // 2], accum_dtype)
+            acc_o_r = T.alloc_fragment([H_per_block, D // 2], accum_dtype)
+
+            # WG0 computes S0(BI_2*i), WG1 computes S1(BI_2*i+1), shared via shared memory
+
+            # Reuse K_tail_shared for S_shared to save memory when dimensions match
+            # Must reuse, otherwise H100 SM's shared mem is insufficient (> 228kb), this is shared mem bound
+            S_shared_0 = K_tail_shared_0
+            S_shared_1 = K_tail_shared_1
+
+            # WG0 and WG1 exchange local max with each other, compare to compute global max, and rescale their O_L or O_R accordingly
+            row_max_shared_0 = T.alloc_shared([H_per_block], accum_dtype)
+            row_max_shared_1 = T.alloc_shared([H_per_block], accum_dtype)
+
+            # Used to store sum of exps for even BI and odd BI respectively, which will be summed up for integration later
+            row_sum_shared_0 = T.alloc_shared([H_per_block], accum_dtype)
+            row_sum_shared_1 = T.alloc_shared([H_per_block], accum_dtype)
+
+            # acc_s, sumexp, m_i each need to be allocated separately for consumer0 and consumer1
+            acc_s_0 = T.alloc_fragment([H_per_block, BI], accum_dtype)
+            acc_s_1 = T.alloc_fragment([H_per_block, BI], accum_dtype)
+
+            sumexp_0 = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i_0 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_0 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev_0 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_peer_0 = T.alloc_fragment([H_per_block], accum_dtype)
+
+            sumexp_1 = T.alloc_fragment([H_per_block], accum_dtype)
+            sumexp_i_1 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_1 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_prev_1 = T.alloc_fragment([H_per_block], accum_dtype)
+            m_i_peer_1 = T.alloc_fragment([H_per_block], accum_dtype)
+
+            bar_q = T.alloc_barrier(arrive_count=384)
+
+            # Producer -> Consumer Barriers
+            bar_k_0_ready = T.alloc_barrier(arrive_count=128)  # Prod arrives
+            bar_k_1_ready = T.alloc_barrier(arrive_count=128)  # Prod arrives
+
+            # Consumer -> Producer Barriers (Both consumers must arrive)
+            bar_k_0_free = T.alloc_barrier(arrive_count=256)
+            bar_k_1_free = T.alloc_barrier(arrive_count=256)
+
+            # Inter-Consumer Barriers (Seesaw Sync)
+            bar_stats_0_ready = T.alloc_barrier(arrive_count=128)  # Cons 0 arrives
+            bar_stats_1_ready = T.alloc_barrier(arrive_count=128)  # Cons 1 arrives
+
+            bar_S_0_ready = T.alloc_barrier(arrive_count=128)  # Cons 0 arrives
+            bar_S_1_ready = T.alloc_barrier(arrive_count=128)  # Cons 1 arrives
+
+            b_i, g_i = by, bz
+            # If it's the first chunk, start computing directly from the (kv_stride - 1)-th token
+            s_i = (bx + (KV_stride - 1 if CP0 else 0)) if REPLICATE_H == 1 else (bx // REPLICATE_H + (KV_stride - 1 if CP0 else 0))
+            q_i = q_start_index_s[0] + s_i
+            # Sometimes to reduce kvcache size, we may not store KV for every token, but store
+            # KV every KV_stride tokens (usually the last token in the stride window),
+            # so the kv range visible to the current query should be [0:max_kv_i]
+            max_kv_i = (q_i + 1 - KV_stride) // KV_stride
+
+            H0 = g_i * padded_H + (0 if REPLICATE_H == 1 else (bx % REPLICATE_H) * 64)
+            H1 = H0 + H_per_block
+
+            tx = T.get_thread_binding()
+
+            T.copy(Q[b_i, s_i, H0:H1, 0 : D // 2], Q_shared_l)
+            T.copy(Q[b_i, s_i, H0:H1, D // 2 : D], Q_shared_r)
+            T.copy(Q[b_i, s_i, H0:H1, D:], Q_tail_shared)
+
+            # Non-blockingly increment the barrier's internal counter, producer threads can start loading kv ahead of time
+            T.barrier_arrive(bar_q)
+
+            if tx >= 256:
+                # producer: prefetch kvcache to shared mem
+                T.set_max_nreg(72, 0)
+
+                prefetch_indices_0 = T.alloc_fragment([4], indices_dtype)
+                prefetch_indices_1 = T.alloc_fragment([4], indices_dtype)
+
+                # Prime the Pump! Prefetch indices for iter_0
+                for r in T.serial(4):
+                    # This read will cause a long scoreboard stall, but it only happens once before the loop starts
+                    prefetch_indices_0[r] = Indices[b_i, s_i, g_i, r * 16 + (tx - 256) // 8]
+                    prefetch_indices_1[r] = Indices[b_i, s_i, g_i, BI + r * 16 + (tx - 256) // 8]
+
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # Buffer 0
+                    # Wait for both KV_shared_0_l and KV_shared_0_r to be done being used
+
+                    T.barrier_wait(bar_k_0_free[0], (i_i & 1))
+
+                    # Block size `BI` is 64, loading is divided into 4 iterations, each processing 16 indices
+                    # Producer has 128 threads total, 8 consecutive threads collaborate to load kv for one index
+                    for r in T.serial(4):
+                        # mitigate long scoreboard stall here
+                        index = prefetch_indices_0[r]
+                        is_kv_valid[0, r * 16 + (tx - 256) // 8] = index <= max_kv_i
+                        if is_kv_valid[0, r * 16 + (tx - 256) // 8]:
+                            # 8 threads collaborate to load one row of KV_dim (512) in 4 iters, each loading 8 elems
+                            for u in T.serial(4):
+                                T.ptx_cp_async(
+                                    T.access_ptr(KV_shared_0_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                    T.access_ptr(KV[b_i, index, g_i, 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                    8,
+                                )
+                                T.ptx_cp_async(
+                                    T.access_ptr(KV_shared_0_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                    T.access_ptr(KV[b_i, index, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                    8,
+                                )
+                            # tail_dim (64) needs only one iter of 8 elems per 8 collaborating threads
+                            T.ptx_cp_async(
+                                T.access_ptr(K_tail_shared_0[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[b_i, index, g_i, D + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                    T.cp_async_barrier_noinc(bar_k_0_ready[0])
+
+                    if i_i + 1 < T.ceildiv(NI, 2):
+                        # Async prefetch indices needed for the next round of kv data loading, overlaps with current round to hide latency
+                        for r in T.serial(4):
+                            prefetch_indices_0[r] = Indices[b_i, s_i, g_i, ((i_i + 1) * 2) * BI + r * 16 + (tx - 256) // 8]
+
+                    # Buffer 1
+                    T.barrier_wait(bar_k_1_free[0], (i_i & 1))
+
+                    for r in T.serial(4):
+                        index = prefetch_indices_1[r]
+                        is_kv_valid[1, r * 16 + (tx - 256) // 8] = index <= max_kv_i
+                        if is_kv_valid[1, r * 16 + (tx - 256) // 8]:
+                            for u in T.serial(4):
+                                T.ptx_cp_async(
+                                    T.access_ptr(KV_shared_1_l[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                    T.access_ptr(KV[b_i, index, g_i, 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                    8,
+                                )
+                                T.ptx_cp_async(
+                                    T.access_ptr(KV_shared_1_r[r * 16 + (tx - 256) // 8, 64 * u + (tx - 256) % 8 * 8], "w", 8),
+                                    T.access_ptr(KV[b_i, index, g_i, D // 2 + 64 * u + (tx - 256) % 8 * 8], "r", 8),
+                                    8,
+                                )
+                            T.ptx_cp_async(
+                                T.access_ptr(K_tail_shared_1[r * 16 + (tx - 256) // 8, (tx - 256) % 8 * 8], "w", 8),
+                                T.access_ptr(KV[b_i, index, g_i, D + (tx - 256) % 8 * 8], "r", 8),
+                                8,
+                            )
+                    T.cp_async_barrier_noinc(bar_k_1_ready[0])
+
+                    if i_i + 1 < T.ceildiv(NI, 2):
+                        for r in T.serial(4):
+                            prefetch_indices_1[r] = Indices[b_i, s_i, g_i, ((i_i + 1) * 2 + 1) * BI + r * 16 + (tx - 256) // 8]
+
+            elif tx < 128:
+                # Check if 384 threads have already arrived at bar_q (phase0 completed),
+                # if not continue waiting, otherwise pass through directly
+                T.barrier_wait(bar_q, 0)
+
+                # pre-arrive free barriers to indicate buffers are initially free
+                # At the beginning of phase0, tells producer it can load data into both buffers
+                T.barrier_arrive(bar_k_0_free[0])
+                T.barrier_arrive(bar_k_1_free[0])
+
+                # Consumer 0 (WG0): Responsible for Even Blocks and O_L (Left Half)
+                T.set_max_nreg(216, 1)
+                T.fill(sumexp_0, 0)
+                for h_i in T.Parallel(H_per_block):
+                    m_i_0[h_i] = -5e4
+                T.fill(acc_o_l, 0)
+
+                # Each iteration, two consumers cooperate to compute two BIs
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # --- Step 1: Compute S0 = Q @ K0^T (Even Block) ---
+                    T.barrier_wait(bar_k_0_ready[0], (i_i & 1))
+
+                    T.fill(acc_s_0, 0)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_0_l, acc_s_0, transpose_B=True)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True)
+                    T.wgmma_gemm(Q_tail_shared, K_tail_shared_0, acc_s_0, transpose_B=True)
+
+                    T.copy(m_i_0, m_i_prev_0)
+                    T.wait_wgmma(0)
+
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        if not is_kv_valid[0, bi_i]:
+                            acc_s_0[h_i, bi_i] = -5e4
+                    T.reduce_max(acc_s_0, m_i_0, dim=1, clear=False)
+
+                    # --- Step 2: Local Softmax Stats & Exchange ---
+                    T.copy(m_i_0, row_max_shared_0)
+                    T.barrier_arrive(bar_stats_0_ready)
+                    # If consumer0 has received the local max from consumer1 at iter_i, this also means
+                    # consumer1 has finished using S_0 passed by consumer0 at iter_i-1,
+                    # so we can write to it directly without blocking below
+                    T.barrier_wait(bar_stats_1_ready, (i_i & 1))
+                    T.copy(row_max_shared_1, m_i_peer_0)
+
+                    # Update global max and scale O
+                    for h_i in T.Parallel(H_per_block):
+                        m_i_0[h_i] = T.max(m_i_0[h_i], m_i_peer_0[h_i])
+
+                    # Scale O_L
+                    for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                        acc_o_l[h_i, d_i] *= T.exp2((m_i_prev_0[h_i] - m_i_0[h_i]) * sm_scale)
+
+                    # Scale SumExp
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp_0[h_i] *= T.exp2((m_i_prev_0[h_i] - m_i_0[h_i]) * sm_scale)
+
+                    # Compute P0 = exp(S0 - m_new)
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        acc_s_0[h_i, bi_i] = T.exp2(acc_s_0[h_i, bi_i] * sm_scale - m_i_0[h_i] * sm_scale)
+
+                    # Update SumExp with P0
+                    T.reduce_sum(acc_s_0, sumexp_i_0, dim=1)
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp_0[h_i] += sumexp_i_0[h_i]
+
+                    # --- Step 3: O_L += P0 @ V0_L (Self-Attention) ---
+                    # Wait for S0 buffer to be free (consumed by peer in prev iter)
+                    # T.barrier_wait(bar_S_0_free, (i_i & 1))
+                    T.copy(acc_s_0, S_shared_0)
+                    T.barrier_arrive(bar_S_0_ready)
+
+                    T.wgmma_gemm(S_shared_0, KV_shared_0_l, acc_o_l, transpose_B=False)
+
+                    # --- Step 4: O_L += P1 @ V1_L (Cross-Attention) ---
+                    # Wait for P1 (S1) from peer
+                    T.barrier_wait(bar_S_1_ready, (i_i & 1))
+
+                    T.wgmma_gemm(S_shared_1, KV_shared_1_l, acc_o_l, transpose_B=False)
+
+                    # NOTE: However, k_0 and k_1 are used by both consumer0 and consumer1, so this doesn't bring much performance improvement
+                    # Except for the most recent async gemm (i.e., S_shared_1 @ KV_shared_1_k), all others need to wait to finish
+                    T.wait_wgmma(1)
+                    T.barrier_arrive(bar_k_0_free[0])
+                    # Wait for all async gemms to finish
+                    T.wait_wgmma(0)
+                    T.barrier_arrive(bar_k_1_free[0])
+
+                T.copy(sumexp_0, row_sum_shared_0)
+                T.barrier_arrive(bar_stats_0_ready)  # Reuse barrier
+                T.barrier_wait(bar_stats_1_ready, T.ceildiv(NI, 2) & 1)
+                T.copy(row_sum_shared_1, sumexp_i_0)  # Reuse sumexp_i buffer
+
+                for h_i in T.Parallel(H_per_block):
+                    sumexp_0[h_i] += sumexp_i_0[h_i]
+
+                for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                    acc_o_l[h_i, d_i] /= sumexp_0[h_i]
+
+                for h_i in T.Parallel(H_per_block):
+                    sumexp_0[h_i] = T.log2(sumexp_0[h_i]) + m_i_0[h_i] * sm_scale
+
+                T.copy(acc_o_l, O_shared_l)
+                T.copy(O_shared_l, Output[b_i, s_i, H0:H1, 0 : D // 2])
+                T.copy(sumexp_0, Lse[b_i, s_i, H0:H1])  # Write LSE
+
+            elif tx >= 128 and tx < 256:
+                T.barrier_wait(bar_q, 0)
+
+                # pre-arrive free barriers to indicate buffers are initially free
+                # At the beginning of phase0, tells producer it can load data into both buffers
+                T.barrier_arrive(bar_k_0_free[0])
+                T.barrier_arrive(bar_k_1_free[0])
+
+                # Consumer 1 (WG1): Responsible for Odd Blocks and O_R (Right Half)
+                # NOTE: 256 * 216 + 128 * 72 = 64,512 < 65536 (H100 SM RegFile Limit),
+                # setting more registers will cause a hang, all values must be multiples of 8
+                T.set_max_nreg(216, 1)
+                T.fill(sumexp_1, 0)
+                for h_i in T.Parallel(H_per_block):
+                    m_i_1[h_i] = -5e4
+                T.fill(acc_o_r, 0)
+
+                for i_i in T.serial(T.ceildiv(NI, 2)):
+                    # --- Step 1: Compute S1 = Q @ K1^T (Odd Block) ---
+                    T.barrier_wait(bar_k_1_ready[0], (i_i & 1))
+
+                    T.fill(acc_s_1, 0)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_1_l, acc_s_1, transpose_B=True)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True)
+                    T.wgmma_gemm(Q_tail_shared, K_tail_shared_1, acc_s_1, transpose_B=True)
+
+                    # --- Step 2: Local Softmax Stats & Exchange ---
+                    T.copy(m_i_1, m_i_prev_1)
+                    T.wait_wgmma(0)
+
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        if not is_kv_valid[1, bi_i]:
+                            acc_s_1[h_i, bi_i] = -5e4
+
+                    T.reduce_max(acc_s_1, m_i_1, dim=1, clear=False)
+                    T.copy(m_i_1, row_max_shared_1)
+                    T.barrier_arrive(bar_stats_1_ready)
+                    T.barrier_wait(bar_stats_0_ready, (i_i & 1))
+                    T.copy(row_max_shared_0, m_i_peer_1)
+
+                    for h_i in T.Parallel(H_per_block):
+                        m_i_1[h_i] = T.max(m_i_1[h_i], m_i_peer_1[h_i])
+
+                    for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                        acc_o_r[h_i, d_i] *= T.exp2((m_i_prev_1[h_i] - m_i_1[h_i]) * sm_scale)
+
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp_1[h_i] *= T.exp2((m_i_prev_1[h_i] - m_i_1[h_i]) * sm_scale)
+
+                    for h_i, bi_i in T.Parallel(H_per_block, BI):
+                        acc_s_1[h_i, bi_i] = T.exp2(acc_s_1[h_i, bi_i] * sm_scale - m_i_1[h_i] * sm_scale)
+
+                    T.reduce_sum(acc_s_1, sumexp_i_1, dim=1)
+                    for h_i in T.Parallel(H_per_block):
+                        sumexp_1[h_i] += sumexp_i_1[h_i]
+
+                    # --- Step 3: O_R += P1 @ V1_R (Self-Attention) ---
+                    T.copy(acc_s_1, S_shared_1)
+
+                    T.barrier_arrive(bar_S_1_ready)
+
+                    T.wgmma_gemm(S_shared_1, KV_shared_1_r, acc_o_r, transpose_B=False)
+
+                    # --- Step 4: O_R += P0 @ V0_R (Cross-Attention) ---
+                    T.barrier_wait(bar_S_0_ready, (i_i & 1))
+
+                    T.wgmma_gemm(S_shared_0, KV_shared_0_r, acc_o_r, transpose_B=False)
+
+                    T.wait_wgmma(1)
+                    T.barrier_arrive(bar_k_1_free[0])
+                    T.wait_wgmma(0)
+                    T.barrier_arrive(bar_k_0_free[0])
+
+                T.copy(sumexp_1, row_sum_shared_1)
+                T.barrier_arrive(bar_stats_1_ready)
+                T.barrier_wait(bar_stats_0_ready, T.ceildiv(NI, 2) & 1)
+                T.copy(row_sum_shared_0, sumexp_i_1)
+
+                for h_i in T.Parallel(H_per_block):
+                    sumexp_1[h_i] += sumexp_i_1[h_i]
+
+                for h_i, d_i in T.Parallel(H_per_block, D // 2):
+                    acc_o_r[h_i, d_i] /= sumexp_1[h_i]
+
+                T.copy(acc_o_r, O_shared_r)
+                T.copy(O_shared_r, Output[b_i, s_i, H0:H1, D // 2 : D])
+
+    return main
+
+
+def sparse_mla_fwd_interface(
+    q, kv, indices, q_start_index_s, kv_stride, sm_scale=None, is_casual=True, return_kernel=False, print_kernel=False
+):
+    assert q.is_contiguous() and kv.is_contiguous() and indices.is_contiguous()
+    batch, seq_len, heads, dim_plus_tail_dim = q.shape
+    _, seq_len_kv, kv_group, _ = kv.shape
+
+    assert dim_plus_tail_dim == 576, "you should assign dim otherwise"
+    dim = 512
+
+    assert kv.shape[-1] == dim_plus_tail_dim
+    tail_dim = dim_plus_tail_dim - dim
+    assert kv.shape[0] == batch
+    _, _, _, topk = indices.shape
+    assert indices.shape == (batch, seq_len, kv_group, topk)
+
+    if q_start_index_s != 0:
+        assert q_start_index_s > kv_stride, (
+            "If it is because each cp has too short length, you should fix the logic involving CP0 (cp_rank == 0), to make sure q with pos < KV_Stride - 1 is masked (or you may just ignore how this is handled if nan in these q's Out would not effect others, which is reported to be likely to happen by wangding)"
+        )
+    CP0 = q_start_index_s == 0
+
+    # Compile the kernel
+    kernel = sparse_mla_fwd(batch, seq_len, seq_len_kv, heads, dim, tail_dim, topk, kv_stride, kv_group, sm_scale, is_casual, CP0)
+
+    if print_kernel:
+        print(kernel.get_kernel_source())
+
+    if return_kernel:
+        return kernel
+
+    (
+        out,
+        lse,
+    ) = kernel(q, kv, indices, torch.tensor([q_start_index_s], dtype=torch.int32, device="cuda"))
+    if q_start_index_s == 0 and kv_stride > 1:
+        # Set the output of the first (kv_stride - 1) positions to 0, since they cannot see any kv so no computation was performed
+        out[:, : kv_stride - 1, :, :] = 0
+    return out, lse
+
+
+def ref_sparse_mla_fwd_interface(q, kv, indices, q_start_index_s, kv_stride=1, sm_scale=None, is_casual=True):
+    q = q.float()
+    kv = kv.float()
+    indices = indices.transpose(1, 2)
+    b, sq, h, dim_q = q.shape
+    b, sk, g, _ = kv.shape
+    if q_start_index_s is None:
+        q_start_index_s = sk * kv_stride - sq
+
+    assert kv.shape[-1] == 576, "you should assign dim otherwise"
+    dim = 512
+    k = kv
+    v = kv[..., :dim]
+
+    b, _, _, dim_v = v.shape
+    num_kv_per_index = 1
+    g_index = g
+    h_index = h // g
+    compressed_casual_mask = torch.arange(q_start_index_s, sq + q_start_index_s, dtype=torch.int32, device="cuda").view(
+        -1, 1
+    ) >= torch.arange(kv_stride - 1, sk * kv_stride, kv_stride, dtype=torch.int32, device="cuda").view(1, -1)
+
+    mask = q.new_zeros(b, g_index, sq, sk + 1, dtype=torch.bool).scatter(3, indices.long(), 1)
+    mask = mask[..., :-1]
+    mask = mask & compressed_casual_mask.view(1, 1, sq, sk)
+    mask[:, :, : kv_stride - 1, 0] = True
+    mask = mask.view(b, g_index, 1, sq, sk)
+
+    q = q.view(b, sq, g, -1, dim_q)
+    score = torch.einsum("bmghd,bngd->bghmn", q, k)
+    sm_scale = dim_q**-0.5 if sm_scale is None else sm_scale
+    score = score.masked_fill(~mask, float("-inf")).mul(sm_scale)
+    p = score.softmax(dim=-1)
+    p = p.view(b, g_index, h_index, -1, sq, sk)
+    p = p.view(b, g, -1, sq, sk)
+    o = torch.einsum("bghmn,bngd->bmghd", p.type(v.dtype), v)
+    o = o.reshape(b, sq, h, dim_v)
+    return o.to(torch.bfloat16)
+
+
+def test_sparse_mla_fwd_pipelined(
+    B=1,
+    S=4096,
+    SKV=8192,
+    H=128,
+    HKV=1,
+    DQK=576,
+    DV=512,
+    topk=2048,
+    dtype=torch.bfloat16,
+    # Offset of query in global sequence position (or relative to kv)
+    q_start_s_index=2048,
+    check_correctness=True,
+    profile=False,
+):
+    KV_stride = 1
+
+    torch.random.manual_seed(0)
+    q = torch.randn((B, S, H, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    kv = torch.randn((B, SKV, HKV, DQK), dtype=dtype, device="cuda").requires_grad_(True) / 10
+    q_start_s_index_t = torch.tensor([q_start_s_index], dtype=torch.int32, device="cuda")
+
+    q.clamp_(-10, 10)
+    kv.clamp_(-10, 10)
+
+    indices = torch.full((B, S, HKV, topk), SKV, dtype=torch.int32, device="cuda")
+    for b in range(B):
+        for t in range(S):
+            for h in range(HKV):
+                # Add offset q_start_s_index to convert to global sequence position
+                i_i = torch.randperm(min(max(1, ((t + q_start_s_index) // KV_stride)), SKV))[:topk]
+                indices[b, t, h, : len(i_i)] = i_i
+
+    print("index generation finished")
+
+    kernel = sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride, return_kernel=True, print_kernel=True)
+
+    def fn():
+        return kernel(q, kv, indices, q_start_s_index_t)
+
+    if check_correctness:
+        tl_out, tl_lse = fn()
+        assert KV_stride == 1, "KV_stride > 1 not supported"
+        # if q_start_s_index == 0 and KV_stride > 1:
+        #     tl_out[:, :KV_stride - 1, :, :] = 0
+        ref_out = ref_sparse_mla_fwd_interface(q, kv, indices, q_start_s_index, KV_stride)
+        print(f"tl_out: {tl_out}")
+        print(f"ref_out: {ref_out}")
+        torch.testing.assert_close(tl_out, ref_out, rtol=1e-3, atol=1e-3)
+
+    if profile:
+        print("Profiling mode: running minimal iterations (1 warmup + 1 run)...")
+        fn()
+        torch.cuda.synchronize()
+        fn()
+        torch.cuda.synchronize()
+        return
+
+    from tilelang.profiler import do_bench
+
+    ms = do_bench(
+        fn,
+        rep=20,
+        warmup=10,
+    )
+    print(f"Average time: {ms:.3f} ms")
+    print(f"fwd io bandwidth = ", (B * S * DQK * topk * 2) / (ms * 1e-3) / 1e12)
+    tflops = (B * S * (DQK + DV) * topk * 2 * H) / (ms * 1e-3) / 1e12
+    print(f"fwd tflops = {tflops:.2f}")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test_correctness", action="store_true")
+    parser.add_argument("--profile", action="store_true")
+    args = parser.parse_args()
+    if args.test_correctness:
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 1, 1024, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
+        test_sparse_mla_fwd_pipelined(B, S, SKV, H, HKV, DQK, DV, topk, dtype, check_correctness=True, profile=args.profile)
+    else:
+        # Prefill Benchmark: long context
+        print(" --- Prefill Benchmark --- ")
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 2, 4096, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
+        test_sparse_mla_fwd_pipelined(
+            B, S, SKV, H, HKV, DQK, DV, topk, dtype, q_start_s_index=4096, check_correctness=False, profile=args.profile
+        )
+
+        # Decode Benchmark: large batch size, high throughput generation
+        print("\n --- Decode Benchmark --- ")
+        # Increase batch size to saturate h100 for decode
+        B, S, SKV, H, HKV, DQK, DV, topk, dtype = 128 * 16, 2, 8192, 128, 1, 576, 512, 2048, torch.bfloat16
+        test_sparse_mla_fwd_pipelined(
+            B, S, SKV, H, HKV, DQK, DV, topk, dtype, q_start_s_index=2048 + 4096, check_correctness=False, profile=args.profile
+        )
diff --git a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
index 983798f9f0..9e4c6a63d9 100644
--- a/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
+++ b/examples/deepseek_v32/test_tilelang_example_deepseek_v32.py
@@ -9,30 +9,34 @@
 import sparse_mla_bwd
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_topk_selector():
     topk_selector.test_topk_selector()
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_fp8_lighting_indexer():
     fp8_lighting_indexer.test_fp8_lighting_indexer(S=512, SKV=1024, H=32, HKV=1, D=64, kv_stride=1)
 
 
 @tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_sparse_mla_fwd():
     # small shapes for testing
     sparse_mla_fwd.test_sparse_mla_fwd(S=256, SKV=1024, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_sparse_mla_fwd_pipelined():
     # small shapes for testing
     sparse_mla_fwd_pipelined.test_sparse_mla_fwd_pipelined(S=256, SKV=512, H=64, HKV=1, DQK=576, DV=512, topk=256, check_correctness=False)
 
 
 @tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_sparse_mla_bwd():
     sparse_mla_bwd.test_sparse_mla_bwd(S=256, SKV=512, H=64, HKV=1, DQKV=576, DV=512, topk=256, check_correctness=False)
     sparse_mla_bwd.test_sparse_mla_bwd(
diff --git a/examples/deepseek_v32/topk_selector.py b/examples/deepseek_v32/topk_selector.py
index 078eb26868..8b29c6fd5e 100644
--- a/examples/deepseek_v32/topk_selector.py
+++ b/examples/deepseek_v32/topk_selector.py
@@ -8,18 +8,18 @@
 
 
 def convert_to_uint16(x):
-    hval = T.Cast(T.float16, x)
-    bits_uint = T.reinterpret(T.uint16, hval)
+    hval = T.cast(x, T.float16)
+    bits_uint = T.reinterpret(hval, T.uint16)
     bits_uint = T.if_then_else(x < 0, ~bits_uint & (0xFFFF), bits_uint | (0x8000))
     return bits_uint >> 8
 
 
 def convert_to_uint32(x):
-    bits_uint = T.reinterpret(T.uint32, x)
+    bits_uint = T.reinterpret(x, T.uint32)
     bits_uint = T.if_then_else(
         x < 0,
-        ~bits_uint & T.Cast(T.uint32, (0xFFFFFFFF)),
-        bits_uint | T.Cast(T.uint32, (0x80000000)),
+        ~bits_uint & T.cast((0xFFFFFFFF), T.uint32),
+        bits_uint | T.cast((0x80000000), T.uint32),
     )
     return bits_uint
 
@@ -57,6 +57,8 @@ def tl_topk_kernel(
             l_end_idx = T.alloc_var(T.int32)
             l_out_pos = T.alloc_var(T.int32)
 
+            pos = T.alloc_var(T.int32)
+
             l_new_topk = topk
             l_start_idx = starts[bx]
             l_end_idx = ends[bx]
@@ -99,7 +101,7 @@ def tl_topk_kernel(
                 input_idx = s * BLOCK_SIZE + tx
                 if input_idx < l_end_idx and input_idx >= l_start_idx and input_idx < seq_len:
                     bin_id = convert_to_uint16(input[bx, input_idx])
-                    l_bin_id32 = T.Cast(T.int32, bin_id)
+                    l_bin_id32 = T.cast(bin_id, T.int32)
                     if l_bin_id32 > l_threshold_bin_id:
                         # need a pos = T.atomic_add(s_histogram[bin_id32+1], 1)
                         pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True)
@@ -113,7 +115,7 @@ def tl_topk_kernel(
             # stage 2: tail pass
             for round in T.serial(4):
                 if l_new_topk <= 0:
-                    T.loop_break()
+                    break
 
                 r_idx = round % 2
                 l_start_pos = topk - l_new_topk
@@ -127,8 +129,8 @@ def tl_topk_kernel(
                 l_num_input = s_num_input[r_idx]
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast(
-                            T.int32, ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                        l_bin_id32 = T.cast(
+                            ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF), T.int32
                         )
                         T.atomic_add(s_histogram[l_bin_id32], 1)
                 T.sync_threads()
@@ -156,8 +158,8 @@ def tl_topk_kernel(
                 for s in T.serial(T.ceildiv(l_num_input, BLOCK_SIZE)):
                     T.sync_threads()
                     if s * BLOCK_SIZE + tx < l_num_input:
-                        l_bin_id32 = T.Cast(
-                            T.int32, ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF)
+                        l_bin_id32 = T.cast(
+                            ((convert_to_uint32(input[bx, s_input_idx[r_idx, s * BLOCK_SIZE + tx]]) >> (24 - round * 8)) & 0xFF), T.int32
                         )
                         if l_bin_id32 > l_threshold_bin_id:
                             pos = T.atomic_add(s_histogram[l_bin_id32 + 1], 1, return_prev=True) + l_start_pos
@@ -183,9 +185,6 @@ def tl_topk(input, starts, ends, topk):
 
 
 def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
-    batch = 64
-    seq_len = 32 * 1024
-    topk = 2048
     torch.manual_seed(1)
     input = torch.randn(batch, seq_len, dtype=torch.float32).cuda()
     starts = torch.zeros(batch, dtype=torch.int32).cuda()
@@ -241,27 +240,11 @@ def test_topk_selector(batch=64, seq_len=32 * 1024, topk=2048):
 
 
 def run_regression_perf(batch=64, seq_len=32 * 1024, topk=2048):
-    batch = 64
-    seq_len = 32 * 1024
-    topk = 2048
     torch.manual_seed(1)
     input = torch.randn(batch, seq_len, dtype=torch.float32).cuda()
     starts = torch.zeros(batch, dtype=torch.int32).cuda()
     ends = torch.ones(batch, dtype=torch.int32).cuda() * seq_len
 
-    indexes = tl_topk(input, starts, ends, topk)
-
-    indexes_ref = torch.topk(input, topk, dim=-1)[1]
-
-    for i in range(batch):
-        ref_np = indexes_ref[i].cpu().to(torch.int32).numpy()
-        trt_np = indexes[i].cpu().to(torch.int32).numpy()
-
-        set_ref = set(ref_np)
-        set_trt = set(trt_np)
-        intersection = set_ref & set_trt
-        print("selected/all:", len(intersection), "/", len(set_ref), "=", len(intersection) / len(set_ref))
-
     from tilelang.profiler import do_bench
 
     def run_kernel_only():
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
index 36b32c0a8a..2ae9bdf3eb 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_fp4_hopper.py
@@ -41,7 +41,7 @@ def get_configs():
 )
 @tilelang.jit(
     out_idx=[-1],
-    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def matmul(
     M,
@@ -180,8 +180,8 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared):
                 # Then, dequant.
                 T.call_extern(
                     func_name,
-                    T.address_of(B_local_thread[0]),
-                    T.address_of(B_dequantize_local_thread[0]),
+                    T.access_ptr(B_local_thread, "r"),
+                    T.access_ptr(B_dequantize_local_thread, "w"),
                     1,
                     dtype=out_dtype,
                 )
diff --git a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
index cc37c8bc42..0842e16856 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_bf16_mxfp4_hopper.py
@@ -246,8 +246,8 @@ def fast_dequant_bf16_fp4_twiddling(B_shared, B_dequantize_shared, Scale, k):
                 # Then, dequant.
                 T.call_extern(
                     func_name,
-                    T.address_of(B_local_thread[0]),
-                    T.address_of(B_dequantize_local_thread[0]),
+                    T.access_ptr(B_local_thread, "r"),
+                    T.access_ptr(B_dequantize_local_thread, "w"),
                     1,
                     dtype=out_dtype,
                 )
diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
index 37826874bc..a870208083 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -302,8 +302,16 @@ def main(
                         T.call_extern(
                             "handle",
                             "decode_i4u_to_f16",
-                            T.address_of(B_local[j * local_size_b // num_elems_per_byte]),
-                            T.address_of(B_dequantize_local[j * local_size_b]),
+                            T.access_ptr(
+                                B_local[j * local_size_b // num_elems_per_byte],
+                                "r",
+                                local_size_b // num_elems_per_byte,
+                            ),
+                            T.access_ptr(
+                                B_dequantize_local[j * local_size_b],
+                                "w",
+                                local_size_b,
+                            ),
                             8,
                         )
 
diff --git a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
index b1f8b11328..2db3cd61a9 100644
--- a/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
+++ b/examples/dequantize_gemm/example_dequant_gemm_w4a8.py
@@ -202,4 +202,3 @@ def run_regression_perf(m=4096, n=4096, k=4096):
 
     M, N, K = args.m, args.n, args.k
     main(M, N, K, args.tune)
-    # main(M, N, K, True)
diff --git a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
index 43e97f9309..b67d8165b4 100644
--- a/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
+++ b/examples/dequantize_gemm/example_dequant_gemv_fp16xint4.py
@@ -113,8 +113,8 @@ def main(
                 if fast_decoding:
                     T.call_extern(
                         func_name,
-                        T.address_of(B_quant_local[0]),
-                        T.address_of(B_dequantize_local[0]),
+                        T.access_ptr(B_quant_local, "r"),
+                        T.access_ptr(B_dequantize_local, "w"),
                         dtype=in_dtype,
                     )
                 else:
@@ -135,7 +135,7 @@ def main(
                         accum_res[0] += A_local[ki] * B_dequantize_local[ki]
 
             with T.attr(
-                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                 "reduce_scope",
                 T.reinterpret(T.uint64(0), dtype="handle"),
             ):
diff --git a/examples/dequantize_gemm/regression_example_dequantize_gemm.py b/examples/dequantize_gemm/regression_example_dequantize_gemm.py
index 4ab03784ff..51b7c53e00 100644
--- a/examples/dequantize_gemm/regression_example_dequantize_gemm.py
+++ b/examples/dequantize_gemm/regression_example_dequantize_gemm.py
@@ -4,7 +4,6 @@
 import example_dequant_gemm_fp4_hopper
 import example_dequant_gemm_w4a8
 import example_dequant_gemv_fp16xint4
-import example_dequant_groupedgemm_bf16_mxfp4_hopper
 
 
 def regression_example_dequant_gemv_fp16xint4():
@@ -23,10 +22,6 @@ def regression_example_dequant_gemm_bf16_mxfp4_hopper():
     tilelang.testing.process_func(example_dequant_gemm_bf16_mxfp4_hopper.run_regression_perf)
 
 
-def regression_example_dequant_groupedgemm_bf16_mxfp4_hopper():
-    tilelang.testing.process_func(example_dequant_groupedgemm_bf16_mxfp4_hopper.run_regression_perf)
-
-
 def regression_example_dequant_gemm_w4a8():
     tilelang.testing.process_func(example_dequant_gemm_w4a8.run_regression_perf)
 
diff --git a/examples/dequantize_gemm/test_example_dequantize_gemm.py b/examples/dequantize_gemm/test_example_dequantize_gemm.py
index a2f777222b..021402a363 100644
--- a/examples/dequantize_gemm/test_example_dequantize_gemm.py
+++ b/examples/dequantize_gemm/test_example_dequantize_gemm.py
@@ -3,7 +3,6 @@
 import example_dequant_gemv_fp16xint4
 import example_dequant_gemm_fp4_hopper
 import example_dequant_gemm_bf16_mxfp4_hopper
-import example_dequant_groupedgemm_bf16_mxfp4_hopper
 import example_dequant_gemm_w4a8
 
 
@@ -13,25 +12,19 @@ def test_example_dequant_gemv_fp16xint4():
 
 
 @tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_dequant_gemm_fp4_hopper():
     example_dequant_gemm_fp4_hopper.main()
 
 
 @tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_dequant_gemm_bf16_mxfp4_hopper():
     example_dequant_gemm_bf16_mxfp4_hopper.main()
 
 
 @tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_dequant_groupedgemm_bf16_mxfp4_hopper():
-    example_dequant_groupedgemm_bf16_mxfp4_hopper.main()
-
-
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_dequant_gemm_w4a8():
     example_dequant_gemm_w4a8.main()
 
diff --git a/examples/distributed/example_allgather_gemm_specialized.py b/examples/distributed/example_allgather_gemm_specialized.py
new file mode 100644
index 0000000000..17a6a6a6e8
--- /dev/null
+++ b/examples/distributed/example_allgather_gemm_specialized.py
@@ -0,0 +1,243 @@
+import os
+import argparse
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing
+
+import tilelang
+import tilelang.language as T
+from tilelang.carver.arch import driver
+from tilelang.distributed import init_dist
+from tilelang.distributed import perf_fn
+from tilelang.utils.allocator import get_allocator
+
+tilelang.disable_cache()
+os.environ["NCCL_DEBUG"] = "WARN"
+
+
+@tilelang.jit
+def ag_gemm_sm_specialized_kernel(
+    M,
+    N,
+    K,
+    num_ranks,
+    num_comm_sms: int,
+    block_M: int,
+    block_N: int,
+    block_K: int,
+    threads: int,
+    dtype: str = "float16",
+):
+    sm_num = driver.get_num_sms()
+    num_comp_sms = sm_num - num_comm_sms
+    M_per_rank = M // num_ranks
+    N_per_rank = N // num_ranks
+    m_blocks = T.ceildiv(M, block_M)
+    n_blocks = T.ceildiv(N_per_rank, block_N)
+    local_m_blocks = T.ceildiv(M_per_rank, block_M)
+    k_blocks = T.ceildiv(K, block_K)
+    total_tiles = m_blocks * n_blocks
+    waves = T.ceildiv(total_tiles, num_comp_sms)
+    GROUP_SIZE_M = 8
+    accum_dtype = "float"
+
+    @T.prim_func
+    def main(
+        A_local: T.Tensor((M_per_rank, K), dtype),
+        B: T.Tensor((K, N_per_rank), dtype),
+        mcast_A: T.Tensor((M, K), dtype),
+        gathered_A: T.Tensor((M, K), dtype),
+        mcast_signal: T.Tensor((m_blocks,), "uint32"),
+        local_signal: T.Tensor((m_blocks,), "uint32"),
+        grid_barrier: T.Tensor((num_ranks,), "int32"),
+        C: T.Tensor((M, N_per_rank), dtype),
+        local_rank: T.int32,
+    ):
+        with T.Kernel(sm_num, threads=threads) as bid:
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_shared = T.alloc_shared((block_M, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            tid = T.get_thread_binding(0)
+
+            if bid == 0:
+                for i in T.serial(T.ceildiv(m_blocks, threads)):
+                    signal_idx = i * threads + tid
+                    if signal_idx < m_blocks:
+                        local_signal[signal_idx] = 0
+            T.fence_sys()
+            T.barrier_blocks(grid_barrier)
+
+            if bid < num_comp_sms:
+                for w in T.serial(waves):
+                    tile_id = bid + w * num_comp_sms
+                    if tile_id < total_tiles:
+                        num_pid_in_group = GROUP_SIZE_M * n_blocks
+                        group_id = tile_id // num_pid_in_group
+                        first_pid_m = group_id * GROUP_SIZE_M
+                        group_size_m = T.min(m_blocks - first_pid_m, GROUP_SIZE_M)
+                        pid_m = first_pid_m + ((tile_id % num_pid_in_group) % group_size_m)
+                        pid_n = (tile_id % num_pid_in_group) // group_size_m
+
+                        if tid == 0:
+                            T.wait_eq(local_signal[pid_m], 1)
+
+                        T.clear(C_local)
+                        for k in T.Pipelined(k_blocks, num_stages=3):
+                            T.copy(gathered_A[pid_m * block_M, k * block_K], A_shared)
+                            T.copy(B[k * block_K, pid_n * block_N], B_shared)
+                            T.gemm(A_shared, B_shared, C_local)
+                        T.copy(C_local, C_shared)
+                        T.copy(C_shared, C[pid_m * block_M, pid_n * block_N])
+            else:
+                loaded = T.alloc_barrier([256])
+                parity = 0
+                comm_sm_id = bid - num_comp_sms
+                for local_m in T.serial(T.ceildiv(local_m_blocks, num_comm_sms)):
+                    local_pid_m = comm_sm_id + local_m * num_comm_sms
+                    if local_pid_m < local_m_blocks:
+                        global_pid_m = local_rank * local_m_blocks + local_pid_m
+                        for k in T.serial(k_blocks):
+                            T.tma_load(A_local[local_pid_m * block_M, k * block_K], A_shared)
+                            T.mbarrier_arrive(loaded)
+                            T.mbarrier_wait_parity(loaded, parity)
+                            parity = (parity + 1) % 2
+                            T.copy(
+                                A_shared,
+                                mcast_A[global_pid_m * block_M, k * block_K],
+                            )  # TODO(wt): Change to canonical mcast tma store later
+
+                        T.fence_sys()
+                        if tid == 0:
+                            T.multimem_signal(mcast_signal[global_pid_m], 1)
+
+    return main
+
+
+def ag_gemm_op(
+    A,
+    B,
+    mcast_A,
+    gathered_A,
+    mcast_signal,
+    local_signal,
+    grid_barrier,
+    C,
+    kernel,
+    local_rank,
+):
+    kernel(A, B, mcast_A, gathered_A, mcast_signal, local_signal, grid_barrier, C, local_rank)
+    return C
+
+
+def torch_ag_gemm(group: torch.distributed.ProcessGroup, A: torch.Tensor, B: torch.Tensor, ag_out: torch.Tensor):
+    torch.distributed.all_gather_into_tensor(ag_out, A, group)
+    return torch.matmul(ag_out, B)
+
+
+def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
+    dtype = torch.float16
+    M, N, K = args.M, args.N, args.K
+    block_M, block_N, block_K = args.block_m, args.block_n, args.block_k
+    threads = args.threads
+    num_comm_sms = args.num_comm_sms
+
+    assert M % num_local_ranks == 0, "M must be divisible by num-processes"
+    assert N % num_local_ranks == 0, "N must be divisible by num-processes"
+    assert (M // num_local_ranks) % block_M == 0, "M_per_rank must be divisible by block_m"
+    assert (N // num_local_ranks) % block_N == 0, "N_per_rank must be divisible by block_n"
+    assert K % block_K == 0, "K must be divisible by block_k"
+    assert 0 < num_comm_sms < driver.get_num_sms(), "num_comm_sms must leave at least one compute SM"
+
+    M_per_rank = M // num_local_ranks
+    N_per_rank = N // num_local_ranks
+    m_blocks = M // block_M
+
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+    assert rank == local_rank and num_ranks == num_local_ranks, "only support single-node launch for now"
+
+    dtype_bytes = torch.tensor([], dtype=dtype).element_size()
+    signal_bytes = torch.tensor([], dtype=torch.uint32).element_size()
+    # _allocate_mcast_tensor uses aligned bump allocation; keep room for padding
+    # between the gathered A buffer and the signal buffer.
+    mcast_bytes = M * K * dtype_bytes + m_blocks * signal_bytes + 4096
+    allocator = get_allocator(
+        size=2**30,
+        device=f"cuda:{local_rank}",
+        is_distributed=True,
+        local_rank=local_rank,
+        num_local_ranks=num_local_ranks,
+        group=group,
+        use_vmm=True,
+        mcast_size=mcast_bytes,
+    )
+
+    kernel = ag_gemm_sm_specialized_kernel(
+        M,
+        N,
+        K,
+        num_local_ranks,
+        num_comm_sms,
+        block_M,
+        block_N,
+        block_K,
+        threads,
+    )
+    kernel.initialize(allocator=allocator)
+    if local_rank == 0 and args.print_source:
+        print(kernel.get_kernel_source())
+
+    torch.manual_seed(42 + local_rank)
+    A = tilelang.tensor((M_per_rank, K), dtype, allocator=allocator).normal_()
+    B = tilelang.tensor((K, N_per_rank), dtype, allocator=allocator).normal_()
+    C = tilelang.tensor((M, N_per_rank), dtype, allocator=allocator)
+    grid_barrier = tilelang.tensor((num_local_ranks,), torch.int32, allocator=allocator).zero_()
+
+    mcast_A_flat, gathered_A_flat = allocator._allocate_mcast_tensor((M * K,), dtype)
+    mcast_signal, local_signal = allocator._allocate_mcast_tensor((m_blocks,), torch.uint32)
+    mcast_A = mcast_A_flat.view(M, K)
+    gathered_A = gathered_A_flat.view(M, K)
+
+    dist.barrier(group)
+    tilelang_C = ag_gemm_op(A, B, mcast_A, gathered_A, mcast_signal, local_signal, grid_barrier, C, kernel, local_rank)
+    torch.cuda.synchronize()
+    dist.barrier(group)
+
+    torch_ag_buffer = torch.empty((M, K), dtype=dtype, device=f"cuda:{local_rank}")
+    torch_C = torch_ag_gemm(group, A, B, torch_ag_buffer)
+
+    if torch.allclose(torch_C, tilelang_C, atol=1e-2, rtol=1e-2):
+        print(f"rank {local_rank} check passed.")
+    else:
+        max_diff = (torch_C - tilelang_C).abs().max().item()
+        print(f"rank {local_rank} check failed. max_diff={max_diff}")
+
+    tl_t = perf_fn(
+        lambda: ag_gemm_op(A, B, mcast_A, gathered_A, mcast_signal, local_signal, grid_barrier, C, kernel, local_rank),
+        warmup=args.warmup,
+        rep=args.rep,
+    )
+    print(f"rank {local_rank} tilelang specialized ag_gemm time: {tl_t:.2f} ms, TFLOPS: {2 * M * N * K / 1e9 / tl_t / num_local_ranks:.2f}")
+
+    allocator.close()
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-processes", type=int, default=8)
+    parser.add_argument("--M", type=int, default=32768)
+    parser.add_argument("--N", type=int, default=16384)
+    parser.add_argument("--K", type=int, default=2048)
+    parser.add_argument("--block-m", type=int, default=128)
+    parser.add_argument("--block-n", type=int, default=256)
+    parser.add_argument("--block-k", type=int, default=64)
+    parser.add_argument("--threads", type=int, default=256)
+    parser.add_argument("--num-comm-sms", type=int, default=8)
+    parser.add_argument("--warmup", type=int, default=5)
+    parser.add_argument("--rep", type=int, default=10)
+    parser.add_argument("--print-source", action="store_true")
+    args = parser.parse_args()
+
+    torch.multiprocessing.spawn(main, args=(args.num_processes, args), nprocs=args.num_processes, join=True)
diff --git a/examples/distributed/example_multimem_allreduce.py b/examples/distributed/example_multimem_allreduce.py
new file mode 100644
index 0000000000..9247d9e040
--- /dev/null
+++ b/examples/distributed/example_multimem_allreduce.py
@@ -0,0 +1,123 @@
+"""
+Multimem allreduce example using NVSwitch multicast instructions.
+
+Multi-process multi-GPU: each process manages one GPU, multicast handle
+shared via fabric handles through torch.distributed.
+
+Usage:
+  export TILESCALE_USE_VMM=1
+  export NCCL_IB_DISABLE=1
+  export TILELANG_USE_DISTRIBUTED=1
+  python examples/distributed/example_multimem_allreduce.py [--num-processes 8]
+
+Requirements:
+  - NVSwitch with multicast support (H100/B200 DGX)
+"""
+
+import os
+import argparse
+
+import torch
+import torch.distributed as dist
+import torch.multiprocessing
+
+import tilelang
+import tilelang.language as T
+from tilelang.distributed import init_dist
+from tilelang.utils.allocator import get_allocator
+
+tilelang.disable_cache()
+os.environ["NCCL_DEBUG"] = "WARN"
+
+
+def multimem_allreduce_kernel(N, block_N, threads):
+    @T.prim_func
+    def main(
+        mcast_buf: T.Tensor((N,), "float32"),
+        result: T.Tensor((N,), "float32"),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx,):
+            result_local = T.alloc_fragment([block_N], "float32")
+            T.multimem_ld_reduce(
+                mcast_buf[bx * block_N : (bx + 1) * block_N],
+                result_local,
+                reduce_op=T.MultimemReduceOp.ADD,
+            )
+            T.copy(result_local, result[bx * block_N : (bx + 1) * block_N])
+
+    return main
+
+
+def main(local_rank: int, num_local_ranks: int, args: argparse.Namespace):
+    N = args.N
+    BLOCK_N = args.block_n
+    threads = args.threads
+
+    rank, num_ranks, group = init_dist(local_rank, num_local_ranks)
+
+    # Create allocator with integrated multicast buffer
+    allocator = get_allocator(
+        size=N * 4,  # float32 = 4 bytes
+        device=f"cuda:{local_rank}",
+        is_distributed=True,
+        local_rank=local_rank,
+        num_local_ranks=num_local_ranks,
+        group=group,
+        mcast_size=N * 4,
+    )
+
+    # Compile kernel
+    kernel = tilelang.compile(
+        multimem_allreduce_kernel(N, BLOCK_N, threads),
+        pass_configs={"tl.disable_tma_lower": True},
+    )
+    if local_rank == 0 and args.print_source:
+        print(kernel.get_kernel_source())
+
+    # Random input per rank
+    torch.manual_seed(42 + local_rank)
+    local_data = torch.randn(N, dtype=torch.float32, device=f"cuda:{local_rank}")
+
+    # Allocate from multicast buffer
+    # mcast_tensor: MC VA for multimem instructions (read)
+    # local_tensor: physical VA for writing data
+    mcast_tensor, local_tensor = allocator._allocate_mcast_tensor((N,), torch.float32)
+
+    # Write to physical memory (NOT the MC VA)
+    local_tensor.copy_(local_data)
+    torch.cuda.synchronize()
+    dist.barrier(group)
+    result = torch.empty(N, dtype=torch.float32, device=f"cuda:{local_rank}")
+    kernel(mcast_tensor, result)
+    torch.cuda.synchronize()
+
+    # torch.distributed reference
+    expected = local_data.clone()
+    dist.all_reduce(expected, op=dist.ReduceOp.SUM, group=group)
+
+    # Compare (fp32 should be exact or near-exact)
+    atol = 1e-5
+    max_diff = (result - expected).abs().max().item()
+    passed = max_diff < atol
+
+    if local_rank == 0:
+        print(f"N={N}, num_ranks={num_ranks}, max_diff={max_diff:.4f}, atol={atol}")
+    if passed:
+        print(f"[rank {local_rank}] PASSED")
+    else:
+        print(f"[rank {local_rank}] FAILED (max_diff={max_diff:.4f})")
+
+    allocator.close()
+    dist.destroy_process_group()
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--num-processes", type=int, default=8)
+    parser.add_argument("--N", type=int, default=65536)
+    parser.add_argument("--block_n", type=int, default=4096)
+    parser.add_argument("--threads", type=int, default=128)
+    parser.add_argument("--print_source", action="store_true")
+    args = parser.parse_args()
+
+    torch.multiprocessing.spawn(main, args=(args.num_processes, args), nprocs=args.num_processes, join=True)
diff --git a/examples/dsa_hisa/README.md b/examples/dsa_hisa/README.md
new file mode 100644
index 0000000000..5eb0f72a8a
--- /dev/null
+++ b/examples/dsa_hisa/README.md
@@ -0,0 +1,200 @@
+# tilelang_kernels — hisa prefill pipeline
+
+Tilelang prefill implementation of **hisa** (HIerarchical Sparse Attention).
+Paper: <https://arxiv.org/pdf/2603.28458>.
+
+## What is HISA?
+
+HISA optimizes DeepSeek sparse attention by a plug-and-play replacement
+for the indexer that rewrites the search path from a flat token scan into
+a two-stage hierarchical procedure.
+
+**Stage 1 — coarse block-level selection.** Group K tokens into pool blocks
+of `k_block_size` tokens, mean-pool each block, then score each query
+against all pool blocks and pick the top `block_topk` blocks per query.
+
+**Stage 2 — fine-grained token-level scoring.** For each query, run a
+full-resolution MQA over the raw tokens inside its selected blocks, then
+pick the top `topk_tokens` tokens per query.
+
+## Files
+
+| file | step | role |
+|---|---|---|
+| `fp8_block_mean_pooling.py` | 1.1 | Mean-pool raw K into pool blocks (fp8 + per-block f32 scale) |
+| `pool_mqa_fp8.py`           | 1.2 | fp8×fp8 score `Q · pooled_K` → one logit per (query, pool block) |
+| `clean_and_maintain_logits.py` | 1.3 | In-place mask on stage-1 logits: -inf outside per-query range, +inf at first/last valid block |
+| `block_sparse_mqa_fp8.py`   | 2.1 | fp8×fp8 fine-grained score over the raw tokens of the `block_topk` selected blocks |
+| `hisa.py`                   | —   | End-to-end orchestration: all four kernels + the two `torch.topk` steps + the index-translation post-processing |
+
+Each per-kernel file has one `test_*` entry that (a) runs the kernel +
+torch ref, (b) asserts via `torch.testing.assert_close`, (c) prints the
+latency of the kernel. `hisa.py` has `test_hisa` that runs the full
+pipeline, checks the output-index mask invariant, and prints end-to-end
+latency.
+
+## Per-kernel reference
+
+### 1.1 `fp8_block_mean_pooling.py`
+
+**Function**: `fp8_native_block_mean_pooling`
+
+**Meaning**: flat per-block mean of the chunk's K tokens, re-quantized to
+fp8 with a per-block f32 scale. Groups `N` K tokens into
+`ceildiv(N, k_block_size)` pool blocks.
+
+**Interface**:
+```python
+blocked_k, blocked_k_scale = fp8_native_block_mean_pooling_interface(
+    k,           # [N, D] fp8
+    k_scale,     # [N] f32  — per-token scale from indexer_k_quant_and_cache
+    k_block_size,
+)
+# blocked_k:       [num_blocks, D] fp8
+# blocked_k_scale: [num_blocks]    f32
+```
+
+**What it does**: per pool block `b` of size `kb = k_block_size`,
+1. dequantize each of the `kb` tokens: `k_f[i] = k_fp8[i] * k_scale[i]`
+2. average across the block in f32: `mean = sum_i k_f[i] / kb` (or the
+   actual valid count for the ragged tail block)
+3. re-quantize the f32 mean to fp8 with a per-block scale
+   `block_scale = max(max_abs(mean) / 448, 1e-10)`, writing
+   `blocked_k[b] = fp8(mean / block_scale)` and `blocked_k_scale[b] = block_scale`.
+
+### 1.2 `pool_mqa_fp8.py`
+
+**Function**: `pool_mqa_attn_return_logits_fp8`
+
+**Meaning**: coarse-grained fp8 multi-query attention over the **pooled** K
+(one vector per pool block). Produces one logit per (query, pool-block).
+
+**Interface**:
+```python
+block_k_score = pool_mqa_attn_return_logits_fp8_interface(
+    q_fp8,                    # [M, H, D] fp8
+    blocked_kv_fp8,           # [Nb, D]   fp8     (from step 1.1)
+    blocked_kv_scale,         # [Nb]      f32     (from step 1.1)
+    weights_f32,              # [M, H]    f32
+    cu_seqlen_blocked_ks,     # [M] int32 — per-query start in pool-block coords
+    cu_seqlen_blocked_ke,     # [M] int32 — per-query end   in pool-block coords
+)
+# block_k_score: [M, Nb] f32
+```
+
+**What it does**: for each query `m` and each pool block `n` in
+`[cu_seqlen_blocked_ks[m], cu_seqlen_blocked_ke[m])`,
+```
+block_k_score[m, n] = sum_h ReLU(q[m, h] · blocked_k[n]) * blocked_k_scale[n] * weights[m, h]
+```
+Uses tile-level fp8×fp8→f32 Tensor Core GEMM; the per-block scale is
+applied post-GEMM. The kernel processes queries in tiles of size
+`block_Q × block_N` and **writes the union of the tile's queries' visible
+K ranges** — entries outside an individual query's range inside that
+union still carry raw dot-product values (they will be masked by
+step 1.3 next). Entries outside the tile union are left at their
+zero-init value.
+
+### 1.3 `clean_and_maintain_logits.py`
+
+**Function**: `clean_and_maintain_logits_`
+
+**Meaning**: in-place post-kernel mask on the stage-1 logits.
+
+**Interface**:
+```python
+clean_and_maintain_logits_interface(
+    logits,        # [M, Nb] f32 — stage-1 output; modified in place
+    cu_seqlen_ks,  # [M] int32 — per-row start (inclusive)
+    cu_seqlen_ke,  # [M] int32 — per-row end   (exclusive)
+)
+```
+
+**What it does**: for each row `m`,
+- positions outside `[cu_seqlen_ks[m], cu_seqlen_ke[m])` → set to `-inf`
+  (so `torch.topk` ignores them),
+- positions `cu_seqlen_ks[m]` and `cu_seqlen_ke[m] - 1`      → set to `+inf`
+  (force-maintain the boundary blocks: they are always picked by the
+  subsequent top-block selection — a standard hisa trick to preserve
+  sink and local blocks).
+
+### 2.1 `block_sparse_mqa_fp8.py`
+
+**Function**: `fp8_native_block_sparse_mqa_attn_return_logits`
+
+**Meaning**: fine-grained fp8 MQA over only the **raw K tokens** inside the
+top-`block_topk` pool blocks selected per query. Two kernel variants are
+auto-dispatched by the factory:
+- general (`kv_block_size > block_N`): pipelined sub-block inner loop
+- small-pooling-size (`kv_block_size == block_N`): single pass, no pipeline
+
+**Interface**:
+```python
+block_sparse_logits = fp8_native_block_sparse_mqa_attn_return_logits_interface(
+    q,                  # [M, H, D] fp8
+    k,                  # [N, D]    fp8
+    k_scale,            # [N]       f32
+    topk_block_index,   # [M, block_topk] int64 — from torch.topk over stage-1 scores
+    kv_block_size,      # == k_block_size
+    weights,            # [M, H] f32
+    cu_seqlen_ks,       # [M] int32 — per-query K start (absolute, in raw tokens)
+    cu_seqlen_ke,       # [M] int32 — per-query K end
+)
+# block_sparse_logits: [M, block_topk * kv_block_size] f32
+```
+
+**What it does**: for each query `m`, for each selected block
+`t ∈ [0, block_topk)` with `blk = topk_block_index[m, t]`, for each
+in-block offset `i ∈ [0, kv_block_size)`,
+```
+k_abs = blk * kv_block_size + i
+if k_abs ∉ [cu_seqlen_ks[m], cu_seqlen_ke[m]) or k_abs >= N:
+    block_sparse_logits[m, t * kv_block_size + i] = -inf
+else:
+    block_sparse_logits[m, t * kv_block_size + i] =
+        sum_h ReLU(q[m, h] · k[k_abs]) * k_scale[k_abs] * weights[m, h]
+```
+The out-of-range mask is written directly by this kernel — no separate
+mask pass is needed here (unlike stage 1).
+
+### End-to-end `hisa.py`
+
+**Function**: `hisa_indexer`
+
+**Meaning**: single entry point that runs the full pipeline below.
+
+**Interface**:
+```python
+topk_indices = hisa_indexer(
+    q,                # [M, H, D] fp8
+    k,                # [N, D]    fp8
+    k_scale,          # [N]       f32
+    weights,          # [M, H]    f32
+    cu_seqlen_ks,     # [M]       int32 — per-query K start
+    cu_seqlen_ke,     # [M]       int32 — per-query K end
+    *,
+    k_block_size,     # pool block size (=128 in DeepSeek-V3.2)
+    block_topk,       # number of top pool blocks kept per query
+    topk_tokens,      # final top-k size handed to the sparse attention
+)
+# topk_indices: [M, topk_tokens] int32 — each row is the query's top-k K
+# positions expressed as offsets within its own [cu_ks, cu_ke) window.
+# Out-of-range slots are -1.
+```
+
+**Pipeline**:
+
+```
+(1.1) fp8_native_block_mean_pooling            K, k_scale → blocked_k, blocked_k_scale
+(1.2) pool_mqa_attn_return_logits_fp8          Q × blocked_k → block_k_score[M, Nb]
+(1.3) clean_and_maintain_logits                in-place mask (-inf/+inf) on block_k_score
+(1.4) torch.topk(block_k_score.bfloat16(),     → topk_block_indices[M, block_topk] int64
+                 k=block_topk, sorted=False)
+(2.1) fp8_native_block_sparse_mqa_…            Q × K[selected] → block_sparse_logits
+                                                  [M, block_topk * k_block_size]
+(2.2) torch.topk(block_sparse_logits,          → relevant_topk_indices[M, topk_tokens] int64
+                 k=topk_tokens)
+(2.3) (Python) gather topk_block_indices +     → absolute K positions, then subtract
+      arith + subtract cu_seqlen_ks + mask        cu_seqlen_ks for per-query-relative offsets
+                                                  → topk_indices[M, topk_tokens] int32
+```
diff --git a/examples/dsa_hisa/block_sparse_mqa_fp8.py b/examples/dsa_hisa/block_sparse_mqa_fp8.py
new file mode 100644
index 0000000000..10f95bb204
--- /dev/null
+++ b/examples/dsa_hisa/block_sparse_mqa_fp8.py
@@ -0,0 +1,269 @@
+import tilelang
+from tilelang import language as T
+from tilelang.profiler import do_bench
+import torch
+
+from tilelang_utils import prepare_ks_ke_from_cu_seqlens
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def fp8_native_block_sparse_mqa_attn_return_logits(
+    IndexQ,
+    IndexK,
+    IndexKScale,
+    TopKBlockIndex,
+    Weights,
+    CuSeqLenKS,
+    CuSeqLenKE,
+    heads: int = 64,
+    index_dim: int = 128,
+    kv_block_size: int = 128,
+    topk: int = 64,
+    block_N: int = 128,
+    num_stages: int = 1,
+    threads: int = 256,
+):
+    fp8_dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
+    index_dtype = T.int32
+    topk_index_dtype = T.int64
+
+    seq_len, seq_len_kv = T.const("seq_len, seq_len_kv")
+
+    H_per_block = heads
+    block_N = min(block_N, kv_block_size // 2)
+    assert kv_block_size % block_N == 0, "block_N must divide kv_block_size"
+
+    IndexQ: T.Tensor[[seq_len * heads, index_dim], fp8_dtype]
+    IndexK: T.Tensor[[seq_len_kv, index_dim], fp8_dtype]
+    IndexKScale: T.Tensor[[seq_len_kv], accum_dtype]
+    TopKBlockIndex: T.Tensor[[seq_len, topk], topk_index_dtype]
+    Weights: T.Tensor[[seq_len, heads], accum_dtype]
+    CuSeqLenKS: T.Tensor[[seq_len], index_dtype]
+    CuSeqLenKE: T.Tensor[[seq_len], index_dtype]
+
+    Logits = T.empty((seq_len, topk * kv_block_size), accum_dtype)
+
+    with T.Kernel(seq_len, threads=threads) as bx:
+        index_q_shared = T.alloc_shared([H_per_block, index_dim], fp8_dtype)
+        index_k_shared = T.alloc_shared([block_N, index_dim], fp8_dtype)
+        # Shared (zero-init'd) — see note in the hisa source about serial-topk
+        # loop making shared slightly faster than fragment here.
+        scale_shared = T.alloc_shared([block_N], accum_dtype)
+
+        s = T.alloc_fragment([block_N, H_per_block], accum_dtype)
+        s_reshaped = T.reshape(s, (block_N, H_per_block // heads, heads))
+        logits = T.alloc_fragment([block_N, H_per_block // heads], accum_dtype)
+        weights = T.alloc_fragment([H_per_block // heads, heads], accum_dtype)
+
+        seq_len_i = bx
+
+        cu_k_s_min = CuSeqLenKS[seq_len_i]
+        cu_k_e_max = CuSeqLenKE[seq_len_i]
+
+        T.copy(IndexQ[seq_len_i * heads : seq_len_i * heads + H_per_block, :], index_q_shared)
+        T.copy(Weights[seq_len_i, :], weights)
+
+        for n_i in T.serial(topk):
+            topk_block_id = T.cast(TopKBlockIndex[seq_len_i, n_i], index_dtype)
+            block_s = topk_block_id * kv_block_size
+            for b_i in T.Pipelined(kv_block_size // block_N, num_stages=num_stages):
+                block_s_i = block_s + b_i * block_N
+
+                T.copy(IndexK[block_s_i : block_s_i + block_N, :], index_k_shared)
+                for bn_i in T.Parallel(block_N):
+                    scale_shared[bn_i] = IndexKScale[block_s_i + bn_i]
+
+                T.gemm(
+                    index_k_shared,
+                    index_q_shared,
+                    s,
+                    transpose_B=True,
+                    clear_accum=True,
+                    policy=T.GemmWarpPolicy.FullRow,
+                )
+
+                for bn_i, bq_i, h_i in T.Parallel(block_N, H_per_block // heads, heads):
+                    s_reshaped[bn_i, bq_i, h_i] = T.max(s_reshaped[bn_i, bq_i, h_i] * scale_shared[bn_i], 0) * weights[bq_i, h_i]
+
+                T.reduce_sum(s_reshaped, logits, dim=-1, clear=True)
+
+                for i_i in T.Parallel(block_N):
+                    k_i = block_s_i + i_i
+                    if k_i < cu_k_s_min or k_i >= cu_k_e_max:
+                        logits[i_i, 0] = -T.infinity(accum_dtype)
+
+                for bn_i in T.Parallel(block_N):
+                    Logits[seq_len_i, n_i * kv_block_size + b_i * block_N + bn_i] = logits[bn_i, 0]
+
+    return Logits
+
+
+def fp8_native_block_sparse_mqa_attn_return_logits_interface(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    k_scale: torch.Tensor,
+    topk_block_index: torch.Tensor,
+    kv_block_size: int,
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+):
+    seq_len, heads, index_dim = q.shape
+    topk = topk_block_index.shape[1]
+    logits = fp8_native_block_sparse_mqa_attn_return_logits(
+        q.view(seq_len * heads, index_dim),
+        k,
+        k_scale,
+        topk_block_index,
+        weights,
+        cu_seqlen_ks,
+        cu_seqlen_ke,
+        heads=heads,
+        index_dim=index_dim,
+        kv_block_size=kv_block_size,
+        topk=topk,
+    )
+    return logits
+
+
+def ref_fp8_block_sparse_mqa(
+    q_fp8: torch.Tensor,
+    k_fp8: torch.Tensor,
+    k_scale: torch.Tensor,
+    topk_block_index: torch.Tensor,
+    kv_block_size: int,
+    weights: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    M, H, D = q_fp8.shape
+    N = k_fp8.shape[0]
+    topk = topk_block_index.shape[1]
+
+    block_starts = topk_block_index.long() * kv_block_size  # [M, topk]
+    pos_in_block = torch.arange(kv_block_size, device=q_fp8.device)
+    k_abs = block_starts[..., None] + pos_in_block[None, None, :]  # [M, topk, B]
+    k_safe = k_abs.clamp(0, N - 1)
+
+    q_f = q_fp8.float()
+    k_f = k_fp8.float() * k_scale[:, None]
+    gathered_k = k_f[k_safe.flatten()].reshape(M, topk, kv_block_size, D)
+
+    s = torch.einsum("mhd,mtid->mtih", q_f, gathered_k)  # [M, topk, B, H]
+    logits = (s.clamp(min=0) * weights[:, None, None, :]).sum(dim=-1)  # [M, topk, B]
+
+    in_range = (k_abs >= cu_seqlen_ks.long()[:, None, None]) & (k_abs < cu_seqlen_ke.long()[:, None, None]) & (k_abs < N)
+    logits = logits.masked_fill(~in_range, float("-inf"))
+    return logits.reshape(M, topk * kv_block_size)
+
+
+def test_fp8_block_sparse_mqa(
+    M: int = 1024,
+    H: int = 64,
+    D: int = 128,
+    kv_block_size: int = 128,
+    topk: int = 64,
+    num_seqs: int = 1,
+):
+    """Correctness + speed test packing `num_seqs` equal-length causal
+    sequences into the [M, H, D] Q and [M, D] K tensors. Each query sees
+    only the prefix of its own sequence (``cu_ks = start_of_seq``,
+    ``cu_ke = start_of_seq + position_in_seq + 1``).
+
+    ``topk_block_index`` is drawn at random from [0, num_k_blocks) — some
+    picks will point to blocks outside the query's own sequence; those
+    positions get -inf via the kernel's built-in mask, and the torch ref
+    produces the same -inf. Comparison checks both the +/-inf mask
+    pattern (exact) and the finite values (fp8 tolerance)."""
+    torch.manual_seed(0)
+    assert M % num_seqs == 0, f"M ({M}) must be divisible by num_seqs ({num_seqs})"
+    N = M  # causal self-attention prefill, packed
+
+    per_seq = M // num_seqs
+    cu_seqlens = torch.arange(num_seqs + 1, device="cuda", dtype=torch.long) * per_seq
+    ks_long, ke_long = prepare_ks_ke_from_cu_seqlens(cu_seqlens)
+    cu_ks = ks_long.to(torch.int32).contiguous()
+    cu_ke = ke_long.to(torch.int32).contiguous()
+
+    q_bf16 = torch.randn(M, H, D, device="cuda", dtype=torch.bfloat16)
+    q = q_bf16.to(torch.float8_e4m3fn)
+    k_bf16 = torch.randn(N, D, device="cuda", dtype=torch.bfloat16)
+    k = k_bf16.to(torch.float8_e4m3fn)
+    k_scale = (0.1 + 0.01 * torch.rand(N, device="cuda", dtype=torch.float32)).contiguous()
+    weights = torch.randn(M, H, device="cuda", dtype=torch.float32)
+
+    # Random per-query top-k blocks (distinct indices drawn from [0, num_blocks)).
+    num_k_blocks = (N + kv_block_size - 1) // kv_block_size
+    topk = min(topk, num_k_blocks)
+    g = torch.Generator(device="cuda").manual_seed(42)
+    topk_block_index = torch.stack([torch.randperm(num_k_blocks, generator=g, device="cuda")[:topk] for _ in range(M)]).to(torch.int64)
+
+    # Correctness.
+    got = fp8_native_block_sparse_mqa_attn_return_logits_interface(
+        q,
+        k,
+        k_scale,
+        topk_block_index,
+        kv_block_size,
+        weights,
+        cu_ks,
+        cu_ke,
+    )
+    ref = ref_fp8_block_sparse_mqa(
+        q,
+        k,
+        k_scale,
+        topk_block_index,
+        kv_block_size,
+        weights,
+        cu_ks,
+        cu_ke,
+    )
+    # The kernel marks out-of-range as -inf. Compare finite positions only —
+    # the -inf mask pattern must agree exactly, so we also check that.
+    finite = torch.isfinite(got) & torch.isfinite(ref)
+    assert torch.equal(torch.isposinf(got), torch.isposinf(ref)), "pos-inf mask differs"
+    assert torch.equal(torch.isneginf(got), torch.isneginf(ref)), "neg-inf mask differs"
+    torch.testing.assert_close(got[finite], ref[finite], rtol=1e-1, atol=2e-1)
+    print(f"  correctness: PASS  (M={M}, H={H}, D={D}, kv_block_size={kv_block_size}, topk={topk}, num_seqs={num_seqs}, per_seq={per_seq})")
+
+    # Speed.
+    def fn():
+        return fp8_native_block_sparse_mqa_attn_return_logits_interface(
+            q,
+            k,
+            k_scale,
+            topk_block_index,
+            kv_block_size,
+            weights,
+            cu_ks,
+            cu_ke,
+        )
+
+    ms = do_bench(fn, warmup=50, rep=200)
+    # FLOPs: M × topk × kv_block_size × H × D (fp8×fp8) × 2 (mul+add).
+    total_flops = 2 * M * topk * kv_block_size * H * D
+    tflops = total_flops / (ms * 1e-3) / 1e12
+    print(f"  latency: {ms:.4f} ms  ({tflops:.2f} fp8 TFLOPS)")
+
+
+if __name__ == "__main__":
+    # Ref path materialises [M, topk, B, D] fp32 gathered_k which is ~M GB at
+    # topk=64, kv_block_size=128, D=128. Keep M modest to avoid OOM.
+    # (M, H, D, kv_block_size, topk, num_seqs)
+    for cfg in [
+        (1024, 64, 128, 128, 64, 1),
+        (4096, 64, 128, 128, 64, 1),
+        (4096, 64, 128, 128, 64, 4),
+        (8192, 64, 128, 128, 64, 1),
+        (8192, 64, 128, 128, 64, 8),
+        (8192, 64, 128, 64, 128, 8),
+        (8192, 64, 128, 256, 32, 8),
+    ]:
+        test_fp8_block_sparse_mqa(*cfg)
+        torch.cuda.empty_cache()
diff --git a/examples/dsa_hisa/clean_and_maintain_logits.py b/examples/dsa_hisa/clean_and_maintain_logits.py
new file mode 100644
index 0000000000..12ff8a4c10
--- /dev/null
+++ b/examples/dsa_hisa/clean_and_maintain_logits.py
@@ -0,0 +1,121 @@
+import tilelang
+from tilelang import language as T
+from tilelang.profiler import do_bench
+import torch
+
+from tilelang_utils import prepare_ks_ke_from_cu_seqlens
+
+
+@tilelang.jit
+def clean_and_maintain_logits_(
+    Logits,
+    CuSeqLenKS,
+    CuSeqLenKE,
+    threads: int = 512,
+    block_K: int = 4096,
+):
+    seq_len, seq_len_kv = T.const("seq_len, seq_len_kv")
+
+    dtype = T.float
+    indices_dtype = T.int32
+
+    Logits: T.Tensor[[seq_len, seq_len_kv], dtype]
+    CuSeqLenKS: T.Tensor[[seq_len], indices_dtype]
+    CuSeqLenKE: T.Tensor[[seq_len], indices_dtype]
+
+    with T.Kernel(seq_len, threads=threads) as bx:
+        tx = T.thread_binding(0, threads, thread="threadIdx.x")
+        cu_k_s = CuSeqLenKS[bx]
+        cu_k_e = CuSeqLenKE[bx]
+
+        for n_i in T.Pipelined(T.ceildiv(seq_len_kv, block_K)):
+            for k_i in T.serial(block_K // threads):
+                idx = n_i * block_K + k_i * threads + tx
+                if idx == cu_k_s or idx == cu_k_e - 1:
+                    Logits[bx, idx] = T.infinity(dtype)
+                if idx < cu_k_s or idx >= cu_k_e:
+                    Logits[bx, idx] = -T.infinity(dtype)
+
+
+def clean_and_maintain_logits_interface(
+    logits: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+):
+    """In-place: applies +inf/-inf mask based on per-row [ks, ke)."""
+    clean_and_maintain_logits_(logits, cu_seqlen_ks, cu_seqlen_ke)
+    return logits
+
+
+def ref_clean_and_maintain_logits(
+    logits: torch.Tensor,
+    cu_seqlen_ks: torch.Tensor,
+    cu_seqlen_ke: torch.Tensor,
+) -> torch.Tensor:
+    """Pure torch equivalent. Returns a new tensor (doesn't mutate the input)."""
+    M, N = logits.shape
+    out = logits.clone()
+    n = torch.arange(N, device=logits.device)[None, :]
+    mask_out = (n < cu_seqlen_ks.long()[:, None]) | (n >= cu_seqlen_ke.long()[:, None])
+    out = out.masked_fill(mask_out, float("-inf"))
+    m_idx = torch.arange(M, device=logits.device)
+    out[m_idx, cu_seqlen_ks.long()] = float("inf")
+    out[m_idx, (cu_seqlen_ke - 1).clamp(min=0).long()] = float("inf")
+    return out
+
+
+def test_clean_and_maintain_logits(M: int = 4096, N: int = 4096, num_seqs: int = 1):
+    """Correctness + speed test where `M` query rows are packed from
+    `num_seqs` equal-length causal sequences. Per-row ``cu_ks / cu_ke``
+    is derived from ``prepare_ks_ke_from_cu_seqlens`` so each row sees
+    only the prefix of its own sequence (causal self-attention)."""
+    torch.manual_seed(0)
+    assert M % num_seqs == 0, f"M ({M}) must be divisible by num_seqs ({num_seqs})"
+    assert (M // num_seqs) <= N, "N must accommodate the longest sequence"
+
+    per_seq = M // num_seqs
+    cu_seqlens = torch.arange(num_seqs + 1, device="cuda", dtype=torch.long) * per_seq
+    ks_long, ke_long = prepare_ks_ke_from_cu_seqlens(cu_seqlens)
+    cu_ks = ks_long.to(torch.int32).contiguous()
+    cu_ke = ke_long.to(torch.int32).clamp(max=N).contiguous()
+
+    logits_init = torch.randn(M, N, device="cuda", dtype=torch.float32)
+
+    # Run kernel in place on a copy.
+    got = logits_init.clone()
+    clean_and_maintain_logits_interface(got, cu_ks, cu_ke)
+
+    # Ref.
+    ref = ref_clean_and_maintain_logits(logits_init, cu_ks, cu_ke)
+
+    # Exact equality: this kernel only writes +/-inf, other positions untouched
+    # (ref clones the input and does the same). Compare directly.
+    assert torch.equal(torch.isposinf(got), torch.isposinf(ref)), "pos-inf mask differs"
+    assert torch.equal(torch.isneginf(got), torch.isneginf(ref)), "neg-inf mask differs"
+    finite = torch.isfinite(got) & torch.isfinite(ref)
+    torch.testing.assert_close(got[finite], ref[finite], rtol=0.0, atol=0.0)
+    print(f"  correctness: PASS  (M={M}, N={N}, num_seqs={num_seqs}, per_seq={per_seq})")
+
+    # Speed.
+    def fn():
+        logits = torch.randn(M, N, device="cuda", dtype=torch.float32)  # fresh copy each iter
+        clean_and_maintain_logits_interface(logits, cu_ks, cu_ke)
+        return logits
+
+    ms = do_bench(fn, warmup=50, rep=200)
+    # ~2 reads + 1 write of [M, N] f32, but mostly no-op except at mask boundaries.
+    bytes_moved = 2 * M * N * 4
+    gbps = bytes_moved / (ms * 1e-3) / 1e9
+    print(f"  latency: {ms:.4f} ms  ({gbps:.1f} GB/s)")
+
+
+if __name__ == "__main__":
+    # (M, N, num_seqs)
+    for cfg in [
+        (4096, 4096, 1),
+        (4096, 4096, 4),
+        (16384, 16384, 1),
+        (16384, 16384, 8),
+        (65536, 65536, 16),
+    ]:
+        test_clean_and_maintain_logits(*cfg)
diff --git a/examples/dsa_hisa/fp8_block_mean_pooling.py b/examples/dsa_hisa/fp8_block_mean_pooling.py
new file mode 100644
index 0000000000..1c9f90cc4c
--- /dev/null
+++ b/examples/dsa_hisa/fp8_block_mean_pooling.py
@@ -0,0 +1,146 @@
+import tilelang
+from tilelang import language as T
+from tilelang.profiler import do_bench
+import torch
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def fp8_native_block_mean_pooling(
+    K,
+    KScale,
+    dim: int = 128,
+    pooling_block_size: int = 128,
+    block_N: int = 64,
+    num_stages: int = 1,
+    threads: int = 256,
+):
+    dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
+    FP8_MAX_INV = 1.0 / 448.0
+
+    seq_len_k = T.const("seq_len_k")
+
+    K: T.Tensor[[seq_len_k, dim], dtype]
+    KScale: T.Tensor[[seq_len_k], accum_dtype]
+
+    num_blocks = T.ceildiv(seq_len_k, pooling_block_size)
+    BlockedK = T.empty((num_blocks, dim), dtype)
+    BlockedKScale = T.empty((num_blocks,), accum_dtype)
+
+    with T.Kernel(num_blocks, threads=threads) as bx:
+        index_k = T.alloc_fragment([block_N, dim], dtype)
+        scale = T.alloc_fragment([block_N], accum_dtype)
+        acc = T.alloc_fragment([dim], accum_dtype)
+        max_abs = T.alloc_fragment([1], accum_dtype)
+        T.fill(acc, 0.0)
+
+        k_start = bx * pooling_block_size
+        k_end = T.min(k_start + pooling_block_size, seq_len_k)
+        cur_pooling_block_size = k_end - k_start
+
+        for b_i in T.serial(T.ceildiv(cur_pooling_block_size, block_N)):
+            T.fill(index_k, 0.0)
+
+            tl_block_s = k_start + b_i * block_N
+            tl_block_e = T.min(k_start + (b_i + 1) * block_N, k_end)
+            T.copy(K[tl_block_s : tl_block_s + block_N, :], index_k)
+            for bn_i in T.Parallel(block_N):
+                scale[bn_i] = KScale[tl_block_s + bn_i]
+
+            for bn_i, d_i in T.Parallel(block_N, dim):
+                index_k[bn_i, d_i] = index_k[bn_i, d_i] * scale[bn_i]
+
+            cur_tl_block_size = tl_block_e - tl_block_s
+            for n_i in T.parallel(block_N):
+                for d_i in T.parallel(dim):
+                    if n_i >= cur_tl_block_size:
+                        index_k[n_i, d_i] = T.cast(0, accum_dtype)
+
+            T.reduce_sum(index_k, acc, dim=0, clear=False)
+
+        inv_count = T.cast(1.0, accum_dtype) / T.cast(cur_pooling_block_size, accum_dtype)
+        for d_i in T.Parallel(dim):
+            acc[d_i] = acc[d_i] * inv_count
+
+        # Re-quantize f32 mean to fp8 with a per-block scale.
+        T.reduce_absmax(acc, max_abs, dim=0, clear=True)
+        block_scale = T.max(max_abs[0] * T.cast(FP8_MAX_INV, accum_dtype), T.cast(1e-10, accum_dtype))
+        inv_block_scale = T.cast(1.0, accum_dtype) / block_scale
+
+        for d_i in T.Parallel(dim):
+            BlockedK[bx, d_i] = T.cast(acc[d_i] * inv_block_scale, dtype)
+        BlockedKScale[bx] = block_scale
+
+    return BlockedK, BlockedKScale
+
+
+def fp8_native_block_mean_pooling_interface(k: torch.Tensor, k_scale: torch.Tensor, k_block_size: int):
+    return fp8_native_block_mean_pooling(k, k_scale, dim=k.shape[1], pooling_block_size=k_block_size)
+
+
+def ref_fp8_block_mean_pooling(k_fp8: torch.Tensor, k_scale: torch.Tensor, k_block_size: int) -> torch.Tensor:
+    """Spec: per-token dequant + per-block mean (dividing by actual valid count).
+    Returns the f32 mean (caller can compare against fp8*scale re-quant of the kernel)."""
+    N, D = k_fp8.shape
+    dequant = k_fp8.float() * k_scale[:, None]
+    num_blocks = (N + k_block_size - 1) // k_block_size
+    out = torch.empty(num_blocks, D, device=k_fp8.device, dtype=torch.float32)
+    for b in range(num_blocks):
+        s = b * k_block_size
+        e = min(s + k_block_size, N)
+        out[b] = dequant[s:e].sum(dim=0) / (e - s)
+    return out
+
+
+def test_fp8_block_mean_pooling(N: int = 16384, D: int = 128, k_block_size: int = 128, num_seqs: int = 1):
+    """Correctness + speed test with `num_seqs` sequences of equal length
+    packed into the flat K buffer.
+
+    NOTE: the flat mean-pool kernel is sequence-agnostic — it pools every
+    `k_block_size` consecutive tokens regardless of sequence boundaries.
+    `num_seqs` is accepted here for API consistency with the other kernels'
+    tests; it affects how `cu_seqlens` is laid out (shown for illustration)
+    but not the kernel's inputs / outputs.
+    """
+    torch.manual_seed(0)
+    assert N % num_seqs == 0, f"N ({N}) must be divisible by num_seqs ({num_seqs})"
+    per_seq = N // num_seqs
+
+    k_bf16 = torch.randn(N, D, device="cuda", dtype=torch.bfloat16)
+    k = k_bf16.to(torch.float8_e4m3fn)
+    k_scale = (0.1 + 0.01 * torch.rand(N, device="cuda", dtype=torch.float32)).contiguous()
+
+    # Correctness.
+    blocked_k_fp8, blocked_k_scale = fp8_native_block_mean_pooling_interface(k, k_scale, k_block_size)
+    got = blocked_k_fp8.float() * blocked_k_scale[:, None]
+    ref = ref_fp8_block_mean_pooling(k, k_scale, k_block_size)
+    # fp8 re-quant: ~1/256 rel error on top of bf16-level precision.
+    torch.testing.assert_close(got, ref, rtol=5e-2, atol=5e-3)
+    print(f"  correctness: PASS  (N={N}, D={D}, k_block_size={k_block_size}, num_seqs={num_seqs}, per_seq={per_seq})")
+
+    # Speed.
+    def fn():
+        return fp8_native_block_mean_pooling_interface(k, k_scale, k_block_size)
+
+    ms = do_bench(fn, warmup=50, rep=200)
+    num_blocks = (N + k_block_size - 1) // k_block_size
+    # Bytes moved: read N * D fp8 (K) + N * 4 f32 (scale) + write num_blocks * D fp8 + num_blocks * 4 f32.
+    bytes_moved = N * D + N * 4 + num_blocks * D + num_blocks * 4
+    gbps = bytes_moved / (ms * 1e-3) / 1e9
+    print(f"  latency: {ms:.4f} ms  ({gbps:.1f} GB/s)")
+
+
+if __name__ == "__main__":
+    # (N, D, k_block_size, num_seqs)
+    for cfg in [
+        (16384, 128, 128, 1),
+        (16384, 128, 128, 4),
+        (65536, 128, 128, 1),
+        (65536, 128, 128, 8),
+        (131072, 128, 128, 16),
+    ]:
+        test_fp8_block_mean_pooling(*cfg)
diff --git a/examples/dsa_hisa/hisa.py b/examples/dsa_hisa/hisa.py
new file mode 100644
index 0000000000..e9863874e1
--- /dev/null
+++ b/examples/dsa_hisa/hisa.py
@@ -0,0 +1,240 @@
+import torch
+from tilelang.profiler import do_bench
+
+from fp8_block_mean_pooling import fp8_native_block_mean_pooling_interface
+from pool_mqa_fp8 import pool_mqa_attn_return_logits_fp8_interface
+from block_sparse_mqa_fp8 import fp8_native_block_sparse_mqa_attn_return_logits_interface
+from clean_and_maintain_logits import clean_and_maintain_logits_interface
+from tilelang_utils import prepare_ks_ke_from_cu_seqlens
+
+
+def hisa_indexer(
+    q: torch.Tensor,  # [M, H, D] fp8_e4m3fn
+    k: torch.Tensor,  # [N, D] fp8_e4m3fn
+    k_scale: torch.Tensor,  # [N] f32
+    weights: torch.Tensor,  # [M, H] f32
+    cu_seqlen_ks: torch.Tensor,  # [M] int32 — per-query K start (inclusive)
+    cu_seqlen_ke: torch.Tensor,  # [M] int32 — per-query K end   (exclusive)
+    *,
+    k_block_size: int,
+    block_topk: int,
+    topk_tokens: int,
+) -> torch.Tensor:
+    """Run the full hisa prefill pipeline.
+
+    Returns: ``[M, topk_tokens]`` int32 — each row is this query's top
+    ``topk_tokens`` K positions, expressed as offsets relative to
+    ``cu_seqlen_ks[m]`` (so ``0`` means the query's own K start). Slots
+    that fell outside ``[cu_seqlen_ks[m], cu_seqlen_ke[m])`` get ``-1``.
+    """
+    # ------------------------------------------------------------------
+    # Stage 0: fp8 mean-pool over K. Groups K into pool blocks of
+    # k_block_size tokens each; outputs one fp8 vector + f32 scale per
+    # pool block. Grid = (ceil(N/k_block_size),).
+    # ------------------------------------------------------------------
+    blocked_k_fp8, blocked_k_scale = fp8_native_block_mean_pooling_interface(
+        k,
+        k_scale,
+        k_block_size,
+    )  # [Nb, D] fp8, [Nb] f32
+
+    # Translate the per-query K range from flat-token coords to
+    # pool-block coords (floor for start, ceil for end).
+    cu_seqlen_blocked_ks = cu_seqlen_ks // k_block_size
+    cu_seqlen_blocked_ke = (cu_seqlen_ke + k_block_size - 1) // k_block_size
+
+    # ------------------------------------------------------------------
+    # Stage 1: block-level Q·BlockedK score with ReLU + per-head weight
+    # reduction. Output is dense (kernel doesn't mask out-of-range).
+    # ------------------------------------------------------------------
+    block_k_score = pool_mqa_attn_return_logits_fp8_interface(
+        q,
+        blocked_k_fp8,
+        blocked_k_scale,
+        weights,
+        cu_seqlen_blocked_ks,
+        cu_seqlen_blocked_ke,
+    )  # [M, Nb] f32
+
+    # Mask out-of-range entries to -inf and force +inf on first / last
+    # valid block so torch.topk picks the boundary blocks.
+    clean_and_maintain_logits_interface(
+        block_k_score,
+        cu_seqlen_blocked_ks,
+        cu_seqlen_blocked_ke,
+    )
+
+    # ------------------------------------------------------------------
+    # Stage 1.5: top-block_topk selection. bfloat16 + sorted=False is
+    # ~40% faster than f32 and the downstream sparse_mqa doesn't rely
+    # on order.
+    # ------------------------------------------------------------------
+    block_topk_eff = min(block_topk, block_k_score.shape[-1])
+    topk_block_indices = torch.topk(
+        block_k_score.bfloat16(),
+        k=block_topk_eff,
+        dim=-1,
+        sorted=False,
+    ).indices  # [M, block_topk_eff] int64
+
+    # ------------------------------------------------------------------
+    # Stage 2: fp8 fine-grained Q·K MQA over only the selected
+    # blocks' raw tokens (block_topk_eff blocks × k_block_size tokens
+    # per query). The kernel writes -inf for positions outside
+    # [cu_seqlen_ks[m], cu_seqlen_ke[m]).
+    # ------------------------------------------------------------------
+    block_sparse_logits = fp8_native_block_sparse_mqa_attn_return_logits_interface(
+        q,
+        k,
+        k_scale,
+        topk_block_indices,
+        k_block_size,
+        weights,
+        cu_seqlen_ks,
+        cu_seqlen_ke,
+    )  # [M, block_topk_eff * k_block_size] f32
+
+    # ------------------------------------------------------------------
+    # Stage 2.5: top-topk_tokens selection over the block_topk_eff
+    # × k_block_size candidate tokens. Gives per-query slot ids.
+    # ------------------------------------------------------------------
+    topk_tokens_eff = min(topk_tokens, block_sparse_logits.shape[-1])
+    relevant_topk_indices = torch.topk(
+        block_sparse_logits,
+        k=topk_tokens_eff,
+        dim=-1,
+    ).indices  # [M, topk_tokens_eff] int64
+
+    # ------------------------------------------------------------------
+    # Stage 3 (post, Python): translate slot ids → absolute K token
+    # position → per-query relative offset (matches vLLM indexer
+    # output buffer). Slots whose relative offset falls outside the
+    # query's visible range are set to -1.
+    # ------------------------------------------------------------------
+    # slot = block_id_in_topk × k_block_size + offset_in_block
+    #      where block_id_in_topk ∈ [0, block_topk_eff)
+    # absolute_k = topk_block_indices[m, block_id_in_topk] × k_block_size + offset_in_block
+    absolute_topk_block_indices = torch.gather(
+        topk_block_indices,
+        dim=-1,
+        index=(relevant_topk_indices // k_block_size),
+    )
+    topk_indices = absolute_topk_block_indices * k_block_size + (relevant_topk_indices % k_block_size)
+    topk_indices = topk_indices.to(torch.int32)
+
+    # Relative to this query's K start.
+    topk_indices -= cu_seqlen_ks[:, None]
+    mask_lo = topk_indices >= 0
+    mask_hi = topk_indices - (cu_seqlen_ke - cu_seqlen_ks)[:, None] < 0
+    mask = mask_lo & mask_hi
+    topk_indices = topk_indices.masked_fill(~mask, -1)
+
+    return topk_indices
+
+
+def test_hisa(
+    M: int = 1024,
+    H: int = 64,
+    D: int = 128,
+    k_block_size: int = 128,
+    block_topk: int = 8,
+    topk_tokens: int = 256,
+    num_seqs: int = 1,
+):
+    """End-to-end smoke + speed test packing `num_seqs` equal-length causal
+    sequences into the flat [M, H, D] Q and [N=M, D] K tensors.
+
+    Per-token ``cu_ks / cu_ke`` are produced by
+    ``prepare_ks_ke_from_cu_seqlens`` so each query sees only the prefix
+    of its own sequence. Validity checks are done per-query (so each
+    sequence's tail queries have fewer valid candidate slots).
+    """
+    torch.manual_seed(0)
+    assert M % num_seqs == 0, f"M ({M}) must be divisible by num_seqs ({num_seqs})"
+    per_seq = M // num_seqs
+    N = M  # causal self-attention, packed
+
+    cu_seqlens = torch.arange(num_seqs + 1, device="cuda", dtype=torch.long) * per_seq
+    ks_long, ke_long = prepare_ks_ke_from_cu_seqlens(cu_seqlens)
+    cu_ks = ks_long.to(torch.int32).contiguous()
+    cu_ke = ke_long.to(torch.int32).contiguous()
+
+    q_bf16 = torch.randn(M, H, D, device="cuda", dtype=torch.bfloat16)
+    q = q_bf16.to(torch.float8_e4m3fn)
+    k_bf16 = torch.randn(N, D, device="cuda", dtype=torch.bfloat16)
+    k = k_bf16.to(torch.float8_e4m3fn)
+    k_scale = (0.1 + 0.01 * torch.rand(N, device="cuda", dtype=torch.float32)).contiguous()
+    weights = torch.randn(M, H, device="cuda", dtype=torch.float32)
+
+    topk_indices = hisa_indexer(
+        q,
+        k,
+        k_scale,
+        weights,
+        cu_ks,
+        cu_ke,
+        k_block_size=k_block_size,
+        block_topk=block_topk,
+        topk_tokens=topk_tokens,
+    )
+
+    # Sanity checks.
+    assert topk_indices.shape == (M, topk_tokens), f"unexpected output shape {tuple(topk_indices.shape)}"
+    assert topk_indices.dtype == torch.int32
+
+    # Every non-(-1) offset must be within [0, cu_ke[m] - cu_ks[m]).
+    valid = topk_indices >= 0
+    spans = (cu_ke - cu_ks)[:, None].expand_as(topk_indices)
+    in_range = topk_indices < spans
+    assert (valid == (valid & in_range)).all(), "some valid offset falls outside its query's K window"
+
+    # Per-query expected number of valid slots = min(cu_ke[m] - cu_ks[m],
+    # topk_tokens) (clipped by K range and by block_topk × k_block_size).
+    expected_valid = torch.minimum(
+        (cu_ke - cu_ks).clamp(min=0),
+        torch.tensor(min(topk_tokens, block_topk * k_block_size), device=cu_ke.device),
+    )
+    got_valid = valid.sum(dim=-1).to(torch.int32)
+    frac_match = (got_valid == expected_valid).float().mean().item()
+    print(
+        f"  shape: {tuple(topk_indices.shape)}  "
+        f"valid_frac: {valid.float().mean().item():.4f}  "
+        f"per-query valid count match: {frac_match:.4f}  "
+        f"(num_seqs={num_seqs}, per_seq={per_seq})"
+    )
+
+    # Speed.
+    def fn():
+        return hisa_indexer(
+            q,
+            k,
+            k_scale,
+            weights,
+            cu_ks,
+            cu_ke,
+            k_block_size=k_block_size,
+            block_topk=block_topk,
+            topk_tokens=topk_tokens,
+        )
+
+    ms = do_bench(fn, warmup=20, rep=50)
+    print(
+        f"  latency: {ms:.3f} ms  "
+        f"(M={M}, H={H}, D={D}, k_block_size={k_block_size}, "
+        f"block_topk={block_topk}, topk_tokens={topk_tokens}, num_seqs={num_seqs})"
+    )
+
+
+if __name__ == "__main__":
+    # Ref path in block_sparse_mqa materialises [M, topk, kvB, D] fp32 so
+    # stay modest on M (reuse the sparse_mqa module's sizing intuition).
+    for cfg in [
+        dict(M=1024, H=64, D=128, k_block_size=128, block_topk=16, topk_tokens=256, num_seqs=1),
+        dict(M=1024, H=64, D=128, k_block_size=128, block_topk=16, topk_tokens=256, num_seqs=4),
+        dict(M=4096, H=64, D=128, k_block_size=128, block_topk=32, topk_tokens=1024, num_seqs=1),
+        dict(M=4096, H=64, D=128, k_block_size=128, block_topk=32, topk_tokens=1024, num_seqs=4),
+        dict(M=8192, H=64, D=128, k_block_size=128, block_topk=64, topk_tokens=2048, num_seqs=1),
+        dict(M=8192, H=64, D=128, k_block_size=128, block_topk=64, topk_tokens=2048, num_seqs=8),
+    ]:
+        test_hisa(**cfg)
+        torch.cuda.empty_cache()
diff --git a/examples/dsa_hisa/pool_mqa_fp8.py b/examples/dsa_hisa/pool_mqa_fp8.py
new file mode 100644
index 0000000000..515b311ac4
--- /dev/null
+++ b/examples/dsa_hisa/pool_mqa_fp8.py
@@ -0,0 +1,257 @@
+"""Stage-1 kernel: prefill pool-MQA over pooled (blocked) K.
+
+Input: fp8 Q ``[M, H, D]`` + fp8 BlockedK ``[Nb, D]`` + per-block f32 scale
+``[Nb]`` + f32 Weights ``[M, H]`` + per-query ``cu_seqlen_blocked_ks/ke [M]``.
+
+For each query ``m`` and pool block ``n`` in ``[cu_seqlen_blocked_ks[m],
+cu_seqlen_blocked_ke[m])``:
+  ``logits[m, n] = sum_h ReLU(Q[m, h] . BlockedK[n]) * BlockedKScale[n] * Weights[m, h]``
+
+Out-of-range entries in the raw kernel output are undefined — caller should
+zero-init the buffer or apply a separate mask kernel.
+"""
+
+import tilelang
+from tilelang import language as T
+from tilelang.profiler import do_bench
+import torch
+
+from tilelang_utils import prepare_ks_ke_from_cu_seqlens
+from clean_and_maintain_logits import (
+    clean_and_maintain_logits_interface,
+    ref_clean_and_maintain_logits,
+)
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
+    },
+)
+def pool_mqa_attn_return_logits_fp8(
+    IndexQ,
+    IndexBlockedK,
+    IndexBlockedKScale,
+    Logits,
+    Weights,
+    CuSeqLenBlockedKS,
+    CuSeqLenBlockedKE,
+    heads: int = 64,
+    index_dim: int = 128,
+    block_N: int = 256,
+    num_stages: int = 3,
+    threads: int = 512,
+    block_Q: int = 0,
+):
+    # block_Q is the tile size for queries; `0` means "derive from heads".
+    if block_Q == 0:
+        block_Q = 128 // heads
+    fp8_dtype = T.float8_e4m3fn
+    accum_dtype = T.float32
+    index_dtype = T.int32
+
+    seq_len, seq_len_blocked_kv = T.const("seq_len, seq_len_blocked_kv")
+
+    IndexQ: T.Tensor[[seq_len * heads, index_dim], fp8_dtype]
+    IndexBlockedK: T.Tensor[[seq_len_blocked_kv, index_dim], fp8_dtype]
+    IndexBlockedKScale: T.Tensor[[seq_len_blocked_kv], accum_dtype]
+    Logits: T.Tensor[[seq_len, seq_len_blocked_kv], accum_dtype]
+    Weights: T.Tensor[[seq_len, heads], accum_dtype]
+    CuSeqLenBlockedKS: T.Tensor[[seq_len], index_dtype]
+    CuSeqLenBlockedKE: T.Tensor[[seq_len], index_dtype]
+
+    with T.Kernel(T.ceildiv(seq_len, block_Q), threads=threads) as bx:
+        index_q_shared = T.alloc_shared([block_Q * heads, index_dim], fp8_dtype)
+        index_k_shared = T.alloc_shared([block_N, index_dim], fp8_dtype)
+        index_k_scale_fragment = T.alloc_fragment([block_N], accum_dtype)
+        s = T.alloc_fragment([block_N, block_Q * heads], accum_dtype)
+        s_reshaped = T.reshape(s, (block_N, block_Q, heads))
+        logits = T.alloc_fragment([block_N, block_Q], accum_dtype)
+        weights = T.alloc_fragment([block_Q, heads], accum_dtype)
+
+        seq_len_i = bx * block_Q
+
+        cu_k_s_min = T.alloc_var(index_dtype)
+        cu_k_e_max = T.alloc_var(index_dtype)
+        cu_k_s_min = 2147483647
+        cu_k_e_max = -2147483648
+
+        for bq_i in T.serial(block_Q):
+            cu_k_s_min = T.min(cu_k_s_min, T.min(CuSeqLenBlockedKS[seq_len_i + bq_i], seq_len_blocked_kv))
+        for bq_i in T.serial(block_Q):
+            cu_k_e_max = T.max(cu_k_e_max, T.min(CuSeqLenBlockedKE[seq_len_i + bq_i], seq_len_blocked_kv))
+
+        T.copy(IndexQ[seq_len_i * heads, 0], index_q_shared)
+        T.copy(Weights[seq_len_i, 0], weights)
+
+        for nbn_i in T.Pipelined(T.ceildiv(cu_k_e_max - cu_k_s_min, block_N), num_stages=num_stages):
+            T.copy(IndexBlockedK[cu_k_s_min + nbn_i * block_N, 0], index_k_shared)
+            T.copy(IndexBlockedKScale[cu_k_s_min + nbn_i * block_N], index_k_scale_fragment)
+
+            T.gemm(
+                index_k_shared,
+                index_q_shared,
+                s,
+                transpose_B=True,
+                clear_accum=True,
+                policy=T.GemmWarpPolicy.FullCol,
+            )
+
+            for bn_i, bq_i, h_i in T.Parallel(block_N, block_Q, heads):
+                s_reshaped[bn_i, bq_i, h_i] = T.max(s_reshaped[bn_i, bq_i, h_i] * index_k_scale_fragment[bn_i], 0) * weights[bq_i, h_i]
+
+            T.reduce_sum(s_reshaped, logits, dim=-1, clear=True)
+
+            for bq_i, bn_i in T.Parallel(block_Q, block_N):
+                Logits[seq_len_i + bq_i, cu_k_s_min + nbn_i * block_N + bn_i] = logits[bn_i, bq_i]
+
+
+def pool_mqa_attn_return_logits_fp8_interface(
+    q_fp8: torch.Tensor,
+    blocked_kv_fp8: torch.Tensor,
+    blocked_kv_scale: torch.Tensor,
+    weights_f32: torch.Tensor,
+    cu_seqlen_blocked_ks: torch.Tensor,
+    cu_seqlen_blocked_ke: torch.Tensor,
+    block_N: int = 256,
+):
+    """Raw kernel invocation; zero-inits logits so positions the kernel
+    doesn't touch are 0 (matches the ref)."""
+    seq_len, heads, index_dim = q_fp8.shape
+    seq_len_blocked_kv = blocked_kv_fp8.shape[0]
+
+    logits = torch.zeros([seq_len, seq_len_blocked_kv], device=q_fp8.device, dtype=torch.float32)
+    pool_mqa_attn_return_logits_fp8(
+        q_fp8.view(seq_len * heads, index_dim),
+        blocked_kv_fp8,
+        blocked_kv_scale,
+        logits,
+        weights_f32,
+        cu_seqlen_blocked_ks,
+        cu_seqlen_blocked_ke,
+        heads=heads,
+        index_dim=index_dim,
+        block_N=block_N,
+    )
+    return logits
+
+
+def ref_pool_mqa_fp8(
+    q_fp8: torch.Tensor,
+    blocked_kv_fp8: torch.Tensor,
+    blocked_kv_scale: torch.Tensor,
+    weights_f32: torch.Tensor,
+) -> torch.Tensor:
+    """Spec: for each (m, n), logits[m, n] = sum_h ReLU(q[m,h] . k[n] * k_scale[n]) * w[m,h].
+    Computes the full dense [M, Nb] grid — caller is responsible for any masking."""
+    q_f = q_fp8.float()
+    k_f = blocked_kv_fp8.float() * blocked_kv_scale[:, None]
+    # score[m, n, h] = q[m, h] . k[n]
+    s = torch.einsum("mhd,nd->mnh", q_f, k_f)  # [M, Nb, H]
+    logits = (s.clamp(min=0) * weights_f32[:, None, :]).sum(dim=-1)  # [M, Nb]
+    return logits
+
+
+def test_pool_mqa_fp8(
+    M: int = 32768,
+    H: int = 64,
+    D: int = 128,
+    k_block_size: int = 128,
+    block_N: int = 256,
+    num_seqs: int = 1,
+):
+    """Correctness + speed test packing `num_seqs` equal-length causal
+    sequences into the [M, H, D] Q tensor.
+
+    Per-query ``cu_seqlen_blocked_ks/ke`` is derived from the raw-token
+    packed ``cu_ks / cu_ke`` produced by ``prepare_ks_ke_from_cu_seqlens``
+    (floor-divide / ceil-divide by ``k_block_size`` respectively).
+
+    The kernel writes the per-tile ``[cu_k_s_min, cu_k_e_max)`` union of
+    visible K ranges — entries inside this union but outside an
+    individual query's visible range carry raw (unmasked) dot-product
+    values. To make correctness well-defined, we apply the
+    ``clean_and_maintain_logits`` mask (-inf for out-of-range, +inf for
+    the first/last valid block) to both the kernel output and the torch
+    reference before comparing — this mirrors what the hisa pipeline
+    does right after this kernel.
+    """
+    torch.manual_seed(0)
+    assert M % num_seqs == 0, f"M ({M}) must be divisible by num_seqs ({num_seqs})"
+    per_seq = M // num_seqs
+    N_blocked = (M + k_block_size - 1) // k_block_size
+    assert N_blocked % block_N == 0, (
+        f"N_blocked ({N_blocked}) must be a multiple of block_N ({block_N}). Pick M such that ceildiv(M, k_block_size) % block_N == 0."
+    )
+
+    # Per-token packed ks/ke (causal within each sequence), then translate
+    # to pool-block coords.
+    cu_seqlens = torch.arange(num_seqs + 1, device="cuda", dtype=torch.long) * per_seq
+    ks_long, ke_long = prepare_ks_ke_from_cu_seqlens(cu_seqlens)
+    cu_ks_token = ks_long.to(torch.int32).contiguous()
+    cu_ke_token = ke_long.to(torch.int32).contiguous()
+    cu_blocked_ks = (cu_ks_token // k_block_size).contiguous()
+    cu_blocked_ke = ((cu_ke_token + k_block_size - 1) // k_block_size).contiguous()
+
+    q_bf16 = torch.randn(M, H, D, device="cuda", dtype=torch.bfloat16)
+    q = q_bf16.to(torch.float8_e4m3fn)
+    blocked_k_bf16 = torch.randn(N_blocked, D, device="cuda", dtype=torch.bfloat16)
+    blocked_k = blocked_k_bf16.to(torch.float8_e4m3fn)
+    blocked_k_scale = (0.1 + 0.01 * torch.rand(N_blocked, device="cuda", dtype=torch.float32)).contiguous()
+    weights = torch.randn(M, H, device="cuda", dtype=torch.float32)
+
+    # Correctness — kernel + post-mask.
+    got = pool_mqa_attn_return_logits_fp8_interface(
+        q,
+        blocked_k,
+        blocked_k_scale,
+        weights,
+        cu_blocked_ks,
+        cu_blocked_ke,
+        block_N=block_N,
+    )
+    clean_and_maintain_logits_interface(got, cu_blocked_ks, cu_blocked_ke)
+
+    ref = ref_pool_mqa_fp8(q, blocked_k, blocked_k_scale, weights)
+    ref = ref_clean_and_maintain_logits(ref, cu_blocked_ks, cu_blocked_ke)
+
+    # After the mask, +/-inf positions must agree exactly. Compare the
+    # remaining finite values under an fp8×fp8 GEMM tolerance.
+    assert torch.equal(torch.isposinf(got), torch.isposinf(ref)), "pos-inf mask differs"
+    assert torch.equal(torch.isneginf(got), torch.isneginf(ref)), "neg-inf mask differs"
+    finite = torch.isfinite(got) & torch.isfinite(ref)
+    torch.testing.assert_close(got[finite], ref[finite], rtol=5e-2, atol=5e-2)
+    print(f"  correctness: PASS  (M={M}, H={H}, D={D}, N_blocked={N_blocked}, block_N={block_N}, num_seqs={num_seqs}, per_seq={per_seq})")
+
+    # Speed (kernel only — excludes the post mask).
+    def fn():
+        return pool_mqa_attn_return_logits_fp8_interface(
+            q,
+            blocked_k,
+            blocked_k_scale,
+            weights,
+            cu_blocked_ks,
+            cu_blocked_ke,
+            block_N=block_N,
+        )
+
+    ms = do_bench(fn, warmup=50, rep=200)
+    # FLOPs: fp8×fp8 GEMM dominates = 2 * M * H * Nb * D (mul+add).
+    total_flops = 2 * M * H * N_blocked * D
+    tflops = total_flops / (ms * 1e-3) / 1e12
+    print(f"  latency: {ms:.4f} ms  ({tflops:.2f} fp8 TFLOPS)")
+
+
+if __name__ == "__main__":
+    # M × k_block_size^-1 must be a multiple of block_N=256.
+    # With k_block_size=128 → N_blocked = M/128; need N_blocked % 256 == 0
+    # → M % 32768 == 0.
+    # (M, H, D, k_block_size, block_N, num_seqs)
+    for cfg in [
+        (32768, 64, 128, 128, 256, 1),
+        (32768, 64, 128, 128, 256, 4),
+        (65536, 64, 128, 128, 256, 1),
+        (65536, 64, 128, 128, 256, 8),
+        (131072, 64, 128, 128, 256, 16),
+    ]:
+        test_pool_mqa_fp8(*cfg)
diff --git a/examples/dsa_hisa/tilelang_utils.py b/examples/dsa_hisa/tilelang_utils.py
new file mode 100644
index 0000000000..80a1441c5b
--- /dev/null
+++ b/examples/dsa_hisa/tilelang_utils.py
@@ -0,0 +1,314 @@
+import torch
+import torch.nn.functional as F
+import functools
+from typing import Callable, Any, Tuple
+
+
+def tensor_cache(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args: tuple | None = None
+    last_kwargs: dict | None = None
+    last_result: Any = None
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal last_args, last_kwargs, last_result
+
+        if (
+            (last_args is not None and last_kwargs is not None)
+            and (len(args) == len(last_args) and len(kwargs) == len(last_kwargs))
+            and all(a is b for a, b in zip(args, last_args, strict=False))
+            and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items())
+        ):
+            return last_result
+
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+
+    return wrapper
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_cu_seqlens_from_lens(
+    lens: torch.LongTensor,
+    dtype: torch.dtype | None = torch.int32,
+) -> torch.LongTensor:
+    return F.pad(lens.cumsum(dim=0, dtype=dtype), (1, 0))
+
+
+@tensor_cache
+def prepare_lens_from_cu_seqlens(
+    cu_seqlens: torch.LongTensor,
+) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_position_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.cat([torch.arange(n, dtype=cu_seqlens.dtype, device=cu_seqlens.device) for n in prepare_lens(cu_seqlens).unbind()])
+
+
+@tensor_cache
+def prepare_sequence_ids(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return prepare_position_ids(cu_seqlens).eq(0).cumsum(0) - 1
+
+
+@tensor_cache
+def prepare_token_indices(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    position_ids = prepare_position_ids(cu_seqlens)
+    return torch.stack([prepare_sequence_ids(cu_seqlens), position_ids], 1).to(cu_seqlens)
+
+
+@tensor_cache
+def prepare_cu_seqlens_from_position_ids(
+    position_ids: torch.LongTensor,
+    dtype: torch.dtype | None = torch.int32,
+) -> torch.LongTensor:
+    starts = (position_ids == 0).nonzero(as_tuple=True)[0]
+    total_len = position_ids.new_tensor([position_ids.numel()])
+    boundaries = torch.cat([starts, total_len])
+    lens = torch.diff(boundaries)
+    cu_seqlens = prepare_cu_seqlens_from_lens(lens, dtype=dtype)
+    return cu_seqlens
+
+
+@tensor_cache
+def prepare_ks_ke_from_cu_seqlens(
+    cu_seqlens: torch.LongTensor,
+) -> tuple[torch.LongTensor, torch.LongTensor]:
+    position_ids = prepare_position_ids(cu_seqlens)
+    sequence_ids = position_ids.eq(0).cumsum(0) - 1
+
+    ks = cu_seqlens[sequence_ids]
+    ke = ks + position_ids + 1
+
+    return ks, ke
+
+
+@tensor_cache
+def prepare_ks_ke_from_cu_seqlens_qk(
+    cu_seqlens_q: torch.LongTensor,
+    cu_seqlens_k: torch.LongTensor,
+) -> tuple[torch.LongTensor, torch.LongTensor]:
+    position_ids_q = prepare_position_ids(cu_seqlens_q)
+    sequence_ids_q = position_ids_q.eq(0).cumsum(0) - 1
+
+    seqlens_q = prepare_lens(cu_seqlens_q)
+    seqlens_k = prepare_lens(cu_seqlens_k)
+    offset = seqlens_k - seqlens_q
+
+    ks = cu_seqlens_k[sequence_ids_q]
+    ke = ks + position_ids_q + offset[sequence_ids_q] + 1
+
+    return ks, ke
+
+
+def ceil_to_ue8m0(x: torch.Tensor):
+    assert x.view(-1).amax().item() > 0
+    return torch.pow(2.0, torch.ceil(torch.log2(x.abs())))
+
+
+def per_custom_dims_cast_to_fp8(x: torch.Tensor, dims: Tuple[int], use_ue8m0: bool) -> Tuple[torch.Tensor, torch.Tensor]:
+    excluded_dims = tuple([i for i in range(x.dim()) if i not in set(dims)])
+    x_amax = x.abs().float().amax(dim=excluded_dims, keepdim=True).clamp(1e-4)
+    sf = x_amax / 448.0
+    sf = ceil_to_ue8m0(sf) if use_ue8m0 else sf
+    x_scaled = (x * (1.0 / sf)).to(torch.float8_e4m3fn)
+    return x_scaled, sf.squeeze()
+
+
+def get_abs_err(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    return (x - y).flatten().abs().max().item()
+
+
+def get_err_ratio(y, x):
+    x = x.to(torch.float32)
+    y = y.to(torch.float32)
+    err = (x - y).flatten().square().mean().sqrt().item()
+    base = (x).flatten().square().mean().sqrt().item()
+    return err / base
+
+
+def calculate_tensor_similarity(x, y, name="tensor"):
+    """
+    Calculate similarity between two tensors using a normalized dot product metric.
+
+    Unlike torch.testing.assert_close which uses absolute/relative tolerance based on
+    element-wise differences, this function computes a global similarity score:
+        sim = 2 * <x, y> / (||x||^2 + ||y||^2)
+
+    This metric is scale-invariant and measures the cosine-like similarity normalized
+    by the magnitude of both tensors. It returns 1 for identical tensors and values
+    closer to 0 for dissimilar ones. This is particularly useful for comparing tensors
+    with varying magnitudes where relative errors matter more than absolute differences.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        name: Name of the tensor for logging purposes
+
+    Returns:
+        Similarity score in range [0, 1] where 1 means identical
+    """
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print(f"\033[33mWARNING: {name} all zero\033[0m")
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_tensors_similar(x, y, eps=1e-8, name="tensor", raise_assert=True):
+    """
+    Assert that two tensors are similar using a global similarity metric.
+
+    Key differences from torch.testing.assert_close:
+    - torch.testing.assert_close: Uses element-wise comparison with rtol/atol, checking
+      that |x - y| <= atol + rtol * |y| for each element. It's sensitive to outliers
+      and requires all elements to satisfy the tolerance.
+    - assert_tensors_similar: Uses a single global similarity score (1 - sim) where sim is the
+      normalized dot product. It's more robust to outliers and focuses on overall
+      tensor similarity rather than element-wise precision. This is better suited for
+      comparing large tensors where a few outlier elements shouldn't fail the test.
+
+    Args:
+        x: First tensor to compare
+        y: Second tensor to compare
+        eps: Maximum allowed difference (1 - similarity), default 1e-8
+        name: Name of the tensor for error messages
+        raise_assert: Whether to raise assertion error on failure
+    """
+    sim = calculate_tensor_similarity(x, y, name)
+    diff = 1.0 - sim
+    if not (0 <= diff <= eps):
+        print(f"\033[31mERROR: {name} similarity check failed, diff={diff:.2e} (threshold={eps:.2e})\033[0m")
+        if raise_assert:
+            assert False  # noqa: B011
+
+
+@tensor_cache
+def cal_seq_idx_for_q(cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, seq_len: int) -> torch.IntTensor:
+    seq_idx_for_q = torch.full((seq_len,), len(cu_seqlens_qs), dtype=torch.int32, device=cu_seqlens_qs.device)
+    for i in range(len(cu_seqlens_qs)):
+        seq_idx_for_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = i
+    return seq_idx_for_q
+
+
+@tensor_cache
+def cal_cu_seqlen_ks_for_q(
+    cu_seqlens_qs: torch.LongTensor, cu_seqlens_qe: torch.LongTensor, cu_seqlens_ks: torch.LongTensor, seq_len: int
+) -> torch.IntTensor:
+    cu_seqlen_ks_for_each_q = torch.gather(
+        input=torch.cat([cu_seqlens_ks, torch.full((1,), torch.iinfo(torch.int32).max, dtype=torch.int32, device=cu_seqlens_qs.device)]),
+        dim=0,
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
+    return cu_seqlen_ks_for_each_q.int()
+
+
+@tensor_cache
+def cal_cu_seqlen_ke_for_q(
+    cu_seqlens_qs: torch.LongTensor,
+    cu_seqlens_qe: torch.LongTensor,
+    cu_seqlens_ks: torch.LongTensor,
+    cu_seqlens_ke: torch.LongTensor,
+    q_start_idxs: torch.LongTensor,
+    seq_len: int,
+    kv_stride: int,
+) -> torch.IntTensor:
+    cu_seqlen_ke_for_each_q = torch.gather(
+        input=torch.cat([cu_seqlens_ke, torch.zeros(1, dtype=torch.int32, device=cu_seqlens_qs.device)]),
+        dim=0,
+        index=cal_seq_idx_for_q(cu_seqlens_qs=cu_seqlens_qs, cu_seqlens_qe=cu_seqlens_qe, seq_len=seq_len).long(),
+    )
+    casual_cu_seqlen_ke_for_each_q = torch.zeros((seq_len,), dtype=torch.int32, device=cu_seqlens_qs.device)
+    for i in range(len(cu_seqlens_qs)):
+        casual_cu_seqlen_ke_for_each_q[cu_seqlens_qs[i] : cu_seqlens_qe[i]] = (
+            torch.arange(
+                q_start_idxs[i], q_start_idxs[i] + cu_seqlens_qe[i] - cu_seqlens_qs[i], dtype=torch.int32, device=cu_seqlens_qs.device
+            )
+            + 1
+        ) // kv_stride + cu_seqlens_ks[i]
+    cu_seqlen_ke_for_each_q = torch.minimum(casual_cu_seqlen_ke_for_each_q, cu_seqlen_ke_for_each_q)
+    return cu_seqlen_ke_for_each_q.int()
+
+
+def generate_random_cu_seqlens(per_cp_seqlen, cp_size=4, cp_rank=3, kv_stride=1, average_q_len=512):
+    total_seqlen = per_cp_seqlen * cp_size
+
+    cu_seqlens = torch.randint(0, average_q_len * 2, (total_seqlen // average_q_len * 2,)).cuda()
+    last_seq_id = torch.where(cu_seqlens.cumsum(0) >= total_seqlen)[0][0]
+    cu_seqlens = cu_seqlens[:last_seq_id]
+
+    if cu_seqlens.sum() < total_seqlen:
+        cu_seqlens = torch.cat([cu_seqlens, torch.tensor([total_seqlen - cu_seqlens.sum()]).cuda()])
+
+    cu_seqlens_cumsum = torch.cumsum(cu_seqlens, dim=0)
+    cu_seqlens_k_cumsum = torch.cumsum(cu_seqlens // kv_stride, dim=0)
+    cu_seqlens_qs = torch.cat([torch.tensor([0]).cuda(), cu_seqlens_cumsum[:-1]])
+    cu_seqlens_ks = torch.cat([torch.tensor([0]).cuda(), cu_seqlens_k_cumsum[:-1]])
+    cu_seqlens_qe = cu_seqlens_cumsum.clone()
+    cu_seqlens_ke = cu_seqlens_k_cumsum.clone()
+
+    cu_seqlens_ks_for_each_q = cal_cu_seqlen_ks_for_q(
+        cu_seqlens_qs=cu_seqlens_qs,
+        cu_seqlens_qe=cu_seqlens_qe,
+        cu_seqlens_ks=cu_seqlens_ks,
+        seq_len=total_seqlen,
+    )
+    cu_seqlens_ke_for_each_q = cal_cu_seqlen_ke_for_q(
+        cu_seqlens_qs=cu_seqlens_qs,
+        cu_seqlens_qe=cu_seqlens_qe,
+        cu_seqlens_ks=cu_seqlens_ks,
+        cu_seqlens_ke=cu_seqlens_ke,
+        q_start_idxs=torch.zeros_like(cu_seqlens_qs),
+        seq_len=total_seqlen,
+        kv_stride=kv_stride,
+    )
+
+    assert per_cp_seqlen % 2 == 0
+    per_chunk_seqlen = per_cp_seqlen // 2
+    slice_short = slice(cp_rank * per_chunk_seqlen, (cp_rank + 1) * per_chunk_seqlen)
+    slice_long = slice(
+        total_seqlen - (cp_rank + 1) * per_chunk_seqlen,
+        total_seqlen - cp_rank * per_chunk_seqlen,
+    )
+    ks = torch.cat(
+        [
+            cu_seqlens_ks_for_each_q[slice_short],
+            cu_seqlens_ks_for_each_q[slice_long],
+        ]
+    )
+    ke = torch.cat(
+        [
+            cu_seqlens_ke_for_each_q[slice_short],
+            cu_seqlens_ke_for_each_q[slice_long],
+        ]
+    )
+    assert len(ks) == len(ke) == per_cp_seqlen
+    return ks, ke
diff --git a/examples/dsa_sparse_finetune/indexer_bwd.py b/examples/dsa_sparse_finetune/indexer_bwd.py
index 68508ad4e4..54e02e4f18 100644
--- a/examples/dsa_sparse_finetune/indexer_bwd.py
+++ b/examples/dsa_sparse_finetune/indexer_bwd.py
@@ -13,10 +13,7 @@
 FP32 = T.float32
 INT32 = T.int32
 
-pass_configs = {
-    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-}
+pass_configs = {tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
 
 
 @tl.jit(pass_configs=pass_configs)
diff --git a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
index d76eb02724..1066199cd0 100644
--- a/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
+++ b/examples/dsa_sparse_finetune/indexer_topk_reducesum.py
@@ -14,11 +14,7 @@
 FP32 = T.float32
 INT32 = T.int32
 
-pass_configs = {
-    tl.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
-    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-}
+pass_configs = {tl.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True, tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
 
 
 @tl.jit(pass_configs=pass_configs)
diff --git a/examples/dsa_sparse_finetune/sparse_mla_bwd.py b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
index 53e5f8bfea..ab0b4fc493 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_bwd.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_bwd.py
@@ -78,10 +78,7 @@ def postprocess_kernel(
 
 @tilelang.jit(
     out_idx=[-2],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def bwd(
     H,
@@ -226,17 +223,17 @@ def sparse_mla_bwd_kernel(
                         if bi_i < BS // split_store:
                             acc_dkv_tail_shared[bi_i, d_i] = acc_dkv_tail[bi_i + s * (BS // split_store), d_i]
 
-                    for bi_i, d_i in T.Parallel(BS // split_store, D // 4):
-                        T.atomic_addx4(
-                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, d_i * 4],
-                            acc_dkv_shared[bi_i, d_i * 4],
+                    for bi_i, d_i in T.Parallel(BS // split_store, D):
+                        T.atomic_add(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, d_i],
+                            acc_dkv_shared[bi_i, d_i],
                         )
 
                     # Atomically update dKV, dKV_tail tensors
-                    for bi_i, d_i in T.Parallel(BS // split_store, D_tail // 4):
-                        T.atomic_addx4(
-                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, D + d_i * 4],
-                            acc_dkv_tail_shared[bi_i, d_i * 4],
+                    for bi_i, d_i in T.Parallel(BS // split_store, D_tail):
+                        T.atomic_add(
+                            dKV[bos + Indices[bos + s_i, bz, i_i * BS + bi_i + s * (BS // split_store)], bz, D + d_i],
+                            acc_dkv_tail_shared[bi_i, d_i],
                         )
 
             # Store the accumulated dQ
diff --git a/examples/dsa_sparse_finetune/sparse_mla_fwd.py b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
index d875236952..fcde71928b 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_fwd.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_fwd.py
@@ -9,10 +9,7 @@
 
 @tilelang.jit(
     out_idx=[-2, -1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def sparse_mla_fwd(
     heads,
diff --git a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
index a03bc74f51..2fff8dd20f 100644
--- a/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
+++ b/examples/dsa_sparse_finetune/sparse_mla_topk_reducesum.py
@@ -12,10 +12,7 @@
 FP32 = T.float32
 INT32 = T.int32
 
-pass_configs = {
-    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-}
+pass_configs = {tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
 
 
 @tilelang.jit(pass_configs=pass_configs)
diff --git a/examples/eager_jit/eagerjit.en.ipynb b/examples/eager_jit/eagerjit.en.ipynb
new file mode 100644
index 0000000000..6a2bf8453b
--- /dev/null
+++ b/examples/eager_jit/eagerjit.en.ipynb
@@ -0,0 +1,977 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Eager JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Eager JIT merges JIT kernel generation and invocation into a single workflow.\n",
+    "\n",
+    "The function signature looks similar to Triton, but we add many enhancements; the most important one is allowing rich Tensor annotations:\n",
+    "\n",
+    "* If a Tensor has complex shape constraints, we can move its annotation into the function body.\n",
+    "* Use `T.const` or `T.dynamic` to create shape variables, then annotate complex Tensors with `T.Tensor`.\n",
+    "* Use `T.empty` to declare return tensors."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm(\n",
+    "    A,\n",
+    "    B,\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32,\n",
+    "):\n",
+    "    M, N, K = T.const(\"M, N, K\")\n",
+    "\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "Calling the function with Tensors directly triggers the full JIT compile-and-run pipeline:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "Changing the call arguments may trigger a recompilation when compilation parameters change:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "You can also explicitly call the `compile` method to build the kernel.\n",
+    "\n",
+    "1. `ker.compile` compiles the kernel\n",
+    "2. `ker.get_tir` retrieves the TIR\n",
+    "3. `ker.par_compile` compiles in parallel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### Use macros to separate implementation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "Next, we implement a simple GEMM in several different ways. For convenience, we first write a macro that contains the core GEMM logic:"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### Use `T.dynamic` to mark dynamic shapes\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_dyn_K(A, B):\n",
+    "    M, N, K = T.dynamic(\"M, N, K\")\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### Use `T.StridedTensor` to annotate tensors with strides\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def as_contingious(A):\n",
+    "    M, N, dM, dN = T.dynamic(\"M, N, dM, dN\")\n",
+    "    A: T.StridedTensor[[M, N], [dM, dN], T.float32]\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
+    "B = as_contingious(A.T)\n",
+    "B_ref = A.T.contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### Use parameters directly as annotations"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9a47d42",
+   "metadata": {},
+   "source": [
+    "You can directly use function parameters in the annotations."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_ptr(\n",
+    "    A,\n",
+    "    B,\n",
+    "    M,\n",
+    "    N,\n",
+    "    K,\n",
+    "):\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### Annotations for runtime variables"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bba5f27f",
+   "metadata": {},
+   "source": [
+    "Runtime variables work the same; if the function annotation becomes too long, you can move it into the function body."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_ptr_dyn(A, B, M, N, K):\n",
+    "    M: T.int32\n",
+    "    N: T.int32\n",
+    "    K: T.int32\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81427765",
+   "metadata": {},
+   "source": [
+    "### Constraints for constants"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d6b084b",
+   "metadata": {},
+   "source": [
+    "A constant annotation created by `T.const` must be used directly at least once, otherwise an error is raised."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c90dd24f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Constexpr variable `M` is not used in any buffer shape or stride.\n",
+      "At least one **DIRECT** usage is required. Please check:\n",
+      "(1) the variable is not used\n",
+      "(2) all uses are indirect, e.g. M * 2, M * 3. (you can replace them with separate constexpr variables)\n",
+      "Buffer shapes: {A: [M * 2, M * 3]}\n",
+      "Buffer strides: {A: [M * 3, 1]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "@tilelang.jit\n",
+    "def example_wrong_kernel(A):\n",
+    "    M = T.const(\"M\")\n",
+    "    A: T.Tensor[[M * 2, M * 3], T.float32]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        A[0, 0]\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "    example_wrong_kernel(A)\n",
+    "except Exception as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e07e762b",
+   "metadata": {},
+   "source": [
+    "### Dynamic dimensions"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f48e5d7a",
+   "metadata": {},
+   "source": [
+    "If you want certain parameters in a Tensor annotation to change, it is recommended to switch to the `T.ptr` + `T.match_buffer` style."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1d050321",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@tilelang.jit\n",
+    "def dyn_annot(\n",
+    "    A: T.ptr,  # 1. T.ptr type annotation\n",
+    "    is_2d=False,\n",
+    "):\n",
+    "    if is_2d:\n",
+    "        M, N = T.const(\"M, N\")\n",
+    "        # 2. dynamic shape annotation inside function body\n",
+    "        A = T.match_buffer(A, [M, N], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0, 0]\n",
+    "    else:\n",
+    "        L = T.const(\"L\")\n",
+    "        A = T.match_buffer(A, [L], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0]\n",
+    "\n",
+    "\n",
+    "A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "dyn_annot(A, is_2d=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e9f1bb3",
+   "metadata": {},
+   "source": [
+    "### Default arguments"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7fc9917",
+   "metadata": {},
+   "source": [
+    "Scalar annotations like `T.float32` can carry default values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "42ec86a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def add_one(X, data: T.float32 = 1):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    X: T.Tensor[[M, N], T.float32]\n",
+    "    Y = T.empty((M, N), T.float32)\n",
+    "    with T.Kernel(T.ceildiv(M, 128), threads=128) as bx:\n",
+    "        for i, j in T.Parallel(128, N):\n",
+    "            Y[bx * 128 + i, j] = X[bx * 128 + i, j] + data\n",
+    "    return Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d49e1120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = torch.randn(1024, 1024, dtype=torch.float32, device=\"cuda\")\n",
+    "Y = add_one(X)\n",
+    "torch.testing.assert_close(Y, X + 1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a02baedc",
+   "metadata": {},
+   "source": [
+    "## Overhead of argument matching"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "860a2972",
+   "metadata": {},
+   "source": [
+    "EagerJIT has very small overhead; each additional constant annotation costs about 200 ns.\n",
+    "* 200 ns is roughly the cost of an FFI call that reads parameters from a `torch.Tensor`'s shape/stride."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc676e33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Kernel call    : 7.68 us\n",
+      "Parse cache key: 0.41 us\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "A = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "\n",
+    "\n",
+    "@tilelang.jit\n",
+    "def dummy_kernel(A, B):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    A: T.Tensor[[M, N], T.float16]\n",
+    "    B: T.Tensor[[M, N], T.float16]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        pass\n",
+    "\n",
+    "\n",
+    "# compile it first\n",
+    "dummy_kernel(A, B)\n",
+    "\n",
+    "\n",
+    "def eval_overhead(f):\n",
+    "    start = time.perf_counter_ns()\n",
+    "    for _ in range(10000):\n",
+    "        f()\n",
+    "    stop = time.perf_counter_ns()\n",
+    "    return (stop - start) / 10000 / 1000\n",
+    "\n",
+    "\n",
+    "kernel_call_overhead = eval_overhead(lambda: dummy_kernel(A, B))\n",
+    "parse_cache_key_overhead = eval_overhead(lambda: dummy_kernel.parse_cache_key(A, B))\n",
+    "\n",
+    "print(f\"Kernel call    : {kernel_call_overhead:.2f} us\")\n",
+    "print(f\"Parse cache key: {parse_cache_key_overhead:.2f} us\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## Compilation and parallel compilation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "Both EagerJIT and the original `jit` (i.e. LazyJIT) support parallel compilation.\n",
+    "\n",
+    "To avoid wasting memory on temporary `torch.Tensor` objects, you can use `T.Tensor` to create placeholders."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a4e4eb3cd4445bda6e8693da31ef3b8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f61c2946f55547c688e629851d4e8106",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7ef9f7de7d70>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de52b0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34b30>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34530>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de6900>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e344a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e347a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7fb25d0>]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## More convenient macros"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang's macros have been improved:\n",
+    "\n",
+    "1. Allow using `T.Ref` as an annotation, similar to C++ references.\n",
+    "2. Allow returning multiple values.\n",
+    "3. Allow nesting and recursion."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### Passing references with `T.Ref`\n",
+    "\n",
+    "A `T.Ref` reference can point to a scalar variable or to an element of a buffer."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # Supports constant indices\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # Also supports variable indices\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### Pass macros as arguments\n",
+    "\n",
+    "You can pass a macro as a function argument."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def element_wise(A, fn):\n",
+    "    N = T.dynamic(\"N\")\n",
+    "    A: T.Tensor[[N], T.float32]\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "\n",
+    "\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device=\"cuda\")\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Recursive macros\n",
+    "\n",
+    "You may not need this often, but macros can be recursive as long as the termination condition is known at compile time."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "\n",
+    "@tilelang.jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macros returning multiple values"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd83fea7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/eager_jit/eagerjit.zh.ipynb b/examples/eager_jit/eagerjit.zh.ipynb
new file mode 100644
index 0000000000..0f7c9be99e
--- /dev/null
+++ b/examples/eager_jit/eagerjit.zh.ipynb
@@ -0,0 +1,977 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5e0deecc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sys\n",
+    "from pathlib import Path\n",
+    "\n",
+    "sys.path.insert(0, str(Path.cwd().parent.parent.absolute()))\n",
+    "import tilelang\n",
+    "import torch\n",
+    "import tilelang.language as T"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1ca2c56d",
+   "metadata": {},
+   "source": [
+    "# Tilelang Lazy JIT"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "156e7370",
+   "metadata": {},
+   "source": [
+    "## Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "b070c109",
+   "metadata": {},
+   "source": [
+    "Tilelang Lazy JIT 将 jit 生成和调用的逻辑合并到一起\n",
+    "\n",
+    "函数签名的写法与 triton 相似，但做了大量增强，最主要的增强是允许对 Tensor 的标注：\n",
+    "\n",
+    "* 如果一个 Tensor 有复杂的 shape 约束，我们可以把它的标注移动到函数内部\n",
+    "* 通过 `T.const` 或 `T.dynamic` 来建立一些 shape 变量，然后用 `T.Tensor` 标注复杂的 Tensor\n",
+    "* 用 `T.empty` 来声明返回值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "60bf8954",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm(\n",
+    "    A,\n",
+    "    B,\n",
+    "    out_dtype: T.dtype = T.float32,\n",
+    "    block_M: int = 128,\n",
+    "    block_N: int = 128,\n",
+    "    block_K: int = 32,\n",
+    "):\n",
+    "    M, N, K = T.const(\"M, N, K\")\n",
+    "\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "\n",
+    "    C = T.empty((M, N), out_dtype)\n",
+    "\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), out_dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "28f868fe",
+   "metadata": {},
+   "source": [
+    "直接将 Tensor 作为参数调用，即可触发完整的 jit 编译运行流程："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "ee13394a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B)\n",
+    "\n",
+    "# check output is correct\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c6705091",
+   "metadata": {},
+   "source": [
+    "更改调用的参数，如果编译器参数发生了变化，会触发重新编译："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "d8aab5b7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 1024, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm(A, B, block_M=64, block_N=64)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ce6b7391",
+   "metadata": {},
+   "source": [
+    "你也可以手动调用 compile 函数编译 kernel\n",
+    "\n",
+    "1. `ker.compile` 编译 kernel\n",
+    "2. `ker.get_tir` 获取 tir\n",
+    "3. `ker.par_compile` 并行编译"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "f3cf3a2d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "kernel = gemm.compile(A, B, block_M=64, block_N=64)\n",
+    "C = kernel(A, B)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "921761b5",
+   "metadata": {},
+   "source": [
+    "## More Tensor Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4539e54e",
+   "metadata": {},
+   "source": [
+    "### 用 macro 来分离实现"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ad96ba65",
+   "metadata": {},
+   "source": [
+    "接下来，我们会用各种方式来实现一个简单的 gemm，为了方便，我们先写一个 macro 把 gemm 的主要逻辑写出来："
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "171d4fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def gemm_impl(A, B, C, M, N, K, block_M, block_N, block_K):\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        A_shared = T.alloc_shared((block_M, block_K), A.dtype)\n",
+    "        B_shared = T.alloc_shared((block_K, block_N), B.dtype)\n",
+    "        C_local = T.alloc_fragment((block_M, block_N), C.dtype)\n",
+    "        T.clear(C_local)\n",
+    "        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):\n",
+    "            T.copy(A[bx * block_M, k * block_K], A_shared)\n",
+    "            T.copy(B[k * block_K, by * block_N], B_shared)\n",
+    "            T.gemm(A_shared, B_shared, C_local)\n",
+    "        T.copy(C_local, C[bx * block_M, by * block_N])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "446a1acd",
+   "metadata": {},
+   "source": [
+    "### 用 T.dynamic 标记动态 Shape\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "6a38aa95",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_dyn_K(A, B):\n",
+    "    M, N, K = T.dynamic(\"M, N, K\")\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, 128, 128, 32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fe6cfdc8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_dyn_K(A, B)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2ee97bf7",
+   "metadata": {},
+   "source": [
+    "### 用 T.StridedTensor 标记带 stride 的 Tensor\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "9dde1dae",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def as_contingious(A):\n",
+    "    M, N, dM, dN = T.dynamic(\"M, N, dM, dN\")\n",
+    "    A: T.StridedTensor[[M, N], [dM, dN], T.float32]\n",
+    "    B = T.empty((M, N), A.dtype)\n",
+    "    block_M = 128\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):\n",
+    "        T.copy(\n",
+    "            A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "            B[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N],\n",
+    "        )\n",
+    "    return B"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "dec2c0a7",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 1024, device=\"cuda\")\n",
+    "B = as_contingious(A.T)\n",
+    "B_ref = A.T.contiguous()\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f5fb20d6",
+   "metadata": {},
+   "source": [
+    "## More Annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "890df0a2",
+   "metadata": {},
+   "source": [
+    "### 直接用参数当 annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e9a47d42",
+   "metadata": {},
+   "source": [
+    "可以直接把函数参数写到 annotation 里面"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "0fc17af6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_ptr(\n",
+    "    A,\n",
+    "    B,\n",
+    "    M,\n",
+    "    N,\n",
+    "    K,\n",
+    "):\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "8e52a554",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "6b19ef90",
+   "metadata": {},
+   "source": [
+    "### 对运行时变量的 annotation"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "bba5f27f",
+   "metadata": {},
+   "source": [
+    "运行时变量也是一样，如果嫌函数 annotation 太长，可以放到函数体里面"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "c1e7598a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def gemm_ptr_dyn(A, B, M, N, K):\n",
+    "    M: T.int32\n",
+    "    N: T.int32\n",
+    "    K: T.int32\n",
+    "    A: T.Tensor[[M, K], T.float16]\n",
+    "    B: T.Tensor[[K, N], T.float16]\n",
+    "    C = T.empty((M, N), T.float32)\n",
+    "    gemm_impl(A, B, C, M, N, K, block_M=128, block_N=128, block_K=32)\n",
+    "    return C"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "9e9a4c88",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, 512, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(512, 256, dtype=torch.float16, device=\"cuda\")\n",
+    "C = gemm_ptr_dyn(A, B, 1024, 256, 512)\n",
+    "C_ref = (A @ B).float()\n",
+    "torch.testing.assert_close(C, C_ref, rtol=1e-2, atol=1e-2)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "81427765",
+   "metadata": {},
+   "source": [
+    "### 常量的约束"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "4d6b084b",
+   "metadata": {},
+   "source": [
+    "`T.const` 创建的常量 annotation 只要要被直接使用一次，否则会报错"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "c90dd24f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Constexpr variable `M` is not used in any buffer shape or stride.\n",
+      "At least one **DIRECT** usage is required. Please check:\n",
+      "(1) the variable is not used\n",
+      "(2) all uses are indirect, e.g. M * 2, M * 3. (you can replace them with separate constexpr variables)\n",
+      "Buffer shapes: {A: [M * 2, M * 3]}\n",
+      "Buffer strides: {A: [M * 3, 1]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "@tilelang.jit\n",
+    "def example_wrong_kernel(A):\n",
+    "    M = T.const(\"M\")\n",
+    "    A: T.Tensor[[M * 2, M * 3], T.float32]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        A[0, 0]\n",
+    "\n",
+    "\n",
+    "try:\n",
+    "    A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "    example_wrong_kernel(A)\n",
+    "except Exception as e:\n",
+    "    print(e)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "e07e762b",
+   "metadata": {},
+   "source": [
+    "### 动态维度的"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f48e5d7a",
+   "metadata": {},
+   "source": [
+    "如果想要 Tensor 的 annotation 类型某个参数变化，建议改成 T.ptr + T.match_buffer 格式。"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "1d050321",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@tilelang.jit\n",
+    "def dyn_annot(\n",
+    "    A: T.ptr,  # 1. T.ptr type annotation\n",
+    "    is_2d=False,\n",
+    "):\n",
+    "    if is_2d:\n",
+    "        M, N = T.const(\"M, N\")\n",
+    "        # 2. dynamic shape annotation inside function body\n",
+    "        A = T.match_buffer(A, [M, N], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0, 0]\n",
+    "    else:\n",
+    "        L = T.const(\"L\")\n",
+    "        A = T.match_buffer(A, [L], T.float32)\n",
+    "        with T.Kernel(1) as _:\n",
+    "            A[0]\n",
+    "\n",
+    "\n",
+    "A = torch.randn(64, 96, dtype=torch.float32, device=\"cuda\")\n",
+    "dyn_annot(A, is_2d=True)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2e9f1bb3",
+   "metadata": {},
+   "source": [
+    "### 带默认参数的"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "f7fc9917",
+   "metadata": {},
+   "source": [
+    "类似 `T.float32` 标注的标量可以带默认参数"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "42ec86a1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def add_one(X, data: T.float32 = 1):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    X: T.Tensor[[M, N], T.float32]\n",
+    "    Y = T.empty((M, N), T.float32)\n",
+    "    with T.Kernel(T.ceildiv(M, 128), threads=128) as bx:\n",
+    "        for i, j in T.Parallel(128, N):\n",
+    "            Y[bx * 128 + i, j] = X[bx * 128 + i, j] + data\n",
+    "    return Y"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "d49e1120",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "X = torch.randn(1024, 1024, dtype=torch.float32, device=\"cuda\")\n",
+    "Y = add_one(X)\n",
+    "torch.testing.assert_close(Y, X + 1)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "a02baedc",
+   "metadata": {},
+   "source": [
+    "## 参数匹配的 Overhead"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "860a2972",
+   "metadata": {},
+   "source": [
+    "EagerJIT overhead 很小，每个 constant 添加约 200ns 的 overhead\n",
+    "* 200ns 大约是从 torch.Tensor 的 shape/stride 中拿参数的 ffi call 的代价"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dc676e33",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Kernel call    : 7.68 us\n",
+      "Parse cache key: 0.41 us\n"
+     ]
+    }
+   ],
+   "source": [
+    "import time\n",
+    "\n",
+    "A = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "B = torch.randn(128, 128, dtype=torch.float16, device=\"cuda\")\n",
+    "\n",
+    "\n",
+    "@tilelang.jit\n",
+    "def dummy_kernel(A, B):\n",
+    "    M, N = T.const(\"M, N\")\n",
+    "    A: T.Tensor[[M, N], T.float16]\n",
+    "    B: T.Tensor[[M, N], T.float16]\n",
+    "    with T.Kernel(1) as _:\n",
+    "        pass\n",
+    "\n",
+    "\n",
+    "# compile it first\n",
+    "dummy_kernel(A, B)\n",
+    "\n",
+    "\n",
+    "def eval_overhead(f):\n",
+    "    start = time.perf_counter_ns()\n",
+    "    for _ in range(10000):\n",
+    "        f()\n",
+    "    stop = time.perf_counter_ns()\n",
+    "    return (stop - start) / 10000 / 1000\n",
+    "\n",
+    "\n",
+    "kernel_call_overhead = eval_overhead(lambda: dummy_kernel(A, B))\n",
+    "parse_cache_key_overhead = eval_overhead(lambda: dummy_kernel.parse_cache_key(A, B))\n",
+    "\n",
+    "print(f\"Kernel call    : {kernel_call_overhead:.2f} us\")\n",
+    "print(f\"Parse cache key: {parse_cache_key_overhead:.2f} us\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "39166cb4",
+   "metadata": {},
+   "source": [
+    "## 编译与并行编译"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "8c6fbe08",
+   "metadata": {},
+   "source": [
+    "Eager JIT 和原来的 jit（即 LazyJIT） 都支持并行编译\n",
+    "\n",
+    "为了防止 torch.tensor 白白浪费内存，可以使用 T.Tensor 来创建 placeholder"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "7222e57b",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "8a4e4eb3cd4445bda6e8693da31ef3b8",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Elaborating:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "f61c2946f55547c688e629851d4e8106",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Parallel Compiling:   0%|          | 0/8 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "text/plain": [
+       "[<tilelang.jit.kernel.JITKernel at 0x7ef9f7de7d70>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de52b0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34b30>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e34530>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7de6900>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e344a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7e347a0>,\n",
+       " <tilelang.jit.kernel.JITKernel at 0x7ef9f7fb25d0>]"
+      ]
+     },
+     "execution_count": 20,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "from itertools import product\n",
+    "\n",
+    "\n",
+    "def get_configs():\n",
+    "    return [\n",
+    "        {\n",
+    "            \"A\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"B\": T.Tensor((1024, 1024), T.float32),\n",
+    "            \"block_M\": block_M,\n",
+    "            \"block_N\": block_N,\n",
+    "            \"block_K\": block_K,\n",
+    "        }\n",
+    "        for block_M, block_N, block_K in product([32, 64], repeat=3)\n",
+    "    ]\n",
+    "\n",
+    "\n",
+    "gemm.par_compile(get_configs())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "5160d2cc",
+   "metadata": {},
+   "source": [
+    "## 更便利的 Macro"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "be44afc4",
+   "metadata": {},
+   "source": [
+    "tilelang 的 macro 现在已经升级：\n",
+    "\n",
+    "1. 允许用 `T.Ref` 作为 annotation，这类似与 C++ 的引用传递\n",
+    "2. 允许返回多个值\n",
+    "3. 允许嵌套，递归"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "79575972",
+   "metadata": {},
+   "source": [
+    "### T.Ref 传递引用\n",
+    "\n",
+    "T.Ref 传递的引用可以 var 也可以是 Buffer 的索引"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 21,
+   "id": "90eaa6e5",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo(x_handle: T.handle):\n",
+       "    x = T.match_buffer(x_handle, (2,), strides=(1,))\n",
+       "    # with T.block(\"root\"):\n",
+       "    bx = T.launch_thread(\"blockIdx.x\", 1)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        idx = T.Buffer((1,), \"int32\", scope=\"local.var\")\n",
+       "        T.writes(x[T.min(1, idx[0]):T.min(1, idx[0]) + (T.max(1, idx[0]) + 1 - T.min(1, idx[0]))])\n",
+       "        T.block_attr({\"tl.local_var_init\": {idx.data: 0}})\n",
+       "        idx = T.alloc_buffer((1,), \"int32\", data=idx.data, scope=\"local.var\")\n",
+       "        x[1] = T.float32(1.0)\n",
+       "        _tmp: T.int32 = idx[0]\n",
+       "        x[_tmp] = T.float32(1.0)"
+      ]
+     },
+     "execution_count": 21,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def macro_with_ref(x: T.Ref):\n",
+    "    x = 1  # noqa: F841\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo(x: T.Tensor((2,))):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        # 支持常量 index\n",
+    "        macro_with_ref(x[1])\n",
+    "\n",
+    "        # 也支持变量 index\n",
+    "        idx = T.alloc_var(T.int32, 0)\n",
+    "        macro_with_ref(x[idx])\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "7bb447a2",
+   "metadata": {},
+   "source": [
+    "### 当作参数传递\n",
+    "\n",
+    "你可以把 macro 当做参数传递"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "dc7bb779",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@tilelang.jit\n",
+    "def element_wise(A, fn):\n",
+    "    N = T.dynamic(\"N\")\n",
+    "    A: T.Tensor[[N], T.float32]\n",
+    "    B = T.empty((N,), dtype=A.dtype)\n",
+    "    block_N = 128\n",
+    "    with T.Kernel(T.ceildiv(N, block_N), threads=128) as bx:\n",
+    "        for i in T.Parallel(block_N):\n",
+    "            idx = bx * block_N + i\n",
+    "            B[idx] = fn(A[idx])\n",
+    "    return B\n",
+    "\n",
+    "\n",
+    "@T.macro\n",
+    "def add_one(x):\n",
+    "    return x + 1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "a89fdb44",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "A = torch.randn(1024, device=\"cuda\")\n",
+    "B = element_wise(A, add_one)\n",
+    "B_ref = A + 1\n",
+    "torch.testing.assert_close(B, B_ref)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "ef6e403a",
+   "metadata": {},
+   "source": [
+    "### Macro 递归\n",
+    "\n",
+    "虽然不知道有没有这种需求，但 macro 是可以递归的，终止条件要求编译期间确定"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "7703cab5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "@T.macro\n",
+    "def n31(x, var: T.Ref):\n",
+    "    if x == 1:\n",
+    "        pass\n",
+    "    elif x % 2 == 0:\n",
+    "        var = var // 2\n",
+    "        n31(x // 2, var)\n",
+    "    else:\n",
+    "        var = var * 3 + 1\n",
+    "        n31(x * 3 + 1, var)\n",
+    "\n",
+    "\n",
+    "@tilelang.jit\n",
+    "def foo(A: T.Tensor[[1], T.int32], n: int):\n",
+    "    with T.Kernel(1) as _:\n",
+    "        n31(n, A[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "542ddd4e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "tensor([18], device='cuda:0', dtype=torch.int32)"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "A = torch.tensor([100], dtype=torch.int32, device=\"cuda\")\n",
+    "foo(A, 5)\n",
+    "A"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "dc30c2d2",
+   "metadata": {},
+   "source": [
+    "### Macro 返回多个值"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "d5a2388f",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "# from tvm.script import tir as T\n",
+       "\n",
+       "@T.prim_func\n",
+       "def foo():\n",
+       "    # with T.block(\"root\"):\n",
+       "    x = T.launch_thread(\"blockIdx.x\", 32)\n",
+       "    tx = T.launch_thread(\"threadIdx.x\", 128)\n",
+       "    ty = T.launch_thread(\"threadIdx.y\", 1)\n",
+       "    tz = T.launch_thread(\"threadIdx.z\", 1)\n",
+       "    with T.block(\"tilelang_root\"):\n",
+       "        T.reads()\n",
+       "        T.writes()\n",
+       "        s: T.int32 = T.sin(x)\n",
+       "        c: T.int32 = T.cos(x)\n",
+       "        a: T.int32 = s + c\n",
+       "        b: T.int32 = s - c\n",
+       "        T.evaluate(0)"
+      ]
+     },
+     "execution_count": 26,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "@T.macro\n",
+    "def sincos(x):\n",
+    "    return T.sin(x), T.cos(x)\n",
+    "\n",
+    "\n",
+    "@T.prim_func\n",
+    "def foo():\n",
+    "    with T.Kernel(32) as x:\n",
+    "        s, c = sincos(x)\n",
+    "        a = s + c  # noqa: F841\n",
+    "        b = s - c  # noqa: F841\n",
+    "\n",
+    "\n",
+    "foo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dd83fea7",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "tilelang-dev_0",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.9"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/examples/elementwise/example_elementwise_add.py b/examples/elementwise/example_elementwise_add.py
index 32da940155..3d142ed542 100644
--- a/examples/elementwise/example_elementwise_add.py
+++ b/examples/elementwise/example_elementwise_add.py
@@ -1,5 +1,4 @@
 import argparse
-import itertools
 import torch
 import tilelang
 import tilelang.language as T
@@ -9,15 +8,6 @@ def ref_program(x, y):
     return x + y
 
 
-def get_configs():
-    block_M = [64, 128, 256]
-    block_N = [64, 128, 256]
-    threads = [64, 128, 256]
-    configs = list(itertools.product(block_M, block_N, threads))
-    return [{"block_M": bm, "block_N": bn, "threads": th} for bm, bn, th in configs]
-
-
-@tilelang.autotune(configs=get_configs())
 @tilelang.jit(out_idx=[-1])
 def elementwise_add(M, N, block_M, block_N, in_dtype, out_dtype, threads):
     @T.prim_func
@@ -42,12 +32,7 @@ def main(M=1024, N=1024, use_autotune=False):
     a = torch.randn(M, N, dtype=torch.float32, device="cuda")
     b = torch.randn(M, N, dtype=torch.float32, device="cuda")
 
-    if use_autotune:
-        kernel = elementwise_add(M, N, in_dtype=T.float32, out_dtype=T.float32)
-    else:
-        # Default config
-        config = {"block_M": 32, "block_N": 32, "threads": 128}
-        kernel = elementwise_add(M, N, **config, in_dtype=T.float32, out_dtype=T.float32)
+    kernel = elementwise_add(M, N, block_M=32, block_N=32, threads=128, in_dtype=T.float32, out_dtype=T.float32)
 
     out = kernel(a, b)
     torch.testing.assert_close(out, ref_program(a, b), rtol=1e-2, atol=1e-2)
@@ -72,6 +57,5 @@ def run_regression_perf():
     parser = argparse.ArgumentParser()
     parser.add_argument("--m", type=int, default=1024)
     parser.add_argument("--n", type=int, default=1024)
-    parser.add_argument("--use_autotune", action="store_true", default=False)
     args, _ = parser.parse_known_args()
-    main(args.m, args.n, args.use_autotune)
+    main(args.m, args.n)
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce.py b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
index fea547b6e6..4920d8cf06 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce.py
@@ -5,8 +5,6 @@
 from tilelang.contrib import nvcc
 import argparse
 
-tilelang.disable_cache()
-
 
 @tilelang.jit(
     out_idx=[3, 4],
@@ -49,13 +47,13 @@ def flash_fwd(
             T.fill(logsum, 0)
             # Warning: in causal/varlen/unaligned seqlen scenarios, the -inf will cause undefined behavior in exp ops
             # We should set it to negative large number instead
-            T.fill(scores_max, T.Cast(accum_dtype, -1e30))
+            T.fill(scores_max, T.cast(-1e30, accum_dtype))
             loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
                 T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
                 if is_causal:
                     for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, T.Cast(accum_dtype, -1e30))
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, T.cast(-1e30, accum_dtype))
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
@@ -211,14 +209,6 @@ def flash_bwd(
             dv_shared = T.alloc_shared([block_M, dim_v], accum_dtype)
             dq_shared = T.alloc_shared([block_N, dim_qk], accum_dtype)
 
-            T.annotate_layout(
-                {
-                    dQ: make_dq_layout(dQ),
-                    dK: make_dq_layout(dK),
-                    dV: make_dq_layout(dV),
-                }
-            )
-
             T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
             T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
             T.clear(dv)
@@ -389,7 +379,6 @@ def maybe_contiguous(x):
         block_M = 128
         block_N = 32
         mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD_V)
-        mod_post = flashattn_bwd_postprocess(BATCH, H, HEAD_KV, N_CTX, D_HEAD_QK, D_HEAD_V)
         delta = mod_prep(o, do)
 
         if ctx.use_atomic:
@@ -403,11 +392,11 @@ def maybe_contiguous(x):
             dk = torch.zeros(shape_k, dtype=torch.float32, device=q.device)
             dv = torch.zeros(shape_v, dtype=torch.float32, device=q.device)
             kernel(q, k, v, do, lse, delta, dq, dk, dv)
-            dq, dk, dv = mod_post(dq, dk, dv)
         else:
             kernel = flashattn_bwd_split_novarlen(
                 BATCH, H, N_CTX, D_HEAD_QK, D_HEAD_V, ctx.causal, block_M, block_N, threads=256, num_stages=2, groups=groups
             )
+            mod_post = flashattn_bwd_postprocess(BATCH, H, HEAD_KV, N_CTX, D_HEAD_QK, D_HEAD_V)
             shape_q = [BATCH, N_CTX, H, D_HEAD_QK]
             shape_k = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_QK]  # sum after kernel
             shape_v = [groups, BATCH, N_CTX, HEAD_KV, D_HEAD_V]  # sum after kernel
diff --git a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
index a9f45e077d..b09eec00c4 100644
--- a/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
+++ b/examples/flash_attention/example_gqa_bwd_tma_reduce_varlen.py
@@ -76,7 +76,7 @@ def flash_fwd(
             T.fill(logsum, 0.0)
             # Warning: in causal/varlen/unaligned seqlen scenarios, the -inf will cause undefined behavior in exp ops
             # We should set it to negative large number instead
-            T.fill(scores_max, T.Cast(accum_dtype, -1e30))
+            T.fill(scores_max, T.cast(-1e30, accum_dtype))
             loop_range = T.ceildiv(k_current_seqlen, block_N)
             for k in T.Pipelined(loop_range, num_stages=1):
                 for i, d in T.Parallel(block_N, dim_qk):
@@ -91,12 +91,12 @@ def flash_fwd(
                             (bx * block_M + i >= k * block_N + j)
                             and (bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen),
                             0,
-                            T.Cast(accum_dtype, -1e30),
+                            T.cast(-1e30, accum_dtype),
                         )
                 else:
                     for i, j in T.Parallel(block_M, block_N):
                         acc_s[i, j] = T.if_then_else(
-                            bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen, 0, T.Cast(accum_dtype, -1e30)
+                            bx * block_M + i < q_current_seqlen and k * block_N + j < k_current_seqlen, 0, T.cast(-1e30, accum_dtype)
                         )
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, d in T.Parallel(block_N, dim_v):
@@ -286,14 +286,6 @@ def flash_bwd(
             q_current_seqlen = q_end_idx - q_start_idx
             k_current_seqlen = k_end_idx - k_start_idx
 
-            T.annotate_layout(
-                {
-                    dQ: make_dq_layout(dQ),
-                    dK: make_dq_layout(dK),
-                    dV: make_dq_layout(dV),
-                }
-            )
-
             T.copy(K[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], K_shared)
             T.copy(V[k_start_idx + by * block_M : k_start_idx + (by + 1) * block_M, bx // groups, :], V_shared)
 
@@ -508,8 +500,8 @@ def forward(
         total_q = q_unpad.shape[0]
         total_kv = k_unpad.shape[0]
 
-        mod = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V, causal, block_M, block_N, groups)
-        o_unpad, lse = mod(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k)
+        kernel = flashattn_fwd(BATCH, total_q, total_kv, N_CTX, H, max_seqlen_q, D_HEAD_QK, D_HEAD_V, causal, block_M, block_N, groups)
+        o_unpad, lse = kernel(q_unpad, k_unpad, v_unpad, cu_seqlens_q, cu_seqlens_k)
         o = pad_input(o_unpad, indices_q, BATCH, N_CTX)
         ctx.save_for_backward(q_unpad, k_unpad, v_unpad, o_unpad, lse, seqlens_q, seqlens_k, cu_seqlens_q, cu_seqlens_k)
         ctx.batch = BATCH
@@ -541,7 +533,6 @@ def maybe_contiguous(x):
         block_M = 128
         block_N = 32
         mod_prep = flashattn_bwd_preprocess(BATCH, H, total_q, N_CTX, ctx.max_seqlen_q, D_HEAD_V)
-        mod_post = flashattn_bwd_postprocess(total_q, total_kv, H, HEAD_KV, D_HEAD_QK, D_HEAD_V)
         delta = mod_prep(o, do, cu_seqlens_q)
 
         if ctx.use_atomic:
@@ -565,7 +556,6 @@ def maybe_contiguous(x):
             dk = torch.zeros_like(k, dtype=torch.float32)
             dv = torch.zeros_like(v, dtype=torch.float32)
             kernel(q, k, v, do, lse_clone, delta, cu_seqlens_q, cu_seqlens_k, dq, dk, dv)
-            dq, dk, dv = mod_post(dq, dk, dv)
         else:
             kernel = flashattn_bwd_split(
                 BATCH,
@@ -583,6 +573,7 @@ def maybe_contiguous(x):
                 num_stages=2,
                 groups=groups,
             )
+            mod_post = flashattn_bwd_postprocess(total_q, total_kv, H, HEAD_KV, D_HEAD_QK, D_HEAD_V)
             dq = torch.zeros_like(q, dtype=torch.float32)
             dk = torch.empty(groups, *k.shape, dtype=torch.float16, device=q.device)
             dv = torch.empty(groups, *v.shape, dtype=torch.float16, device=q.device)
diff --git a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py b/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
deleted file mode 100644
index c0fe4e33d2..0000000000
--- a/examples/flash_attention/example_mha_bwd_bshd_wgmma_pipelined.py
+++ /dev/null
@@ -1,353 +0,0 @@
-import torch
-import torch.nn.functional as F
-import tilelang
-import tilelang.language as T
-from tilelang.profiler import do_bench
-import argparse
-
-
-@tilelang.jit(
-    out_idx=[3, 4],
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-)
-def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
-    shape = [batch, seq_len, heads, dim]
-    dtype = T.float16
-    accum_dtype = T.float32
-
-    @T.prim_func
-    def flash_fwd(
-        Q: T.Tensor(shape, dtype),  # type: ignore
-        K: T.Tensor(shape, dtype),  # type: ignore
-        V: T.Tensor(shape, dtype),  # type: ignore
-        Output: T.Tensor(shape, dtype),  # type: ignore
-        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-    ):
-        with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
-            Q_shared = T.alloc_shared([block_M, dim], dtype)
-            K_shared = T.alloc_shared([block_N, dim], dtype)
-            V_shared = T.alloc_shared([block_N, dim], dtype)
-            acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
-            acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
-            acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
-            scores_max = T.alloc_fragment([block_M], accum_dtype)
-            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
-            scores_scale = T.alloc_fragment([block_M], accum_dtype)
-            scores_sum = T.alloc_fragment([block_M], accum_dtype)
-            logsum = T.alloc_fragment([block_M], accum_dtype)
-
-            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
-            T.fill(acc_o, 0)
-            T.fill(logsum, 0)
-            T.fill(scores_max, -T.infinity(accum_dtype))
-
-            loop_range = T.ceildiv((bx + 1) * block_M, block_N) if is_causal else T.ceildiv(seq_len, block_N)
-            for k in T.Pipelined(loop_range, num_stages=1):
-                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
-                if is_causal:
-                    for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
-                else:
-                    for i, j in T.Parallel(block_M, block_N):
-                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
-                T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
-                T.copy(scores_max, scores_max_prev)
-                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                for i in T.Parallel(block_M):
-                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
-                for i in T.Parallel(block_M):
-                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
-                for i, j in T.Parallel(block_M, dim):
-                    acc_o[i, j] *= scores_scale[i]
-                for i, j in T.Parallel(block_M, block_N):
-                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
-                T.copy(acc_s, acc_s_cast)
-                T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
-                T.reduce_sum(acc_s, scores_sum, dim=1)
-                for i in T.Parallel(block_M):
-                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
-            for i, j in T.Parallel(block_M, dim):
-                acc_o[i, j] /= logsum[i]
-            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
-            for i in T.Parallel(block_M):
-                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
-            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
-
-    return flash_fwd
-
-
-@tilelang.jit(
-    out_idx=[2],
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    },
-)
-def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
-    dtype = T.float16
-    accum_dtype = T.float32
-    shape = [batch, seq_len, heads, dim]
-    blk = 32
-
-    @T.prim_func
-    def flash_bwd_prep(
-        O: T.Tensor(shape, dtype),  # type: ignore
-        dO: T.Tensor(shape, dtype),  # type: ignore
-        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-    ):
-        with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
-            o = T.alloc_fragment([blk, blk], dtype)
-            do = T.alloc_fragment([blk, blk], dtype)
-            acc = T.alloc_fragment([blk, blk], accum_dtype)
-            delta = T.alloc_fragment([blk], accum_dtype)
-            T.clear(acc)
-            for k in range(T.ceildiv(dim, blk)):
-                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
-                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
-                for i, j in T.Parallel(blk, blk):
-                    acc[i, j] += o[i, j] * do[i, j]
-            T.reduce_sum(acc, delta, 1)
-            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
-
-    return flash_bwd_prep
-
-
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True,
-    }
-)
-def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
-    sm_scale = (1.0 / dim) ** 0.5
-    scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
-    shape = [batch, seq_len, heads, dim]
-    dtype = T.float16
-    accum_dtype = T.float32
-
-    @T.prim_func
-    def flash_bwd(
-        Q: T.Tensor(shape, dtype),  # type: ignore
-        K: T.Tensor(shape, dtype),  # type: ignore
-        V: T.Tensor(shape, dtype),  # type: ignore
-        dO: T.Tensor(shape, dtype),  # type: ignore
-        lse: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),  # type: ignore
-        dQ: T.Tensor(shape, accum_dtype),  # type: ignore
-        dK: T.Tensor(shape, dtype),  # type: ignore
-        dV: T.Tensor(shape, dtype),  # type: ignore
-    ):
-        with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=256) as (bx, by, bz):
-            K_shared = T.alloc_shared([block_M, dim], dtype)
-            dsT_shared = T.alloc_shared([block_M, block_N], dtype)
-            # should not store K to local if dim is large
-            # K_local = T.alloc_fragment([block_M, dim], dtype)
-            # K_local_T = T.alloc_fragment([block_M, dim], dtype)
-            # V_local = T.alloc_fragment([block_M, dim], dtype)
-            q = T.alloc_shared([block_N, dim], dtype)
-            V_shared = T.alloc_shared([block_M, dim], dtype)
-            qkT = T.alloc_fragment([block_M, block_N], accum_dtype)
-            dsT = T.alloc_fragment([block_M, block_N], accum_dtype)
-            qkT_cast = T.alloc_fragment([block_M, block_N], dtype)
-            dsT_cast = T.alloc_fragment([block_M, block_N], dtype)
-            lse_shared = T.alloc_shared([block_N], accum_dtype)
-            delta = T.alloc_shared([block_N], accum_dtype)
-            do = T.alloc_shared([block_N, dim], dtype)
-            dv = T.alloc_fragment([block_M, dim], accum_dtype)
-            dk = T.alloc_fragment([block_M, dim], accum_dtype)
-            dq = T.alloc_fragment([block_N, dim], accum_dtype)
-            dv_shared = T.alloc_shared([block_M, dim], dtype)
-            dk_shared = T.alloc_shared([block_M, dim], dtype)
-            dq_shared = T.alloc_shared([block_N, dim], accum_dtype)
-
-            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
-            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
-            T.clear(dv)
-            T.clear(dk)
-            loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
-            loop_ed = T.ceildiv(seq_len, block_N)
-            for k in T.Pipelined(loop_st, loop_ed, num_stages=2):
-                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
-                T.clear(qkT)
-                T.gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
-                T.clear(dsT)
-                T.gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-                T.wait_wgmma(1)
-
-                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
-                for i, j in T.Parallel(block_M, block_N):
-                    qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
-                if is_causal:
-                    for i, j in T.Parallel(block_M, block_N):
-                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
-                # We don't need to handle OOB positions for non-causal cases,
-                # since OOB values won't affect other positions here.
-                T.wait_wgmma(0)
-                T.copy(qkT, qkT_cast)
-                T.gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow, wg_wait=-1)
-
-                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
-
-                for i, j in T.Parallel(block_M, block_N):
-                    dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
-                T.gemm(dsT_cast, q, dk, policy=T.GemmWarpPolicy.FullRow, wg_wait=1)
-
-                T.copy(dsT_cast, dsT_shared)
-                T.clear(dq)
-                T.gemm(dsT_shared, K_shared, dq, transpose_A=True, wg_wait=1)
-                T.wait_wgmma(0)
-                T.copy(dq, dq_shared)
-                T.atomic_add(dQ[bz, k * block_N : (k + 1) * block_N, bx, :], dq_shared)
-            T.copy(dv, dv_shared)
-            T.copy(dk, dk_shared)
-            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
-            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
-
-    return flash_bwd
-
-
-class _attention(torch.autograd.Function):
-    @staticmethod
-    def forward(ctx, q, k, v, causal):
-        BATCH, N_CTX, H, D_HEAD = q.shape
-        block_M = 64
-        block_N = 64 if D_HEAD <= 128 else 32
-        mod = flashattn_fwd(BATCH, H, N_CTX, D_HEAD, causal, block_M, block_N)
-        o, lse = mod(q, k, v)
-        ctx.save_for_backward(q, k, v, o, lse)
-        ctx.causal = causal
-        return o
-
-    @staticmethod
-    def backward(ctx, do):
-        q, k, v, o, lse = ctx.saved_tensors
-        BATCH, N_CTX, H, D_HEAD = q.shape
-
-        def maybe_contiguous(x):
-            if x.stride(-1) != 1:
-                return x.contiguous()
-            return x
-
-        do, q, k, v, o = [maybe_contiguous(x) for x in (do, q, k, v, o)]
-        block_M = 128
-        block_N = 128 if D_HEAD <= 64 else 32
-        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
-        delta = mod_prep(o, do)
-        mod = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, ctx.causal, block_M, block_N)
-        shape = [BATCH, N_CTX, H, D_HEAD]
-        dq = torch.zeros(shape, dtype=torch.float32, device=q.device)
-        dk = torch.empty(shape, dtype=torch.float16, device=q.device)
-        dv = torch.empty(shape, dtype=torch.float16, device=q.device)
-        mod(q, k, v, do, lse, delta, dq, dk, dv)
-        dq = dq.to(torch.float16)
-        return dq, dk, dv, None
-
-
-attention = _attention.apply
-
-
-def ref_program(Q, K, V, is_causal):
-    dim = Q.size(-1)
-    scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
-    scores = scores / torch.sqrt(torch.tensor(dim, dtype=scores.dtype))
-    if is_causal:
-        seq_len = Q.size(1)
-        mask = torch.tril(torch.ones(seq_len, seq_len, device=scores.device))
-        mask = mask.unsqueeze(0).unsqueeze(0)
-        scores = scores.masked_fill(mask == 0, float("-inf"))
-    attention_weights = F.softmax(scores, dim=-1)
-    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V)
-    return output
-
-
-def main(
-    BATCH: int = 8,
-    H: int = 32,
-    N_CTX: int = 1024,
-    D_HEAD: int = 64,
-    causal: bool = False,
-):
-    flops_per_matmul = 2.0 * BATCH * H * N_CTX * N_CTX * D_HEAD
-    total_flops = 5 * flops_per_matmul
-    if causal:
-        total_flops *= 0.5
-    Q = torch.empty(BATCH, N_CTX, H, D_HEAD, dtype=torch.half, device="cuda").normal_().requires_grad_()
-    K = torch.empty_like(Q).normal_().requires_grad_()
-    V = torch.empty_like(Q).normal_().requires_grad_()
-    dO = torch.randn_like(Q)
-    O = attention(Q, K, V, causal)
-    O.backward(dO, retain_graph=True)
-    dQ, Q.grad = Q.grad.clone(), None
-    dK, K.grad = K.grad.clone(), None
-    dV, V.grad = V.grad.clone(), None
-
-    O_ref = ref_program(Q, K, V, causal)
-    O_ref.backward(dO, retain_graph=True)
-    dQ_ref, Q.grad = Q.grad.clone(), None
-    dK_ref, K.grad = K.grad.clone(), None
-    dV_ref, V.grad = V.grad.clone(), None
-
-    assert torch.allclose(O, O_ref, rtol=1e-2, atol=1e-2)
-    assert torch.allclose(dV, dV_ref, rtol=1e-2, atol=1e-2)
-    assert torch.allclose(dK, dK_ref, rtol=1e-2, atol=1e-2)
-    assert torch.allclose(dQ, dQ_ref, rtol=1e-2, atol=1e-2)
-    print("All checks passed.✅")
-
-    def run():
-        O_ref.backward(dO, retain_graph=True)
-
-    def run1():
-        O.backward(dO, retain_graph=True)
-
-    latency = do_bench(run, warmup=500)
-    print("torch: {:.2f} ms".format(latency))
-    print("torch: {:.2f} TFlops".format(total_flops / latency * 1e-9))
-    latency = do_bench(run1, warmup=500)
-    print("tilelang: {:.2f} ms".format(latency))
-    print("tilelang: {:.2f} TFlops".format(total_flops / latency * 1e-9))
-
-
-def run_regression_perf():
-    BATCH = 1
-    H = 32
-    N_CTX = 256
-    D_HEAD = 64
-    causal = False
-    device = "cuda"
-    torch.manual_seed(0)
-    block_M = 128
-    block_N = 128 if D_HEAD <= 64 else 32
-    Q = torch.randn(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.half)
-    K = torch.randn_like(Q)
-    V = torch.randn_like(Q)
-    O = torch.randn_like(Q)
-    dO = torch.randn_like(Q)
-    lse = torch.zeros(BATCH, H, N_CTX, device=device, dtype=torch.float32)
-    with torch.no_grad():
-        mod_prep = flashattn_bwd_preprocess(BATCH, H, N_CTX, D_HEAD)
-        kernel = flashattn_bwd(BATCH, H, N_CTX, D_HEAD, causal, block_M, block_N)
-    dQ = torch.zeros(BATCH, N_CTX, H, D_HEAD, device=device, dtype=torch.float32)
-    dK = torch.zeros_like(Q, dtype=torch.float16)
-    dV = torch.zeros_like(Q, dtype=torch.float16)
-    Delta = mod_prep(O, dO)
-
-    from tilelang.profiler import do_bench
-
-    def run_kernel_only():
-        kernel(Q, K, V, dO, lse, Delta, dQ, dK, dV)
-
-    return do_bench(run_kernel_only, backend="cupti")
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--batch", type=int, default=8, help="Batch size")
-    parser.add_argument("--h", type=int, default=32, help="Number of heads")
-    parser.add_argument("--n_ctx", type=int, default=1024, help="Context size")
-    parser.add_argument("--d_head", type=int, default=64, help="Head dimension")
-    parser.add_argument("--causal", type=bool, default=False, help="Causal flag")
-    args = parser.parse_args()
-    main(args.batch, args.h, args.n_ctx, args.d_head, args.causal)
diff --git a/examples/flash_attention/regression_example_flash_attention.py b/examples/flash_attention/regression_example_flash_attention.py
index 8710bbb6e2..86bea2f86e 100644
--- a/examples/flash_attention/regression_example_flash_attention.py
+++ b/examples/flash_attention/regression_example_flash_attention.py
@@ -1,17 +1,12 @@
 import tilelang.testing
 import example_gqa_fwd_bshd
-import example_gqa_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_bhsd
-import example_mha_fwd_bhsd_wgmma_pipelined
 import example_mha_fwd_bshd
-import example_mha_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_varlen
 import example_gqa_bwd_tma_reduce_varlen
 import example_gqa_bwd
-import example_gqa_bwd_wgmma_pipelined
 import example_mha_bwd_bshd
 import example_mha_bwd_bhsd
-import example_mha_bwd_bshd_wgmma_pipelined
 
 
 def regression_example_gqa_bwd_tma_reduce_varlen():
@@ -22,10 +17,6 @@ def regression_example_gqa_bwd():
     tilelang.testing.process_func(example_gqa_bwd.run_regression_perf)
 
 
-def regression_example_gqa_bwd_wgmma_pipelined():
-    tilelang.testing.process_func(example_gqa_bwd_wgmma_pipelined.run_regression_perf)
-
-
 def regression_example_mha_bwd_bshd():
     tilelang.testing.process_func(example_mha_bwd_bshd.run_regression_perf)
 
@@ -34,34 +25,16 @@ def regression_example_mha_bwd_bhsd():
     tilelang.testing.process_func(example_mha_bwd_bhsd.run_regression_perf)
 
 
-def regression_example_mha_bwd_bshd_wgmma_pipelined():
-    tilelang.testing.process_func(example_mha_bwd_bshd_wgmma_pipelined.run_regression_perf)
-
-
-def regression_example_gqa_fwd_bshd_wgmma_pipelined():
-    tilelang.testing.process_func(
-        example_gqa_fwd_bshd_wgmma_pipelined.run_regression_perf, batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16
-    )
-
-
 def regression_example_gqa_fwd_bshd():
     tilelang.testing.process_func(
         example_gqa_fwd_bshd.run_regression_perf, batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16
     )
 
 
-def regression_example_mha_fwd_bhsd_wgmma_pipelined():
-    tilelang.testing.process_func(example_mha_fwd_bhsd_wgmma_pipelined.run_regression_perf)
-
-
 def regression_example_mha_fwd_bhsd():
     tilelang.testing.process_func(example_mha_fwd_bhsd.run_regression_perf)
 
 
-def regression_example_mha_fwd_bshd_wgmma_pipelined():
-    tilelang.testing.process_func(example_mha_fwd_bshd_wgmma_pipelined.run_regression_perf, batch=1, heads=32, seq_len=256)
-
-
 def regression_example_mha_fwd_bshd():
     tilelang.testing.process_func(example_mha_fwd_bshd.run_regression_perf, batch=1, seq_len=256)
 
diff --git a/examples/flash_attention/test_example_flash_attention.py b/examples/flash_attention/test_example_flash_attention.py
index a74bf071b9..dc8b9d9266 100644
--- a/examples/flash_attention/test_example_flash_attention.py
+++ b/examples/flash_attention/test_example_flash_attention.py
@@ -1,22 +1,18 @@
 import tilelang.testing
 
 import example_gqa_bwd
-import example_gqa_bwd_wgmma_pipelined
 import example_mha_bwd_bshd
 import example_mha_bwd_bhsd
-import example_mha_fwd_bhsd_wgmma_pipelined
 import example_gqa_fwd_bshd
 import example_mha_fwd_bshd
-import example_gqa_fwd_bshd_wgmma_pipelined
-import example_mha_fwd_bshd_wgmma_pipelined
 import example_mha_fwd_varlen
-import example_mha_bwd_bshd_wgmma_pipelined
 import example_mha_fwd_bhsd
 import example_gqa_bwd_tma_reduce_varlen
 import example_gqa_fwd_varlen
 
 
 @tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_gqa_bwd_tma_reduce_varlen():
     example_gqa_bwd_tma_reduce_varlen.main()
 
@@ -26,12 +22,6 @@ def test_example_gqa_bwd():
     example_gqa_bwd.main()
 
 
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_gqa_bwd_wgmma_pipelined():
-    example_gqa_bwd_wgmma_pipelined.main()
-
-
 @tilelang.testing.requires_cuda
 def test_example_mha_bwd():
     example_mha_bwd_bshd.main(
@@ -54,40 +44,16 @@ def test_example_mha_bwd_bhsd():
     )
 
 
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_mha_bwd_wgmma_pipelined():
-    example_mha_bwd_bshd_wgmma_pipelined.main(BATCH=1, H=32, N_CTX=256, D_HEAD=64, causal=False)
-
-
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_gqa_fwd_bshd_wgmma_pipelined():
-    example_gqa_fwd_bshd_wgmma_pipelined.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
-
-
 @tilelang.testing.requires_cuda
 def test_example_gqa_fwd_bshd():
     example_gqa_fwd_bshd.main(batch=1, heads=16, seq_len=1024, dim=128, is_causal=False, groups=16, tune=False)
 
 
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_mha_fwd_bhsd_wgmma_pipelined():
-    example_mha_fwd_bhsd_wgmma_pipelined.main()
-
-
 @tilelang.testing.requires_cuda
 def test_example_mha_fwd_bhsd():
     example_mha_fwd_bhsd.main()
 
 
-@tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
-def test_example_mha_fwd_bshd_wgmma_pipelined():
-    example_mha_fwd_bshd_wgmma_pipelined.main(batch=1, heads=32, seq_len=256)
-
-
 @tilelang.testing.requires_cuda
 def test_example_mha_fwd_bshd():
     example_mha_fwd_bshd.main(batch=1, seq_len=256)
diff --git a/examples/flash_attention_sm100/gqa_bwd_bshd.py b/examples/flash_attention_sm100/gqa_bwd_bshd.py
new file mode 100644
index 0000000000..95e1c35d60
--- /dev/null
+++ b/examples/flash_attention_sm100/gqa_bwd_bshd.py
@@ -0,0 +1,345 @@
+"""Blackwell (SM100) GQA backward, BSHD layout.
+
+Q/dQ: [batch, seq_len, heads, dim]; K,V,dK,dV: [batch, seq_len, head_kv, dim]; head_kv = heads // groups.
+dK/dV use atomic_add (multiple Q heads -> same KV head).
+Pipeline (default): --variant ss. ts (optional): --variant ts.
+"""
+
+import torch
+import torch.nn.functional as F
+import tilelang
+import tilelang.language as T
+import argparse
+
+
+PASS_CFG = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+
+
+@tilelang.jit(out_idx=[3, 4], pass_configs=PASS_CFG)
+def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N, groups=1):
+    """Forward for LSE; K/V indexed by by // groups."""
+    if groups <= 0 or heads % groups != 0:
+        raise ValueError("groups must be a positive divisor of heads")
+    head_kv = heads // groups
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+    q_shape = [batch, seq_len, heads, dim]
+    kv_shape = [batch, seq_len, head_kv, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+            acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            loop_range = (
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
+            for k in T.Pipelined(loop_range, num_stages=1):
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.tcgen05_gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
+                T.copy(scores_max, scores_max_prev)
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.copy(acc_s, acc_s_cast)
+                T.tcgen05_gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+            for i, j in T.Parallel(block_M, dim):
+                acc_o[i, j] /= logsum[i]
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
+            for i in T.Parallel(block_M):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
+
+    return main
+
+
+@tilelang.jit(out_idx=[2], pass_configs=PASS_CFG)
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+    shape = [batch, seq_len, heads, dim]
+    blk = 32
+
+    @T.prim_func
+    def main(
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
+            o = T.alloc_fragment([blk, blk], dtype)
+            do = T.alloc_fragment([blk, blk], dtype)
+            acc = T.alloc_fragment([blk, blk], accum_dtype)
+            delta = T.alloc_fragment([blk], accum_dtype)
+            T.clear(acc)
+            for k in range(T.ceildiv(dim, blk)):
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
+                for i, j in T.Parallel(blk, blk):
+                    acc[i, j] += o[i, j] * do[i, j]
+            T.reduce_sum(acc, delta, 1)
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
+
+    return main
+
+
+def make_dq_layout(dQ):
+    return T.Layout(
+        dQ.shape,
+        lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2],
+    )
+
+
+@tilelang.jit(out_idx=[1], pass_configs=PASS_CFG)
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+    shape = [batch, seq_len, heads, dim]
+    blk = 64
+
+    @T.prim_func
+    def main(
+        dQ: T.Tensor(shape, accum_dtype),
+        dQ_out: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
+            T.annotate_layout({dQ: make_dq_layout(dQ)})
+            T.copy(
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
+            )
+
+    return main
+
+
+@tilelang.jit(pass_configs=PASS_CFG)
+def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N, groups=1, threads=128, num_stages=2):
+    """GQA backward: K/V/dK/dV use bx // groups; dK/dV use atomic_add."""
+    if groups <= 0 or heads % groups != 0:
+        raise ValueError("groups must be a positive divisor of heads")
+    head_kv = heads // groups
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+    q_shape = [batch, seq_len, heads, dim]
+    kv_shape = [batch, seq_len, head_kv, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        dO: T.Tensor(q_shape, dtype),
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),
+        dQ: T.Tensor(q_shape, accum_dtype),
+        dK: T.Tensor(kv_shape, accum_dtype),
+        dV: T.Tensor(kv_shape, accum_dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+            K_shared = T.alloc_shared([block_M, dim], dtype)
+            dsT_shared = T.alloc_shared([block_M, block_N], dtype)
+            q = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_M, dim], dtype)
+            qkT = T.alloc_fragment([block_M, block_N], accum_dtype)
+            dsT = T.alloc_fragment([block_M, block_N], accum_dtype)
+            qkT_cast = T.alloc_fragment([block_M, block_N], dtype)
+            dsT_cast = T.alloc_fragment([block_M, block_N], dtype)
+            qkT_shared = T.alloc_shared([block_M, block_N], dtype)
+            lse_shared = T.alloc_shared([block_N], accum_dtype)
+            delta = T.alloc_shared([block_N], accum_dtype)
+            do = T.alloc_shared([block_N, dim], dtype)
+            dv = T.alloc_fragment([block_M, dim], accum_dtype)
+            dk = T.alloc_fragment([block_M, dim], accum_dtype)
+            dq = T.alloc_fragment([block_N, dim], accum_dtype)
+            dv_shared = T.alloc_shared([block_M, dim], accum_dtype)
+            dk_shared = T.alloc_shared([block_M, dim], accum_dtype)
+
+            T.annotate_layout({dQ: make_dq_layout(dQ)})
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx // groups, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx // groups, :], V_shared)
+            T.clear(dv)
+            T.clear(dk)
+            loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
+            loop_ed = T.ceildiv(seq_len, block_N)
+            for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
+                T.clear(qkT)
+                T.tcgen05_gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
+                T.clear(dsT)
+                T.tcgen05_gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(qkT, qkT_cast)
+                T.copy(qkT_cast, qkT_shared)
+                T.tcgen05_gemm(qkT_shared, do, dv, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
+                for i, j in T.Parallel(block_M, block_N):
+                    dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
+                T.copy(dsT_cast, dsT_shared)
+                T.tcgen05_gemm(dsT_shared, q, dk, policy=T.GemmWarpPolicy.FullRow)
+                T.clear(dq)
+                T.tcgen05_gemm(dsT_shared, K_shared, dq, transpose_A=True)
+                for i, j in T.Parallel(block_N, dim):
+                    T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
+            T.copy(dv, dv_shared)
+            T.copy(dk, dk_shared)
+            T.atomic_add(dV[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dv_shared)
+            T.atomic_add(dK[bz, by * block_M : (by + 1) * block_M, bx // groups, :], dk_shared)
+
+    return main
+
+
+def flashattn_bwd_pipeline(batch, heads, seq_len, dim, is_causal, block_M, block_N, groups=1):
+    return flashattn_bwd(
+        batch,
+        heads,
+        seq_len,
+        dim,
+        is_causal,
+        block_M,
+        block_N,
+        groups=groups,
+        threads=128,
+        num_stages=2,
+    )
+
+
+def flashattn_bwd_warp(batch, heads, seq_len, dim, is_causal, block_M, block_N, groups=1):
+    return flashattn_bwd(
+        batch,
+        heads,
+        seq_len,
+        dim,
+        is_causal,
+        block_M,
+        block_N,
+        groups=groups,
+        threads=256,
+        num_stages=2,
+    )
+
+
+def ref_program(Q, K, V, is_causal, groups=1):
+    """CPU reference for forward only; backward ref omitted for brevity."""
+    dim = Q.size(-1)
+    K_f = K.cpu().float().repeat_interleave(groups, dim=2)
+    V_f = V.cpu().float().repeat_interleave(groups, dim=2)
+    Q_f = Q.cpu().float()
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q_f, K_f)
+    scores = scores / (dim**0.5)
+    if is_causal:
+        seq_len = Q_f.size(1)
+        mask = torch.tril(torch.ones(seq_len, seq_len))
+        mask = mask.unsqueeze(0).unsqueeze(0)
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    P = F.softmax(scores, dim=-1)
+    out_ref = torch.einsum("bhqk,bkhd->bqhd", P, V_f)
+    return out_ref.to(Q.dtype)
+
+
+def main(
+    batch: int = 2,
+    heads: int = 4,
+    seq_len: int = 256,
+    dim: int = 128,
+    is_causal: bool = False,
+    groups: int = 1,
+    variant: str = "ss",
+):
+    """Run GQA backward kernels (fwd + preprocess + bwd + postprocess)."""
+    if groups <= 0 or heads % groups != 0:
+        raise ValueError("groups must be a positive divisor of heads")
+    head_kv = heads // groups
+    block_M = 64
+    block_N = 64 if dim <= 64 else 32
+    bwd_fn = flashattn_bwd_warp if variant == "ts" else flashattn_bwd_pipeline
+
+    kernel_fwd = flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N, groups=groups)
+    kernel_prep = flashattn_bwd_preprocess(batch, heads, seq_len, dim)
+    kernel_post = flashattn_bwd_postprocess(batch, heads, seq_len, dim)
+    kernel_bwd = bwd_fn(batch, heads, seq_len, dim, is_causal, block_M, block_N, groups=groups)
+
+    Q = torch.randn(batch, seq_len, heads, dim, device="cuda", dtype=torch.bfloat16)
+    K = torch.randn(batch, seq_len, head_kv, dim, device="cuda", dtype=torch.bfloat16)
+    V = torch.randn(batch, seq_len, head_kv, dim, device="cuda", dtype=torch.bfloat16)
+    dO = torch.randn(batch, seq_len, heads, dim, device="cuda", dtype=torch.bfloat16)
+
+    O, lse = kernel_fwd(Q, K, V)
+    Delta = kernel_prep(O, dO)
+    dQ = torch.zeros(batch, seq_len, heads, dim, device="cuda", dtype=torch.float32)
+    dK = torch.zeros(batch, seq_len, head_kv, dim, device="cuda", dtype=torch.float32)
+    dV = torch.zeros(batch, seq_len, head_kv, dim, device="cuda", dtype=torch.float32)
+    kernel_bwd(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+    _ = kernel_post(dQ)  # dQ_out in output layout; not compared to ref (no backward ref)
+    print("Blackwell GQA bwd ({}): run OK (backward gradients not verified against ref).".format(variant))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=2)
+    parser.add_argument("--heads", type=int, default=4)
+    parser.add_argument("--seq_len", type=int, default=256)
+    parser.add_argument("--dim", type=int, default=128)
+    parser.add_argument("--is_causal", action="store_true")
+    parser.add_argument("--groups", type=int, default=1, help="head_kv = heads // groups")
+    parser.add_argument(
+        "--variant",
+        choices=["ss", "ts"],
+        default="ss",
+        help="ss: pipeline (default); ts: 256 threads",
+    )
+    args = parser.parse_args()
+    main(
+        args.batch,
+        args.heads,
+        args.seq_len,
+        args.dim,
+        args.is_causal,
+        args.groups,
+        args.variant,
+    )
diff --git a/examples/flash_attention_sm100/gqa_fwd_bshd.py b/examples/flash_attention_sm100/gqa_fwd_bshd.py
new file mode 100644
index 0000000000..775cb45dd1
--- /dev/null
+++ b/examples/flash_attention_sm100/gqa_fwd_bshd.py
@@ -0,0 +1,504 @@
+"""Blackwell (SM100) GQA forward, BSHD layout.
+
+Q: [batch, seq_len, heads, dim], K/V: [batch, seq_len, head_kv, dim], head_kv = heads // groups.
+variant='ss': mma_ss for both GEMMs (128 threads, P via shared memory).
+variant='ts': mma_ts for GEMM 2 (256 threads, P via tensor memory).
+variant='wasp': warp-specialized pipeline (softmax/DMA/BMM warps); GEMM 2 mma_ts.
+"""
+
+import torch
+import torch.nn.functional as F
+import tilelang
+import tilelang.language as T
+from tilelang.profiler import do_bench
+import argparse
+
+
+PASS_CFG = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+
+
+@tilelang.jit(out_idx=[3], pass_configs=PASS_CFG)
+def flashattn(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    groups=1,
+    block_M=128,
+    block_N=128,
+    variant="ss",
+):
+    """GQA forward. variant='ss': mma_ss (128t, P via shared); 'ts': mma_ts (256t, P via TMEM)."""
+    if groups <= 0 or heads % groups != 0:
+        raise ValueError("groups must be a positive divisor of heads")
+    head_kv = heads // groups
+    use_ts = variant == "ts"
+    threads = 256 if use_ts else 128
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+    q_shape = [batch, seq_len, heads, dim]
+    kv_shape = [batch, seq_len, head_kv, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
+
+            S_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            D_tmem = T.alloc_tmem([block_M, dim], accum_dtype)
+            mbar_s = T.alloc_barrier(1)
+            mbar_d = T.alloc_barrier(1)
+
+            if use_ts:
+                P_tmem = T.alloc_tmem([block_M, block_N], dtype)
+            else:
+                P_shared = T.alloc_shared([block_M, block_N], dtype)
+
+            S_reg = T.alloc_fragment([block_M, block_N], accum_dtype)
+            P_cast = T.alloc_fragment([block_M, block_N], dtype)
+            O_reg = T.alloc_fragment([block_M, dim], accum_dtype)
+            D_reg = T.alloc_fragment([block_M, dim], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
+            T.fill(O_reg, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            loop_range = (
+                T.min(
+                    T.ceildiv(seq_len, block_N),
+                    T.ceildiv((bx + 1) * block_M, block_N),
+                )
+                if is_causal
+                else T.ceildiv(seq_len, block_N)
+            )
+
+            for k in T.Pipelined(loop_range, num_stages=1):
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared)
+
+                T.tcgen05_gemm(
+                    Q_shared,
+                    K_shared,
+                    S_tmem,
+                    transpose_B=True,
+                    mbar=mbar_s,
+                    clear_accum=True,
+                )
+                T.mbarrier_wait_parity(mbar_s, k % 2)
+
+                T.copy(S_tmem, S_reg)
+
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        S_reg[i, j] = T.if_then_else(
+                            bx * block_M + i >= k * block_N + j,
+                            S_reg[i, j],
+                            -T.infinity(accum_dtype),
+                        )
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        S_reg[i, j] = T.if_then_else(
+                            k * block_N + j >= seq_len,
+                            -T.infinity(accum_dtype),
+                            S_reg[i, j],
+                        )
+
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(S_reg, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    S_reg[i, j] = T.exp2(S_reg[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(S_reg, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+
+                for i, j in T.Parallel(block_M, dim):
+                    O_reg[i, j] *= scores_scale[i]
+
+                T.copy(S_reg, P_cast)
+                if use_ts:
+                    T.copy(P_cast, P_tmem)
+                    P_operand = P_tmem
+                else:
+                    T.copy(P_cast, P_shared)
+                    P_operand = P_shared
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared)
+
+                T.tcgen05_gemm(
+                    P_operand,
+                    V_shared,
+                    D_tmem,
+                    mbar=mbar_d,
+                    clear_accum=True,
+                )
+                T.mbarrier_wait_parity(mbar_d, k % 2)
+
+                T.copy(D_tmem, D_reg)
+                for i, j in T.Parallel(block_M, dim):
+                    O_reg[i, j] += D_reg[i, j]
+
+            for i, j in T.Parallel(block_M, dim):
+                O_reg[i, j] /= logsum[i]
+            T.copy(O_reg, O_shared)
+            T.copy(
+                O_shared,
+                Output[bz, bx * block_M : (bx + 1) * block_M, by, :],
+            )
+
+    return main
+
+
+flashattn_ss = flashattn
+flashattn_ts = flashattn
+
+
+@tilelang.jit(out_idx=[3], pass_configs=PASS_CFG)
+def flashattn_wasp(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    groups=1,
+    block_M=128,
+    block_N=128,
+    threads=256,
+    num_stages=2,
+):
+    """GQA warp-specialized pipeline: softmax(0-127)/DMA(128-159)/BMM(160-191); GEMM2 mma_ts."""
+    if groups <= 0 or heads % groups != 0:
+        raise ValueError("groups must be a positive divisor of heads")
+    head_kv = heads // groups
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+    q_shape = [batch, seq_len, heads, dim]
+    kv_shape = [batch, seq_len, head_kv, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(q_shape, dtype),
+        K: T.Tensor(kv_shape, dtype),
+        V: T.Tensor(kv_shape, dtype),
+        Output: T.Tensor(q_shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared_0 = T.alloc_shared([block_N, dim], dtype)
+            K_shared_1 = T.alloc_shared([block_N, dim], dtype)
+            V_shared_0 = T.alloc_shared([block_N, dim], dtype)
+            V_shared_1 = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
+
+            S_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            P_tmem = T.alloc_tmem([block_M, block_N], dtype)
+            O_tmem = T.alloc_tmem([block_M, dim], accum_dtype)
+
+            mbar_dma1_empty = T.alloc_barrier([32] * num_stages)
+            mbar_dma1_full = T.alloc_barrier([32] * num_stages)
+            mbar_bmm1_empty = T.alloc_barrier([128] * num_stages)
+            mbar_bmm1_full = T.alloc_barrier([1] * num_stages)
+            mbar_dma2_empty = T.alloc_barrier([32] * num_stages)
+            mbar_dma2_full = T.alloc_barrier([32] * num_stages)
+            mbar_bmm2_full = T.alloc_barrier([1] * num_stages)
+            mbar_softmax_empty = T.alloc_barrier([32] * num_stages)
+            mbar_softmax_full = T.alloc_barrier([128] * num_stages)
+            mbar_correction_full = T.alloc_barrier([32] * num_stages)
+
+            tid = T.get_thread_binding()
+
+            S_reg = T.alloc_fragment([block_M, block_N], accum_dtype)
+            P_cast = T.alloc_fragment([block_M, block_N], dtype)
+            O_reg = T.alloc_fragment([block_M, dim], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_rescale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+
+            if tid < 128:
+                T.fill(O_reg, 0)
+                T.fill(logsum, 0)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.copy(O_reg, O_tmem)
+
+            loop_range = (
+                T.min(
+                    T.ceildiv(seq_len, block_N),
+                    T.ceildiv((bx + 1) * block_M, block_N),
+                )
+                if is_causal
+                else T.ceildiv(seq_len, block_N)
+            )
+
+            for k in T.serial(loop_range):
+                parity = (k // num_stages) & 1
+                parity_inv = parity ^ 1
+                stage_id = k % num_stages
+                is_clear_accum = k == 0
+
+                if tid >= 128 and tid < 160:
+                    T.mbarrier_wait_parity(mbar_dma1_empty[stage_id], parity_inv)
+
+                    if k == 0:
+                        T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
+
+                    if stage_id == 0:
+                        T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared_0)
+                    else:
+                        T.copy(K[bz, k * block_N : (k + 1) * block_N, by // groups, :], K_shared_1)
+                    T.mbarrier_arrive(mbar_dma1_full[stage_id])
+
+                    T.mbarrier_wait_parity(mbar_dma2_empty[stage_id], parity_inv)
+
+                    if stage_id == 0:
+                        T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared_0)
+                    else:
+                        T.copy(V[bz, k * block_N : (k + 1) * block_N, by // groups, :], V_shared_1)
+
+                    T.mbarrier_arrive(mbar_dma2_full[stage_id])
+
+                elif tid >= 160 and tid < 192:
+                    T.mbarrier_wait_parity(mbar_dma1_full[stage_id], parity)
+                    T.mbarrier_wait_parity(mbar_bmm1_empty[stage_id], parity_inv)
+
+                    if stage_id == 0:
+                        T.tcgen05_gemm(
+                            Q_shared,
+                            K_shared_0,
+                            S_tmem,
+                            transpose_B=True,
+                            mbar=mbar_bmm1_full[stage_id],
+                            clear_accum=True,
+                        )
+                    else:
+                        T.tcgen05_gemm(
+                            Q_shared,
+                            K_shared_1,
+                            S_tmem,
+                            transpose_B=True,
+                            mbar=mbar_bmm1_full[stage_id],
+                            clear_accum=True,
+                        )
+                    T.mbarrier_arrive(mbar_dma1_empty[stage_id])
+
+                    T.mbarrier_wait_parity(mbar_softmax_full[stage_id], parity)
+                    T.mbarrier_wait_parity(mbar_dma2_full[stage_id], parity)
+
+                    if stage_id == 0:
+                        T.tcgen05_gemm(
+                            P_tmem,
+                            V_shared_0,
+                            O_tmem,
+                            mbar=mbar_bmm2_full[stage_id],
+                            clear_accum=is_clear_accum,
+                        )
+                    else:
+                        T.tcgen05_gemm(
+                            P_tmem,
+                            V_shared_1,
+                            O_tmem,
+                            mbar=mbar_bmm2_full[stage_id],
+                            clear_accum=is_clear_accum,
+                        )
+
+                    T.mbarrier_arrive(mbar_softmax_empty[stage_id])
+                    T.mbarrier_arrive(mbar_dma2_empty[stage_id])
+
+                    if k == loop_range - 1:
+                        T.mbarrier_arrive(mbar_correction_full[0])
+
+                elif tid < 128:
+                    T.mbarrier_wait_parity(mbar_softmax_empty[stage_id], parity_inv)
+                    T.mbarrier_wait_parity(mbar_bmm1_full[stage_id], parity)
+                    if k > 0:
+                        prev_stage = (k - 1) % num_stages
+                        prev_parity = ((k - 1) // num_stages) & 1
+                        T.mbarrier_wait_parity(mbar_bmm2_full[prev_stage], prev_parity)
+
+                    T.copy(O_tmem, O_reg)
+                    T.copy(S_tmem, S_reg)
+
+                    if is_causal:
+                        for i, j in T.Parallel(block_M, block_N):
+                            S_reg[i, j] = T.if_then_else(
+                                bx * block_M + i >= k * block_N + j,
+                                S_reg[i, j],
+                                -T.infinity(accum_dtype),
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            S_reg[i, j] = T.if_then_else(
+                                k * block_N + j >= seq_len,
+                                -T.infinity(accum_dtype),
+                                S_reg[i, j],
+                            )
+
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(S_reg, scores_max, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                    for i in T.Parallel(block_M):
+                        scores_rescale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_M, block_N):
+                        S_reg[i, j] = T.exp2(S_reg[i, j] * scale - scores_max[i] * scale)
+
+                    T.reduce_sum(S_reg, scores_sum, dim=1)
+                    for i in T.Parallel(block_M):
+                        logsum[i] = logsum[i] * scores_rescale[i] + scores_sum[i]
+
+                    for i, j in T.Parallel(block_M, dim):
+                        O_reg[i, j] *= scores_rescale[i]
+
+                    T.copy(S_reg, P_cast)
+                    T.copy(P_cast, P_tmem)
+                    T.copy(O_reg, O_tmem)
+
+                    T.mbarrier_arrive(mbar_softmax_full[stage_id])
+                    T.mbarrier_arrive(mbar_bmm1_empty[stage_id])
+
+                    if k == loop_range - 1:
+                        T.mbarrier_wait_parity(mbar_correction_full[0], 0)
+                        T.mbarrier_wait_parity(mbar_bmm2_full[stage_id], parity)
+                        T.copy(O_tmem, O_reg)
+                        for i, j in T.Parallel(block_M, dim):
+                            O_reg[i, j] /= logsum[i]
+                        T.copy(O_reg, O_shared)
+                        T.copy(
+                            O_shared,
+                            Output[bz, bx * block_M : (bx + 1) * block_M, by, :],
+                        )
+
+    return main
+
+
+flashattn_warp = flashattn_wasp
+
+
+def ref_program(Q, K, V, is_causal, groups=1):
+    """CPU reference: K/V [b,s,head_kv,d], expand to heads for einsum."""
+    assert Q.size(2) == K.size(2) * groups
+    dim = Q.size(-1)
+    K_f = K.cpu().float().repeat_interleave(groups, dim=2)
+    V_f = V.cpu().float().repeat_interleave(groups, dim=2)
+    Q_f = Q.cpu().float()
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q_f, K_f)
+    scores = scores / (dim**0.5)
+    if is_causal:
+        seq_len = Q_f.size(1)
+        mask = torch.tril(torch.ones(seq_len, seq_len))
+        mask = mask.unsqueeze(0).unsqueeze(0)
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    P = F.softmax(scores, dim=-1)
+    O = torch.einsum("bhqk,bkhd->bqhd", P, V_f)
+    return O.to(Q.dtype)
+
+
+def main(
+    batch: int = 2,
+    heads: int = 4,
+    seq_len: int = 256,
+    dim: int = 128,
+    is_causal: bool = False,
+    groups: int = 1,
+    variant: str = "ss",
+):
+    """Run GQA forward kernel (ss or ts variant) and benchmark."""
+    if groups <= 0 or heads % groups != 0:
+        raise ValueError("groups must be a positive divisor of heads")
+    head_kv = heads // groups
+    flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
+    total_flops = 2 * flops_per_matmul
+    if is_causal:
+        total_flops *= 0.5
+
+    print(f"=== Blackwell GQA Forward ({variant.upper()}) ===")
+    print(f"batch={batch}, heads={heads}, head_kv={head_kv}, groups={groups}, seq_len={seq_len}, dim={dim}, causal={is_causal}")
+
+    if variant in ("ss", "ts"):
+        kernel = flashattn(
+            batch,
+            heads,
+            seq_len,
+            dim,
+            is_causal,
+            groups=groups,
+            block_M=128,
+            block_N=128,
+            variant=variant,
+        )
+    else:
+        kernel = flashattn_wasp(
+            batch,
+            heads,
+            seq_len,
+            dim,
+            is_causal,
+            groups=groups,
+            block_M=128,
+            block_N=128,
+            threads=256,
+            num_stages=2,
+        )
+
+    Q = torch.randn(batch, seq_len, heads, dim, device="cuda", dtype=torch.bfloat16)
+    K = torch.randn(batch, seq_len, head_kv, dim, device="cuda", dtype=torch.bfloat16)
+    V = torch.randn(batch, seq_len, head_kv, dim, device="cuda", dtype=torch.bfloat16)
+
+    out = kernel(Q, K, V)
+    ref = ref_program(Q, K, V, is_causal, groups).to(out.device)
+    torch.testing.assert_close(out, ref, rtol=1e-2, atol=1e-2)
+    print("Correctness check passed.")
+
+    latency = do_bench(lambda: kernel(Q, K, V), warmup=100)
+    print(f"Blackwell GQA fwd ({variant}): {latency:.2f} ms")
+    print(f"Blackwell GQA fwd ({variant}): {total_flops / latency * 1e-9:.2f} TFlops")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=2)
+    parser.add_argument("--heads", type=int, default=4)
+    parser.add_argument("--seq_len", type=int, default=256)
+    parser.add_argument("--dim", type=int, default=128)
+    parser.add_argument("--is_causal", action="store_true")
+    parser.add_argument("--groups", type=int, default=1, help="GQA: head_kv = heads // groups")
+    parser.add_argument(
+        "--variant",
+        choices=["ss", "ts", "wasp"],
+        default="ss",
+        help="ss: pipeline (default); ts: 256 threads; wasp: warp-specialized",
+    )
+    args = parser.parse_args()
+    main(
+        args.batch,
+        args.heads,
+        args.seq_len,
+        args.dim,
+        args.is_causal,
+        args.groups,
+        args.variant,
+    )
diff --git a/examples/flash_attention_sm100/mha_bwd_bshd.py b/examples/flash_attention_sm100/mha_bwd_bshd.py
new file mode 100644
index 0000000000..45406a2eda
--- /dev/null
+++ b/examples/flash_attention_sm100/mha_bwd_bshd.py
@@ -0,0 +1,309 @@
+"""Blackwell (SM100) MHA backward, BSHD layout.
+
+Pipeline (default): --variant ss or default.
+ts (optional): --variant ts (256 threads, 2 stages).
+"""
+
+import torch
+import torch.nn.functional as F
+import tilelang
+import tilelang.language as T
+import argparse
+
+
+PASS_CFG = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+
+
+@tilelang.jit(out_idx=[3, 4], pass_configs=PASS_CFG)
+def flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N):
+    """Forward to get O and LSE (for backward)."""
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+    shape = [batch, seq_len, heads, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=128) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            acc_s = T.alloc_fragment([block_M, block_N], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_M, block_N], dtype)
+            acc_o = T.alloc_fragment([block_M, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+            loop_range = (
+                T.min(T.ceildiv(seq_len, block_N), T.ceildiv((bx + 1) * block_M, block_N)) if is_causal else T.ceildiv(seq_len, block_N)
+            )
+            for k in T.Pipelined(loop_range, num_stages=1):
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(bx * block_M + i >= k * block_N + j, 0, -T.infinity(acc_s.dtype))
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        acc_s[i, j] = T.if_then_else(k * block_N + j >= seq_len, -T.infinity(acc_s.dtype), 0)
+                T.tcgen05_gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
+                T.copy(scores_max, scores_max_prev)
+                T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, dim):
+                    acc_o[i, j] *= scores_scale[i]
+                for i, j in T.Parallel(block_M, block_N):
+                    acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                T.copy(acc_s, acc_s_cast)
+                T.tcgen05_gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+                T.reduce_sum(acc_s, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+            for i, j in T.Parallel(block_M, dim):
+                acc_o[i, j] /= logsum[i]
+            T.copy(acc_o, Output[bz, bx * block_M : (bx + 1) * block_M, by, :])
+            for i in T.Parallel(block_M):
+                logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
+            T.copy(logsum, lse[bz, by, bx * block_M : (bx + 1) * block_M])
+
+    return main
+
+
+@tilelang.jit(out_idx=[2], pass_configs=PASS_CFG)
+def flashattn_bwd_preprocess(batch, heads, seq_len, dim):
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+    shape = [batch, seq_len, heads, dim]
+    blk = 32
+
+    @T.prim_func
+    def main(
+        O: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(seq_len, blk), batch) as (bx, by, bz):
+            o = T.alloc_fragment([blk, blk], dtype)
+            do = T.alloc_fragment([blk, blk], dtype)
+            acc = T.alloc_fragment([blk, blk], accum_dtype)
+            delta = T.alloc_fragment([blk], accum_dtype)
+            T.clear(acc)
+            for k in range(T.ceildiv(dim, blk)):
+                T.copy(O[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], o)
+                T.copy(dO[bz, by * blk : (by + 1) * blk, bx, k * blk : (k + 1) * blk], do)
+                for i, j in T.Parallel(blk, blk):
+                    acc[i, j] += o[i, j] * do[i, j]
+            T.reduce_sum(acc, delta, 1)
+            T.copy(delta, Delta[bz, bx, by * blk : (by + 1) * blk])
+
+    return main
+
+
+def make_dq_layout(dQ):
+    return T.Layout(
+        dQ.shape,
+        lambda b, l, h, d: [b, l // 8, h, d // 8, (d % 2), 4 * (l % 8) + (d % 8) // 2],
+    )
+
+
+@tilelang.jit(out_idx=[1], pass_configs=PASS_CFG)
+def flashattn_bwd_postprocess(batch, heads, seq_len, dim):
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+    shape = [batch, seq_len, heads, dim]
+    blk = 64
+
+    @T.prim_func
+    def main(
+        dQ: T.Tensor(shape, accum_dtype),
+        dQ_out: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, blk), heads, batch, threads=128) as (bx, by, bz):
+            T.annotate_layout({dQ: make_dq_layout(dQ)})
+            T.copy(
+                dQ[bz, bx * blk : (bx + 1) * blk, by, :],
+                dQ_out[bz, bx * blk : (bx + 1) * blk, by, :],
+            )
+
+    return main
+
+
+@tilelang.jit(pass_configs=PASS_CFG)
+def flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N, threads=128, num_stages=2):
+    """Blackwell MHA backward. Pipeline default (128, 2); ts = (256, 2)."""
+    sm_scale = (1.0 / dim) ** 0.5
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+    shape = [batch, seq_len, heads, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        dO: T.Tensor(shape, dtype),
+        lse: T.Tensor([batch, heads, seq_len], accum_dtype),
+        Delta: T.Tensor([batch, heads, seq_len], accum_dtype),
+        dQ: T.Tensor(shape, accum_dtype),
+        dK: T.Tensor(shape, dtype),
+        dV: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(heads, T.ceildiv(seq_len, block_M), batch, threads=threads) as (bx, by, bz):
+            K_shared = T.alloc_shared([block_M, dim], dtype)
+            dsT_shared = T.alloc_shared([block_M, block_N], dtype)
+            q = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_M, dim], dtype)
+            qkT = T.alloc_fragment([block_M, block_N], accum_dtype)
+            dsT = T.alloc_fragment([block_M, block_N], accum_dtype)
+            qkT_cast = T.alloc_fragment([block_M, block_N], dtype)
+            dsT_cast = T.alloc_fragment([block_M, block_N], dtype)
+            lse_shared = T.alloc_shared([block_N], accum_dtype)
+            delta = T.alloc_shared([block_N], accum_dtype)
+            do = T.alloc_shared([block_N, dim], dtype)
+            dv = T.alloc_fragment([block_M, dim], accum_dtype)
+            dk = T.alloc_fragment([block_M, dim], accum_dtype)
+            dq = T.alloc_fragment([block_N, dim], accum_dtype)
+            dv_shared = T.alloc_shared([block_M, dim], dtype)
+            dk_shared = T.alloc_shared([block_M, dim], dtype)
+
+            T.annotate_layout({dQ: make_dq_layout(dQ)})
+            T.copy(K[bz, by * block_M : (by + 1) * block_M, bx, :], K_shared)
+            T.copy(V[bz, by * block_M : (by + 1) * block_M, bx, :], V_shared)
+            T.clear(dv)
+            T.clear(dk)
+            loop_st = T.floordiv(by * block_M, block_N) if is_causal else 0
+            loop_ed = T.ceildiv(seq_len, block_N)
+            for k in T.Pipelined(loop_st, loop_ed, num_stages=num_stages):
+                T.copy(Q[bz, k * block_N : (k + 1) * block_N, bx, :], q)
+                T.clear(qkT)
+                T.tcgen05_gemm(K_shared, q, qkT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(lse[bz, bx, k * block_N : (k + 1) * block_N], lse_shared)
+                for i, j in T.Parallel(block_M, block_N):
+                    qkT[i, j] = T.exp2(qkT[i, j] * scale - lse_shared[j])
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        qkT[i, j] = T.if_then_else(by * block_M + i <= k * block_N + j, qkT[i, j], 0)
+                T.copy(dO[bz, k * block_N : (k + 1) * block_N, bx, :], do)
+                T.clear(dsT)
+                T.tcgen05_gemm(V_shared, do, dsT, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.copy(qkT, qkT_cast)
+                T.tcgen05_gemm(qkT_cast, do, dv, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(Delta[bz, bx, k * block_N : (k + 1) * block_N], delta)
+                for i, j in T.Parallel(block_M, block_N):
+                    dsT_cast[i, j] = qkT[i, j] * (dsT[i, j] - delta[j]) * sm_scale
+                T.tcgen05_gemm(dsT_cast, q, dk, policy=T.GemmWarpPolicy.FullRow)
+
+                T.copy(dsT_cast, dsT_shared)
+                T.clear(dq)
+                T.tcgen05_gemm(dsT_shared, K_shared, dq, transpose_A=True)
+                for i, j in T.Parallel(block_N, dim):
+                    T.atomic_add(dQ[bz, k * block_N + i, bx, j], dq[i, j])
+            T.copy(dv, dv_shared)
+            T.copy(dk, dk_shared)
+            T.copy(dv_shared, dV[bz, by * block_M : (by + 1) * block_M, bx, :])
+            T.copy(dk_shared, dK[bz, by * block_M : (by + 1) * block_M, bx, :])
+
+    return main
+
+
+def flashattn_bwd_pipeline(batch, heads, seq_len, dim, is_causal, block_M, block_N):
+    """Pipeline (default): 128 threads, 2 stages."""
+    return flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N, threads=128, num_stages=2)
+
+
+def flashattn_bwd_warp(batch, heads, seq_len, dim, is_causal, block_M, block_N):
+    """ts: 256 threads, 2 stages. Use --variant ts to enable."""
+    return flashattn_bwd(batch, heads, seq_len, dim, is_causal, block_M, block_N, threads=256, num_stages=2)
+
+
+def ref_program(Q, K, V, is_causal):
+    """CPU reference forward (for validation); backward ref not implemented."""
+    dim = Q.size(-1)
+    Q_f = Q.cpu().float()
+    K_f = K.cpu().float()
+    V_f = V.cpu().float()
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q_f, K_f)
+    scores = scores / (dim**0.5)
+    if is_causal:
+        seq_len = Q_f.size(1)
+        mask = torch.tril(torch.ones(seq_len, seq_len))
+        mask = mask.unsqueeze(0).unsqueeze(0)
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    P = F.softmax(scores, dim=-1)
+    out_ref = torch.einsum("bhqk,bkhd->bqhd", P, V_f)
+    return out_ref.to(Q.dtype)
+
+
+def main(
+    batch: int = 2,
+    heads: int = 4,
+    seq_len: int = 256,
+    dim: int = 128,
+    is_causal: bool = False,
+    variant: str = "ss",
+):
+    """Run MHA backward kernels (fwd + preprocess + bwd + postprocess)."""
+    block_M = 64
+    block_N = 64 if dim <= 64 else 32
+    use_ts = variant == "ts"
+    bwd_fn = flashattn_bwd_warp if use_ts else flashattn_bwd_pipeline
+
+    kernel_fwd = flashattn_fwd(batch, heads, seq_len, dim, is_causal, block_M, block_N)
+    kernel_prep = flashattn_bwd_preprocess(batch, heads, seq_len, dim)
+    kernel_post = flashattn_bwd_postprocess(batch, heads, seq_len, dim)
+    kernel_bwd = bwd_fn(batch, heads, seq_len, dim, is_causal, block_M, block_N)
+
+    Q = torch.randn(batch, seq_len, heads, dim, device="cuda", dtype=torch.bfloat16)
+    K = torch.randn_like(Q)
+    V = torch.randn_like(Q)
+    dO = torch.randn_like(Q)
+    O, lse = kernel_fwd(Q, K, V)
+    Delta = kernel_prep(O, dO)
+    dQ = torch.zeros(batch, seq_len, heads, dim, device="cuda", dtype=torch.float32)
+    dK = torch.empty_like(K, device="cuda")
+    dV = torch.empty_like(V, device="cuda")
+    kernel_bwd(Q, K, V, dO, lse, Delta, dQ, dK, dV)
+    _ = kernel_post(dQ)  # dQ_out in output layout; not compared to ref (no backward ref)
+    print("Blackwell MHA bwd ({}): run OK (backward gradients not verified against ref).".format(variant))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=2)
+    parser.add_argument("--heads", type=int, default=4)
+    parser.add_argument("--seq_len", type=int, default=256)
+    parser.add_argument("--dim", type=int, default=128)
+    parser.add_argument("--is_causal", action="store_true")
+    parser.add_argument(
+        "--variant",
+        choices=["ss", "ts"],
+        default="ss",
+        help="ss: pipeline (default); ts: 256 threads",
+    )
+    args = parser.parse_args()
+    main(
+        args.batch,
+        args.heads,
+        args.seq_len,
+        args.dim,
+        args.is_causal,
+        args.variant,
+    )
diff --git a/examples/flash_attention_sm100/mha_fwd_bshd.py b/examples/flash_attention_sm100/mha_fwd_bshd.py
new file mode 100644
index 0000000000..db9f2472fd
--- /dev/null
+++ b/examples/flash_attention_sm100/mha_fwd_bshd.py
@@ -0,0 +1,491 @@
+"""Blackwell (SM100) Flash Attention Forward using TCGEN05MMA with TMEM accumulators.
+
+Replaces the Hopper WGMMA-based Flash Attention for Blackwell GPUs.
+Three variants: ss, ts, wasp.
+  - flashattn (variant='ss'): Both GEMMs use mma_ss (shared x shared -> TMEM), 128 threads.
+  - flashattn (variant='ts'): Single-path; GEMM 2 uses mma_ts (P_tmem x V_shared -> D_tmem), 256 threads.
+  - flashattn_wasp: Warp-specialized pipeline (softmax/DMA/BMM warps); GEMM 2 mma_ts.
+    If wasp fails (e.g. layout inference), fallback to ts.
+"""
+
+import torch
+import torch.nn.functional as F
+import tilelang
+import tilelang.language as T
+from tilelang.profiler import do_bench
+import argparse
+
+
+PASS_CFG = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+
+
+@tilelang.jit(out_idx=[3], pass_configs=PASS_CFG)
+def flashattn(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    block_M=128,
+    block_N=128,
+    variant="ss",
+):
+    """Flash Attention forward. variant='ss': mma_ss (128t, P via shared); 'ts': mma_ts (256t, P via TMEM)."""
+    use_ts = variant == "ts"
+    threads = 256 if use_ts else 128
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+    shape = [batch, seq_len, heads, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared = T.alloc_shared([block_N, dim], dtype)
+            V_shared = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
+
+            S_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            D_tmem = T.alloc_tmem([block_M, dim], accum_dtype)
+            mbar_s = T.alloc_barrier(1)
+            mbar_d = T.alloc_barrier(1)
+
+            if use_ts:
+                P_tmem = T.alloc_tmem([block_M, block_N], dtype)
+            else:
+                P_shared = T.alloc_shared([block_M, block_N], dtype)
+
+            S_reg = T.alloc_fragment([block_M, block_N], accum_dtype)
+            P_cast = T.alloc_fragment([block_M, block_N], dtype)
+            O_reg = T.alloc_fragment([block_M, dim], accum_dtype)
+            D_reg = T.alloc_fragment([block_M, dim], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_scale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+
+            T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
+            T.fill(O_reg, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            loop_range = (
+                T.min(
+                    T.ceildiv(seq_len, block_N),
+                    T.ceildiv((bx + 1) * block_M, block_N),
+                )
+                if is_causal
+                else T.ceildiv(seq_len, block_N)
+            )
+
+            for k in T.Pipelined(loop_range, num_stages=1):
+                T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared)
+
+                # GEMM 1: S = Q @ K^T -> S_tmem (tcgen05mma_ss)
+                T.tcgen05_gemm(
+                    Q_shared,
+                    K_shared,
+                    S_tmem,
+                    transpose_B=True,
+                    mbar=mbar_s,
+                    clear_accum=True,
+                )
+                T.mbarrier_wait_parity(mbar_s, k % 2)
+
+                T.copy(S_tmem, S_reg)
+
+                if is_causal:
+                    for i, j in T.Parallel(block_M, block_N):
+                        S_reg[i, j] = T.if_then_else(
+                            bx * block_M + i >= k * block_N + j,
+                            S_reg[i, j],
+                            -T.infinity(accum_dtype),
+                        )
+                else:
+                    for i, j in T.Parallel(block_M, block_N):
+                        S_reg[i, j] = T.if_then_else(
+                            k * block_N + j >= seq_len,
+                            -T.infinity(accum_dtype),
+                            S_reg[i, j],
+                        )
+
+                # Online softmax
+                T.copy(scores_max, scores_max_prev)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.reduce_max(S_reg, scores_max, dim=1, clear=False)
+                for i in T.Parallel(block_M):
+                    scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                for i in T.Parallel(block_M):
+                    scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                for i, j in T.Parallel(block_M, block_N):
+                    S_reg[i, j] = T.exp2(S_reg[i, j] * scale - scores_max[i] * scale)
+                T.reduce_sum(S_reg, scores_sum, dim=1)
+                for i in T.Parallel(block_M):
+                    logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+
+                for i, j in T.Parallel(block_M, dim):
+                    O_reg[i, j] *= scores_scale[i]
+
+                T.copy(S_reg, P_cast)
+                if use_ts:
+                    T.copy(P_cast, P_tmem)
+                    P_operand = P_tmem
+                else:
+                    T.copy(P_cast, P_shared)
+                    P_operand = P_shared
+
+                T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared)
+
+                # GEMM 2: D = P @ V -> D_tmem (ss: mma_ss; ts: mma_ts)
+                T.tcgen05_gemm(
+                    P_operand,
+                    V_shared,
+                    D_tmem,
+                    mbar=mbar_d,
+                    clear_accum=True,
+                )
+                T.mbarrier_wait_parity(mbar_d, k % 2)
+
+                T.copy(D_tmem, D_reg)
+                for i, j in T.Parallel(block_M, dim):
+                    O_reg[i, j] += D_reg[i, j]
+
+            for i, j in T.Parallel(block_M, dim):
+                O_reg[i, j] /= logsum[i]
+            T.copy(O_reg, O_shared)
+            T.copy(
+                O_shared,
+                Output[bz, bx * block_M : (bx + 1) * block_M, by, :],
+            )
+
+    return main
+
+
+flashattn_ss = flashattn
+flashattn_ts = flashattn
+
+
+@tilelang.jit(out_idx=[3], pass_configs=PASS_CFG)
+def flashattn_wasp(
+    batch,
+    heads,
+    seq_len,
+    dim,
+    is_causal,
+    block_M=128,
+    block_N=128,
+    threads=256,
+    num_stages=2,
+):
+    """Warp-specialized pipeline: softmax(0-127)/DMA(128-159)/BMM(160-191); GEMM2 mma_ts."""
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+    shape = [batch, seq_len, heads, dim]
+    dtype = T.bfloat16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def main(
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        Output: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, block_M), heads, batch, threads=threads) as (bx, by, bz):
+            Q_shared = T.alloc_shared([block_M, dim], dtype)
+            K_shared_0 = T.alloc_shared([block_N, dim], dtype)
+            K_shared_1 = T.alloc_shared([block_N, dim], dtype)
+            V_shared_0 = T.alloc_shared([block_N, dim], dtype)
+            V_shared_1 = T.alloc_shared([block_N, dim], dtype)
+            O_shared = T.alloc_shared([block_M, dim], dtype)
+
+            S_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            P_tmem = T.alloc_tmem([block_M, block_N], dtype)
+            O_tmem = T.alloc_tmem([block_M, dim], accum_dtype)
+
+            mbar_dma1_empty = T.alloc_barrier([32] * num_stages)
+            mbar_dma1_full = T.alloc_barrier([32] * num_stages)
+            mbar_bmm1_empty = T.alloc_barrier([128] * num_stages)
+            mbar_bmm1_full = T.alloc_barrier([1] * num_stages)
+            mbar_dma2_empty = T.alloc_barrier([32] * num_stages)
+            mbar_dma2_full = T.alloc_barrier([32] * num_stages)
+            mbar_bmm2_full = T.alloc_barrier([1] * num_stages)
+            mbar_softmax_empty = T.alloc_barrier([32] * num_stages)
+            mbar_softmax_full = T.alloc_barrier([128] * num_stages)
+            mbar_correction_full = T.alloc_barrier([32] * num_stages)
+
+            tid = T.get_thread_binding()
+
+            S_reg = T.alloc_fragment([block_M, block_N], accum_dtype)
+            P_cast = T.alloc_fragment([block_M, block_N], dtype)
+            O_reg = T.alloc_fragment([block_M, dim], accum_dtype)
+
+            scores_max = T.alloc_fragment([block_M], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_M], accum_dtype)
+            scores_rescale = T.alloc_fragment([block_M], accum_dtype)
+            scores_sum = T.alloc_fragment([block_M], accum_dtype)
+            logsum = T.alloc_fragment([block_M], accum_dtype)
+
+            if tid < 128:
+                T.fill(O_reg, 0)
+                T.fill(logsum, 0)
+                T.fill(scores_max, -T.infinity(accum_dtype))
+                T.copy(O_reg, O_tmem)
+
+            loop_range = (
+                T.min(
+                    T.ceildiv(seq_len, block_N),
+                    T.ceildiv((bx + 1) * block_M, block_N),
+                )
+                if is_causal
+                else T.ceildiv(seq_len, block_N)
+            )
+
+            for k in T.serial(loop_range):
+                parity = (k // num_stages) & 1
+                parity_inv = parity ^ 1
+                stage_id = k % num_stages
+                is_clear_accum = k == 0
+
+                if tid >= 128 and tid < 160:
+                    T.mbarrier_wait_parity(mbar_dma1_empty[stage_id], parity_inv)
+
+                    if k == 0:
+                        T.copy(Q[bz, bx * block_M : (bx + 1) * block_M, by, :], Q_shared)
+
+                    if stage_id == 0:
+                        T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared_0)
+                    else:
+                        T.copy(K[bz, k * block_N : (k + 1) * block_N, by, :], K_shared_1)
+                    T.mbarrier_arrive(mbar_dma1_full[stage_id])
+
+                    T.mbarrier_wait_parity(mbar_dma2_empty[stage_id], parity_inv)
+
+                    if stage_id == 0:
+                        T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared_0)
+                    else:
+                        T.copy(V[bz, k * block_N : (k + 1) * block_N, by, :], V_shared_1)
+
+                    T.mbarrier_arrive(mbar_dma2_full[stage_id])
+
+                elif tid >= 160 and tid < 192:
+                    T.mbarrier_wait_parity(mbar_dma1_full[stage_id], parity)
+                    T.mbarrier_wait_parity(mbar_bmm1_empty[stage_id], parity_inv)
+
+                    if stage_id == 0:
+                        T.tcgen05_gemm(
+                            Q_shared,
+                            K_shared_0,
+                            S_tmem,
+                            transpose_B=True,
+                            mbar=mbar_bmm1_full[stage_id],
+                            clear_accum=True,
+                        )
+                    else:
+                        T.tcgen05_gemm(
+                            Q_shared,
+                            K_shared_1,
+                            S_tmem,
+                            transpose_B=True,
+                            mbar=mbar_bmm1_full[stage_id],
+                            clear_accum=True,
+                        )
+                    T.mbarrier_arrive(mbar_dma1_empty[stage_id])
+
+                    T.mbarrier_wait_parity(mbar_softmax_full[stage_id], parity)
+                    T.mbarrier_wait_parity(mbar_dma2_full[stage_id], parity)
+
+                    if stage_id == 0:
+                        T.tcgen05_gemm(
+                            P_tmem,
+                            V_shared_0,
+                            O_tmem,
+                            mbar=mbar_bmm2_full[stage_id],
+                            clear_accum=is_clear_accum,
+                        )
+                    else:
+                        T.tcgen05_gemm(
+                            P_tmem,
+                            V_shared_1,
+                            O_tmem,
+                            mbar=mbar_bmm2_full[stage_id],
+                            clear_accum=is_clear_accum,
+                        )
+
+                    T.mbarrier_arrive(mbar_softmax_empty[stage_id])
+                    T.mbarrier_arrive(mbar_dma2_empty[stage_id])
+
+                    if k == loop_range - 1:
+                        T.mbarrier_arrive(mbar_correction_full[0])
+
+                elif tid < 128:
+                    T.mbarrier_wait_parity(mbar_softmax_empty[stage_id], parity_inv)
+                    T.mbarrier_wait_parity(mbar_bmm1_full[stage_id], parity)
+                    if k > 0:
+                        prev_stage = (k - 1) % num_stages
+                        prev_parity = ((k - 1) // num_stages) & 1
+                        T.mbarrier_wait_parity(mbar_bmm2_full[prev_stage], prev_parity)
+
+                    T.copy(O_tmem, O_reg)
+                    T.copy(S_tmem, S_reg)
+
+                    if is_causal:
+                        for i, j in T.Parallel(block_M, block_N):
+                            S_reg[i, j] = T.if_then_else(
+                                bx * block_M + i >= k * block_N + j,
+                                S_reg[i, j],
+                                -T.infinity(accum_dtype),
+                            )
+                    else:
+                        for i, j in T.Parallel(block_M, block_N):
+                            S_reg[i, j] = T.if_then_else(
+                                k * block_N + j >= seq_len,
+                                -T.infinity(accum_dtype),
+                                S_reg[i, j],
+                            )
+
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(S_reg, scores_max, dim=1, clear=False)
+                    for i in T.Parallel(block_M):
+                        scores_max[i] = T.max(scores_max[i], scores_max_prev[i])
+                    for i in T.Parallel(block_M):
+                        scores_rescale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_M, block_N):
+                        S_reg[i, j] = T.exp2(S_reg[i, j] * scale - scores_max[i] * scale)
+
+                    T.reduce_sum(S_reg, scores_sum, dim=1)
+                    for i in T.Parallel(block_M):
+                        logsum[i] = logsum[i] * scores_rescale[i] + scores_sum[i]
+
+                    for i, j in T.Parallel(block_M, dim):
+                        O_reg[i, j] *= scores_rescale[i]
+
+                    T.copy(S_reg, P_cast)
+                    T.copy(P_cast, P_tmem)
+                    T.copy(O_reg, O_tmem)
+
+                    T.mbarrier_arrive(mbar_softmax_full[stage_id])
+                    T.mbarrier_arrive(mbar_bmm1_empty[stage_id])
+
+                    if k == loop_range - 1:
+                        T.mbarrier_wait_parity(mbar_correction_full[0], 0)
+                        T.mbarrier_wait_parity(mbar_bmm2_full[stage_id], parity)
+                        T.copy(O_tmem, O_reg)
+                        for i, j in T.Parallel(block_M, dim):
+                            O_reg[i, j] /= logsum[i]
+                        T.copy(O_reg, O_shared)
+                        T.copy(
+                            O_shared,
+                            Output[bz, bx * block_M : (bx + 1) * block_M, by, :],
+                        )
+
+    return main
+
+
+flashattn_warp = flashattn_wasp
+
+
+def ref_program(Q, K, V, is_causal):
+    """CPU reference computation to avoid cuBLAS issues on Blackwell."""
+    Q_f = Q.cpu().float()
+    K_f = K.cpu().float()
+    V_f = V.cpu().float()
+    dim = Q_f.size(-1)
+    scores = torch.einsum("bqhd,bkhd->bhqk", Q_f, K_f)
+    scores = scores / (dim**0.5)
+    if is_causal:
+        seq_len = Q_f.size(1)
+        mask = torch.tril(torch.ones(seq_len, seq_len))
+        mask = mask.unsqueeze(0).unsqueeze(0)
+        scores = scores.masked_fill(mask == 0, float("-inf"))
+    attention_weights = F.softmax(scores, dim=-1)
+    output = torch.einsum("bhqk,bkhd->bqhd", attention_weights, V_f)
+    return output.to(torch.bfloat16)
+
+
+def main(
+    batch: int = 2,
+    heads: int = 4,
+    seq_len: int = 256,
+    dim: int = 128,
+    is_causal: bool = False,
+    variant: str = "ss",
+):
+    """Run MHA forward kernel (ss / ts / wasp) and benchmark."""
+    flops_per_matmul = 2.0 * batch * heads * seq_len * seq_len * dim
+    total_flops = 2 * flops_per_matmul
+    if is_causal:
+        total_flops *= 0.5
+
+    print(f"=== Blackwell Flash Attention ({variant.upper()}) ===")
+    print(f"batch={batch}, heads={heads}, seq_len={seq_len}, dim={dim}, causal={is_causal}")
+
+    if variant in ("ss", "ts"):
+        kernel = flashattn(
+            batch,
+            heads,
+            seq_len,
+            dim,
+            is_causal,
+            block_M=128,
+            block_N=128,
+            variant=variant,
+        )
+    else:
+        kernel = flashattn_wasp(
+            batch,
+            heads,
+            seq_len,
+            dim,
+            is_causal,
+            block_M=128,
+            block_N=128,
+            threads=256,
+            num_stages=2,
+        )
+
+    Q = torch.randn(batch, seq_len, heads, dim, device="cuda", dtype=torch.bfloat16)
+    K = torch.randn_like(Q)
+    V = torch.randn_like(Q)
+
+    out = kernel(Q, K, V)
+    ref = ref_program(Q, K, V, is_causal).to(out.device)
+    torch.testing.assert_close(out, ref, rtol=1e-2, atol=1e-2)
+    print("Correctness check passed.")
+
+    latency = do_bench(lambda: kernel(Q, K, V), warmup=100)
+    print(f"Blackwell ({variant}): {latency:.2f} ms")
+    print(f"Blackwell ({variant}): {total_flops / latency * 1e-9:.2f} TFlops")
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=1)
+    parser.add_argument("--heads", type=int, default=16)
+    parser.add_argument("--seq_len", type=int, default=16384)
+    parser.add_argument("--dim", type=int, default=128)
+    parser.add_argument("--is_causal", action="store_true")
+    parser.add_argument(
+        "--variant",
+        choices=["ss", "ts", "wasp"],
+        default="wasp",
+        help="ss: pipeline 128t; ts: single-path 256t mma_ts; wasp: warp-specialized (fallback to ts if fail)",
+    )
+    args = parser.parse_args()
+    print(args)
+    main(
+        args.batch,
+        args.heads,
+        args.seq_len,
+        args.dim,
+        args.is_causal,
+        args.variant,
+    )
diff --git a/examples/flash_decoding/example_gqa_decode.py b/examples/flash_decoding/example_gqa_decode.py
index 9e6f360178..26b4801115 100644
--- a/examples/flash_decoding/example_gqa_decode.py
+++ b/examples/flash_decoding/example_gqa_decode.py
@@ -40,13 +40,13 @@ def get_heuristic_config() -> Tuple[Dict, int]:
     return cfg, sm_version
 
 
-# TODO(lei): fix warp specialized and tma lower pass
+# TODO(lei): fix warp specialized pass
 def get_pass_configs():
-    return {tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
+    return {tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
 
 
 @autotune(configs=get_configs(), warmup=10, rep=10)
-@tilelang.jit(out_idx=[6], pass_configs=get_pass_configs())
+@tilelang.jit(out_idx=[4], pass_configs=get_pass_configs())
 def flashattn(batch, heads, groups, seqlen_kv, dim, block_N, block_H, num_split, num_stages, threads):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, heads, dim]
@@ -67,10 +67,10 @@ def flashattn_gqa_decode_split(
         K: T.Tensor(shape_k, dtype),
         V: T.Tensor(shape_v, dtype),
         mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-        glse: T.Tensor([batch, heads, num_split], dtype),
-        Output_partial: T.Tensor(part_shape, dtype),
         Output: T.Tensor(shape_o, dtype),
     ):
+        glse = T.alloc_global([batch, heads, num_split], dtype)
+        Output_partial = T.alloc_global(part_shape, dtype)
         # split
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
@@ -193,8 +193,6 @@ def flashattn_gqa_decode_no_split(
         K: T.Tensor(shape_k, dtype),
         V: T.Tensor(shape_v, dtype),
         mask: T.Tensor([batch, seqlen_kv, groups], "uint8"),
-        glse: T.Tensor([batch, heads, num_split], dtype),
-        Output_partial: T.Tensor(part_shape, dtype),
         Output: T.Tensor(shape_o, dtype),
     ):
         with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
@@ -259,7 +257,7 @@ def flashattn_gqa_decode_no_split(
         return flashattn_gqa_decode_no_split
 
 
-def ref_program(query, key, value, mask, glse, Output_partial):
+def ref_program(query, key, value, mask):
     #     """
     #     Inputs:
     #     - query (Tensor): [batch, heads, dim]
@@ -417,12 +415,9 @@ def main(batch: int = 1, heads: int = 32, groups: int = 8, kv_seqlen: int = 8192
         k = torch.randn(batch, kv_seqlen, groups, dim, device="cuda", dtype=torch.float16)
         v = torch.randn(batch, kv_seqlen, groups, dim, device="cuda", dtype=torch.float16)
         mask = torch.randint(0, 2, (batch, kv_seqlen, groups), device="cuda", dtype=torch.uint8)
-        split = config["num_split"]
-        glse = torch.empty(batch, heads, split, device="cuda", dtype=torch.float16)
-        Output_partial = torch.empty(batch, heads, split, dim, device="cuda", dtype=torch.float16)
-        o = kernel(q, k, v, mask, glse, Output_partial)
-        o_ref = ref_program(q, k, v, mask, glse, Output_partial)
-        o_ref_split = ref_split_program(q, k, v, mask, glse, Output_partial)
+        o = kernel(q, k, v, mask)
+        o_ref = ref_program(q, k, v, mask)
+        o_ref_split = ref_split_program(q, k, v, mask)
 
         print(o)
         print(o_ref)
diff --git a/examples/flash_decoding/example_gqa_decode_varlen_logits.py b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
index 30acd879e6..468be22302 100644
--- a/examples/flash_decoding/example_gqa_decode_varlen_logits.py
+++ b/examples/flash_decoding/example_gqa_decode_varlen_logits.py
@@ -1,10 +1,9 @@
 import torch
-import triton
-import triton.language as tl
 import math
 import argparse
 import tilelang
 import tilelang.language as T
+from tilelang.profiler import do_bench
 
 torch.manual_seed(0)
 
@@ -21,167 +20,6 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
     return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
 
 
-@triton.jit
-def _fwd_inner(
-    q,
-    k_ptrs,
-    v_ptrs,
-    s_ptrs,
-    m_i,
-    l_i,
-    acc,
-    offs_h,
-    mask_h,
-    offs_n,
-    seqlen,
-    softmax_scale,
-    lo,
-    hi,
-    stride_kt,
-    stride_vt,
-    stride_sh,
-    stride_sn,
-    BLOCK_N: tl.constexpr,
-):
-    """Inner loop computation for attention"""
-
-    for blk_idx in tl.range(lo, hi):
-        start_n = blk_idx * BLOCK_N
-        k = tl.load(k_ptrs + start_n * stride_kt, mask=offs_n[None, :] + start_n < seqlen)
-        v = tl.load(v_ptrs + start_n * stride_vt, mask=offs_n[:, None] + start_n < seqlen)
-
-        qk = tl.dot(q, k)
-        qk *= softmax_scale
-        qk += tl.where(offs_n[None, :] + start_n < seqlen, 0, -1.0e9)
-
-        row_max = tl.max(qk, 1)
-        tl.store(s_ptrs + offs_h * stride_sh + blk_idx * stride_sn, row_max, mask=mask_h)
-
-        m_ij = tl.maximum(m_i, row_max)
-        qk -= m_ij[:, None]
-        p = tl.math.exp(qk)
-        l_ij = tl.sum(p, 1)
-        alpha = tl.math.exp(m_i - m_ij)
-        l_i = l_i * alpha + l_ij
-        m_i = m_ij
-        acc *= alpha[:, None]
-        p = p.to(v.type.element_ty)
-        acc += tl.dot(p, v)
-
-    return m_i, l_i, acc
-
-
-@triton.autotune(
-    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [4, 8] for num_stages in [2, 4]],
-    key=["gqa_group_size", "BLOCK_N", "BLOCK_D", "BLOCK_H"],
-)
-@triton.jit
-def _fwd_kernel_varlen(
-    Q,  # [token_q = b, h_q, dim]
-    K,  # [token_k, h_kv, dim]
-    V,
-    O,
-    S,
-    s_aux,
-    softmax_scale,
-    cu_seqlens_k,
-    stride_qt,
-    stride_qh,
-    stride_qd,
-    stride_kt,
-    stride_kh,
-    stride_kd,
-    stride_vt,
-    stride_vh,
-    stride_vd,
-    stride_ot,
-    stride_oh,
-    stride_od,
-    stride_sb,
-    stride_sh,
-    stride_sn,  # bmask shape [b, q_h, seq/BLOCK_N]
-    gqa_group_size: tl.constexpr,
-    BLOCK_H: tl.constexpr,
-    BLOCK_N: tl.constexpr,
-    BLOCK_D: tl.constexpr,
-):
-    off_z = tl.program_id(0)
-    off_h_for_kv = tl.program_id(1)
-    off_h_q = off_h_for_kv * gqa_group_size
-
-    cu_k_start = tl.load(cu_seqlens_k + off_z)
-    cu_k_end = tl.load(cu_seqlens_k + off_z + 1)
-
-    seqlen_k = cu_k_end - cu_k_start
-
-    offs_h = tl.arange(0, BLOCK_H)
-    offs_n = tl.arange(0, BLOCK_N)
-    offs_d = tl.arange(0, BLOCK_D)
-
-    Q_ptrs = Q + off_z * stride_qt + off_h_q * stride_qh
-    K_ptrs = K + (cu_k_start) * stride_kt + off_h_for_kv * stride_kh
-    V_ptrs = V + (cu_k_start) * stride_vt + off_h_for_kv * stride_vh
-    O_ptrs = O + off_z * stride_ot + off_h_q * stride_oh
-    S_ptrs = S + off_z * stride_sb + off_h_q * stride_sh
-
-    mask_h = offs_h < gqa_group_size
-    q = tl.load(Q_ptrs + offs_d[None, :] * stride_qd + offs_h[:, None] * stride_qh, mask=mask_h[:, None])
-
-    if s_aux is not None:
-        sink = tl.load(s_aux + off_h_q + offs_h, mask=mask_h).to(tl.float32)
-        l_i = tl.zeros([BLOCK_H], dtype=tl.float32)
-        m_i = tl.zeros([BLOCK_H], dtype=tl.float32) + sink
-    else:
-        l_i = tl.full([BLOCK_H], 1.0, dtype=tl.float32)
-        m_i = tl.full([BLOCK_H], float("-inf"), dtype=tl.float32)
-
-    acc = tl.zeros([BLOCK_H, BLOCK_D], dtype=tl.float32)
-
-    k_ptrs = K_ptrs + offs_n[None, :] * stride_kt + offs_d[:, None] * stride_kd
-    v_ptrs = V_ptrs + offs_n[:, None] * stride_vt + offs_d[None, :] * stride_vd
-
-    lo, hi = 0, tl.cdiv(seqlen_k, BLOCK_N)
-    m_i, l_i, acc = _fwd_inner(
-        q,
-        k_ptrs,
-        v_ptrs,
-        S_ptrs,
-        m_i,
-        l_i,
-        acc,
-        offs_h,
-        mask_h,
-        offs_n,
-        seqlen_k,
-        softmax_scale,
-        lo,
-        hi,
-        stride_kt,
-        stride_vt,
-        stride_sh,
-        stride_sn,
-        BLOCK_N,
-    )
-
-    if s_aux is not None:
-        sink = tl.math.exp(sink - m_i)
-        l_i = l_i + sink
-        acc = acc / l_i[:, None]
-
-    else:
-        l_recip = 1 / l_i[:, None]
-        acc = acc * l_recip
-
-    for blk_idx in tl.range(lo, hi):
-        s = tl.load(S_ptrs + offs_h * stride_sh + blk_idx * stride_sn, mask=mask_h)
-        s = tl.exp(s - m_i) / l_i
-        tl.store(S_ptrs + offs_h * stride_sh + blk_idx * stride_sn, s, mask=mask_h)
-
-    acc = acc.to(O.dtype.element_ty)
-
-    tl.store(O_ptrs + offs_h[:, None] * stride_oh + offs_d[None, :] * stride_od, acc, mask=mask_h[:, None])
-
-
 def get_configs():
     import itertools
 
@@ -211,7 +49,6 @@ def flashattn(
     kv_group_num = heads // k_heads
 
     valid_block_H = min(block_H, kv_group_num)
-    # TODO: check if max_seqlen_kv is correct for varlen case
 
     @T.prim_func
     def flashattn_gqa_decode_no_split(
@@ -223,7 +60,7 @@ def flashattn_gqa_decode_no_split(
         Output: T.Tensor(shape_o, dtype),
         S: T.Tensor(shape_s, dtype),
     ):
-        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bx, by, bz):
+        with T.Kernel(batch, heads // valid_block_H, num_split, threads=threads) as (bid, hid, bz):
             Q_shared = T.alloc_shared([block_H, dim], dtype)
             K_shared = T.alloc_shared([block_N, dim], dtype)
             V_shared = T.alloc_shared([block_N, dim], dtype)
@@ -236,12 +73,10 @@ def flashattn_gqa_decode_no_split(
             scores_scale = T.alloc_fragment([block_H], accum_dtype)
             scores_sum = T.alloc_fragment([block_H], accum_dtype)
             logsum = T.alloc_fragment([block_H], accum_dtype)
-            S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
-            # S_fragment = T.alloc_fragment([block_H, math.ceil(max_seqlen_kv / block_N)], accum_dtype)
+            S_shared = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], accum_dtype)
+            S_shared_cast = T.alloc_shared([block_H, math.ceil(max_seqlen_kv / block_N)], dtype)
             s_aux_shared = T.alloc_shared([block_H], T.float32)
 
-            bid = bx
-            hid = by
             cur_kv_head = hid // (kv_group_num // valid_block_H)
 
             cur_start_k = cu_seqlens_k[bid]
@@ -253,30 +88,22 @@ def flashattn_gqa_decode_no_split(
             T.fill(logsum, 0)
             T.fill(scores_max, -T.infinity(accum_dtype))
 
-            # loop_range = T.ceildiv((seqlen_kv // num_split), block_N)
             loop_range = T.ceildiv((cur_seqlen_k // num_split), block_N)
             for k in T.Pipelined(loop_range, num_stages=num_stages):
                 T.copy(K[cur_start_k + k * block_N : cur_start_k + (k + 1) * block_N, cur_kv_head, :], K_shared)
                 T.clear(acc_s)
                 T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
                 for i, j in T.Parallel(block_H, block_N):
-                    # acc_s[i, j] = T.if_then_else(mask_local[j] != 0 and k * block_N + j < cur_seqlen_k, acc_s[i, j],
-                    #                              -T.infinity(accum_dtype))
                     acc_s[i, j] = T.if_then_else(k * block_N + j < cur_seqlen_k, acc_s[i, j], -T.infinity(accum_dtype))
                 T.copy(scores_max, scores_max_prev)
                 T.fill(scores_max, -T.infinity(accum_dtype))
                 T.reduce_max(acc_s, scores_max, dim=1, clear=False)
-                # scores_max_prev is m_i
-                # scores_max is row_max->m_ij in triton
                 T.copy(scores_max, S_shared[:, k])
-                # scores_scale is alpha in triton
                 for i in T.Parallel(block_H):
                     scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
                 for i, j in T.Parallel(block_H, block_N):
                     acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
                 T.reduce_sum(acc_s, scores_sum, dim=1)
-                # scores_sum is l_ij in triton
-                # logsum is l_i in triton
                 for i in T.Parallel(block_H):
                     logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
                 T.copy(acc_s, acc_s_cast)
@@ -293,355 +120,109 @@ def flashattn_gqa_decode_no_split(
                 acc_o[i, j] /= logsum[i]
             for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
                 S_shared[h, k] = T.exp2((S_shared[h, k] - scores_max[h]) * scale) / logsum[h]
-            # T.copy(S_shared, S_fragment)
-            # for h, k in T.Parallel(block_H, math.ceil(max_seqlen_kv / block_N)):
-            #     S_fragment[h, k] = T.exp2((S_fragment[h, k] - scores_max[h]) * scale) / logsum[h]
             for i in T.Parallel(block_H):
                 logsum[i] = T.log2(logsum[i]) + scores_max[i] * scale
             T.copy(acc_o[:valid_block_H, :], O_shared)
             T.copy(O_shared, Output[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
-            # T.copy(S_fragment, S_shared)
-            T.copy(S_shared[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
+            T.copy(S_shared, S_shared_cast)
+            T.copy(S_shared_cast[:valid_block_H, :], S[bid, hid * valid_block_H : (hid + 1) * valid_block_H, :])
 
-    # TODO: split version
     return flashattn_gqa_decode_no_split
 
 
-def flash_attn_with_attn_pool_decode_tilelang(
-    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
-    K: torch.Tensor,  ## [tk, k_h, k_dim]
-    V: torch.Tensor,
-    cu_seqlens_k: torch.Tensor,
-    max_seqlen_k: int,
-    real_max_k_seqlen: int,
-    num_split: int,
-    softmax_scale: float,
-    s_aux: torch.Tensor = None,
-    block_size: int = 64,
-    use_per_kv_head_sparse_index: bool = False,
-    tl_kernel=None,
-):
-    num_tokens, q_h, head_size = Q.shape
-    batch = cu_seqlens_k.size(0) - 1
-    k_h = K.size(1)
-
-    assert Q.dim() == K.dim() == 3
-    assert Q.size(2) == K.size(2)
-    assert cu_seqlens_k.dim() == 1
-    assert head_size in {64, 128, 256}
-    assert Q.is_contiguous()
-    # assert K.is_contiguous()
-    # assert V.is_contiguous()
+def ref_attention(q, k, v, k_seqlens, q_heads, sink=None):
+    """
+    Compute reference attention output and weights.
+    Args:
+        q: [b, q_heads, head_size]
+        k, v: [b, kv_heads, max_seqlen, head_size]
+        k_seqlens: [b] actual sequence lengths
+        sink: [q_heads] optional sink values
+    Returns: output [b, q_heads, head_size], attn_weights [b, q_heads, max_seqlen]
+    """
+    batch_size, kv_heads, max_seqlen, head_size = k.shape
+    softmax_scale = 1.0 / math.sqrt(head_size)
 
-    gqa_group_size = q_h // k_h
+    # Expand KV heads and compute attention scores
+    k = repeat_kv(k, q_heads // kv_heads)
+    v = repeat_kv(v, q_heads // kv_heads)
+    logits = torch.matmul(q.unsqueeze(2), k.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
 
-    O_tl = torch.zeros_like(Q)
-    S_tl = torch.zeros((batch, q_h, math.ceil(real_max_k_seqlen / block_size)), dtype=Q.dtype, device=Q.device)
-    O_tl, S_tl = tl_kernel(Q, K, V, cu_seqlens_k, s_aux)
+    # Mask invalid positions
+    mask = torch.arange(max_seqlen, device=q.device).expand(batch_size, -1) >= k_seqlens.unsqueeze(1)
+    logits.masked_fill_(mask.unsqueeze(1).unsqueeze(2), float("-inf"))
 
-    if use_per_kv_head_sparse_index:
-        S_tl = torch.max_pool2d(S_tl, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
-    else:
-        S_tl = torch.max_pool2d(S_tl, kernel_size=(q_h, 1), stride=(q_h, 1))
-
-    return O_tl, S_tl
-
-
-def flash_attn_with_attn_pool_decode(
-    Q: torch.Tensor,  ## [tq = b, q_h, q_dim]
-    K: torch.Tensor,  ## [tk, k_h, k_dim]
-    V: torch.Tensor,
-    cu_seqlens_k: torch.Tensor,
-    max_seqlen_k: int,
-    real_max_k_seqlen: int,
-    num_split: int,
-    softmax_scale: float,
-    s_aux: torch.Tensor = None,
-    block_size: int = 64,
-    use_per_kv_head_sparse_index: bool = False,
-):
-    num_tokens, q_h, head_size = Q.shape
-    batch = cu_seqlens_k.size(0) - 1
-    k_h = K.size(1)
-
-    assert Q.dim() == K.dim() == 3
-    assert Q.size(2) == K.size(2)
-    assert cu_seqlens_k.dim() == 1
-    assert head_size in {64, 128, 256}
-    assert Q.is_contiguous()
-    # assert K.is_contiguous()
-    # assert V.is_contiguous()
-
-    gqa_group_size = q_h // k_h
-
-    BLOCK_D = head_size
-    BLOCK_N = block_size
-    BLOCK_H = 64
-
-    O = torch.zeros_like(Q)
-    S = torch.zeros((batch, q_h, math.ceil(max_seqlen_k / block_size)), dtype=Q.dtype, device=Q.device)
-
-    def grid(META):
-        return (batch, k_h)
-
-    with torch.cuda.device(Q.device.index):
-        _fwd_kernel_varlen[grid](
-            Q,
-            K,
-            V,
-            O,
-            S,
-            s_aux,
-            softmax_scale,
-            cu_seqlens_k,
-            *Q.stride(),
-            *K.stride(),
-            *V.stride(),
-            *O.stride(),
-            *S.stride(),
-            gqa_group_size,
-            BLOCK_H=BLOCK_H,
-            BLOCK_N=BLOCK_N,
-            BLOCK_D=BLOCK_D,
-        )
-
-    if use_per_kv_head_sparse_index:
-        S = torch.max_pool2d(S, kernel_size=(gqa_group_size, 1), stride=(gqa_group_size, 1))
+    if sink is None:
+        attn_weights = logits.softmax(dim=-1)
     else:
-        S = torch.max_pool2d(S, kernel_size=(q_h, 1), stride=(q_h, 1))
+        # Sink attention: softmax with additional sink term
+        sink_expanded = sink.view(1, q_heads, 1, 1)
+        logits_max = torch.maximum(logits.max(dim=-1, keepdim=True).values, sink_expanded)
+        exp_logits = torch.exp(logits - logits_max)
+        attn_weights = exp_logits / (exp_logits.sum(dim=-1, keepdim=True) + torch.exp(sink_expanded - logits_max))
 
-    return O, S
+    attn_weights.masked_fill_(mask.unsqueeze(1).unsqueeze(2), 0.0)
+    output = torch.matmul(attn_weights.to(v.dtype), v).squeeze(2)
+    return output, attn_weights.squeeze(2)
 
 
 def test_varlen_decode_main(args):
-    """Test decode kernel with variable sequence lengths"""
-    batch_size = args.batch_size
-    q_heads = args.q_heads
-    kv_heads = args.kv_heads
-    max_k_seqlen = args.k_seqlen  # Use as max sequence length
-    real_max_k_seqlen = args.k_seqlen
-    head_size = args.head_size
-    block_size = args.block_size
+    """Test decode kernel with variable sequence lengths."""
+    batch_size, q_heads, kv_heads = args.batch_size, args.q_heads, args.kv_heads
+    max_k_seqlen, head_size, block_size = args.k_seqlen, args.head_size, args.block_size
     dtype = torch.bfloat16 if args.dtype == T.bfloat16 else torch.float16
 
-    print(f"Testing decode kernel with variable sequence lengths (max_k_seqlen={max_k_seqlen})")
-
-    # Generate sink values if needed
-    sink = None
-    if args.test_sink:
-        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
-        print(f"Using sink attention with sink values: {sink}")
-
-    # Generate variable length k sequences
-    k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
-    print(f"k_seqlens: {k_seqlens}")
-
-    # Generate cumulative sequence lengths for k
-    cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
-    total_k_tokens = 0
-    for i in range(batch_size):
-        cu_seqlens_k[i] = total_k_tokens
-        total_k_tokens += k_seqlens[i]
-    cu_seqlens_k[batch_size] = total_k_tokens
-
-    print(f"cu_seqlens_k: {cu_seqlens_k}")
-
-    # Generate tensors - Q is [batch_size, q_heads, head_size] for decode
-    q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
-    k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
-    v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
-
-    softmax_scale = 1.0 / math.sqrt(head_size)
-    max_seqlen_k = int(k_seqlens.max())
-
-    print(f"Actual max_seqlen_k: {max_seqlen_k}")
-    print(f"q_decode shape: {q_decode.shape}")
-    print(f"k_varlen shape: {k_varlen.shape}")
-    print(f"v_varlen shape: {v_varlen.shape}")
-
-    num_tokens, q_h, head_size = q_decode.shape
-    batch = cu_seqlens_k.size(0) - 1
-    k_h = k_varlen.size(1)
-    tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
+    # Make the test deterministic and independent of global RNG state.
+    # This avoids flaky allclose failures when run under xdist with different
+    # test ordering.
+    cuda_devices = list(range(torch.cuda.device_count()))
+    with torch.random.fork_rng(devices=cuda_devices):
+        torch.manual_seed(0)
+        if cuda_devices:
+            torch.cuda.manual_seed_all(0)
 
-    # Test our decode kernel
-    O_triton, S_triton = flash_attn_with_attn_pool_decode(
-        q_decode,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        real_max_k_seqlen,
-        args.num_split,
-        softmax_scale,
-        s_aux=sink,
-        block_size=block_size,
-    )
-    O_tilelang, S_tilelang = flash_attn_with_attn_pool_decode_tilelang(
-        q_decode,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        real_max_k_seqlen,
-        args.num_split,
-        softmax_scale,
-        s_aux=sink,
-        block_size=block_size,
-        tl_kernel=tl_kernel,
-    )
+        # Generate variable length sequences and cumulative lengths
+        k_seqlens = torch.randint(max_k_seqlen // 4, max_k_seqlen + 1, size=(batch_size,))
+        cu_seqlens_k = torch.zeros(batch_size + 1, device="cuda", dtype=torch.int32)
+        cu_seqlens_k[1:] = torch.cumsum(k_seqlens, dim=0).to(torch.int32).cuda()
+        total_k_tokens = cu_seqlens_k[-1].item()
+
+        # Generate input tensors
+        q = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
+        k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+        v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
+        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1 if args.test_sink else None
+
+    # Run tilelang kernel
+    tl_kernel = flashattn(batch_size, q_heads, kv_heads, max_k_seqlen, total_k_tokens, head_size, args.test_sink)
+    O_tl, S_tl = tl_kernel(q, k_varlen, v_varlen, cu_seqlens_k, sink)
+    S_tl = torch.max_pool2d(S_tl, kernel_size=(q_heads, 1), stride=(q_heads, 1))
+
+    # Mask out invalid S positions
     for i in range(batch_size):
-        S_tilelang[i, :, math.ceil((cu_seqlens_k[i + 1].item() - cu_seqlens_k[i].item()) / block_size) :] = 0
-
-    # Create torch reference - pad tensors for comparison
-    k_padded_list = []
-    v_padded_list = []
+        valid_blocks = math.ceil(k_seqlens[i].item() / block_size)
+        S_tl[i, :, valid_blocks:] = 0
 
+    # Prepare padded tensors for reference
+    actual_max = int(k_seqlens.max())
+    k_padded = torch.zeros(batch_size, kv_heads, actual_max, head_size, device="cuda", dtype=dtype)
+    v_padded = torch.zeros(batch_size, kv_heads, actual_max, head_size, device="cuda", dtype=dtype)
     for i in range(batch_size):
-        actual_k_len = k_seqlens[i]
-
-        # Extract and pad k, v for this batch
-        k_start = cu_seqlens_k[i]
-        k_end = cu_seqlens_k[i + 1]
-
-        # Pad to max_seqlen_k
-        k_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
-        v_padded = torch.zeros(max_seqlen_k, kv_heads, head_size, device="cuda", dtype=dtype)
-
-        k_padded[:actual_k_len] = k_varlen[k_start:k_end]
-        v_padded[:actual_k_len] = v_varlen[k_start:k_end]
-
-        k_padded_list.append(k_padded)
-        v_padded_list.append(v_padded)
-
-    # Stack to create batched tensors [b, max_seqlen, kv_heads, head_size]
-    k_padded_batched = torch.stack(k_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
-    v_padded_batched = torch.stack(v_padded_list, dim=0).transpose(1, 2)  # [b, kv_heads, max_seqlen, head_size]
-
-    # Expand q to match kv heads: [b, q_heads, 1, head_size]
-    q_expanded = q_decode.unsqueeze(2)  # [b, q_heads, 1, head_size]
-
-    print(f"q_expanded shape: {q_expanded.shape}")
-    print(f"k_padded_batched shape: {k_padded_batched.shape}")
-    print(f"v_padded_batched shape: {v_padded_batched.shape}")
-
-    # Compute torch reference
-    k_repeat = repeat_kv(k_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
-    v_repeat = repeat_kv(v_padded_batched, q_heads // kv_heads)  # [b, q_heads, max_seqlen, head_size]
-
-    if sink is None:
-        # Standard attention computation: [b, q_heads, 1, head_size] @ [b, q_heads, head_size, max_seqlen]
-        attn_score = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
+        seq_len = k_seqlens[i].item()
+        k_padded[i, :, :seq_len] = k_varlen[cu_seqlens_k[i] : cu_seqlens_k[i + 1]].transpose(0, 1)
+        v_padded[i, :, :seq_len] = v_varlen[cu_seqlens_k[i] : cu_seqlens_k[i + 1]].transpose(0, 1)
 
-        # Apply sequence length masking
-        for i in range(batch_size):
-            actual_k_len = k_seqlens[i]
-            attn_score[i, :, :, actual_k_len:] = float("-inf")
-
-        attn_weights = attn_score.softmax(dim=-1)  # [b, q_heads, 1, max_seqlen]
-
-        # Mask out invalid positions
-        for i in range(batch_size):
-            actual_k_len = k_seqlens[i]
-            attn_weights[i, :, :, actual_k_len:] = 0.0
-
-        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
-        O_torch = torch.matmul(attn_weights, v_repeat)  # [b, q_heads, 1, head_size]
-    else:
-        # s_aux attention
-        logits = torch.matmul(q_expanded, k_repeat.transpose(-2, -1)) * softmax_scale  # [b, q_heads, 1, max_seqlen]
-
-        # Apply sequence length masking
-        for i in range(batch_size):
-            actual_k_len = k_seqlens[i]
-            logits[i, :, :, actual_k_len:] = float("-inf")
-
-        sink_expanded = sink.view(1, q_heads, 1, 1)  # [1, q_heads, 1, 1]
-        logits_max = torch.max(logits, dim=-1, keepdim=True).values
-        logits_or_sinks_max = torch.maximum(logits_max, sink_expanded)
-        sinks = torch.exp(sink_expanded - logits_or_sinks_max)
-        unnormalized_scores = torch.exp(logits - logits_or_sinks_max)
-        normalizer = unnormalized_scores.sum(dim=-1, keepdim=True) + sinks
-        attn_weights = unnormalized_scores / normalizer
-
-        # Mask out invalid positions
-        for i in range(batch_size):
-            actual_k_len = k_seqlens[i]
-            attn_weights[i, :, :, actual_k_len:] = 0.0
-
-        # Compute output: [b, q_heads, 1, max_seqlen] @ [b, q_heads, max_seqlen, head_size]
-        O_torch = torch.matmul(attn_weights.to(v_repeat.dtype), v_repeat)  # [b, q_heads, 1, head_size]
-
-    O_torch = O_torch.squeeze(2)  # [b, q_heads, head_size]
-
-    # Compute attention score pooling for S
-    attn_score_pooled = torch.max_pool2d(
-        attn_weights.squeeze(2),  # [b, q_heads, max_seqlen]
-        kernel_size=(q_heads, block_size),
-        stride=(q_heads, block_size),
-        ceil_mode=True,
-    ).to(dtype=torch.float16)  # [b, 1, ceil(max_seqlen/block_size)]
-
-    print(f"O_triton shape: {O_triton.shape}")
-    print(f"O_tilelang shape: {O_tilelang.shape}")
-    print(f"O_torch shape: {O_torch.shape}")
-    print(f"S_triton shape: {S_triton.shape}")
-    print(f"S_tilelang shape: {S_tilelang.shape}")
-    print(f"attn_score_pooled shape: {attn_score_pooled.shape}")
+    # Compute reference
+    O_ref, attn_weights = ref_attention(q, k_padded, v_padded, k_seqlens.cuda(), q_heads, sink)
+    S_ref = torch.max_pool2d(attn_weights, kernel_size=(q_heads, block_size), stride=(q_heads, block_size), ceil_mode=True).to(dtype)
 
     # Compare results
-    max_diff_o = torch.max(torch.abs(O_triton - O_torch))
-    max_diff_o_tl = torch.max(torch.abs(O_tilelang - O_torch))
-    print(f"Max difference in O: {max_diff_o.item()}")
-    print(f"Max difference in O_tilelang: {max_diff_o_tl.item()}")
-
-    max_diff_s = torch.max(torch.abs(S_triton - attn_score_pooled))
-    max_diff_s_tl = torch.max(
-        torch.abs(
-            S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)] - attn_score_pooled[:, :, : math.ceil(max_seqlen_k / block_size)]
-        )
-    )
-    print(f"Max difference in S: {max_diff_s.item()}")
-    print(f"Max difference in S_tilelang: {max_diff_s_tl.item()}")
-
-    assert torch.allclose(O_triton, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o.item()}"
-    assert torch.allclose(S_triton, attn_score_pooled, atol=1e-2, rtol=1e-2), f"Score mismatch: {max_diff_s.item()}"
-    assert torch.allclose(O_tilelang, O_torch, atol=1e-2, rtol=1e-2), f"Output mismatch: {max_diff_o_tl.item()}"
-    assert torch.allclose(
-        S_tilelang[:, :, : math.ceil(max_seqlen_k / block_size)],
-        attn_score_pooled[:, :, : math.ceil(max_seqlen_k / block_size)],
-        atol=1e-2,
-        rtol=1e-2,
-    ), f"Score mismatch: {max_diff_s_tl.item()}"
-
+    num_blocks = math.ceil(actual_max / block_size)
+    assert torch.allclose(O_tl, O_ref, atol=1e-2, rtol=1e-2), f"Output mismatch: {(O_tl - O_ref).abs().max()}"
+    assert torch.allclose(S_tl[:, :, :num_blocks], S_ref[:, :, :num_blocks], atol=1e-2, rtol=1e-2), "Score mismatch"
     print("✅ All tests passed!")
 
 
-def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
-    """
-    Do benchmark for a function.
-    """
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    for _ in range(warmup):
-        fn(*args, **kwargs)
-
-    torch.cuda.synchronize()
-    for i in range(rep):
-        start_event[i].record()
-        fn(*args, **kwargs)
-        end_event[i].record()
-    torch.cuda.synchronize()
-
-    # Record clocks
-    times = torch.tensor(
-        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
-        dtype=torch.float,
-    )
-
-    return times.mean().item()
-
-
 def speed_benchmark_decode_comparison(args):
     """Speed benchmark for decode kernel"""
     batch_size = args.batch_size
@@ -682,66 +263,25 @@ def speed_benchmark_decode_comparison(args):
     q_decode = torch.randn(batch_size, q_heads, head_size, device="cuda", dtype=dtype)
     k_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
     v_varlen = torch.randn(total_k_tokens, kv_heads, head_size, device="cuda", dtype=dtype)
-
-    softmax_scale = 1.0 / math.sqrt(head_size)
-    max_seqlen_k = int(k_seqlens.max())
-
-    # Generate sink values if needed
-    sink = None
-    if args.test_sink:
-        sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1  # Small sink values
-        print("  Using sink attention with sink values")
-
-    print("Setup complete:")
-    print(f"  Total K tokens: {total_k_tokens}")
-    print(f"  Actual max K seq len: {max_seqlen_k}")
+    sink = torch.randn(q_heads, device="cuda", dtype=torch.float32) * 0.1 if args.test_sink else None
     if args.test_varlen:
         print(f"  K sequence lengths: {k_seqlens.tolist()}")
 
-    # Warmup
-    num_tokens, q_h, head_size = q_decode.shape
+    _, q_h, head_size = q_decode.shape
     batch = cu_seqlens_k.size(0) - 1
     k_h = k_varlen.size(1)
     tl_kernel = flashattn(batch, q_h, k_h, args.k_seqlen, cu_seqlens_k[-1].item(), head_size, args.test_sink)
 
+    def run_once():
+        tl_kernel(q_decode, k_varlen, v_varlen, cu_seqlens_k, sink)
+
     # Benchmark
     print("⚡ Benchmarking Tilelang kernel (100 iterations)...")
     tilelang_time = do_bench(
-        flash_attn_with_attn_pool_decode_tilelang,
-        q_decode,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        args.k_seqlen,
-        1,
-        softmax_scale,
-        sink,
-        block_size,
-        False,
-        tl_kernel,
+        run_once,
     )
     print(f"Average decode kernel time Tilelang: {tilelang_time:.3f} ms")
 
-    # Benchmark
-    print("⚡ Benchmarking Triton kernel (100 iterations)...")
-    triton_time = do_bench(
-        flash_attn_with_attn_pool_decode,
-        q_decode,
-        k_varlen,
-        v_varlen,
-        cu_seqlens_k,
-        max_seqlen_k,
-        args.k_seqlen,
-        1,
-        softmax_scale,
-        sink,
-        block_size,
-    )
-    print(f"Average decode kernel time Triton: {triton_time:.3f} ms")
-
-    print(f"Speedup: {(triton_time / tilelang_time):.3f}")
-
 
 def main():
     args = argparse.Namespace(
@@ -779,7 +319,9 @@ def main():
     args.dtype = T.float16
     args.num_split = 1
 
-    if args.benchmark:
-        speed_benchmark_decode_comparison(args)
-    else:
-        test_varlen_decode_main(args)
+    # if args.benchmark:
+    #     speed_benchmark_decode_comparison(args)
+    # else:
+    #     test_varlen_decode_main(args)
+
+    speed_benchmark_decode_comparison(args)
diff --git a/examples/flash_decoding/example_mha_inference.py b/examples/flash_decoding/example_mha_inference.py
index 24a90c57b5..f17d6abc75 100644
--- a/examples/flash_decoding/example_mha_inference.py
+++ b/examples/flash_decoding/example_mha_inference.py
@@ -8,7 +8,7 @@
 num_split = 4
 
 
-@tilelang.jit(out_idx=[5])
+@tilelang.jit(out_idx=[3])
 def flashattn(batch, heads, seqlen_q, seqlen_kv, dim, is_causal, block_M, block_N):
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     shape_q = [batch, seqlen_q, heads, dim]
@@ -22,10 +22,10 @@ def flashattn_mha_inference(
         Q: T.Tensor(shape_q, dtype),
         K: T.Tensor(shape_kv, dtype),
         V: T.Tensor(shape_kv, dtype),
-        glse: T.Tensor([batch, heads, num_split, seqlen_q], dtype),
-        Output_partial: T.Tensor(part_shape, dtype),  # [batch, seqlen_q, heads, num_split, dim]
         Output: T.Tensor(shape_q, dtype),
     ):
+        glse = T.alloc_global([batch, heads, num_split, seqlen_q], dtype)
+        Output_partial = T.alloc_global(part_shape, dtype)  # [batch, seqlen_q, heads, num_split, dim]
         # split
         with T.Kernel(T.ceildiv(seqlen_q, block_M), heads * batch, num_split, threads=128) as (bx, by, bz):
             Q_shared = T.alloc_shared([block_M, dim], dtype)
@@ -155,7 +155,7 @@ def flashattn_mha_inference(
     return flashattn_mha_inference
 
 
-def ref_program(Q, K, V, glse, Output_partial, causal):
+def ref_program(Q, K, V, causal):
     assert causal is False
     dim = Q.size(-1)
     scores = torch.einsum("bqhd,bkhd->bhqk", Q, K)
diff --git a/examples/flash_decoding/test_example_flash_decoding.py b/examples/flash_decoding/test_example_flash_decoding.py
index a02a920974..3181df2d56 100644
--- a/examples/flash_decoding/test_example_flash_decoding.py
+++ b/examples/flash_decoding/test_example_flash_decoding.py
@@ -1,18 +1,22 @@
+import os
+import pytest
 import tilelang.testing
 
 import example_gqa_decode
 import example_mha_inference
 import example_gqa_decode_varlen_logits
-import example_gqa_decode_varlen_logits_paged
+
+_is_cutedsl = os.environ.get("TILELANG_TARGET", "").lower() == "cutedsl"
 
 
-# TODO(lei): fix the correctness of gqa decode on sm90
 @tilelang.testing.requires_cuda
 @tilelang.testing.requires_cuda_compute_version_le(8, 9)
+@pytest.mark.skipif(_is_cutedsl, reason="CuTeDSL backend does not support alloc_global yet")
 def test_example_example_gqa_decode():
     example_gqa_decode.main()
 
 
+@pytest.mark.skipif(_is_cutedsl, reason="CuTeDSL backend does not support alloc_global yet")
 def test_example_example_mha_inference():
     example_mha_inference.main(BATCH=1, H=32, Q_CTX=128, KV_CTX=2048, D_HEAD=128, causal=False)
 
@@ -21,9 +25,5 @@ def test_example_example_gqa_decode_varlen_logits():
     example_gqa_decode_varlen_logits.main()
 
 
-def test_example_example_gqa_decode_varlen_logits_paged():
-    example_gqa_decode_varlen_logits_paged.main()
-
-
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/examples/fusedmoe/example_fusedmoe_tilelang.py b/examples/fusedmoe/example_fusedmoe_tilelang.py
index 5c236dd802..d4a2ced46d 100644
--- a/examples/fusedmoe/example_fusedmoe_tilelang.py
+++ b/examples/fusedmoe/example_fusedmoe_tilelang.py
@@ -8,7 +8,7 @@
 from example_fusedmoe_torch import *
 
 
-@tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+@tilelang.jit(pass_configs={"tl.disable_warp_specialized": True})
 def moe_forward_tilelang_shared(
     d_hidden,
     d_expert,
@@ -93,7 +93,7 @@ def kernel_shared(
     return kernel_shared
 
 
-@tilelang.jit(pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def moe_forward_tilelang_routed(
     d_hidden,
     d_expert,
@@ -106,8 +106,6 @@ def moe_forward_tilelang_routed(
     block_dexpert=128,
     threads=256,
     num_stages=1,
-    k_pack=1,
-    coalesced_width=None,
 ):
     scale = 1.44269504  # log2(e)
 
@@ -155,7 +153,7 @@ def kernel(
             gate_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
             up_logits_local = T.alloc_fragment((block_token, block_dexpert), dtype=accum_dtype)
 
-            T.use_swizzle(10, enable=True)
+            T.use_swizzle(10)
 
             m_start_padded = bx * block_token
 
@@ -172,24 +170,21 @@ def kernel(
                 T.copy(
                     input[m_start : m_start + block_token, k * block_dhidden : (k + 1) * block_dhidden],
                     input_shared,
-                    coalesced_width=coalesced_width,
                 )
                 T.copy(
                     routed_expert_gate[
                         cur_group_idx, by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
                     ],
                     routed_expert_gate_shared,
-                    coalesced_width=coalesced_width,
                 )
-                T.gemm(input_shared, routed_expert_gate_shared, gate_logits_local, k_pack=k_pack, transpose_B=True)
+                T.gemm(input_shared, routed_expert_gate_shared, gate_logits_local, transpose_B=True)
                 T.copy(
                     routed_expert_up[
                         cur_group_idx, by * block_dexpert : (by + 1) * block_dexpert, k * block_dhidden : (k + 1) * block_dhidden
                     ],
                     routed_expert_up_shared,
-                    coalesced_width=coalesced_width,
                 )
-                T.gemm(input_shared, routed_expert_up_shared, up_logits_local, k_pack=k_pack, transpose_B=True)
+                T.gemm(input_shared, routed_expert_up_shared, up_logits_local, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dexpert):
                 gate_logits_local[i, j] = gate_logits_local[i, j] * (1.0 / (1.0 + T.exp2(-gate_logits_local[i, j] * scale)))
@@ -205,7 +200,7 @@ def kernel(
             routed_expert_down_shared = T.alloc_shared((block_dhidden, block_dexpert), dtype=dtype)
             output_local = T.alloc_fragment((block_token, block_dhidden), dtype=accum_dtype)
 
-            T.use_swizzle(10, enable=True)
+            T.use_swizzle(10)
 
             m_start_padded = bx * block_token
 
@@ -221,16 +216,14 @@ def kernel(
                 T.copy(
                     up_logits[m_start : m_start + block_token, k * block_dexpert : (k + 1) * block_dexpert],
                     up_logits_shared,
-                    coalesced_width=coalesced_width,
                 )
                 T.copy(
                     routed_expert_down[
                         cur_group_idx, by * block_dhidden : (by + 1) * block_dhidden, k * block_dexpert : (k + 1) * block_dexpert
                     ],
                     routed_expert_down_shared,
-                    coalesced_width=coalesced_width,
                 )
-                T.gemm(up_logits_shared, routed_expert_down_shared, output_local, k_pack=k_pack, transpose_B=True)
+                T.gemm(up_logits_shared, routed_expert_down_shared, output_local, transpose_B=True)
 
             for i, j in T.Parallel(block_token, block_dhidden):
                 if i < actual_rows:
@@ -479,8 +472,6 @@ def custom_kernel(data: Tuple[torch.Tensor, Dict, Dict]) -> torch.Tensor:
         block_dexpert=128,
         threads=256,
         num_stages=1,
-        k_pack=1,
-        coalesced_width=2,
     )
 
     moe = MoE(config, shared_kernel, routed_kernel, weights, padding_M=128)
@@ -503,13 +494,8 @@ def main(d_hidden=7168, d_expert=2048, n_routed_experts=8, n_shared_experts=1, n
     }
 
     data = generate_input(**config)
-
-    torch.cuda.synchronize()
     ref_output = ref_kernel(clone_data(data)).to(torch.float32)
-    torch.cuda.synchronize()
     tilelang_output = custom_kernel(clone_data(data)).to(torch.float32)
-    torch.cuda.synchronize()
-
     torch.testing.assert_close(ref_output, tilelang_output, atol=1e-2, rtol=1e-2)
     print("✅ Tilelang and Torch match")
 
@@ -554,8 +540,6 @@ def run_regression_perf(
         block_dexpert=128,
         threads=256,
         num_stages=1,
-        k_pack=1,
-        coalesced_width=2,
     )
 
     moe = MoE(config, shared_kernel, routed_kernel, weights, padding_M=128)
@@ -627,8 +611,11 @@ def run_routed_kernel_only():
             moe.expert_output_routed,
         )
 
-    return do_bench(run_routed_kernel_only, backend="cupti")
+    shared_latency = do_bench(run_shared_kernel_only, backend="cupti")
+    routed_latency = do_bench(run_routed_kernel_only, backend="cupti")
+    return (shared_latency + routed_latency) / 2
 
 
 if __name__ == "__main__":
+    tilelang.disable_cache()
     main()
diff --git a/examples/gdn/example_chunk_delta_bwd.py b/examples/gdn/example_chunk_delta_bwd.py
index 4230df525e..466c471821 100644
--- a/examples/gdn/example_chunk_delta_bwd.py
+++ b/examples/gdn/example_chunk_delta_bwd.py
@@ -4,6 +4,7 @@
 
 import tilelang
 import tilelang.language as T
+from tilelang.profiler import do_bench
 
 print(tilelang.__file__, flush=True)
 
@@ -544,31 +545,6 @@ def run_test(
         assert_similar(dv2_ref_torch, dv2_tilelang, 1e-5, "torch-tilelang", data="dv2")
 
 
-def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
-    """
-    Do benchmark for a function.
-    """
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    for _ in range(warmup):
-        fn(*args, **kwargs)
-
-    torch.cuda.synchronize()
-    for i in range(rep):
-        start_event[i].record()
-        fn(*args, **kwargs)
-        end_event[i].record()
-    torch.cuda.synchronize()
-
-    # Record clocks
-    times = torch.tensor(
-        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
-        dtype=torch.float,
-    )
-
-    return times.mean().item()
-
-
 def main():
     DK = 128
     run_test(
diff --git a/examples/gdn/example_chunk_delta_h.py b/examples/gdn/example_chunk_delta_h.py
index 2ee84e7bf6..c34d9b5304 100644
--- a/examples/gdn/example_chunk_delta_h.py
+++ b/examples/gdn/example_chunk_delta_h.py
@@ -4,6 +4,7 @@
 import tilelang
 import tilelang.language as T
 from tilelang.autotuner import autotune
+from tilelang.profiler import do_bench
 
 # Add your fla repository path to sys.path
 # Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
@@ -224,31 +225,6 @@ def kernel(
     return kernel
 
 
-def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
-    """
-    Do benchmark for a function.
-    """
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    for _ in range(warmup):
-        fn(*args, **kwargs)
-
-    torch.cuda.synchronize()
-    for i in range(rep):
-        start_event[i].record()
-        fn(*args, **kwargs)
-        end_event[i].record()
-    torch.cuda.synchronize()
-
-    # Record clocks
-    times = torch.tensor(
-        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
-        dtype=torch.float,
-    )
-
-    return times.mean().item()
-
-
 def run_test(
     B,
     S,
diff --git a/examples/gdn/example_chunk_o.py b/examples/gdn/example_chunk_o.py
index a4d7281f55..bb95f555f8 100644
--- a/examples/gdn/example_chunk_o.py
+++ b/examples/gdn/example_chunk_o.py
@@ -127,16 +127,15 @@ def kernel(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     G_diff_local[i_s1, i_s2] = G_shared[i_s1] - G_shared[i_s2]
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(G_diff_local[i_s1, i_s2] <= 0):
-                        with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2])
-                        with T.Else():
-                            A_fragment[i_s1, i_s2] = 0
+                    A_fragment[i_s1, i_s2] = T.if_then_else(
+                        G_diff_local[i_s1, i_s2] <= 0,
+                        A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2]),
+                        0,
+                    )
 
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                with T.If(i_s1 < i_s2):  # noqa: SIM117
-                    with T.Then():
-                        A_fragment[i_s1, i_s2] = 0
+                if i_s1 < i_s2:
+                    A_fragment[i_s1, i_s2] = 0
 
             T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], V_shared)
             T.copy(A_fragment, A_shared)
diff --git a/examples/gdn/example_chunk_o_bwd.py b/examples/gdn/example_chunk_o_bwd.py
index e589818f4c..b369e03a8f 100644
--- a/examples/gdn/example_chunk_o_bwd.py
+++ b/examples/gdn/example_chunk_o_bwd.py
@@ -109,7 +109,7 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-4, -3, -2, -1],
-    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def tilelang_chunk_o_bwd_dqkwg(
     # task config
@@ -359,31 +359,6 @@ def kernel(
     return kernel
 
 
-def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
-    """
-    Do benchmark for a function.
-    """
-    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
-    for _ in range(warmup):
-        fn(*args, **kwargs)
-
-    torch.cuda.synchronize()
-    for i in range(rep):
-        start_event[i].record()
-        fn(*args, **kwargs)
-        end_event[i].record()
-    torch.cuda.synchronize()
-
-    # Record clocks
-    times = torch.tensor(
-        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
-        dtype=torch.float,
-    )
-
-    return times.mean().item()
-
-
 def run_test(
     B,
     S,
diff --git a/examples/gdn/example_chunk_scaled_dot_kkt.py b/examples/gdn/example_chunk_scaled_dot_kkt.py
index 8c7a4d573b..c16374fe8c 100644
--- a/examples/gdn/example_chunk_scaled_dot_kkt.py
+++ b/examples/gdn/example_chunk_scaled_dot_kkt.py
@@ -111,16 +111,15 @@ def kernel(
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
                     G_diff_local[i_s1, i_s2] = G_shared[i_s1] - G_shared[i_s2]
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(G_diff_local[i_s1, i_s2] <= 0 and i_s1 > i_s2):
-                        with T.Then():
-                            A_fragment[i_s1, i_s2] = A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2])
-                        with T.Else():
-                            A_fragment[i_s1, i_s2] = 0
+                    A_fragment[i_s1, i_s2] = T.if_then_else(
+                        G_diff_local[i_s1, i_s2] <= 0 and i_s1 > i_s2,
+                        A_fragment[i_s1, i_s2] * T.exp(G_diff_local[i_s1, i_s2]),
+                        0,
+                    )
             else:
                 for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                    with T.If(i_s1 <= i_s2):  # noqa: SIM117
-                        with T.Then():
-                            A_fragment[i_s1, i_s2] = 0
+                    if i_s1 <= i_s2:
+                        A_fragment[i_s1, i_s2] = 0
 
             T.copy(A_fragment, A_shared)
             T.copy(A_shared, A[bb, bs * block_S : (bs + 1) * block_S, bh, :])
diff --git a/examples/gdn/example_cumsum.py b/examples/gdn/example_cumsum.py
index 0760b49645..9d4ca0222e 100644
--- a/examples/gdn/example_cumsum.py
+++ b/examples/gdn/example_cumsum.py
@@ -20,9 +20,7 @@
 import torch
 
 
-@tilelang.jit(
-    out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
-)
+@tilelang.jit(out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tilelang_chunk_local_cumsum_scalar(
     # task config
     B,
diff --git a/examples/gdn/example_wy_fast_bwd_split.py b/examples/gdn/example_wy_fast_bwd_split.py
index de8afc2b77..5711010025 100644
--- a/examples/gdn/example_wy_fast_bwd_split.py
+++ b/examples/gdn/example_wy_fast_bwd_split.py
@@ -94,7 +94,7 @@ def prepare_output(
 
 @tilelang.jit(
     out_idx=[-5, -4, -3, -2, -1],
-    pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def tilelang_wy_fast_bwd(
     # task config
@@ -247,7 +247,7 @@ def kernel(
     return kernel
 
 
-@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tilelang_wy_fast_bwd_split(
     # task config
     B,
@@ -345,26 +345,25 @@ def kernel(
             T.copy(dA_shared, dA_fragment)
 
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                with T.If(i_s1 <= i_s2):  # noqa: SIM117
-                    with T.Then():
-                        dA_fragment[i_s1, i_s2] = 0
+                if i_s1 <= i_s2:
+                    dA_fragment[i_s1, i_s2] = 0
             T.copy(dA_fragment, dA_shared)
             T.gemm(dA_shared, A_shared, dA_fragment, clear_accum=True, transpose_B=True)
             T.copy(dA_fragment, dA_shared)
             T.gemm(A_shared, dA_shared, dA_fragment, clear_accum=True, transpose_A=True)
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                with T.If(i_s1 <= i_s2):
-                    with T.Then():
-                        dA_fragment[i_s1, i_s2] = 0
-                    with T.Else():
-                        dA_fragment[i_s1, i_s2] = -dA_fragment[i_s1, i_s2]
+                dA_fragment[i_s1, i_s2] = T.if_then_else(
+                    i_s1 <= i_s2,
+                    0,
+                    -dA_fragment[i_s1, i_s2],
+                )
 
             for i_s1, i_s2 in T.Parallel(block_S, block_S):
-                with T.If(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0):
-                    with T.Then():
-                        dA_fragment[i_s1, i_s2] *= T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh])
-                    with T.Else():
-                        dA_fragment[i_s1, i_s2] = 0
+                dA_fragment[i_s1, i_s2] = T.if_then_else(
+                    G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh] <= 0,
+                    dA_fragment[i_s1, i_s2] * T.exp(G[bb, bs * block_S + i_s1, bh] - G[bb, bs * block_S + i_s2, bh]),
+                    0,
+                )
             T.copy(dA_fragment, dA_shared)
 
             # acceptable dA diff
diff --git a/examples/gdn/test_example_gdn_compilation.py b/examples/gdn/test_example_gdn_compilation.py
index 6f9fa5d2f7..b3d255a70a 100644
--- a/examples/gdn/test_example_gdn_compilation.py
+++ b/examples/gdn/test_example_gdn_compilation.py
@@ -50,6 +50,7 @@ def test_example_wy_fast_compilation():
     )
     print(kernel.get_kernel_source())
     W_tilelang, U_tilelang = kernel(K, V, Beta, G, A)
+    torch.cuda.synchronize()
 
 
 def test_example_wy_fast_bwd_split_compilation():
@@ -317,4 +318,4 @@ def test_example_chunk_delta_bwd_compilation():
 
 if __name__ == "__main__":
     # tilelang.testing.main()
-    test_example_chunk_delta_bwd_compilation()
+    test_example_wy_fast_compilation()
diff --git a/examples/gemm/example_gemm_autotune.py b/examples/gemm/example_gemm_autotune.py
index 016d448a4c..052bd64c6d 100644
--- a/examples/gemm/example_gemm_autotune.py
+++ b/examples/gemm/example_gemm_autotune.py
@@ -107,7 +107,13 @@ def get_configs(M, N, K, with_roller=False, topk=20):
     return configs
 
 
-def get_best_config(M, N, K, with_roller=False):
+def get_best_config(
+    M,
+    N,
+    K,
+    with_roller: bool = False,
+    profile_backend: str = "event",
+):
     def kernel(
         block_M=None,
         block_N=None,
@@ -156,6 +162,7 @@ def main(
             supply_type=tl.TensorSupplyType.Integer,
             ref_prog=ref_program,
             skip_check=False,
+            backend=profile_backend,
         )
     )
     return autotuner.run(warmup=3, rep=20)
@@ -207,10 +214,22 @@ def gemm_autotune(
     return gemm_autotune
 
 
-def main(M: int = 4096, N: int = 4096, K: int = 4096, use_autotune: bool = False, with_roller: bool = False):
-    use_autotune = True
+def main(
+    M: int = 4096,
+    N: int = 4096,
+    K: int = 4096,
+    use_autotune: bool = False,
+    with_roller: bool = False,
+    profile_backend: str = "event",
+):
     if use_autotune:
-        result = get_best_config(M, N, K, with_roller)
+        result = get_best_config(
+            M,
+            N,
+            K,
+            with_roller=with_roller,
+            profile_backend=profile_backend,
+        )
         print(result.config)
         kernel = result.kernel
     else:
@@ -219,8 +238,13 @@ def main(M: int = 4096, N: int = 4096, K: int = 4096, use_autotune: bool = False
 
     # benchmark
     profiler = kernel.get_profiler(tensor_supply_type=tl.TensorSupplyType.Auto)
-    tilelang_latency = profiler.do_bench()
-    ref_latency = profiler.do_bench(ref_program)
+    tilelang_latency = profiler.do_bench(
+        backend=profile_backend,
+    )
+    ref_latency = profiler.do_bench(
+        ref_program,
+        backend=profile_backend,
+    )
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
     print(f"TileLang latency: {tilelang_latency}")
     print(f"Ref latency: {ref_latency}")
@@ -242,5 +266,13 @@ def run_regression_perf(M: int = 4096, N: int = 4096, K: int = 4096):
     parser.add_argument("--k", type=int, default=4096, help="Matrix dimension K")
     parser.add_argument("--use_autotune", action="store_true", default=False, help="Whether to use autotune for matmul configs")
     parser.add_argument("--with_roller", action="store_true", default=False, help="Whether to enable BitBLAS roller for search space")
+    parser.add_argument("--profile_backend", type=str, default="event", help="Profiler backend")
     args = parser.parse_args()
-    main(args.m, args.n, args.k, args.use_autotune, args.with_roller)
+    main(
+        args.m,
+        args.n,
+        args.k,
+        args.use_autotune,
+        args.with_roller,
+        args.profile_backend,
+    )
diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
index d4bc9480ff..15e552587e 100644
--- a/examples/gemm/example_gemm_intrinsics.py
+++ b/examples/gemm/example_gemm_intrinsics.py
@@ -6,7 +6,6 @@
 from tilelang.intrinsics.mma_macro_generator import (
     TensorCoreIntrinEmitter,
 )
-from tilelang.transform import simplify_prim_func
 
 
 def make_swizzle_layout(shared_buf):
@@ -25,7 +24,6 @@ def transform_func(i, j):
 
 
 @tilelang.jit(out_idx=[2])
-@simplify_prim_func
 def tl_matmul(
     M,
     N,
diff --git a/examples/gemm/regression_example_gemm.py b/examples/gemm/regression_example_gemm.py
index 3583cf16ac..4976020598 100644
--- a/examples/gemm/regression_example_gemm.py
+++ b/examples/gemm/regression_example_gemm.py
@@ -2,7 +2,6 @@
 import example_gemm
 import example_gemm_autotune
 import example_gemm_intrinsics
-import example_gemm_schedule
 
 
 def regression_example_gemm_autotune():
@@ -13,10 +12,6 @@ def regression_example_gemm_intrinsics():
     tilelang.testing.process_func(example_gemm_intrinsics.run_regression_perf, M=1024, N=1024, K=1024)
 
 
-def regression_example_gemm_schedule():
-    tilelang.testing.process_func(example_gemm_schedule.run_regression_perf)
-
-
 def regression_example_gemm():
     tilelang.testing.process_func(example_gemm.run_regression_perf)
 
diff --git a/examples/gemm/test_example_gemm.py b/examples/gemm/test_example_gemm.py
index 5f69364be6..fb0ae3ab4b 100644
--- a/examples/gemm/test_example_gemm.py
+++ b/examples/gemm/test_example_gemm.py
@@ -1,7 +1,6 @@
 import tilelang.testing
 import example_gemm_autotune
 import example_gemm_intrinsics
-import example_gemm_schedule
 import example_gemm
 
 
@@ -14,10 +13,6 @@ def test_example_gemm_intrinsics():
     example_gemm_intrinsics.main(M=1024, N=1024, K=1024)
 
 
-def test_example_gemm_schedule():
-    example_gemm_schedule.main()
-
-
 def test_example_gemm():
     example_gemm.main()
 
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd.py b/examples/gemm_fp8/example_tilelang_gemm_amd.py
index 93f8c4980c..16a9d5f329 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_amd.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_amd.py
@@ -2,6 +2,7 @@
 import tilelang
 import tilelang.language as T
 from tilelang.utils.tensor import torch_assert_close
+from tilelang.utils import determine_fp8_type, determine_torch_fp8_type
 import itertools
 
 
@@ -17,8 +18,9 @@ def supply_prog(args):
     a_param, b_param = args
     M, K = a_param.shape
     N, _ = b_param.shape
-    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    fp8_dtype = determine_torch_fp8_type()
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
     return [a, b]
 
 
@@ -53,7 +55,7 @@ def get_configs():
 )
 @tilelang.jit(out_idx=[-1])
 def fp8_matmul(M, N, K, block_M, block_N, block_K, num_stages, num_threads, k_pack, gemm_type):
-    dtype = T.float8_e4m3fnuz
+    dtype = determine_fp8_type()
     accum_dtype = T.float32
 
     @T.prim_func
@@ -104,8 +106,9 @@ def gemm_fp8_ss(
 
 def test_gemm_fp8(M, N, K):
     kernel = fp8_matmul(M, N, K)
-    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
-    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=torch.float8_e4m3fnuz)
+    fp8_dtype = determine_torch_fp8_type()
+    a = (torch.randn(M, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
+    b = (torch.randn(N, K, dtype=torch.float16, device="cuda") * 0.01).to(dtype=fp8_dtype)
     c = kernel(a, b)
     ref_c = ref_program(a, b)
     torch_assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py b/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py
new file mode 100644
index 0000000000..fc7fb44003
--- /dev/null
+++ b/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py
@@ -0,0 +1,225 @@
+import torch
+import itertools
+import tilelang
+import tilelang.testing
+from tilelang import tvm as tvm
+import tilelang.language as T
+from tilelang.tileop.base import GemmWarpPolicy
+from tilelang.layout import make_swizzled_layout
+from tilelang.intrinsics.mfma_macro_generator import MatrixCorePreshuffleIntrinEmitter
+from tilelang.utils import determine_fp8_type
+
+tilelang.testing.set_random_seed(0)
+
+
+def get_configs():
+    block_Ms = [32, 64, 128]
+    block_Ns = [32, 64, 128]
+    block_Ks = [64, 128]
+    num_stages = [0, 1, 2]
+
+    valid_configs = []
+
+    for m, n, k, stages in itertools.product(block_Ms, block_Ns, block_Ks, num_stages):
+        valid_configs.append(
+            {
+                "block_M": m,
+                "block_N": n,
+                "block_K": k,
+                "num_stages": stages,
+            }
+        )
+    return valid_configs
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit(out_idx=[-1])
+def tl_matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    num_stages,
+    k_pack=2,
+    num_threads=256,
+    in_dtype=None,
+    out_dtype=T.float32,
+    accum_dtype=T.float32,
+    a_transposed=False,
+    b_transposed=True,
+):
+    if in_dtype is None:
+        in_dtype = determine_fp8_type()
+    b_preshuffle = True
+    warp_size = 64
+    num_warps = num_threads // warp_size
+
+    policy = GemmWarpPolicy.Square
+    m_warp, n_warp = policy.compute_warp_partition(block_M, block_N, num_warps)
+
+    shared_scope = "shared"
+    warp_row_tiles = block_M // m_warp
+    warp_col_tiles = block_N // n_warp
+
+    # MMA Wrapper to Auto Generate Code for MMA
+    mfma_emitter = MatrixCorePreshuffleIntrinEmitter(
+        a_dtype=in_dtype,
+        b_dtype=in_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+        block_row_warps=m_warp,
+        block_col_warps=n_warp,
+        warp_row_tiles=warp_row_tiles,
+        warp_col_tiles=warp_col_tiles,
+        chunk=block_K,
+        k_pack=k_pack,
+        b_preshuffle=b_preshuffle,
+    )
+    local_size_a = mfma_emitter.local_size_a
+    local_size_b = mfma_emitter.local_size_b
+
+    warp_rows = mfma_emitter.warp_rows
+    warp_cols = mfma_emitter.warp_cols
+
+    micro_size_y = mfma_emitter.micro_size_y
+    micro_size_k = mfma_emitter.micro_size_k
+    pack_size_k = micro_size_k * k_pack
+
+    A_shape = (K, M) if a_transposed else (M, K)
+    A_shared_shape = (block_K, block_M) if a_transposed else (block_M, block_K)
+
+    B_shape = (
+        (N // micro_size_y, K // pack_size_k, micro_size_y, pack_size_k)
+        if b_transposed
+        else (K // pack_size_k, N // micro_size_y, pack_size_k, micro_size_y)
+    )
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=num_threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype, scope=shared_scope)
+            A_local = T.alloc_local((warp_rows * local_size_a * k_pack), in_dtype)
+            B_local = T.alloc_local((warp_cols * local_size_b * k_pack), in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.annotate_layout(
+                {
+                    A_shared: make_swizzled_layout(A_shared),
+                    C_local: mfma_emitter.make_mfma_store_layout(C_local),
+                }
+            )
+
+            num_ko = K // block_K
+            num_ki = block_K // (k_pack * micro_size_k)
+
+            # Improve L2 Cache
+            # T.use_swizzle(panel_size=10)
+            T.clear(C_local)
+            for ko in T.Pipelined(num_ko, num_stages=num_stages):
+                # Load A into shared memory
+                if a_transposed:
+                    T.copy(A[ko * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, ko * block_K], A_shared)
+
+                for ki in T.serial(0, num_ki):
+                    mfma_emitter.ldmatrix_a(
+                        A_local,
+                        A_shared,
+                        ki,
+                    )
+                    mfma_emitter.ldmatrix_b(B_local, B, ki + ko * num_ki, pid_m=by, pid_n=bx)
+
+                    # Perform Matrix Multiplication
+                    mfma_emitter.mfma(A_local, B_local, C_local, ki)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def shuffle_weight(
+    x: torch.Tensor,
+    layout=(16, 32),
+    k_pack=1,
+    is_transpose=False,
+) -> torch.Tensor:
+    IN, IK = layout
+    BK = IK * k_pack
+    BN = IN
+
+    N, K = (x.shape[-2], x.shape[-1]) if is_transpose else (x.shape[-1], x.shape[-2])
+    assert N % BN == 0
+    assert K % BK == 0
+
+    x = x.view(N // BN, BN, K // BK, BK) if is_transpose else x.view(K // BK, BK, N // BN, BN)
+    x = x.permute(0, 2, 1, 3)
+    return x.contiguous()
+
+
+def assert_tl_matmul_correctness(M, N, K, k_pack=1, a_transposed=False, b_transposed=True):
+    in_dtype = determine_fp8_type()
+    out_dtype = T.float32
+    accum_dtype = T.float32
+    kernel = tl_matmul(
+        M,
+        N,
+        K,
+        k_pack=k_pack,
+        in_dtype=in_dtype,
+        out_dtype=out_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=a_transposed,
+        b_transposed=b_transposed,
+    )
+
+    src_code = kernel.get_kernel_source()
+    # src_code is the generated cuda source
+    assert src_code is not None
+    A_shape = (K, M) if a_transposed else (M, K)
+    B_shape = (N, K) if b_transposed else (K, N)
+
+    A = (torch.rand(A_shape, device="cuda", dtype=torch.float16) / 10).to(getattr(torch, in_dtype))
+    B = (torch.rand(B_shape, device="cuda", dtype=torch.float16) / 10).to(getattr(torch, in_dtype))
+
+    B_preshuffle = shuffle_weight(B, k_pack=k_pack, is_transpose=b_transposed)
+    C = kernel(A, B_preshuffle)
+
+    profiler = kernel.get_profiler()
+    latency = profiler.do_bench()
+
+    # Ensure that the latency is not None
+    assert latency is not None
+    print("time: ", latency)
+
+    if a_transposed and b_transposed:
+        # Get Reference Result
+        ref_c = torch.matmul(A.T.half(), B.T.half()).to(getattr(torch, out_dtype))
+    elif a_transposed and not b_transposed:
+        # Get Reference Result
+        ref_c = torch.matmul(A.T.half(), B.half()).to(getattr(torch, out_dtype))
+    elif not a_transposed and b_transposed:
+        # Get Reference Result
+        ref_c = torch.matmul(A.half(), B.T.half()).to(getattr(torch, out_dtype))
+    else:
+        # Get Reference Result
+        ref_c = torch.matmul(A.half(), B.half()).to(getattr(torch, out_dtype))
+
+    torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
+
+
+def test_assert_tl_matmul():
+    assert_tl_matmul_correctness(512, 512, 512, k_pack=2)
+
+
+if __name__ == "__main__":
+    test_assert_tl_matmul()
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8.py b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
index 0869979756..3b575c78e8 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8.py
@@ -1,6 +1,7 @@
 import torch
 import tilelang
 import tilelang.language as T
+from tilelang.utils import determine_fp8_type
 
 
 def calc_diff(x, y):
@@ -55,21 +56,24 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 1024, T.float8_e4m3fn)
-    test_gemm_fp8(1024, 1024, 1024, T.float8_e5m2)
+    test_gemm_fp8(1024, 1024, 1024, determine_fp8_type())
+    test_gemm_fp8(1024, 1024, 1024, determine_fp8_type("e5m2"))
 
 
 def run_regression_perf():
     M, N, K = 4096, 4096, 4096
-    dtype = "float8_e4m3"
+    dtype = determine_fp8_type()
     kernel_e4m3 = matmul(M, N, K, 128, 128, 64, dtype)
     profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
-    latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
-    dtype = "float8_e5m2"
-    kernel_e5m2 = matmul(M, N, K, 128, 128, 64, dtype)
-    profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
-    latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
-    return (latency_e4m3 + latency_e5m2) / 2
+    if torch.version.hip is None:
+        latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+        dtype = determine_fp8_type("e5m2")
+        kernel_e5m2 = matmul(M, N, K, 128, 128, 64, dtype)
+        profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
+        latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
+        return (latency_e4m3 + latency_e5m2) / 2
+    latency_e4m3 = profiler_e4m3.do_bench()
+    return latency_e4m3
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
index a702e8ae0a..39c6fc333c 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_2xAcc.py
@@ -1,6 +1,7 @@
 import torch
 import tilelang
 import tilelang.language as T
+from tilelang.utils import determine_fp8_type
 
 
 @tilelang.jit(out_idx=[-1])
@@ -73,21 +74,26 @@ def test_gemm_fp8(M, N, K, dtype):
 
 
 def main():
-    test_gemm_fp8(1024, 1024, 8192, T.float8_e4m3fn)
-    test_gemm_fp8(1024, 1024, 8192, T.float8_e5m2)
+    test_gemm_fp8(1024, 1024, 8192, determine_fp8_type())
+    test_gemm_fp8(1024, 1024, 8192, determine_fp8_type("e5m2"))
 
 
 def run_regression_perf():
     M, N, K = 1024, 1024, 8192
-    dtype = "float8_e4m3"
+    dtype = determine_fp8_type()
     kernel_e4m3 = matmul(M, N, K, 128, 128, 64, dtype)
     profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
-    latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
-    dtype = "float8_e5m2"
-    kernel_e5m2 = matmul(M, N, K, 128, 128, 64, dtype)
-    profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
-    latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
-    return (latency_e4m3 + latency_e5m2) / 2
+    if torch.version.hip is None:
+        latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+    else:
+        latency_e4m3 = profiler_e4m3.do_bench()
+    if torch.version.hip is None:
+        dtype = determine_fp8_type("e5m2")
+        kernel_e5m2 = matmul(M, N, K, 128, 128, 64, dtype)
+        profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
+        latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
+        return (latency_e4m3 + latency_e5m2) / 2
+    return latency_e4m3
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
index 762885ec38..d9f749d9f2 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_intrinsic.py
@@ -4,11 +4,9 @@
 from tvm import DataType
 import tilelang.language as T
 from tilelang.intrinsics import get_swizzle_layout
-from tilelang.intrinsics.mma_macro_generator import (
-    TensorCoreIntrinEmitter,
-)
-from tilelang.transform import simplify_prim_func
-from tilelang.utils.tensor import map_torch_type
+from tilelang.intrinsics.mma_macro_generator import TensorCoreIntrinEmitter
+from tilelang.intrinsics.mfma_macro_generator import MatrixCoreIntrinEmitter
+from tilelang.utils import determine_fp8_type
 
 tilelang.testing.set_random_seed(0)
 
@@ -29,7 +27,6 @@ def transform_func(i, j):
 
 
 @tilelang.jit(out_idx=[2])
-@simplify_prim_func
 def tl_matmul(
     M,
     N,
@@ -41,26 +38,17 @@ def tl_matmul(
     assert in_dtype in [
         T.float16,
         T.float8_e4m3fn,
+        T.float8_e4m3fnuz,
         T.float8_e5m2,
+        T.float8_e5m2fnuz,
         T.int8,
-    ], "Currently only float16 and int8 are supported"
+    ], "Currently only float16, float8, and int8 are supported"
     assert out_dtype in [
         T.float16,
         T.float32,
         T.int32,
     ], "Currently only float16, float32 and int32 are supported"
 
-    micro_size_x = micro_size_y = micro_size_k = 16
-
-    is_float8 = in_dtype in [
-        T.float8_e4m3fn,
-        T.float8_e5m2,
-        T.float8_e4m3fn,
-        T.float8_e5m2fnuz,
-    ]
-    if out_dtype == T.int32 or is_float8:
-        micro_size_k = 32
-
     # This is a debug config
     block_row_warps = 2
     block_col_warps = 2
@@ -80,6 +68,38 @@ def tl_matmul(
     B_shape = (N, K)
     A_shared_shape = (block_M, block_K)
     B_shared_shape = (block_N, block_K)
+    is_hip = torch.version.hip is not None
+    # MMA Wrapper to Auto Generate Code for MMA/MFMA
+    if is_hip:
+        mma_emitter = MatrixCoreIntrinEmitter(
+            a_dtype=in_dtype,
+            b_dtype=in_dtype,
+            accum_dtype=accum_dtype,
+            a_transposed=False,
+            b_transposed=True,
+            block_row_warps=block_row_warps,
+            block_col_warps=block_col_warps,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=chunk,
+        )
+    else:
+        mma_emitter = TensorCoreIntrinEmitter(
+            a_dtype=in_dtype,
+            b_dtype=in_dtype,
+            accum_dtype=accum_dtype,
+            a_transposed=False,
+            b_transposed=True,
+            block_row_warps=block_row_warps,
+            block_col_warps=block_col_warps,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=chunk,
+        )
+
+    micro_size_x = mma_emitter.M_DIM
+    micro_size_y = getattr(mma_emitter, "n_dim", getattr(mma_emitter, "N_DIM", micro_size_x))
+    micro_size_k = mma_emitter.k_dim
     C_shared_shape = (
         block_M // micro_size_x,
         block_N // micro_size_y,
@@ -87,27 +107,12 @@ def tl_matmul(
         micro_size_y,
     )
 
-    warp_size = 32
-    threads = warp_size * (block_row_warps * block_col_warps)
-    local_size_a = (micro_size_x * micro_size_k) // warp_size
-    local_size_b = (micro_size_y * micro_size_k) // warp_size
-    local_size_c = (micro_size_x * micro_size_y) // warp_size
-    warp_rows = warp_row_tiles // micro_size_x
-    warp_cols = warp_col_tiles // micro_size_y
-
-    # MMA Wrapper to Auto Generate Code for MMA
-    mma_emitter = TensorCoreIntrinEmitter(
-        a_dtype=in_dtype,
-        b_dtype=in_dtype,
-        accum_dtype=accum_dtype,
-        a_transposed=False,
-        b_transposed=True,
-        block_row_warps=block_row_warps,
-        block_col_warps=block_col_warps,
-        warp_row_tiles=warp_row_tiles,
-        warp_col_tiles=warp_col_tiles,
-        chunk=chunk,
-    )
+    threads = mma_emitter.threads
+    local_size_a = mma_emitter.local_size_a
+    local_size_b = mma_emitter.local_size_b
+    local_size_c = mma_emitter.local_size_out
+    warp_rows = mma_emitter.warp_rows
+    warp_cols = mma_emitter.warp_cols
 
     @T.prim_func
     def gemm_fp8_intrinsic(
@@ -160,7 +165,10 @@ def gemm_fp8_intrinsic(
                     )
 
                     # Perform Matrix Multiplication
-                    mma_emitter.mma(A_local, B_local, C_local)
+                    if is_hip:
+                        mma_emitter.mfma(A_local, B_local, C_local, ki)
+                    else:
+                        mma_emitter.mma(A_local, B_local, C_local)
 
             # Perform STMatrix
             mma_emitter.stmatrix(
@@ -183,18 +191,17 @@ def gemm_fp8_intrinsic(
 def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     kernel = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     src_code = kernel.get_kernel_source()
-    print(src_code)
     # src_code is the generated cuda source
     assert src_code is not None
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-    accum_dtype = map_torch_type(accum_dtype)
+    in_dtype = in_dtype.as_torch()
+    out_dtype = out_dtype.as_torch()
+    accum_dtype = accum_dtype.as_torch()
 
     if in_dtype in {torch.int8, torch.int32}:
         A = torch.randint(-128, 128, (M, K), dtype=torch.int8).to(in_dtype).cuda()
         B = torch.randint(-128, 128, (N, K), dtype=torch.int8).to(in_dtype).cuda()
-    elif in_dtype in {torch.float8_e4m3fn, torch.float8_e5m2}:
+    elif in_dtype in {torch.float8_e4m3fn, torch.float8_e4m3fnuz, torch.float8_e5m2, torch.float8_e5m2fnuz}:
         A = torch.randn(M, K).to(in_dtype).cuda()
         B = torch.randn(N, K).to(in_dtype).cuda()
     else:
@@ -214,28 +221,27 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
 
     # Get Reference Result
     ref_c = torch.matmul(A.to(accum_dtype), B.T.to(accum_dtype)).to(out_dtype)
-    print(C)
-    print(ref_c)
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
 def main():
-    assert_tl_matmul_correctness(128, 128, 128, T.float8_e4m3fn, T.float32, T.float32)
-    assert_tl_matmul_correctness(128, 128, 128, T.float8_e5m2, T.float32, T.float32)
+    e4m3_dtype = determine_fp8_type()
+    assert_tl_matmul_correctness(128, 128, 128, e4m3_dtype, T.float32, T.float32)
+    e5m2_dtype = determine_fp8_type("e5m2")
+    assert_tl_matmul_correctness(128, 128, 128, e5m2_dtype, T.float32, T.float32)
 
 
 def run_regression_perf():
     M, N, K = 4096, 4096, 4096
-    out_dtype, accum_dtype = "float32", "float32"
-    in_dtype = T.float8_e4m3fn
+    out_dtype, accum_dtype = T.float32, T.float32
+    in_dtype = determine_fp8_type()
     kernel_e4m3 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
     profiler_e4m3 = kernel_e4m3.get_profiler(tilelang.TensorSupplyType.Integer)
-    latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
-    in_dtype = T.float8_e5m2
-    kernel_e5m2 = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype)
-    profiler_e5m2 = kernel_e5m2.get_profiler(tilelang.TensorSupplyType.Integer)
-    latency_e5m2 = profiler_e5m2.do_bench(backend="cupti")
-    return (latency_e4m3 + latency_e5m2) / 2
+    if torch.version.hip is None:
+        latency_e4m3 = profiler_e4m3.do_bench(backend="cupti")
+    else:
+        latency_e4m3 = profiler_e4m3.do_bench()
+    return latency_e4m3
 
 
 if __name__ == "__main__":
diff --git a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
index aa7e8b3608..72f09c2503 100644
--- a/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
+++ b/examples/gemm_fp8/example_tilelang_gemm_fp8_sm100.py
@@ -1,7 +1,6 @@
 import torch
 import tilelang
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 
 def matmul(
@@ -41,14 +40,13 @@ def main(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)
                 T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm_v2(
+                T.tcgen05_gemm(
                     A_shared,
                     B_shared,
                     C_tmem,
                     trans_A,
                     trans_B,
                     mbar=mbar,
-                    wg_wait=-1,
                     clear_accum=(k == 0),
                 )
                 T.mbarrier_wait_parity(mbar, k % 2)
@@ -75,8 +73,8 @@ def calc_diff(x, y):
 threads = 256
 for tvm_fp8_dtype in [T.float8_e4m3fn, T.float8_e5m2]:
     for tvm_acc_dtype in [T.float16, T.float32]:  # , torch.float16]:
-        torch_fp8_dtype = map_torch_type(tvm_fp8_dtype)
-        torch_acc_dtype = map_torch_type(tvm_acc_dtype)
+        torch_fp8_dtype = tvm_fp8_dtype.as_torch()
+        torch_acc_dtype = tvm_acc_dtype.as_torch()
         print(f"running {tvm_fp8_dtype} -> {tvm_acc_dtype}")
         in_dtype, out_dtype, accum_dtype = tvm_fp8_dtype, tvm_acc_dtype, tvm_acc_dtype
 
@@ -100,7 +98,6 @@ def calc_diff(x, y):
             out_idx=[2],
             target="cuda",
             pass_configs={
-                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
                 tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
                 tilelang.PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT: True,
             },
diff --git a/examples/gemm_int4/example_tilelang_gemm_int4.py b/examples/gemm_int4/example_tilelang_gemm_int4.py
new file mode 100644
index 0000000000..3db000b616
--- /dev/null
+++ b/examples/gemm_int4/example_tilelang_gemm_int4.py
@@ -0,0 +1,113 @@
+"""Frontend int4 GEMM example for the T.gemm int4 path.
+
+This file intentionally models the desired TileLang frontend API:
+- A/B are declared as T.int4 tensors
+- the matmul is expressed with T.gemm(...)
+
+The example compiles the kernel, prints the generated CUDA source, and
+checks correctness against a PyTorch reference.
+"""
+
+import torch
+
+import tilelang
+import tilelang.language as T
+
+
+def matmul_nt_int4(M, N, K, block_M, block_N, block_K, threads=128):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), T.int4),
+        B: T.Tensor((N, K), T.int4),
+        C: T.Tensor((M, N), T.int32),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), T.int4)
+            B_shared = T.alloc_shared((block_N, block_K), T.int4)
+            C_local = T.alloc_fragment((block_M, block_N), T.int32)
+
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[bx * block_N, ko * block_K], B_shared)
+                # Frontend expectation: T.gemm should accept int4 operands directly.
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def compile_int4_gemm(
+    M=1024,
+    N=1024,
+    K=1024,
+    block_M=128,
+    block_N=128,
+    block_K=64,
+    threads=128,
+    print_cuda_source=True,
+):
+    func = matmul_nt_int4(M, N, K, block_M, block_N, block_K, threads)
+    kernel = tilelang.compile(func, out_idx=-1)
+    print("Compilation succeeded.")
+    if print_cuda_source:
+        print(kernel.get_kernel_source())
+    return func, kernel
+
+
+def pack_int4(tensor: torch.Tensor) -> torch.Tensor:
+    if tensor.dtype != torch.int8:
+        raise TypeError(f"Expected torch.int8 logical int4 tensor, but got {tensor.dtype}.")
+    if tensor.ndim == 0 or tensor.shape[-1] % 2 != 0:
+        raise ValueError("The last dimension of a logical int4 tensor must be even for int8 packing.")
+
+    tensor_i16 = tensor.to(torch.int16)
+    packed = (tensor_i16[..., ::2] & 0x0F) | ((tensor_i16[..., 1::2] & 0x0F) << 4)
+    return packed.to(torch.int8).contiguous()
+
+
+def check_int4_gemm_correctness(
+    M=1024,
+    N=1024,
+    K=1024,
+    block_M=128,
+    block_N=128,
+    block_K=64,
+    threads=128,
+):
+    if not torch.cuda.is_available():
+        raise RuntimeError("CUDA is required to run the int4 GEMM example.")
+
+    _, kernel = compile_int4_gemm(
+        M=M,
+        N=N,
+        K=K,
+        block_M=block_M,
+        block_N=block_N,
+        block_K=block_K,
+        threads=threads,
+    )
+
+    A_logical = torch.randint(-8, 8, (M, K), device="cuda", dtype=torch.int8)
+    B_logical = torch.randint(-8, 8, (N, K), device="cuda", dtype=torch.int8)
+
+    A_packed = pack_int4(A_logical)
+    B_packed = pack_int4(B_logical)
+    C = kernel(A_packed, B_packed)
+    torch.cuda.synchronize()
+
+    ref_c = torch.matmul(A_logical.cpu().to(torch.int32), B_logical.cpu().to(torch.int32).T)
+    torch.testing.assert_close(C.cpu(), ref_c, rtol=0, atol=0)
+    print("Correctness check passed.")
+    return C, ref_c
+
+
+def main():
+    # check_int4_gemm_correctness(M=16, N=16, K=32, block_M=16, block_N=16, block_K=32)
+    # check_int4_gemm_correctness(M=16, N=16, K=64, block_M=16, block_N=16, block_K=64)
+    check_int4_gemm_correctness()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gemm_sm100/README.md b/examples/gemm_sm100/README.md
index d630d2d0d3..3ae66dde91 100644
--- a/examples/gemm_sm100/README.md
+++ b/examples/gemm_sm100/README.md
@@ -7,17 +7,30 @@ This directory contains examples for TileLang's experimental SM100 architecture
 ### 1. Manual TCGEN5.MMA Management
 Users must manually handle TCGEN5MMA operations using:
 - `T.alloc_tmem()` - Allocate Tensor Memory
-- `T.gemm()` with `wg_wait=-1` - Launch TCGEN5MMA without waiting
+- `T.tcgen05_gemm()` - Launch TCGEN5MMA without an implicit wait
 - Manual synchronization with mbarrier
 
+For the default synchronous path, `T.gemm(..., mbar=...)` now inserts the
+matching `mbarrier_wait_parity(...)` automatically after TCGEN5MMA issue.
+
 ### 2. Manual mbarrier Synchronization
 TCGEN5MMA is asynchronous and requires explicit synchronization:
 ```python
 mbar = T.alloc_barrier(1)  # expect-arrive-count = 1
-T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k==0)
+T.tcgen05_gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, clear_accum=k==0)
 T.mbarrier_wait_parity(mbar, k%2)  # Manual phase calculation required
 ```
 
+TileLang now has a conservative `InjectTcgen05Fence` pass on SM100+ that can
+insert `tcgen05_before_thread_sync()` / `tcgen05_after_thread_sync()` around:
+- `tvm_storage_sync("shared"|"shared.dyn")`
+- linear `mbarrier_wait_parity(...) -> tcgen05/TMEM use` regions
+- linear `tcgen05/TMEM use -> mbarrier_arrive(...)` regions
+
+This does **not** eliminate the need to structure the mbarrier protocol
+explicitly in user code, and the examples in this directory still keep manual
+fences where they make the handoff points obvious.
+
 ## Examples
 
 ### TCGEN5MMA Example (`gemm_tcgen5mma.py`)
@@ -61,8 +74,8 @@ def main(
             T.copy(B[bx * block_N, k * block_K], B_shared)
 
             # TCGEN5MMA computation: asynchronous launch, output to Tensor Memory
-            T.gemm(A_shared, B_shared, C_tmem, trans_A=False, trans_B=True,
-                   mbar=mbar, wg_wait=-1, clear_accum=k==0)
+            T.tcgen05_gemm(A_shared, B_shared, C_tmem, trans_A=False, trans_B=True,
+                           mbar=mbar, clear_accum=k==0)
 
             # Critical: wait for TCGEN5MMA completion
             T.mbarrier_wait_parity(mbar, k%2)
@@ -84,7 +97,6 @@ block_M, block_N, block_K = 128, 256, 128
 
 # Compile kernel
 jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda", pass_configs={
-    tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,        # Required
     tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, # Required
 })
 
diff --git a/examples/gemm_sm100/gemm_mma.py b/examples/gemm_sm100/gemm_mma.py
index 226e33c01e..e3a70df973 100644
--- a/examples/gemm_sm100/gemm_mma.py
+++ b/examples/gemm_sm100/gemm_mma.py
@@ -58,10 +58,7 @@ def main(
     func,
     out_idx=[2],
     target="cuda",
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 print(jit_kernel.get_kernel_source())
 # 3. Test the kernel in Python with PyTorch data
diff --git a/examples/gemm_sm100/gemm_tcgen5mma.py b/examples/gemm_sm100/gemm_tcgen5mma.py
index 523a94fea6..229908992a 100644
--- a/examples/gemm_sm100/gemm_tcgen5mma.py
+++ b/examples/gemm_sm100/gemm_tcgen5mma.py
@@ -38,9 +38,9 @@ def main(
             C_shared = T.alloc_shared((block_M, block_N), out_dtype)
 
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[bx * block_N, k * block_K], B_shared)
-                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, wg_wait=-1, clear_accum=k == 0)
+                T.copy(A[by * block_M, k * block_K], A_shared)  # not trans_A
+                T.copy(B[bx * block_N, k * block_K], B_shared)  # trans_B
+                T.tcgen05_gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, mbar=mbar, clear_accum=k == 0)
                 T.mbarrier_wait_parity(mbar, k % 2)
 
             T.copy(C_tmem, C_local)
@@ -52,10 +52,10 @@ def main(
 
 
 M, N, K = 4096, 4096, 8192
-block_M, block_N, block_K = 128, 256, 128
+block_M, block_N, block_K = 128, 128, 128
 trans_A, trans_B = False, True
 in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
-num_stages = 2
+num_stages = 0 if block_N >= 256 or block_M >= 256 or block_K >= 256 else 2
 threads = 256
 
 func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, num_stages, threads)
@@ -63,10 +63,7 @@ def main(
     func,
     out_idx=[2],
     target="cuda",
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 
 print(jit_kernel.get_kernel_source())
diff --git a/examples/gemm_sm100/gemm_tcgen5mma_ws.py b/examples/gemm_sm100/gemm_tcgen5mma_ws.py
new file mode 100644
index 0000000000..b8f2adf41a
--- /dev/null
+++ b/examples/gemm_sm100/gemm_tcgen5mma_ws.py
@@ -0,0 +1,164 @@
+# Non-persistent
+
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.profiler import do_bench
+
+
+@tilelang.jit
+def gemm(A, B, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages, use_tma_store=True):
+    M, N, K = T.const("M, N, K")
+
+    k_iters = T.ceildiv(K, block_K)
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[K, N], in_dtype]
+    C = T.empty((M, N), out_dtype)
+
+    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared((num_stages, block_K, block_N), in_dtype)
+        C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        loaded = T.alloc_barrier([32] * num_stages)
+        consumed = T.alloc_barrier([1] * num_stages)
+        tmem_full = T.alloc_barrier([1])
+
+        tx = T.get_thread_binding()
+
+        T.use_swizzle(8)
+
+        if tx < 32:  # warp 0: issue tma
+            for k in T.serial(k_iters):
+                T.mbarrier_wait_parity(consumed[k % num_stages], ((k // num_stages) & 1) ^ 1)
+                T.tma_copy(
+                    A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K],
+                    A_shared[k % num_stages, :, :],
+                    barrier=loaded[k % num_stages],
+                )
+                T.tma_copy(
+                    B[k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N],
+                    B_shared[k % num_stages, :, :],
+                    barrier=loaded[k % num_stages],
+                )
+                T.mbarrier_arrive(loaded[k % num_stages])
+        elif tx < 64:  # warp 1: issue tcgen5
+            for k in T.serial(k_iters):
+                T.mbarrier_wait_parity(loaded[k % num_stages], (k // num_stages) & 1)
+                T.tcgen05_gemm(
+                    A_shared[k % num_stages, :, :],
+                    B_shared[k % num_stages, :, :],
+                    C_tmem,
+                    mbar=consumed[k % num_stages],
+                    clear_accum=k == 0,
+                )
+            T.tcgen05_mma_arrive(tmem_full)
+
+        # Wait for all tcgen5 to finish
+        T.mbarrier_wait_parity(tmem_full, 0)
+        T.copy(C_tmem, C_local)
+        if use_tma_store:
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[bx * block_M, by * block_N])
+        else:
+            T.copy(C_local, C_local_cast)
+            T.copy(C_local_cast, C[bx * block_M, by * block_N])  # STG256
+    return C
+
+
+@tilelang.jit
+def gemm_2cta(A, B, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages, use_tma_store=True):
+    M, N, K = T.const("M, N, K")
+
+    k_iters = T.ceildiv(K, block_K)
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[K, N], in_dtype]
+    C = T.empty((M, N), out_dtype)
+
+    with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128, cluster_dims=2) as (bx, by):
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared((num_stages, block_K, block_N // 2), in_dtype)  # Each cta hold half of B
+        C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        loaded = T.alloc_cluster_barrier([32 * 2] * num_stages)
+        consumed = T.alloc_cluster_barrier([1] * num_stages)
+        tmem_full = T.alloc_barrier([1])
+
+        tx = T.get_thread_binding()
+        cta_id = T.block_rank_in_cluster()
+        T.assume(cta_id < 2)  # todo: automatically assume this
+
+        T.use_swizzle(16)  # TL will perform auto threadblock swizzle with cluster
+
+        if tx < 32:  # warp 0: issue tma
+            for k in T.serial(k_iters):
+                T.mbarrier_wait_parity(consumed[k % num_stages], ((k // num_stages) & 1) ^ 1)
+                T.tma_copy(
+                    A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K],
+                    A_shared[k % num_stages, :, :],
+                    barrier=loaded[k % num_stages],
+                )
+                T.tma_copy(
+                    B[k * block_K : (k + 1) * block_K, (by * 2 + cta_id) * (block_N // 2) : (by * 2 + cta_id + 1) * (block_N // 2)],
+                    B_shared[k % num_stages, :, :],
+                    barrier=loaded[k % num_stages],
+                )
+                T.mbarrier_arrive(loaded[k % num_stages], 0)  # arrive on leader cta's barrier
+        elif cta_id == 0 and tx < 64:  # Only warp 1 on leader cta issues tcgen5
+            for k in T.serial(k_iters):
+                T.mbarrier_wait_parity(loaded[k % num_stages], (k // num_stages) & 1)
+                T.tcgen05_gemm(
+                    A_shared[k % num_stages, :, :],
+                    B_shared[k % num_stages, :, :],
+                    C_tmem,
+                    mbar=consumed[k % num_stages],
+                    clear_accum=k == 0,
+                    use_2cta=True,
+                )
+            T.tcgen05_mma_arrive(tmem_full, arrive_2cta=True)
+
+        # Wait for all tcgen5 to finish
+        T.mbarrier_wait_parity(tmem_full, 0)
+        T.copy(C_tmem, C_local)
+        if use_tma_store:
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[bx * block_M, by * block_N])
+        else:
+            T.copy(C_local, C_local_cast)
+            T.copy(C_local_cast, C[bx * block_M, by * block_N])
+    return C
+
+
+def main():
+    M, N, K = 8192, 8192, 8192
+    block_M, block_N, block_K = 128, 256, 64
+    in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
+    enable_2cta_tcgen5mma = True
+    num_stages = 6 if enable_2cta_tcgen5mma else 4  # Each cta only needs to load half of B, enabling larger stages
+    kernel = gemm_2cta if enable_2cta_tcgen5mma else gemm
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.bfloat16)
+    c = kernel(a, b, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages)
+    print(kernel.get_kernel_source(a, b, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages))
+
+    ref_c = (a.to(torch.float) @ b.to(torch.float)).to(torch.bfloat16)
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All checks passed. ✅")
+
+    tl_latency = do_bench(lambda: kernel(a, b, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages), backend="cupti")
+    torch_latency = do_bench(lambda: a @ b, backend="cupti")
+    print(f"Tilelang latency: {tl_latency} ms")
+    print(f"Flops: {2 * M * N * K / (tl_latency / 1e3) / 1e12} TFLOPS")
+    print(f"Torch latency: {torch_latency} ms")
+    print(f"Flops: {2 * M * N * K / (torch_latency / 1e3) / 1e12} TFLOPS")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gemm_sm100/gemm_tcgen5mma_ws_clc.py b/examples/gemm_sm100/gemm_tcgen5mma_ws_clc.py
new file mode 100644
index 0000000000..10ac66937a
--- /dev/null
+++ b/examples/gemm_sm100/gemm_tcgen5mma_ws_clc.py
@@ -0,0 +1,212 @@
+# Introduce CLC tile schedule
+
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.profiler import do_bench
+
+
+def get_swizzled_block_idx(tile_id, group_size, m_clusters, cta_id):
+    bx_cluster = (tile_id // group_size) % m_clusters
+    bx = bx_cluster * 2 + cta_id
+    by = (tile_id % group_size) + (tile_id // group_size) // m_clusters * group_size
+    return bx, by
+
+
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True})
+def gemm_clc_persistent_2cta(
+    A,
+    B,
+    block_M,
+    block_N,
+    store_block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    group_size=8,
+    use_tma_store=True,
+):
+    M, N, K = T.const("M, N, K")
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[K, N], in_dtype]
+    C = T.empty((M, N), out_dtype)
+
+    m_blocks = T.ceildiv(M, block_M)
+    m_clusters = m_blocks // 2
+    n_blocks = T.ceildiv(N, block_N)
+    total_cluster_tiles = m_clusters * n_blocks
+    k_blocks = T.ceildiv(K, block_K)
+    assert n_blocks % (2 * group_size) == 0
+
+    with T.Kernel(total_cluster_tiles * 2, threads=256, cluster_dims=2) as block_id:
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared((num_stages, block_K, block_N // 2), in_dtype)
+        C_tmem_0 = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_tmem_1 = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        C_shared = T.alloc_shared((block_M, store_block_N), out_dtype)
+        loaded = T.alloc_cluster_barrier([32 * 2] * num_stages)
+        consumed = T.alloc_cluster_barrier([1] * num_stages)
+        tmem_full = T.alloc_cluster_barrier([1] * 2)
+        tmem_empty = T.alloc_cluster_barrier([128 * 2] * 2)
+        schedule_arrived = T.alloc_cluster_barrier([1])
+        schedule_finished = T.alloc_cluster_barrier([7])
+        clc_result = T.alloc_shared((4,), "uint32", scope="shared")
+        schedule_valid = T.alloc_shared((1,), "int32")
+        schedule_tile_id = T.alloc_shared((1,), "int32")
+
+        tx = T.get_thread_binding()
+        cta_id = T.block_rank_in_cluster()
+        T.assume(cta_id < 2)
+
+        if tx < 32:  # Producer (TMA loads)
+            for work_iter in T.unroll(total_cluster_tiles):
+                if work_iter > 0:
+                    T.mbarrier_wait_parity(schedule_arrived, (work_iter - 1) & 1)
+                    if tx == 0:
+                        T.mbarrier_arrive(schedule_finished, 0)
+                    if schedule_valid[0] == 0:
+                        break
+
+                tile_id = T.if_then_else(
+                    work_iter == 0,
+                    block_id // 2,
+                    schedule_tile_id[0],
+                )
+                bx, by = get_swizzled_block_idx(tile_id, group_size, m_clusters, cta_id)
+
+                for k in T.serial(k_blocks):
+                    phase = work_iter * k_blocks + k
+                    T.mbarrier_wait_parity(consumed[phase % num_stages], ((phase // num_stages) & 1) ^ 1)
+                    T.tma_copy(
+                        A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K],
+                        A_shared[phase % num_stages, :, :],
+                        barrier=loaded[phase % num_stages],
+                    )
+                    T.tma_copy(
+                        B[k * block_K : (k + 1) * block_K, (by * 2 + cta_id) * block_N // 2 : (by * 2 + cta_id + 1) * block_N // 2],
+                        B_shared[phase % num_stages, :, :],
+                        barrier=loaded[phase % num_stages],
+                    )
+                    T.mbarrier_arrive(loaded[phase % num_stages], 0)
+
+        elif cta_id == 0 and tx < 64:  # MMA (cta_id 0 only)
+            for work_iter in T.unroll(total_cluster_tiles):
+                if work_iter > 0:
+                    T.mbarrier_wait_parity(schedule_arrived, (work_iter - 1) & 1)
+                    if tx == 32:
+                        T.mbarrier_arrive(schedule_finished, 0)
+                    if schedule_valid[0] == 0:
+                        break
+
+                T.mbarrier_wait_parity(tmem_empty[work_iter & 1], ((work_iter // 2) & 1) ^ 1)
+                for k in T.serial(k_blocks):
+                    phase = work_iter * k_blocks + k
+                    T.mbarrier_wait_parity(loaded[phase % num_stages], (phase // num_stages) & 1)
+                    if work_iter & 1 == 0:
+                        T.tcgen05_gemm(
+                            A_shared[phase % num_stages, :, :],
+                            B_shared[phase % num_stages, :, :],
+                            C_tmem_0,
+                            mbar=consumed[phase % num_stages],
+                            clear_accum=k == 0,
+                            use_2cta=True,
+                        )
+                    else:
+                        T.tcgen05_gemm(
+                            A_shared[phase % num_stages, :, :],
+                            B_shared[phase % num_stages, :, :],
+                            C_tmem_1,
+                            mbar=consumed[phase % num_stages],
+                            clear_accum=k == 0,
+                            use_2cta=True,
+                        )
+                T.tcgen05_mma_arrive(tmem_full[work_iter & 1], arrive_2cta=True)
+
+        elif 64 <= tx < 96:  # CLC Scheduler (both CTAs)
+            for work_iter in T.unroll(total_cluster_tiles):
+                if tx == 64:
+                    if cta_id == 0 and work_iter > 0:
+                        T.mbarrier_wait_parity(schedule_finished, (work_iter - 1) & 1)
+                    T.mbarrier_arrive_expect_tx(schedule_arrived, 16)
+                    if cta_id == 0:
+                        T.clc_try_cancel_multicast(clc_result, schedule_arrived)
+                    T.mbarrier_wait_parity(schedule_arrived, work_iter & 1)
+                    schedule_valid[0] = T.clc_is_canceled(clc_result)
+                    schedule_tile_id[0] = T.cast(T.clc_get_first_ctaid_x(clc_result), "int32") // 2
+                    T.mbarrier_arrive(schedule_finished, 0)
+                    if schedule_valid[0] == 0:
+                        break
+
+        elif 128 <= tx < 256:  # Epilogue
+            for work_iter in T.unroll(total_cluster_tiles):
+                if work_iter > 0:
+                    T.mbarrier_wait_parity(schedule_arrived, (work_iter - 1) & 1)
+                    if tx == 128:
+                        T.mbarrier_arrive(schedule_finished, 0)
+                    if schedule_valid[0] == 0:
+                        break
+
+                tile_id = T.if_then_else(
+                    work_iter == 0,
+                    block_id // 2,
+                    schedule_tile_id[0],
+                )
+                bx, by = get_swizzled_block_idx(tile_id, group_size, m_clusters, cta_id)
+
+                T.mbarrier_wait_parity(tmem_full[work_iter & 1], (work_iter // 2) & 1)
+                T.sync_threads(1, 128)
+                if work_iter & 1 == 0:
+                    T.copy(C_tmem_0, C_local)
+                else:
+                    T.copy(C_tmem_1, C_local)
+                T.mbarrier_arrive(tmem_empty[work_iter & 1], 0)
+
+                if use_tma_store:
+                    for i in T.unroll(T.ceildiv(block_N, store_block_N)):
+                        T.copy(C_local[:, i * store_block_N : (i + 1) * store_block_N], C_shared)
+                        T.sync_threads(3, 128)
+                        T.copy(C_shared, C[bx * block_M, by * block_N + i * store_block_N])
+                        T.sync_threads(3, 128)
+                else:
+                    T.copy(C_local, C_local_cast)
+                    T.copy(C_local_cast, C[bx * block_M, by * block_N])
+
+    return C
+
+
+def main():
+    M, N, K = 8192, 8192, 8192
+    block_M, block_N, block_K = 128, 256, 64
+    store_block_N = 64
+    in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
+    num_stages = 6
+    l2_swizzle_group_size = 8
+
+    kernel_args = (block_M, block_N, store_block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages, l2_swizzle_group_size)
+
+    # a = (torch.rand(M, K, device="cuda", dtype=torch.bfloat16) * 2 - 1)
+    # b = (torch.rand(K, N, device="cuda", dtype=torch.bfloat16) * 2 - 1)
+    a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.bfloat16)
+    print(gemm_clc_persistent_2cta.get_kernel_source(a, b, *kernel_args))
+    c = gemm_clc_persistent_2cta(a, b, *kernel_args)
+
+    ref_c = (a.to(torch.float) @ b.to(torch.float)).to(torch.bfloat16)
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All checks passed. ✅")
+
+    tl_latency = do_bench(lambda: gemm_clc_persistent_2cta(a, b, *kernel_args), backend="cupti")
+    torch_latency = do_bench(lambda: a @ b, backend="cupti")
+    print(f"Tilelang latency: {tl_latency} ms")
+    print(f"Flops: {2 * M * N * K / (tl_latency / 1e3) / 1e12} TFLOPS")
+    print(f"Torch latency: {torch_latency} ms")
+    print(f"Flops: {2 * M * N * K / (torch_latency / 1e3) / 1e12} TFLOPS")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gemm_sm100/gemm_tcgen5mma_ws_persistent.py b/examples/gemm_sm100/gemm_tcgen5mma_ws_persistent.py
new file mode 100644
index 0000000000..5a7d820220
--- /dev/null
+++ b/examples/gemm_sm100/gemm_tcgen5mma_ws_persistent.py
@@ -0,0 +1,298 @@
+# Persistent, num_epi_stages = 2
+
+import torch
+import tilelang
+import tilelang.language as T
+from tilelang.carver.arch import driver
+from tilelang.profiler import do_bench
+
+
+@tilelang.jit
+def gemm_persistent(
+    A,
+    B,
+    block_M,
+    block_N,
+    store_block_N,  # block_N for C_shared
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    use_tma_store=True,
+):
+    M, N, K = T.const("M, N, K")
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[K, N], in_dtype]
+    C = T.empty((M, N), out_dtype)
+
+    sm_num = driver.get_num_sms()
+    m_blocks = T.ceildiv(M, block_M)
+    n_blocks = T.ceildiv(N, block_N)
+    assert K % (2 * block_K) == 0  # for simplicity
+    k_blocks = T.ceildiv(K, block_K)
+    waves = T.ceildiv(m_blocks * n_blocks, sm_num)
+    group_size = 8
+    assert n_blocks % (2 * group_size) == 0  # Please adjust group_size if not satisfied
+
+    with T.Kernel(sm_num, threads=256) as (block_id):
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared((num_stages, block_K, block_N), in_dtype)
+        C_tmem_0 = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_tmem_1 = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        C_shared = T.alloc_shared((block_M, store_block_N), out_dtype)
+        loaded = T.alloc_barrier([32] * num_stages)
+        consumed = T.alloc_barrier([1] * num_stages)
+        tmem_full = T.alloc_barrier([1] * 2)
+        tmem_empty = T.alloc_barrier([128] * 2)
+
+        tx = T.get_thread_binding()
+
+        if tx < 32:  # warp 0: issue tma
+            for w in T.unroll(waves):
+                tile_id = sm_num * w + block_id
+                bx = (tile_id // group_size) % m_blocks
+                by = (tile_id % group_size) + (tile_id // group_size) // m_blocks * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    for k in T.serial(k_blocks):
+                        phase = w * k_blocks + k
+                        T.mbarrier_wait_parity(consumed[phase % num_stages], ((phase // num_stages) & 1) ^ 1)
+                        T.tma_copy(
+                            A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K],
+                            A_shared[phase % num_stages, :, :],
+                            barrier=loaded[phase % num_stages],
+                        )
+                        T.tma_copy(
+                            B[k * block_K : (k + 1) * block_K, by * block_N : (by + 1) * block_N],
+                            B_shared[phase % num_stages, :, :],
+                            barrier=loaded[phase % num_stages],
+                        )
+                        T.mbarrier_arrive(loaded[phase % num_stages])
+
+        elif tx < 64:  # warp 1: issue tcgen5
+            for w in T.unroll(waves):
+                tile_id = sm_num * w + block_id
+                bx = (tile_id // group_size) % m_blocks
+                by = (tile_id % group_size) + (tile_id // group_size) // m_blocks * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    T.mbarrier_wait_parity(tmem_empty[w & 1], ((w // 2) & 1) ^ 1)
+                    for k in T.serial(k_blocks):
+                        phase = w * k_blocks + k
+                        T.mbarrier_wait_parity(loaded[phase % num_stages], (phase // num_stages) & 1)
+                        if w & 1 == 0:
+                            T.tcgen05_gemm(
+                                A_shared[k % num_stages, :, :],
+                                B_shared[k % num_stages, :, :],
+                                C_tmem_0,
+                                False,
+                                False,
+                                mbar=consumed[k % num_stages],
+                                clear_accum=k == 0,
+                            )
+                        else:
+                            T.tcgen05_gemm(
+                                A_shared[k % num_stages, :, :],
+                                B_shared[k % num_stages, :, :],
+                                C_tmem_1,
+                                False,
+                                False,
+                                mbar=consumed[k % num_stages],
+                                clear_accum=k == 0,
+                            )
+                    T.tcgen05_mma_arrive(tmem_full[w & 1])
+
+        elif 128 <= tx < 256:  # warp 4~7: epilogue
+            for w in T.unroll(waves):
+                tile_id = sm_num * w + block_id
+                bx = (tile_id // group_size) % m_blocks
+                by = (tile_id % group_size) + (tile_id // group_size) // m_blocks * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    T.mbarrier_wait_parity(tmem_full[w & 1], (w // 2) & 1)
+                    if (w & 1) == 0:
+                        T.copy(C_tmem_0, C_local)
+                    else:
+                        T.copy(C_tmem_1, C_local)
+                    T.mbarrier_arrive(tmem_empty[w & 1])
+
+                    if use_tma_store:
+                        for i in T.unroll(T.ceildiv(block_N, store_block_N)):
+                            T.copy(C_local[:, i * store_block_N : (i + 1) * store_block_N], C_shared)
+                            T.copy(C_shared, C[bx * block_M, by * block_N + i * store_block_N])
+                    else:
+                        T.copy(C_local, C_local_cast)
+                        T.copy(C_local_cast, C[bx * block_M, by * block_N])
+    return C
+
+
+@tilelang.jit
+def gemm_persistent_2cta(
+    A,
+    B,
+    block_M,
+    block_N,
+    store_block_N,  # block_N for C_shared
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    use_tma_store=True,
+):
+    M, N, K = T.const("M, N, K")
+
+    A: T.Tensor[[M, K], in_dtype]
+    B: T.Tensor[[K, N], in_dtype]
+    C = T.empty((M, N), out_dtype)
+
+    sm_num = driver.get_num_sms()
+    num_clusters = sm_num // 2
+    m_blocks = T.ceildiv(M, block_M)
+    m_clusters = m_blocks // 2
+    n_blocks = T.ceildiv(N, block_N)
+    assert K % (2 * block_K) == 0  # for simplicity
+    k_blocks = T.ceildiv(K, block_K)
+    waves = T.ceildiv(m_blocks * n_blocks, sm_num)
+    group_size = 8  # in cluster
+    assert n_blocks % (2 * group_size) == 0  # Please adjust group_size if not satisfied
+
+    with T.Kernel(sm_num, threads=256, cluster_dims=2) as (block_id):
+        A_shared = T.alloc_shared((num_stages, block_M, block_K), in_dtype)
+        B_shared = T.alloc_shared((num_stages, block_K, block_N // 2), in_dtype)
+        C_tmem_0 = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_tmem_1 = T.alloc_tmem([block_M, block_N], accum_dtype)
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+        C_local_cast = T.alloc_fragment((block_M, block_N), out_dtype)
+        C_shared = T.alloc_shared((block_M, store_block_N), out_dtype)
+        loaded = T.alloc_cluster_barrier([32 * 2] * num_stages)
+        consumed = T.alloc_cluster_barrier([1] * num_stages)
+        tmem_full = T.alloc_cluster_barrier([1] * 2)
+        tmem_empty = T.alloc_cluster_barrier([128 * 2] * 2)
+
+        tx = T.get_thread_binding()
+        cta_id = T.block_rank_in_cluster()
+        T.assume(cta_id < 2)  # todo: automatically assume this
+
+        if tx < 32:  # warp 0: issue tma
+            for w in T.unroll(waves):
+                # manual threadblock swizzle
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                bx_cluster = (tile_id // group_size) % m_clusters
+                bx = bx_cluster * 2 + cta_id
+                by = (tile_id % group_size) + (tile_id // group_size) // m_clusters * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    for k in T.serial(k_blocks):
+                        phase = w * k_blocks + k
+                        T.mbarrier_wait_parity(consumed[phase % num_stages], ((phase // num_stages) & 1) ^ 1)
+                        T.tma_copy(
+                            A[bx * block_M : (bx + 1) * block_M, k * block_K : (k + 1) * block_K],
+                            A_shared[phase % num_stages, :, :],
+                            barrier=loaded[phase % num_stages],
+                        )
+
+                        T.tma_copy(
+                            B[k * block_K : (k + 1) * block_K, (by * 2 + cta_id) * block_N // 2 : (by * 2 + cta_id + 1) * block_N // 2],
+                            B_shared[phase % num_stages, :, :],
+                            barrier=loaded[phase % num_stages],
+                        )
+                        T.mbarrier_arrive(loaded[phase % num_stages], 0)
+
+        elif tx < 64 and cta_id == 0:  # warp 1: issue tcgen5
+            for w in T.unroll(waves):
+                # manual threadblock swizzle
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                bx_cluster = (tile_id // group_size) % m_clusters
+                bx = bx_cluster * 2 + cta_id
+                by = (tile_id % group_size) + (tile_id // group_size) // m_clusters * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    T.mbarrier_wait_parity(tmem_empty[w & 1], ((w // 2) & 1) ^ 1)
+                    for k in T.serial(k_blocks):
+                        phase = w * k_blocks + k
+                        T.mbarrier_wait_parity(loaded[phase % num_stages], (phase // num_stages) & 1)
+                        if w & 1 == 0:
+                            T.tcgen05_gemm(
+                                A_shared[phase % num_stages, :, :],
+                                B_shared[phase % num_stages, :, :],
+                                C_tmem_0,
+                                mbar=consumed[phase % num_stages],
+                                clear_accum=k == 0,
+                                use_2cta=True,
+                            )
+                        else:
+                            T.tcgen05_gemm(
+                                A_shared[phase % num_stages, :, :],
+                                B_shared[phase % num_stages, :, :],
+                                C_tmem_1,
+                                mbar=consumed[phase % num_stages],
+                                clear_accum=k == 0,
+                                use_2cta=True,
+                            )
+                    T.tcgen05_mma_arrive(tmem_full[w & 1], arrive_2cta=True)
+
+        elif 128 <= tx < 256:  # warp 4~7: epilogue
+            for w in T.unroll(waves):
+                # manual threadblock swizzle
+                cluster_id = block_id // 2
+                tile_id = num_clusters * w + cluster_id
+                bx_cluster = (tile_id // group_size) % m_clusters
+                bx = bx_cluster * 2 + cta_id
+                by = (tile_id % group_size) + (tile_id // group_size) // m_clusters * group_size
+
+                if bx * block_M < M and by * block_N < N:
+                    T.mbarrier_wait_parity(tmem_full[w & 1], (w // 2) & 1)
+                    if (w & 1) == 0:
+                        T.copy(C_tmem_0, C_local)
+                    else:
+                        T.copy(C_tmem_1, C_local)
+                    T.mbarrier_arrive(tmem_empty[w & 1], 0)
+
+                    if use_tma_store:
+                        for i in T.unroll(T.ceildiv(block_N, store_block_N)):
+                            T.copy(C_local[:, i * store_block_N : (i + 1) * store_block_N], C_shared)
+                            T.copy(C_shared, C[bx * block_M, by * block_N + i * store_block_N])
+                    else:
+                        T.copy(C_local, C_local_cast)
+                        T.copy(C_local_cast, C[bx * block_M, by * block_N])
+
+    return C
+
+
+def main():
+    M, N, K = 8192, 8192, 8192
+    block_M, block_N, block_K = 128, 256, 64
+    store_block_N = 64
+    in_dtype, out_dtype, accum_dtype = T.bfloat16, T.bfloat16, T.float
+    enable_2cta_tcgen5mma = True
+    num_stages = 6 if enable_2cta_tcgen5mma else 4  # Each cta only needs to load half of B, enabling larger stages
+    kernel = gemm_persistent_2cta if enable_2cta_tcgen5mma else gemm_persistent
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.bfloat16)
+    print(kernel.get_kernel_source(a, b, block_M, block_N, store_block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages))
+    c = kernel(a, b, block_M, block_N, store_block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages)
+
+    ref_c = (a.to(torch.float) @ b.to(torch.float)).to(torch.bfloat16)
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print("All checks passed. ✅")
+
+    tl_latency = do_bench(
+        lambda: kernel(a, b, block_M, block_N, store_block_N, block_K, in_dtype, out_dtype, accum_dtype, num_stages), backend="cupti"
+    )
+    torch_latency = do_bench(lambda: a @ b, backend="cupti")
+    print(f"Tilelang latency: {tl_latency} ms")
+    print(f"Flops: {2 * M * N * K / (tl_latency / 1e3) / 1e12} TFLOPS")
+    print(f"Torch latency: {torch_latency} ms")
+    print(f"Flops: {2 * M * N * K / (torch_latency / 1e3) / 1e12} TFLOPS")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/gemm_sp/example_custom_compress.py b/examples/gemm_sp/example_custom_compress.py
index 0544b82557..4b03ae83da 100644
--- a/examples/gemm_sp/example_custom_compress.py
+++ b/examples/gemm_sp/example_custom_compress.py
@@ -7,7 +7,7 @@
 from tilelang.utils.sparse import randn_semi_sparse
 from tilelang.utils.tensor import torch_assert_close
 
-from triton.testing import do_bench
+from tilelang.profiler import do_bench
 
 import torch
 
@@ -291,19 +291,17 @@ def kernel(
     return kernel
 
 
-def main(m=16384, n=16384, k=16384, use_cutlass_layout=False, use_torch_compressor=False, accum_dtype=None, cfg="4090"):
-    if accum_dtype is None:
-        accum_dtype = T.float
-    kernel = matmul_sp_fp16_custom_compress(m, n, k, accum_dtype, **DEFAULT_CONFIG[cfg][accum_dtype], use_cutlass_layout=use_cutlass_layout)
+def main(M=1024, N=1024, K=1024, use_cutlass_layout=False, use_torch_compressor=False, accum_dtype=T.float, cfg="4090"):
+    kernel = matmul_sp_fp16_custom_compress(M, N, K, accum_dtype, **DEFAULT_CONFIG[cfg][accum_dtype], use_cutlass_layout=use_cutlass_layout)
 
-    a = randn_semi_sparse(m, k, device="cuda", dtype=torch.half)
-    b = torch.randn(k, n, device="cuda", dtype=torch.half)
+    a = randn_semi_sparse(M, K, device="cuda", dtype=torch.half)
+    b = torch.randn(K, N, device="cuda", dtype=torch.half)
 
     if use_torch_compressor:
         assert not use_cutlass_layout, "torch sparse must be used with naive layout"
         a_sparse, e = torch_compress(a)
     else:
-        a_sparse, e = compress_kernel(m, k, 32, 32, T.float16, use_cutlass_layout=use_cutlass_layout)(a)
+        a_sparse, e = compress_kernel(M, K, 32, 32, T.float16, use_cutlass_layout=use_cutlass_layout)(a)
 
     c = kernel(a_sparse, e, b)
 
@@ -316,7 +314,7 @@ def main(m=16384, n=16384, k=16384, use_cutlass_layout=False, use_torch_compress
     latency = do_bench(lambda: kernel(a_sparse, e, b))
     ref_latency = do_bench(lambda: a @ b)
 
-    total_flops = 2 * m * n * k
+    total_flops = 2 * M * N * K
     tflops = total_flops / latency / 1e9
     ref_tflops = total_flops / ref_latency / 1e9
     print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
@@ -330,8 +328,15 @@ def main(m=16384, n=16384, k=16384, use_cutlass_layout=False, use_torch_compress
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
     parser.add_argument("--use_cutlass_layout", action="store_true", help="Use cutlass layout for E tensor")
     parser.add_argument("--use_torch_compressor", action="store_true", help="Use torch sparse for reference")
-    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default=T.float, choices=[T.float, T.float16], help="Accumulation datatype")
     parser.add_argument("--cfg", type=str, choices=["4090"], default="4090")
     args = parser.parse_args()
-    accum_dtype = T.float if args.accum_dtype == "float" else T.float16
-    main(args.m, args.n, args.k, args.use_cutlass_layout, args.use_torch_compressor, accum_dtype, args.cfg)
+    main(
+        M=args.m,
+        N=args.n,
+        K=args.k,
+        use_cutlass_layout=args.use_cutlass_layout,
+        use_torch_compressor=args.use_torch_compressor,
+        accum_dtype=args.accum_dtype,
+        cfg=args.cfg,
+    )
diff --git a/examples/gemm_sp/example_gemm_sp.py b/examples/gemm_sp/example_gemm_sp.py
index 8163c84cc8..769ea67362 100644
--- a/examples/gemm_sp/example_gemm_sp.py
+++ b/examples/gemm_sp/example_gemm_sp.py
@@ -6,7 +6,7 @@
 from tilelang.layout import make_cutlass_metadata_layout
 from tilelang.utils.sparse import compress, randn_semi_sparse
 from tilelang.contrib import nvcc
-from triton.testing import do_bench
+from tilelang.profiler import do_bench
 
 import torch
 
@@ -97,13 +97,11 @@ def gemm_sp_fp16(
     return gemm_sp_fp16
 
 
-def main(m=16384, n=16384, k=16384, accum_dtype=None, cfg="4090"):
-    if accum_dtype is None:
-        accum_dtype = T.float
-    kernel = matmul_sp_fp16(m, n, k, accum_dtype, **DEFAULT_CONFIG[cfg][accum_dtype])
+def main(M=1024, N=1024, K=1024, accum_dtype=T.float, cfg="h20"):
+    kernel = matmul_sp_fp16(M, N, K, accum_dtype, **DEFAULT_CONFIG[cfg][accum_dtype])
 
-    a = randn_semi_sparse(m, k, device="cuda", dtype=torch.half)
-    b = torch.randn(k, n, device="cuda", dtype=torch.half)
+    a = randn_semi_sparse(M, K, device="cuda", dtype=torch.half)
+    b = torch.randn(K, N, device="cuda", dtype=torch.half)
 
     a_sparse, e = compress(a, transposed=False, block_k=DEFAULT_CONFIG[cfg][accum_dtype]["block_K"], arch=arch)
     c = kernel(a_sparse, e, b)
@@ -117,7 +115,7 @@ def main(m=16384, n=16384, k=16384, accum_dtype=None, cfg="4090"):
     latency = do_bench(lambda: kernel(a_sparse, e, b))
     ref_latency = do_bench(lambda: a @ b)
 
-    total_flops = 2 * m * n * k
+    total_flops = 2 * M * N * K
     tflops = total_flops / latency / 1e9
     ref_tflops = total_flops / ref_latency / 1e9
     print(f"Sparse TFLOPS: {tflops:.2f}, Latency: {latency / 1e3} s")
@@ -129,8 +127,7 @@ def main(m=16384, n=16384, k=16384, accum_dtype=None, cfg="4090"):
     parser.add_argument("--m", type=int, default=16384, help="Matrix dimension M")
     parser.add_argument("--n", type=int, default=16384, help="Matrix dimension N")
     parser.add_argument("--k", type=int, default=16384, help="Matrix dimension K")
-    parser.add_argument("--accum_dtype", type=str, default="float", choices=["float", "float16"], help="Accumulation datatype")
+    parser.add_argument("--accum_dtype", type=str, default=T.float, choices=[T.float, T.float16], help="Accumulation datatype")
     parser.add_argument("--cfg", type=str, choices=["4090", "h20"], default="4090")
     args = parser.parse_args()
-    accum_dtype = T.float if args.accum_dtype == "float" else T.float16
-    main(args.m, args.n, args.k, accum_dtype, args.cfg)
+    main(M=args.m, N=args.n, K=args.k, accum_dtype=args.accum_dtype, cfg=args.cfg)
diff --git a/examples/gemm_sp/test_example_gemm_sp.py b/examples/gemm_sp/test_example_gemm_sp.py
index fe26df1449..aa1a747f24 100644
--- a/examples/gemm_sp/test_example_gemm_sp.py
+++ b/examples/gemm_sp/test_example_gemm_sp.py
@@ -4,10 +4,14 @@
 import example_gemm_sp
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_custom_compress():
     example_custom_compress.main()
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_example_gemm_sp():
     example_gemm_sp.main()
 
diff --git a/examples/gemm_streamk/example_tilelang_gemm_streamk.py b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
index b2e8e93690..48dc175a96 100644
--- a/examples/gemm_streamk/example_tilelang_gemm_streamk.py
+++ b/examples/gemm_streamk/example_tilelang_gemm_streamk.py
@@ -158,7 +158,7 @@ def main():
         False,
         True,
         T.float16,
-        T.float16,
+        T.float32,  # fp32 for atom add
         T.float32,
         2,
         64,
@@ -166,9 +166,10 @@ def main():
 
     print(kernel.get_kernel_source())
 
-    b_c = torch.zeros((m, n), device="cuda", dtype=torch.float16)
+    b_c = torch.zeros((m, n), device="cuda", dtype=torch.float32)
 
     kernel(A, B, b_c)
+    b_c = b_c.to(torch.float16)
 
     C = torch.matmul(A, B.T)
 
diff --git a/examples/gemv/example_gemv.py b/examples/gemv/example_gemv.py
index 8ca77a2e89..ddbe4fd7a6 100644
--- a/examples/gemv/example_gemv.py
+++ b/examples/gemv/example_gemv.py
@@ -194,7 +194,7 @@ def main(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                 "reduce_scope",
                 T.reinterpret(T.uint64(0), dtype="handle"),
             ):
@@ -227,10 +227,7 @@ def get_block_template_configs():
     rep=20,
 )
 @tl.jit(
-    pass_configs={
-        tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     out_idx=[2],
 )
 def gemv_alloc_reducer(
@@ -304,7 +301,7 @@ def main(
                     C_accum[0] += A_local[k].astype(accum_dtype) * B_local[k].astype(accum_dtype)
             C_reduced = T.alloc_local((1,), accum_dtype)
             with T.attr(
-                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                 "reduce_scope",
                 T.reinterpret(T.uint64(0), dtype="handle"),
             ):
diff --git a/examples/grouped_gemm/example_grouped_gemm_bwd.py b/examples/grouped_gemm/example_grouped_gemm_bwd.py
index 49cce0d1dd..339f8bc1ae 100644
--- a/examples/grouped_gemm/example_grouped_gemm_bwd.py
+++ b/examples/grouped_gemm/example_grouped_gemm_bwd.py
@@ -5,7 +5,7 @@
 import tilelang.language as T
 
 
-@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_warp_specialized": True})
 def grouped_gemm_fwd(batch_sum, batch_count, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
@@ -157,7 +157,7 @@ def construct_inputs(batch_sizes_list, K, M, trans_b, padding_M, device, dtype):
     return A, B, C, batch_sizes, batch_offsets, batch_padded_offsets
 
 
-@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+@tilelang.jit(out_idx=[2], pass_configs={"tl.disable_warp_specialized": True})
 def grouped_gemm_bwd(batch_sum, batch_count, M, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
     """
     args:
diff --git a/examples/grouped_gemm/example_grouped_gemm_fwd_ptr.py b/examples/grouped_gemm/example_grouped_gemm_fwd_ptr.py
new file mode 100644
index 0000000000..d57edcc6ca
--- /dev/null
+++ b/examples/grouped_gemm/example_grouped_gemm_fwd_ptr.py
@@ -0,0 +1,186 @@
+import argparse
+import math
+import time
+
+import torch
+
+import tilelang as tl
+import tilelang.language as T
+
+
+def make_ptr_table(tensors):
+    assert tensors, "pointer table requires at least one tensor"
+    device = tensors[0].device
+    return torch.tensor([tensor.data_ptr() for tensor in tensors], device=device, dtype=torch.int64)
+
+
+def torch_grouped_gemm_ptr(a_list, b_list):
+    assert len(a_list) == len(b_list), "A/B group count mismatch"
+    outputs = []
+    for a, b in zip(a_list, b_list):
+        assert a.shape[1] == b.shape[0], "incompatible GEMM shapes"
+        outputs.append(torch.matmul(a, b))
+    return outputs
+
+
+def grouped_gemm_ptr(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages=2, threads=128, dtype=T.float16):
+    # Keep per-group tensors separate and pass them via pointer tables.
+    # We currently use a common max_M storage shape per group because
+    # ptr-backed tensors with runtime-varying shapes are not stable enough yet.
+    # Multi-stage software pipelining on ptr-backed tensors is not correct yet.
+    # Keep a single-stage pipeline so the ptr path can still use T.copy lowering.
+    copy_num_stages = 1
+    batch_count = len(batch_sizes_list)
+    max_M = max(batch_sizes_list)
+    batch_tile_offsets = [0]
+    for size in batch_sizes_list[:-1]:
+        batch_tile_offsets.append(batch_tile_offsets[-1] + math.ceil(size / block_M))
+    total_m_blocks = sum(math.ceil(size / block_M) for size in batch_sizes_list)
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def kernel(
+        A_ptrs: T.Tensor([batch_count], T.ptr),
+        B_ptrs: T.Tensor([batch_count], T.ptr),
+        C_ptrs: T.Tensor([batch_count], T.ptr),
+        batch_tile_offsets: T.Tensor([batch_count], T.int32),
+    ):
+        with T.Kernel(total_m_blocks, T.ceildiv(N, block_N), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            cur_batch_idx = T.alloc_var(dtype=T.int32)
+            cur_tile_offset = T.alloc_var(dtype=T.int32)
+
+            cur_batch_idx = 0
+            cur_tile_offset = 0
+            for i in range(batch_count):
+                in_cur_batch_idx = bx >= batch_tile_offsets[i]
+                cur_batch_idx = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx)
+                cur_tile_offset = T.if_then_else(in_cur_batch_idx, batch_tile_offsets[i], cur_tile_offset)
+
+            m_start = (bx - cur_tile_offset) * block_M
+            A = T.make_tensor(A_ptrs[cur_batch_idx], (max_M, K), dtype)
+            B = T.make_tensor(B_ptrs[cur_batch_idx], (K, N), dtype)
+            C = T.make_tensor(C_ptrs[cur_batch_idx], (max_M, N), dtype)
+
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=copy_num_stages):
+                T.copy(A[m_start, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, by * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[m_start, by * block_N])
+
+    return kernel
+
+
+def construct_inputs(batch_sizes_list, K, N, block_M, device, dtype):
+    max_M = max(batch_sizes_list)
+    batch_tile_offsets_list = [0]
+    for size in batch_sizes_list[:-1]:
+        batch_tile_offsets_list.append(batch_tile_offsets_list[-1] + math.ceil(size / block_M))
+    # Each group owns an independent padded tensor; nothing is concatenated.
+    a_list = [torch.zeros(max_M, K, device=device, dtype=dtype) for _ in batch_sizes_list]
+    b_list = [torch.randn(K, N, device=device, dtype=dtype) for _ in batch_sizes_list]
+    c_list = [torch.empty(max_M, N, device=device, dtype=dtype) for _ in batch_sizes_list]
+    for a, size in zip(a_list, batch_sizes_list):
+        a[:size].copy_(torch.randn(size, K, device=device, dtype=dtype))
+    a_ptrs = make_ptr_table(a_list)
+    b_ptrs = make_ptr_table(b_list)
+    c_ptrs = make_ptr_table(c_list)
+    batch_tile_offsets = torch.tensor(batch_tile_offsets_list, device=device, dtype=torch.int32)
+    return a_list, b_list, c_list, a_ptrs, b_ptrs, c_ptrs, batch_tile_offsets
+
+
+def verify_outputs(outputs, refs, batch_sizes_list, atol=1e-2, rtol=1e-2):
+    for idx, (out, ref, batch_size) in enumerate(zip(outputs, refs, batch_sizes_list)):
+        try:
+            torch.testing.assert_close(out[:batch_size], ref, atol=atol, rtol=rtol)
+        except AssertionError as err:
+            raise AssertionError(f"group {idx}: {err}") from err
+
+
+def benchmark(kernel, inputs, warmup=50, rep=100):
+    for _ in range(warmup):
+        kernel(*inputs)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    start.record()
+    for _ in range(rep):
+        kernel(*inputs)
+    end.record()
+    torch.cuda.synchronize()
+    return start.elapsed_time(end) / rep
+
+
+def run_tilelang_grouped_gemm_ptr(
+    batch_sizes_list,
+    K,
+    N,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=2,
+    threads=128,
+    profile=False,
+):
+    device = torch.device("cuda")
+    dtype = torch.float16
+    program = grouped_gemm_ptr(batch_sizes_list, K, N, block_M, block_N, block_K, num_stages, threads)
+    # The ptr-backed grouped GEMM example is intended to exercise the regular CUDA
+    # execution path; CuTeDSL does not support these handle tensors.
+    kernel = tl.compile(
+        program,
+        target="cuda",
+        execution_backend="auto",
+        pass_configs={"tl.disable_warp_specialized": True},
+    )
+    a_list, b_list, c_list, a_ptrs, b_ptrs, c_ptrs, batch_tile_offsets = construct_inputs(batch_sizes_list, K, N, block_M, device, dtype)
+    refs = torch_grouped_gemm_ptr([a[:size] for a, size in zip(a_list, batch_sizes_list)], b_list)
+
+    kernel(a_ptrs, b_ptrs, c_ptrs, batch_tile_offsets)
+    verify_outputs(c_list, refs, batch_sizes_list)
+    print("✅ TileLang ptr-grouped-gemm matches PyTorch")
+
+    if profile:
+        latency = benchmark(kernel, (a_ptrs, b_ptrs, c_ptrs, batch_tile_offsets))
+        total_flops = sum(size * K * N * 2 for size in batch_sizes_list)
+        print(f"Latency: {latency:.4f} ms")
+        print(f"TFlops: {total_flops / (latency * 1e9):.4f}")
+
+
+def test_grouped_gemm_ptr():
+    run_tilelang_grouped_gemm_ptr([16, 33, 64], 128, 96, 32, 32, 32)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch_sizes", type=str, default="64,128,256", help="comma-separated per-group M sizes")
+    parser.add_argument("--K", type=int, default=4096, help="reduce dim")
+    parser.add_argument("--N", type=int, default=4096, help="output dim")
+    parser.add_argument("--profile", action="store_true", help="benchmark the kernel")
+    args = parser.parse_args()
+
+    batch_sizes_list = [int(x.strip()) for x in args.batch_sizes.split(",") if x.strip()]
+    block_M = 64
+    block_N = 128
+    block_K = 64
+    num_stages = 1
+    threads = 256
+
+    t0 = time.time()
+    run_tilelang_grouped_gemm_ptr(
+        batch_sizes_list,
+        args.K,
+        args.N,
+        block_M,
+        block_N,
+        block_K,
+        num_stages=num_stages,
+        threads=threads,
+        profile=args.profile,
+    )
+    print(f"End-to-end: {time.time() - t0:.3f} s")
diff --git a/examples/grouped_gemm/test_example_grouped_gemm.py b/examples/grouped_gemm/test_example_grouped_gemm.py
new file mode 100644
index 0000000000..dc0c945072
--- /dev/null
+++ b/examples/grouped_gemm/test_example_grouped_gemm.py
@@ -0,0 +1,59 @@
+import tilelang.testing
+
+import example_grouped_gemm_bwd
+import example_grouped_gemm_fwd
+import example_grouped_gemm_fwd_ptr
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_example_grouped_gemm_fwd_small():
+    example_grouped_gemm_fwd.run_tilelang_grouped_gemm(
+        [5, 9, 13],
+        K=64,
+        M=96,
+        block_M=64,
+        block_N=64,
+        block_K=32,
+        trans_b=False,
+        num_stages=2,
+        threads=256,
+        profile=False,
+    )
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_example_grouped_gemm_fwd_ptr_small():
+    example_grouped_gemm_fwd_ptr.run_tilelang_grouped_gemm_ptr(
+        [5, 9, 13],
+        K=64,
+        N=96,
+        block_M=64,
+        block_N=64,
+        block_K=32,
+        num_stages=1,
+        threads=256,
+        profile=False,
+    )
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_example_grouped_gemm_bwd_small():
+    example_grouped_gemm_bwd.run_tilelang_grouped_gemm(
+        [5, 9, 13],
+        K=64,
+        M=96,
+        block_M=64,
+        block_N=64,
+        block_K=32,
+        trans_b=False,
+        num_stages=2,
+        threads=256,
+        profile=False,
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/examples/kda/FLA_KDA/cumsum.py b/examples/kda/FLA_KDA/cumsum.py
new file mode 100644
index 0000000000..0fb3368f6a
--- /dev/null
+++ b/examples/kda/FLA_KDA/cumsum.py
@@ -0,0 +1,469 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import prepare_chunk_indices, autotune_cache_kwargs, input_guard
+
+BS_LIST = [32, 64]
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps) for num_warps in [1, 2, 4, 8]],
+    key=["B", "H", "BT", "IS_VARLEN", "REVERSE"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_scalar_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+    else:
+        p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    # [BT]
+    b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32)
+    b_o = tl.cumsum(b_s, axis=0)
+    if REVERSE:
+        b_z = tl.sum(b_s, axis=0)
+        b_o = -b_o + b_z[None] + b_s
+    if HAS_SCALE:
+        b_o *= scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,))
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({"BS": BS}, num_warps=num_warps) for BS in BS_LIST for num_warps in [2, 4, 8]],
+    key=["B", "H", "S", "BT", "IS_VARLEN", "REVERSE"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_local_cumsum_vector_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_s, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if HEAD_FIRST:
+        p_s = tl.make_block_ptr(s + (bos * H + i_h * T) * S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        p_o = tl.make_block_ptr(o + (bos * H + i_h * T) * S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+    else:
+        p_s = tl.make_block_ptr(s + (bos * H + i_h) * S, (T, S), (H * S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        p_o = tl.make_block_ptr(o + (bos * H + i_h) * S, (T, S), (H * S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+    # [BT, BS]
+    b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+    if REVERSE:
+        b_o = tl.cumsum(b_s, axis=0, reverse=True)
+    else:
+        b_o = tl.cumsum(b_s, axis=0)
+    if HAS_SCALE:
+        b_o *= scale
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BT": BT}, num_warps=num_warps, num_stages=num_stages)
+        for BT in [32, 64, 128, 256]
+        for num_warps in [2, 4, 8]
+        for num_stages in [1, 2, 3, 4]
+    ],
+    key=["B", "H", "IS_VARLEN", "REVERSE"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_global_cumsum_scalar_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    BT: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_nh = tl.program_id(0)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    T = eos - bos
+
+    b_z = tl.zeros([], dtype=tl.float32)
+    NT = tl.cdiv(T, BT)
+    for i_c in range(NT):
+        i_t = NT - 1 - i_c if REVERSE else i_c
+        if HEAD_FIRST:
+            p_s = tl.make_block_ptr(s + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+            p_o = tl.make_block_ptr(o + bos * H + i_h * T, (T,), (1,), (i_t * BT,), (BT,), (0,))
+        else:
+            p_s = tl.make_block_ptr(s + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+            p_o = tl.make_block_ptr(o + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+        b_s = tl.load(p_s, boundary_check=(0,)).to(tl.float32)
+        b_o = tl.cumsum(b_s, axis=0)
+        b_ss = tl.sum(b_s, 0)
+        if REVERSE:
+            b_o = -b_o + b_ss + b_s
+        b_o += b_z
+        if i_c >= 0:
+            b_z += b_ss
+        if HAS_SCALE:
+            b_o *= scale
+        tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0,))
+
+
+@triton.heuristics(
+    {
+        "HAS_SCALE": lambda args: args["scale"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BT": BT}, num_warps=num_warps, num_stages=num_stages)
+        for BT in [16, 32, 64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [1, 2, 3, 4]
+    ],
+    key=["B", "H", "S", "IS_VARLEN", "REVERSE"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_global_cumsum_vector_kernel(
+    s,
+    o,
+    scale,
+    cu_seqlens,
+    T,
+    B: tl.constexpr,
+    H: tl.constexpr,
+    S: tl.constexpr,
+    BT: tl.constexpr,
+    BS: tl.constexpr,
+    REVERSE: tl.constexpr,
+    HAS_SCALE: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    HEAD_FIRST: tl.constexpr,
+):
+    i_s, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+    T = eos - bos
+
+    b_z = tl.zeros([BS], dtype=tl.float32)
+    NT = tl.cdiv(T, BT)
+    for i_c in range(NT):
+        i_t = NT - 1 - i_c if REVERSE else i_c
+        if HEAD_FIRST:
+            p_s = tl.make_block_ptr(s + (bos * H + i_h * T) * S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+            p_o = tl.make_block_ptr(o + (bos * H + i_h * T) * S, (T, S), (S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        else:
+            p_s = tl.make_block_ptr(s + (bos * H + i_h) * S, (T, S), (H * S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+            p_o = tl.make_block_ptr(o + (bos * H + i_h) * S, (T, S), (H * S, 1), (i_t * BT, i_s * BS), (BT, BS), (1, 0))
+        # [BT, BS]
+        b_s = tl.load(p_s, boundary_check=(0, 1)).to(tl.float32)
+        if REVERSE:
+            b_c = b_z[None, :] + tl.cumsum(b_s, axis=0, reverse=True)
+        else:
+            b_c = b_z[None, :] + tl.cumsum(b_s, axis=0)
+        if HAS_SCALE:
+            b_c *= scale
+        tl.store(p_o, b_c.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+        b_z += tl.sum(b_s, 0)
+
+
+def chunk_local_cumsum_scalar(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: torch.Tensor = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+    chunk_indices: torch.LongTensor = None,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T = g.shape
+    else:
+        B, T, H = g.shape
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+    BT = chunk_size
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+    grid = (NT, B * H)
+    chunk_local_cumsum_scalar_kernel[grid](
+        s=g_org,
+        o=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return g
+
+
+def chunk_local_cumsum_vector(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: torch.Tensor = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+    chunk_indices: torch.LongTensor = None,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, S = g.shape
+    else:
+        B, T, H, S = g.shape
+    BT = chunk_size
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    assert chunk_size == 2 ** (chunk_size.bit_length() - 1), "chunk_size must be a power of 2"
+
+    g_org, g = g, torch.empty_like(g, dtype=output_dtype or g.dtype)
+
+    def grid(meta):
+        return (triton.cdiv(meta["S"], meta["BS"]), NT, B * H)
+
+    # keep cumulative normalizer in fp32
+    # this kernel is equivalent to
+    # g = g.view(B, H, NT, BT, -1).cumsum(-2).view(B, H, T, -1)
+    chunk_local_cumsum_vector_kernel[grid](
+        s=g_org,
+        o=g,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        B=B,
+        H=H,
+        S=S,
+        BT=BT,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return g
+
+
+@input_guard
+def chunk_global_cumsum_scalar(
+    s: torch.Tensor,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor = None,
+    scale: float = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T = s.shape
+    else:
+        B, T, H = s.shape
+    N = len(cu_seqlens) - 1 if cu_seqlens is not None else B
+
+    z = torch.empty_like(s, dtype=output_dtype or s.dtype)
+    grid = (N * H,)
+    chunk_global_cumsum_scalar_kernel[grid](
+        s=s,
+        o=z,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        T=T,
+        B=B,
+        H=H,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return z
+
+
+@input_guard
+def chunk_global_cumsum_vector(
+    s: torch.Tensor,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor = None,
+    scale: float = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    if head_first:
+        B, H, T, S = s.shape
+    else:
+        B, T, H, S = s.shape
+    N = len(cu_seqlens) - 1 if cu_seqlens is not None else B
+    BS = min(32, triton.next_power_of_2(S))
+
+    z = torch.empty_like(s, dtype=output_dtype or s.dtype)
+    grid = (triton.cdiv(S, BS), N * H)
+    chunk_global_cumsum_vector_kernel[grid](
+        s=s,
+        o=z,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        T=T,
+        B=B,
+        H=H,
+        S=S,
+        BS=BS,
+        HEAD_FIRST=head_first,
+        REVERSE=reverse,
+    )
+    return z
+
+
+@input_guard
+def chunk_global_cumsum(
+    s: torch.Tensor,
+    reverse: bool = False,
+    cu_seqlens: torch.Tensor = None,
+    scale: float = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+) -> torch.Tensor:
+    if cu_seqlens is not None:
+        assert s.shape[0] == 1, "Only batch size 1 is supported when cu_seqlens are provided"
+    if len(s.shape) == 3:
+        return chunk_global_cumsum_scalar(
+            s=s,
+            reverse=reverse,
+            cu_seqlens=cu_seqlens,
+            scale=scale,
+            head_first=head_first,
+            output_dtype=output_dtype,
+        )
+    elif len(s.shape) == 4:
+        return chunk_global_cumsum_vector(
+            s=s,
+            reverse=reverse,
+            cu_seqlens=cu_seqlens,
+            scale=scale,
+            head_first=head_first,
+            output_dtype=output_dtype,
+        )
+    else:
+        raise ValueError(
+            f"Unsupported input shape {s.shape}, "
+            f"which should be [B, T, H]/[B, T, H, D] if `head_first=False` "
+            f"or [B, H, T]/[B, H, T, D] otherwise",
+        )
+
+
+@input_guard
+def chunk_local_cumsum(
+    g: torch.Tensor,
+    chunk_size: int,
+    reverse: bool = False,
+    scale: float = None,
+    cu_seqlens: torch.Tensor = None,
+    head_first: bool = False,
+    output_dtype: torch.dtype = torch.float,
+    chunk_indices: torch.LongTensor = None,
+    **kwargs,
+) -> torch.Tensor:
+    if cu_seqlens is not None:
+        assert g.shape[0] == 1, "Only batch size 1 is supported when cu_seqlens are provided"
+    if len(g.shape) == 3:
+        return chunk_local_cumsum_scalar(
+            g=g,
+            chunk_size=chunk_size,
+            reverse=reverse,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            head_first=head_first,
+            output_dtype=output_dtype,
+            chunk_indices=chunk_indices,
+        )
+    elif len(g.shape) == 4:
+        return chunk_local_cumsum_vector(
+            g=g,
+            chunk_size=chunk_size,
+            reverse=reverse,
+            scale=scale,
+            cu_seqlens=cu_seqlens,
+            head_first=head_first,
+            output_dtype=output_dtype,
+            chunk_indices=chunk_indices,
+        )
+    else:
+        raise ValueError(
+            f"Unsupported input shape {g.shape}, which should be (B, T, H, D) if `head_first=False` or (B, H, T, D) otherwise",
+        )
diff --git a/examples/kda/FLA_KDA/fla_chunk_delta.py b/examples/kda/FLA_KDA/fla_chunk_delta.py
new file mode 100644
index 0000000000..3b0fc908d0
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_delta.py
@@ -0,0 +1,579 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+from .fla_utils import prepare_chunk_indices, exp, exp2, USE_CUDA_GRAPH, autotune_cache_kwargs
+
+NUM_WARPS = [2, 4]
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_GK": lambda args: args["gk"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["h0"] is not None,
+        "STORE_FINAL_STATE": lambda args: args["ht"] is not None,
+        "SAVE_NEW_VALUE": lambda args: args["v_new"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4]
+        for num_stages in [2, 3, 4]
+        for BV in [32, 64]
+    ],
+    key=["H", "K", "V", "BT", "USE_EXP2"],
+    use_cuda_graph=USE_CUDA_GRAPH,
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gated_delta_rule_fwd_kernel_h_blockdim64(
+    k,
+    v,
+    w,
+    v_new,
+    g,
+    gk,
+    h,
+    h0,
+    ht,
+    cu_seqlens,
+    chunk_offsets,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    STORE_FINAL_STATE: tl.constexpr,
+    SAVE_NEW_VALUE: tl.constexpr,
+    USE_EXP2: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+
+    # [BK, BV]
+    b_h1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_h2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_h3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_h4 = tl.zeros([64, BV], dtype=tl.float32)
+
+    # calculate offset
+    h += ((boh * H + i_h) * K * V).to(tl.int64)
+    v += ((bos * H + i_h) * V).to(tl.int64)
+    k += ((bos * H + i_h) * K).to(tl.int64)
+    w += ((bos * H + i_h) * K).to(tl.int64)
+    if SAVE_NEW_VALUE:
+        v_new += ((bos * H + i_h) * V).to(tl.int64)
+    stride_v = H * V
+    stride_h = H * K * V
+    stride_k = H * K
+    if USE_INITIAL_STATE:
+        h0 = h0 + i_nh * K * V
+    if STORE_FINAL_STATE:
+        ht = ht + i_nh * K * V
+
+    # load initial state
+    if USE_INITIAL_STATE:
+        p_h0_1 = tl.make_block_ptr(h0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        b_h1 += tl.load(p_h0_1, boundary_check=(0, 1)).to(tl.float32)
+        if K > 64:
+            p_h0_2 = tl.make_block_ptr(h0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            b_h2 += tl.load(p_h0_2, boundary_check=(0, 1)).to(tl.float32)
+        if K > 128:
+            p_h0_3 = tl.make_block_ptr(h0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            b_h3 += tl.load(p_h0_3, boundary_check=(0, 1)).to(tl.float32)
+        if K > 192:
+            p_h0_4 = tl.make_block_ptr(h0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            b_h4 += tl.load(p_h0_4, boundary_check=(0, 1)).to(tl.float32)
+
+    # main recurrence
+    for i_t in range(NT):
+        p_h1 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_h1, b_h1.to(p_h1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_h2 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h2, b_h2.to(p_h2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_h3 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h3, b_h3.to(p_h3.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_h4 = tl.make_block_ptr(h + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_h4, b_h4.to(p_h4.dtype.element_ty), boundary_check=(0, 1))
+
+        p_w = tl.make_block_ptr(w, (T, K), (stride_k, 1), (i_t * BT, 0), (BT, 64), (1, 0))
+        b_w = tl.load(p_w, boundary_check=(0, 1))
+        b_v = tl.dot(b_w, b_h1.to(b_w.dtype))
+        if K > 64:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_k, 1), (i_t * BT, 64), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h2.to(b_w.dtype))
+        if K > 128:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_k, 1), (i_t * BT, 128), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h3.to(b_w.dtype))
+        if K > 192:
+            p_w = tl.make_block_ptr(w, (T, K), (stride_k, 1), (i_t * BT, 192), (BT, 64), (1, 0))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            b_v += tl.dot(b_w, b_h4.to(b_w.dtype))
+        p_v = tl.make_block_ptr(v, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1)) - b_v
+
+        if SAVE_NEW_VALUE:
+            p_v = tl.make_block_ptr(v_new, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+            tl.store(p_v, b_v.to(p_v.dtype.element_ty), boundary_check=(0, 1))
+
+        last_idx = min((i_t + 1) * BT, T) - 1
+        if USE_G:
+            m_t = (i_t * BT + tl.arange(0, BT)) < T
+            b_g_last = tl.load(g + bos * H + last_idx * H + i_h)
+            p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+            b_g = tl.load(p_g, boundary_check=(0,))
+            if USE_EXP2:
+                b_v = b_v * tl.where(m_t, exp2(b_g_last - b_g), 0)[:, None]
+                b_g_last = exp2(b_g_last)
+            else:
+                b_v = b_v * tl.where(m_t, exp(b_g_last - b_g), 0)[:, None]
+                b_g_last = exp(b_g_last)
+            b_h1 *= b_g_last
+            if K > 64:
+                b_h2 *= b_g_last
+            if K > 128:
+                b_h3 *= b_g_last
+            if K > 192:
+                b_h4 *= b_g_last
+
+        if USE_GK:
+            o_k1 = tl.arange(0, 64)
+            b_gk_last1 = tl.load(gk + (bos + last_idx) * H * K + i_h * K + o_k1, mask=(o_k1 < K), other=0.0)
+            if USE_EXP2:
+                b_h1 *= exp2(b_gk_last1)[:, None]
+            else:
+                b_h1 *= exp(b_gk_last1)[:, None]
+            if K > 64:
+                o_k2 = 64 + o_k1
+                b_gk_last2 = tl.load(gk + (bos + last_idx) * H * K + i_h * K + o_k2, mask=(o_k2 < K), other=0.0)
+                if USE_EXP2:
+                    b_h2 *= exp2(b_gk_last2)[:, None]
+                else:
+                    b_h2 *= exp(b_gk_last2)[:, None]
+            if K > 128:
+                o_k3 = 128 + o_k1
+                b_gk_last3 = tl.load(gk + (bos + last_idx) * H * K + i_h * K + o_k3, mask=(o_k3 < K), other=0.0)
+                if USE_EXP2:
+                    b_h3 *= exp2(b_gk_last3)[:, None]
+                else:
+                    b_h3 *= exp(b_gk_last3)[:, None]
+            if K > 192:
+                o_k4 = 192 + o_k1
+                b_gk_last4 = tl.load(gk + (bos + last_idx) * H * K + i_h * K + o_k4, mask=(o_k4 < K), other=0.0)
+                if USE_EXP2:
+                    b_h4 *= exp2(b_gk_last4)[:, None]
+                else:
+                    b_h4 *= exp(b_gk_last4)[:, None]
+        b_v = b_v.to(k.dtype.element_ty)
+
+        p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_h1 += tl.dot(b_k, b_v)
+        if K > 64:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h2 += tl.dot(b_k, b_v)
+        if K > 128:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h3 += tl.dot(b_k, b_v)
+        if K > 192:
+            p_k = tl.make_block_ptr(k, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_h4 += tl.dot(b_k, b_v)
+    # epilogue
+    if STORE_FINAL_STATE:
+        p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_ht, b_h1.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h2.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h3.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_ht = tl.make_block_ptr(ht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_ht, b_h4.to(p_ht.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_GK": lambda args: args["gk"] is not None,
+        "USE_INITIAL_STATE": lambda args: args["dh0"] is not None,
+        "USE_FINAL_STATE_GRADIENT": lambda args: args["dht"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4]
+        for num_stages in ([4, 3, 2])
+        for BV in [64, 32]
+    ],
+    key=["H", "K", "V", "BT", "BV", "USE_G", "USE_EXP2"],
+    use_cuda_graph=USE_CUDA_GRAPH,
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64(
+    q,
+    k,
+    w,
+    g,
+    gk,
+    dht,
+    dh0,
+    do,
+    dh,
+    dv,
+    dv2,
+    cu_seqlens,
+    chunk_offsets,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_GK: tl.constexpr,
+    USE_INITIAL_STATE: tl.constexpr,
+    USE_FINAL_STATE_GRADIENT: tl.constexpr,
+    USE_EXP2: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_nh = tl.program_id(0), tl.program_id(1)
+    i_n, i_h = i_nh // H, i_nh % H
+    if IS_VARLEN:
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+        boh = tl.load(chunk_offsets + i_n).to(tl.int32)
+    else:
+        bos, eos = i_n * T, i_n * T + T
+        NT = tl.cdiv(T, BT)
+        boh = i_n * NT
+
+    # [BK, BV]
+    b_dh1 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 64:
+        b_dh2 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 128:
+        b_dh3 = tl.zeros([64, BV], dtype=tl.float32)
+    if K > 192:
+        b_dh4 = tl.zeros([64, BV], dtype=tl.float32)
+
+    # calculate offset
+    q += ((bos * H + i_h) * K).to(tl.int64)
+    k += ((bos * H + i_h) * K).to(tl.int64)
+    w += ((bos * H + i_h) * K).to(tl.int64)
+    do += ((bos * H + i_h) * V).to(tl.int64)
+    dv += ((bos * H + i_h) * V).to(tl.int64)
+    dv2 += ((bos * H + i_h) * V).to(tl.int64)
+    dh += ((boh * H + i_h) * K * V).to(tl.int64)
+    if USE_GK:
+        gk += ((bos * H + i_h) * K).to(tl.int64)
+
+    stride_v = H * V
+    stride_h = H * K * V
+    stride_k = H * K
+    if USE_INITIAL_STATE:
+        dh0 += i_nh * K * V
+    if USE_FINAL_STATE_GRADIENT:
+        dht += i_nh * K * V
+
+    if USE_FINAL_STATE_GRADIENT:
+        p_dht1 = tl.make_block_ptr(dht, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        b_dh1 += tl.load(p_dht1, boundary_check=(0, 1))
+        if K > 64:
+            p_dht2 = tl.make_block_ptr(dht, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            b_dh2 += tl.load(p_dht2, boundary_check=(0, 1))
+        if K > 128:
+            p_dht3 = tl.make_block_ptr(dht, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            b_dh3 += tl.load(p_dht3, boundary_check=(0, 1))
+        if K > 192:
+            p_dht4 = tl.make_block_ptr(dht, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            b_dh4 += tl.load(p_dht4, boundary_check=(0, 1))
+
+    for i_t in range(NT - 1, -1, -1):
+        p_dh1 = tl.make_block_ptr(dh + i_t * stride_h, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_dh1, b_dh1.to(p_dh1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_dh2 = tl.make_block_ptr(dh + i_t * stride_h, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh2, b_dh2.to(p_dh2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_dh3 = tl.make_block_ptr(dh + i_t * stride_h, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh3, b_dh3.to(p_dh3.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_dh4 = tl.make_block_ptr(dh + i_t * stride_h, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh4, b_dh4.to(p_dh4.dtype.element_ty), boundary_check=(0, 1))
+
+        last_idx = min((i_t + 1) * BT, T) - 1
+        if USE_G:
+            bg_last = tl.load(g + (bos + last_idx) * H + i_h)
+            p_g = tl.make_block_ptr(g + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+            b_g = tl.load(p_g, boundary_check=(0,))
+            if USE_EXP2:
+                bg_last_exp = exp2(bg_last)
+                b_g_exp = exp2(b_g)
+            else:
+                bg_last_exp = exp(bg_last)
+                b_g_exp = exp(b_g)
+
+        p_dv = tl.make_block_ptr(dv, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dv2 = tl.make_block_ptr(dv2, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_do = tl.make_block_ptr(do, (T, V), (stride_v, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+
+        # Update dv
+        p_k = tl.make_block_ptr(k, (T, K), (stride_k, 1), (i_t * BT, 0), (BT, 64), (1, 0))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        if USE_GK:
+            o_k1 = tl.arange(0, 64)
+            b_gk_last1 = tl.load(gk + last_idx * H * K + o_k1, mask=(o_k1 < K), other=0.0)
+        b_dv = tl.dot(b_k, b_dh1.to(b_k.dtype))
+
+        if K > 64:
+            p_k = tl.make_block_ptr(k, (T, K), (stride_k, 1), (i_t * BT, 64), (BT, 64), (1, 0))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            if USE_GK:
+                o_k2 = 64 + o_k1
+                b_gk_last2 = tl.load(gk + last_idx * H * K + o_k2, mask=(o_k2 < K), other=0.0)
+            b_dv += tl.dot(b_k, b_dh2.to(b_k.dtype))
+
+        if K > 128:
+            p_k = tl.make_block_ptr(k, (T, K), (stride_k, 1), (i_t * BT, 128), (BT, 64), (1, 0))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            if USE_GK:
+                o_k3 = 128 + o_k1
+                b_gk_last3 = tl.load(gk + last_idx * H * K + o_k3, mask=(o_k3 < K), other=0.0)
+            b_dv += tl.dot(b_k, b_dh3.to(b_k.dtype))
+
+        if K > 192:
+            p_k = tl.make_block_ptr(k, (T, K), (stride_k, 1), (i_t * BT, 192), (BT, 64), (1, 0))
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            if USE_GK:
+                o_k4 = 192 + o_k1
+                b_gk_last4 = tl.load(gk + last_idx * H * K + o_k4, mask=(o_k4 < K), other=0.0)
+            b_dv += tl.dot(b_k, b_dh4.to(b_k.dtype))
+
+        if USE_G:
+            m_t = (i_t * BT + tl.arange(0, BT)) < T
+            if USE_EXP2:
+                b_dv *= tl.where(m_t, exp2(bg_last - b_g), 0)[:, None]
+            else:
+                b_dv *= tl.where(m_t, exp(bg_last - b_g), 0)[:, None]
+        b_dv += tl.load(p_dv, boundary_check=(0, 1))
+
+        tl.store(p_dv2, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+        # Update dh
+        p_w = tl.make_block_ptr(w, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1))
+        p_q = tl.make_block_ptr(q, (K, T), (1, stride_k), (0, i_t * BT), (64, BT), (0, 1))
+        b_w = tl.load(p_w, boundary_check=(0, 1))
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        if USE_G:
+            b_dh1 *= bg_last_exp
+            b_q = b_q * b_g_exp[None, :]
+        if USE_GK:
+            if USE_EXP2:
+                b_dh1 *= exp2(b_gk_last1[:, None])
+            else:
+                b_dh1 *= exp(b_gk_last1[:, None])
+        b_dh1 += tl.dot(b_q.to(b_q.dtype), b_do.to(b_q.dtype)) * scale - tl.dot(b_w, b_dv.to(b_w.dtype))
+        if K > 64:
+            p_q = tl.make_block_ptr(q, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1))
+            p_w = tl.make_block_ptr(w, (K, T), (1, stride_k), (64, i_t * BT), (64, BT), (0, 1))
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            if USE_G:
+                b_dh2 *= bg_last_exp
+                b_q = b_q * b_g_exp[None, :]
+            if USE_GK:
+                if USE_EXP2:
+                    b_dh2 *= exp2(b_gk_last2[:, None])
+                else:
+                    b_dh2 *= exp(b_gk_last2[:, None])
+            b_dh2 += tl.dot(b_q.to(b_q.dtype), b_do.to(b_q.dtype)) * scale - tl.dot(b_w, b_dv.to(b_w.dtype))
+        if K > 128:
+            p_q = tl.make_block_ptr(q, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1))
+            p_w = tl.make_block_ptr(w, (K, T), (1, stride_k), (128, i_t * BT), (64, BT), (0, 1))
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            if USE_G:
+                b_dh3 *= bg_last_exp
+                b_q = b_q * b_g_exp[None, :]
+            if USE_GK:
+                if USE_EXP2:
+                    b_dh3 *= exp2(b_gk_last3[:, None])
+                else:
+                    b_dh3 *= exp(b_gk_last3[:, None])
+            b_dh3 += tl.dot(b_q.to(b_q.dtype), b_do.to(b_q.dtype)) * scale - tl.dot(b_w, b_dv.to(b_w.dtype))
+        if K > 192:
+            p_q = tl.make_block_ptr(q, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1))
+            p_w = tl.make_block_ptr(w, (K, T), (1, stride_k), (192, i_t * BT), (64, BT), (0, 1))
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_w = tl.load(p_w, boundary_check=(0, 1))
+            if USE_G:
+                b_dh4 *= bg_last_exp
+                b_q = b_q * b_g_exp[None, :]
+            if USE_GK:
+                if USE_EXP2:
+                    b_dh4 *= exp2(b_gk_last4[:, None])
+                else:
+                    b_dh4 *= exp(b_gk_last4[:, None])
+            b_dh4 += tl.dot(b_q.to(b_q.dtype), b_do.to(b_q.dtype)) * scale - tl.dot(b_w, b_dv.to(b_w.dtype))
+
+    if USE_INITIAL_STATE:
+        p_dh0 = tl.make_block_ptr(dh0, (K, V), (V, 1), (0, i_v * BV), (64, BV), (1, 0))
+        tl.store(p_dh0, b_dh1.to(p_dh0.dtype.element_ty), boundary_check=(0, 1))
+        if K > 64:
+            p_dh1 = tl.make_block_ptr(dh0, (K, V), (V, 1), (64, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh1, b_dh2.to(p_dh1.dtype.element_ty), boundary_check=(0, 1))
+        if K > 128:
+            p_dh2 = tl.make_block_ptr(dh0, (K, V), (V, 1), (128, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh2, b_dh3.to(p_dh2.dtype.element_ty), boundary_check=(0, 1))
+        if K > 192:
+            p_dh3 = tl.make_block_ptr(dh0, (K, V), (V, 1), (192, i_v * BV), (64, BV), (1, 0))
+            tl.store(p_dh3, b_dh4.to(p_dh3.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gated_delta_rule_fwd_h(
+    k: torch.Tensor,
+    w: torch.Tensor,
+    u: torch.Tensor,
+    g: torch.Tensor = None,
+    gk: torch.Tensor = None,
+    initial_state: torch.Tensor = None,
+    output_final_state: bool = False,
+    chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+    save_new_value: bool = True,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_indices: torch.LongTensor = None,
+    use_exp2: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, u.shape[-1]
+    BT = chunk_size
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size)
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+    assert K <= 256, "current kernel does not support head dimension larger than 256."
+
+    h = k.new_empty(B, NT, H, K, V)
+    final_state = k.new_empty(N, H, K, V, dtype=torch.float32) if output_final_state else None
+
+    v_new = torch.empty_like(u) if save_new_value else None
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), N * H)
+
+    chunk_gated_delta_rule_fwd_kernel_h_blockdim64[grid](
+        k=k,
+        v=u,
+        w=w,
+        v_new=v_new,
+        g=g,
+        gk=gk,
+        h=h,
+        h0=initial_state,
+        ht=final_state,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_EXP2=use_exp2,
+    )
+    return h, v_new, final_state
+
+
+def chunk_gated_delta_rule_bwd_dhu(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    w: torch.Tensor,
+    do: torch.Tensor,
+    dv: torch.Tensor,
+    g: torch.Tensor = None,
+    gk: torch.Tensor = None,
+    h0: torch.Tensor = None,
+    dht: torch.Tensor = None,
+    scale: float = None,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,  # SY: remove this argument and force chunk size 64?
+    chunk_indices: torch.LongTensor = None,
+    use_exp2: bool = False,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *q.shape, do.shape[-1]
+    # N: the actual number of sequences in the batch with either equal or variable lengths
+    BT = 64
+    assert K <= 256, "current kernel does not support head dimension being larger than 256."
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size)
+    if cu_seqlens is None:
+        N, NT, chunk_offsets = B, triton.cdiv(T, BT), None
+
+    dh = q.new_empty(B, NT, H, K, V)
+    dh0 = torch.empty_like(h0, dtype=torch.float32) if h0 is not None else None
+    dv2 = torch.empty_like(dv)
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), N * H)
+
+    chunk_gated_delta_rule_bwd_kernel_dhu_blockdim64[grid](
+        q=q,
+        k=k,
+        w=w,
+        g=g,
+        gk=gk,
+        dht=dht,
+        dh0=dh0,
+        do=do,
+        dh=dh,
+        dv=dv,
+        dv2=dv2,
+        cu_seqlens=cu_seqlens,
+        chunk_offsets=chunk_offsets,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_EXP2=use_exp2,
+    )
+    return dh, dh0, dv2
diff --git a/examples/kda/FLA_KDA/fla_chunk_inter.py b/examples/kda/FLA_KDA/fla_chunk_inter.py
new file mode 100644
index 0000000000..e6de9bb28f
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_inter.py
@@ -0,0 +1,193 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import prepare_chunk_indices, exp2, autotune_cache_kwargs, check_shared_mem
+
+BK_LIST = [32, 64] if check_shared_mem() else [16, 32]
+BV_LIST = [64, 128] if check_shared_mem("ampere") else [16, 32]
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BK_LIST
+        for BV in BV_LIST
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_kda_bwd_kernel_inter(
+    q,
+    k,
+    v,
+    g,
+    h,
+    do,
+    dh,
+    dq,
+    dk,
+    dv,
+    dw,
+    dg,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    o_k = i_k * BK + tl.arange(0, BK)
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_k = o_k < K
+    m_t = o_t < T
+    m_last = o_t == min(T, i_t * BT + BT) - 1
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    v += (bos * H + i_h) * V
+    g += (bos * H + i_h) * K
+    h += (i_tg * H + i_h) * K * V
+    do += (bos * H + i_h) * V
+    dh += (i_tg * H + i_h) * K * V
+    dq += (bos * H + i_h) * K
+    dk += (bos * H + i_h) * K
+    dw += (bos * H + i_h) * K
+    dv += (bos * H + i_h) * V
+    dg += (bos * H + i_h) * K
+
+    p_g = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+    p_gn = g + (min(T, i_t * BT + BT) - 1) * H * K + o_k
+    b_gn = tl.load(p_gn, mask=m_k, other=0)
+    b_dq = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dk = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dw = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dgk = tl.zeros([BK], dtype=tl.float32)
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_do = tl.make_block_ptr(do, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_h = tl.make_block_ptr(h, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        p_dh = tl.make_block_ptr(dh, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        # [BT, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        # [BV, BK]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+
+        # [BK]
+        b_dgk += tl.sum(b_h * b_dh, axis=0)
+        # [BT, BK]
+        b_dq += tl.dot(b_do, b_h.to(b_do.dtype))
+        b_dk += tl.dot(b_v, b_dh.to(b_v.dtype))
+
+        p_dv = tl.make_block_ptr(dv, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_dv = tl.load(p_dv, boundary_check=(0, 1))
+        b_dw += tl.dot(b_dv.to(b_v.dtype), b_h.to(b_v.dtype))
+
+    p_dw = tl.make_block_ptr(dw, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    tl.store(p_dw, -b_dw.to(p_dw.dtype.element_ty), boundary_check=(0, 1))
+
+    b_dgk *= exp2(b_gn)
+    b_dq *= scale
+    b_dq = b_dq * exp2(b_g)
+    b_dk = b_dk * tl.where(m_t[:, None], exp2(b_gn[None, :] - b_g), 0)
+
+    p_q = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dq = tl.make_block_ptr(dq, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dk = tl.make_block_ptr(dk, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dg = tl.make_block_ptr(dg, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_dgk += tl.sum(b_dk * b_k, axis=0)
+    b_dg = b_q * b_dq - b_k * b_dk + m_last[:, None] * b_dgk
+
+    tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dg, b_dg.to(p_dg.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_kda_bwd_dqkwg(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    w: torch.Tensor,
+    v: torch.Tensor,
+    h: torch.Tensor,
+    g: torch.Tensor,
+    do: torch.Tensor,
+    dh: torch.Tensor,
+    dv: torch.Tensor,
+    scale: float = None,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+):
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = chunk_size
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    dq = torch.empty_like(q, dtype=torch.float)
+    dk = torch.empty_like(k, dtype=torch.float)
+    dw = torch.empty_like(w)
+    dg = torch.empty_like(g)
+
+    def grid(meta):
+        return (triton.cdiv(K, meta["BK"]), NT, B * H)
+
+    chunk_kda_bwd_kernel_inter[grid](
+        q=q,
+        k=k,
+        v=v,
+        g=g,
+        h=h,
+        do=do,
+        dh=dh,
+        dq=dq,
+        dk=dk,
+        dv=dv,
+        dw=dw,
+        dg=dg,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+    )
+    return dq, dk, dw, dg
diff --git a/examples/kda/FLA_KDA/fla_chunk_intra.py b/examples/kda/FLA_KDA/fla_chunk_intra.py
new file mode 100644
index 0000000000..244f05f1c1
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_intra.py
@@ -0,0 +1,650 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import autotune_cache_kwargs, exp2, prepare_chunk_indices
+from .cumsum import chunk_local_cumsum
+
+IS_TF32_SUPPORTED = False
+if IS_TF32_SUPPORTED:
+    SOLVE_TRIL_DOT_PRECISION = tl.constexpr("tf32x3")
+else:
+    SOLVE_TRIL_DOT_PRECISION = tl.constexpr("ieee")
+SOLVE_TRIL_DOT_PRECISION = tl.constexpr("tf32")
+# ============================================================================
+# Fused inter + solve_tril kernel: compute off-diagonal Akk and solve in one pass
+# ============================================================================
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({"BK": BK}, num_warps=num_warps) for BK in [32, 64] for num_warps in [1, 2, 4]],
+    key=["H", "K", "BC"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_kda_fwd_kernel_inter_solve_fused(
+    q,
+    k,
+    g,
+    beta,
+    Aqk,
+    Akk_diag,
+    Akk,
+    scale,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    """
+    Fused kernel: compute inter-subchunk Akk + solve_tril in one pass.
+    Prerequisite: token_parallel has already computed diagonal Akk blocks in Akk_diag.
+
+    This kernel:
+    1. Computes off-diagonal Aqk blocks -> writes to global
+    2. Computes off-diagonal Akk blocks -> keeps in registers
+    3. Loads diagonal Akk blocks from Akk_diag (fp32)
+    4. Does forward substitution on diagonals
+    5. Computes merged Akk_inv
+    6. Writes Akk_inv to Akk
+    """
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    if i_t * BT >= T:
+        return
+
+    i_tc0 = i_t * BT
+    i_tc1 = i_t * BT + BC
+    i_tc2 = i_t * BT + 2 * BC
+    i_tc3 = i_t * BT + 3 * BC
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    g += (bos * H + i_h) * K
+    Aqk += (bos * H + i_h) * BT
+    Akk += (bos * H + i_h) * BT
+    Akk_diag += (bos * H + i_h) * BC
+
+    m_tc1 = (i_tc1 + tl.arange(0, BC)) < T
+    m_tc2 = (i_tc2 + tl.arange(0, BC)) < T
+    m_tc3 = (i_tc3 + tl.arange(0, BC)) < T
+
+    b_Aqk10 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk10 = tl.zeros([BC, BC], dtype=tl.float32)
+
+    b_Aqk20 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk20 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk21 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk21 = tl.zeros([BC, BC], dtype=tl.float32)
+
+    b_Aqk30 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk30 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk31 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk31 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Aqk32 = tl.zeros([BC, BC], dtype=tl.float32)
+    b_Akk32 = tl.zeros([BC, BC], dtype=tl.float32)
+
+    ################################################################################
+    # 1. off-diagonal blocks
+    ################################################################################
+    for i_k in range(tl.cdiv(K, BK)):
+        o_k = i_k * BK + tl.arange(0, BK)
+        m_k = o_k < K
+
+        p_k0 = tl.make_block_ptr(k, (K, T), (1, H * K), (i_k * BK, i_tc0), (BK, BC), (0, 1))
+        p_g0 = tl.make_block_ptr(g, (K, T), (1, H * K), (i_k * BK, i_tc0), (BK, BC), (0, 1))
+        b_kt0 = tl.load(p_k0, boundary_check=(0, 1)).to(tl.float32)
+        b_gt0 = tl.load(p_g0, boundary_check=(0, 1)).to(tl.float32)
+
+        b_kt1, b_gt1 = b_kt0, b_gt0
+        b_kt2, b_gt2 = b_kt0, b_gt0
+        if i_tc1 < T:
+            p_q1 = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_tc1, i_k * BK), (BC, BK), (1, 0))
+            p_k1 = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_tc1, i_k * BK), (BC, BK), (1, 0))
+            p_g1 = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_tc1, i_k * BK), (BC, BK), (1, 0))
+
+            b_q1 = tl.load(p_q1, boundary_check=(0, 1)).to(tl.float32)
+            b_k1 = tl.load(p_k1, boundary_check=(0, 1)).to(tl.float32)
+            b_g1 = tl.load(p_g1, boundary_check=(0, 1)).to(tl.float32)
+            b_kt1 = tl.trans(b_k1)
+            b_gt1 = tl.trans(b_g1)
+
+            b_gn1 = tl.load(g + i_tc1 * H * K + o_k, mask=m_k, other=0).to(tl.float32)
+            b_gqn1 = tl.where(m_tc1[:, None], exp2(b_g1 - b_gn1[None, :]), 0)
+            b_qg1 = b_q1 * b_gqn1
+            b_kg1 = b_k1 * b_gqn1
+            b_kgt = b_kt0 * exp2(b_gn1[:, None] - b_gt0)
+            b_Aqk10 += tl.dot(b_qg1, b_kgt)
+            b_Akk10 += tl.dot(b_kg1, b_kgt)
+
+        if i_tc2 < T:
+            p_q2 = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_tc2, i_k * BK), (BC, BK), (1, 0))
+            p_k2 = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_tc2, i_k * BK), (BC, BK), (1, 0))
+            p_g2 = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_tc2, i_k * BK), (BC, BK), (1, 0))
+
+            b_q2 = tl.load(p_q2, boundary_check=(0, 1)).to(tl.float32)
+            b_k2 = tl.load(p_k2, boundary_check=(0, 1)).to(tl.float32)
+            b_g2 = tl.load(p_g2, boundary_check=(0, 1)).to(tl.float32)
+            b_kt2 = tl.trans(b_k2)
+            b_gt2 = tl.trans(b_g2)
+
+            b_gn2 = tl.load(g + i_tc2 * H * K + o_k, mask=m_k, other=0).to(tl.float32)
+            b_gqn2 = tl.where(m_tc2[:, None], exp2(b_g2 - b_gn2[None, :]), 0)
+            b_qg2 = b_q2 * b_gqn2
+            b_kg2 = b_k2 * b_gqn2
+            b_kgt = b_kt0 * exp2(b_gn2[:, None] - b_gt0)
+            b_Aqk20 += tl.dot(b_qg2, b_kgt)
+            b_Akk20 += tl.dot(b_kg2, b_kgt)
+
+            b_kgt = b_kt1 * exp2(b_gn2[:, None] - b_gt1)
+            b_Aqk21 += tl.dot(b_qg2, b_kgt)
+            b_Akk21 += tl.dot(b_kg2, b_kgt)
+
+        if i_tc3 < T:
+            p_q3 = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_tc3, i_k * BK), (BC, BK), (1, 0))
+            p_k3 = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_tc3, i_k * BK), (BC, BK), (1, 0))
+            p_g3 = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_tc3, i_k * BK), (BC, BK), (1, 0))
+            b_q3 = tl.load(p_q3, boundary_check=(0, 1)).to(tl.float32)
+            b_k3 = tl.load(p_k3, boundary_check=(0, 1)).to(tl.float32)
+            b_g3 = tl.load(p_g3, boundary_check=(0, 1)).to(tl.float32)
+
+            b_gn3 = tl.load(g + i_tc3 * H * K + o_k, mask=m_k, other=0).to(tl.float32)
+            b_gqn3 = tl.where(m_tc3[:, None], exp2(b_g3 - b_gn3[None, :]), 0)
+            b_qg3 = b_q3 * b_gqn3
+            b_kg3 = b_k3 * b_gqn3
+            b_kgt = b_kt0 * exp2(b_gn3[:, None] - b_gt0)
+            b_Aqk30 += tl.dot(b_qg3, b_kgt)
+            b_Akk30 += tl.dot(b_kg3, b_kgt)
+
+            b_kgt = b_kt1 * exp2(b_gn3[:, None] - b_gt1)
+            b_Aqk31 += tl.dot(b_qg3, b_kgt)
+            b_Akk31 += tl.dot(b_kg3, b_kgt)
+
+            b_kgt = b_kt2 * exp2(b_gn3[:, None] - b_gt2)
+            b_Aqk32 += tl.dot(b_qg3, b_kgt)
+            b_Akk32 += tl.dot(b_kg3, b_kgt)
+
+    ################################################################################
+    # 2. save off-diagonal Aqk blocks and prepare Akk
+    ################################################################################
+    if i_tc1 < T:
+        p_Aqk10 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc1, 0), (BC, BC), (1, 0))
+        tl.store(p_Aqk10, (b_Aqk10 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+
+        p_b1 = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_tc1,), (BC,), (0,))
+        b_b1 = tl.load(p_b1, boundary_check=(0,)).to(tl.float32)
+        b_Akk10 = b_Akk10 * b_b1[:, None]
+    if i_tc2 < T:
+        p_Aqk20 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc2, 0), (BC, BC), (1, 0))
+        p_Aqk21 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc2, BC), (BC, BC), (1, 0))
+        tl.store(p_Aqk20, (b_Aqk20 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Aqk21, (b_Aqk21 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+
+        p_b2 = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_tc2,), (BC,), (0,))
+        b_b2 = tl.load(p_b2, boundary_check=(0,)).to(tl.float32)
+        b_Akk20 = b_Akk20 * b_b2[:, None]
+        b_Akk21 = b_Akk21 * b_b2[:, None]
+    if i_tc3 < T:
+        p_Aqk30 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc3, 0), (BC, BC), (1, 0))
+        p_Aqk31 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc3, BC), (BC, BC), (1, 0))
+        p_Aqk32 = tl.make_block_ptr(Aqk, (T, BT), (H * BT, 1), (i_tc3, 2 * BC), (BC, BC), (1, 0))
+        tl.store(p_Aqk30, (b_Aqk30 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Aqk31, (b_Aqk31 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_Aqk32, (b_Aqk32 * scale).to(Aqk.dtype.element_ty), boundary_check=(0, 1))
+
+        p_b3 = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_tc3,), (BC,), (0,))
+        b_b3 = tl.load(p_b3, boundary_check=(0,)).to(tl.float32)
+        b_Akk30 = b_Akk30 * b_b3[:, None]
+        b_Akk31 = b_Akk31 * b_b3[:, None]
+        b_Akk32 = b_Akk32 * b_b3[:, None]
+
+    ################################################################################
+    # 3. load diagonal Akk blocks
+    ################################################################################
+    p_Akk00 = tl.make_block_ptr(Akk_diag, (T, BC), (H * BC, 1), (i_tc0, 0), (BC, BC), (1, 0))
+    p_Akk11 = tl.make_block_ptr(Akk_diag, (T, BC), (H * BC, 1), (i_tc1, 0), (BC, BC), (1, 0))
+    p_Akk22 = tl.make_block_ptr(Akk_diag, (T, BC), (H * BC, 1), (i_tc2, 0), (BC, BC), (1, 0))
+    p_Akk33 = tl.make_block_ptr(Akk_diag, (T, BC), (H * BC, 1), (i_tc3, 0), (BC, BC), (1, 0))
+    # each diagonal block is stored contiguously: row i of block s is at Akk_diag[t=i_t*BT+s*BC+i, :BC]
+    b_Ai00 = tl.load(p_Akk00, boundary_check=(0, 1)).to(tl.float32)
+    b_Ai11 = tl.load(p_Akk11, boundary_check=(0, 1)).to(tl.float32)
+    b_Ai22 = tl.load(p_Akk22, boundary_check=(0, 1)).to(tl.float32)
+    b_Ai33 = tl.load(p_Akk33, boundary_check=(0, 1)).to(tl.float32)
+
+    ################################################################################
+    # 4. forward substitution on diagonals
+    ################################################################################
+    o_i = tl.arange(0, BC)
+    m_A = o_i[:, None] > o_i[None, :]
+    m_I = o_i[:, None] == o_i[None, :]
+
+    b_Ai00 = -tl.where(m_A, b_Ai00, 0)
+    b_Ai11 = -tl.where(m_A, b_Ai11, 0)
+    b_Ai22 = -tl.where(m_A, b_Ai22, 0)
+    b_Ai33 = -tl.where(m_A, b_Ai33, 0)
+
+    # Forward substitution: load from Akk_diag (stride H*BC, columns 0:BC)
+    for i in range(2, min(BC, T - i_tc0)):
+        b_a00 = -tl.load(Akk_diag + (i_tc0 + i) * H * BC + o_i)
+        b_a00 = tl.where(o_i < i, b_a00, 0.0)
+        b_a00 += tl.sum(b_a00[:, None] * b_Ai00, 0)
+        b_Ai00 = tl.where((o_i == i)[:, None], b_a00, b_Ai00)
+    for i in range(BC + 2, min(2 * BC, T - i_tc0)):
+        b_a11 = -tl.load(Akk_diag + (i_tc0 + i) * H * BC + o_i)
+        b_a11 = tl.where(o_i < i - BC, b_a11, 0.0)
+        b_a11 += tl.sum(b_a11[:, None] * b_Ai11, 0)
+        b_Ai11 = tl.where((o_i == i - BC)[:, None], b_a11, b_Ai11)
+    for i in range(2 * BC + 2, min(3 * BC, T - i_tc0)):
+        b_a22 = -tl.load(Akk_diag + (i_tc0 + i) * H * BC + o_i)
+        b_a22 = tl.where(o_i < i - 2 * BC, b_a22, 0.0)
+        b_a22 += tl.sum(b_a22[:, None] * b_Ai22, 0)
+        b_Ai22 = tl.where((o_i == i - 2 * BC)[:, None], b_a22, b_Ai22)
+    for i in range(3 * BC + 2, min(4 * BC, T - i_tc0)):
+        b_a33 = -tl.load(Akk_diag + (i_tc0 + i) * H * BC + o_i)
+        b_a33 = tl.where(o_i < i - 3 * BC, b_a33, 0.0)
+        b_a33 += tl.sum(b_a33[:, None] * b_Ai33, 0)
+        b_Ai33 = tl.where((o_i == i - 3 * BC)[:, None], b_a33, b_Ai33)
+
+    b_Ai00 += m_I
+    b_Ai11 += m_I
+    b_Ai22 += m_I
+    b_Ai33 += m_I
+
+    # ################################################################################
+    # # 5. compute merged inverse using off-diagonals
+    # ################################################################################
+
+    # we used tf32x3 to maintain matrix inverse's precision whenever possible.
+    b_Ai10 = -tl.dot(tl.dot(b_Ai11, b_Akk10, input_precision=SOLVE_TRIL_DOT_PRECISION), b_Ai00, input_precision=SOLVE_TRIL_DOT_PRECISION)
+    b_Ai21 = -tl.dot(tl.dot(b_Ai22, b_Akk21, input_precision=SOLVE_TRIL_DOT_PRECISION), b_Ai11, input_precision=SOLVE_TRIL_DOT_PRECISION)
+    b_Ai32 = -tl.dot(tl.dot(b_Ai33, b_Akk32, input_precision=SOLVE_TRIL_DOT_PRECISION), b_Ai22, input_precision=SOLVE_TRIL_DOT_PRECISION)
+
+    b_Ai20 = -tl.dot(
+        b_Ai22,
+        tl.dot(b_Akk20, b_Ai00, input_precision=SOLVE_TRIL_DOT_PRECISION)
+        + tl.dot(b_Akk21, b_Ai10, input_precision=SOLVE_TRIL_DOT_PRECISION),
+        input_precision=SOLVE_TRIL_DOT_PRECISION,
+    )
+    b_Ai31 = -tl.dot(
+        b_Ai33,
+        tl.dot(b_Akk31, b_Ai11, input_precision=SOLVE_TRIL_DOT_PRECISION)
+        + tl.dot(b_Akk32, b_Ai21, input_precision=SOLVE_TRIL_DOT_PRECISION),
+        input_precision=SOLVE_TRIL_DOT_PRECISION,
+    )
+    b_Ai30 = -tl.dot(
+        b_Ai33,
+        tl.dot(b_Akk30, b_Ai00, input_precision=SOLVE_TRIL_DOT_PRECISION)
+        + tl.dot(b_Akk31, b_Ai10, input_precision=SOLVE_TRIL_DOT_PRECISION)
+        + tl.dot(b_Akk32, b_Ai20, input_precision=SOLVE_TRIL_DOT_PRECISION),
+        input_precision=SOLVE_TRIL_DOT_PRECISION,
+    )
+
+    ################################################################################
+    # 6. store full Akk_inv to Akk
+    ################################################################################
+
+    p_Akk00 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc0, 0), (BC, BC), (1, 0))
+    p_Akk10 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc1, 0), (BC, BC), (1, 0))
+    p_Akk11 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc1, BC), (BC, BC), (1, 0))
+    p_Akk20 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc2, 0), (BC, BC), (1, 0))
+    p_Akk21 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc2, BC), (BC, BC), (1, 0))
+    p_Akk22 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc2, 2 * BC), (BC, BC), (1, 0))
+    p_Akk30 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc3, 0), (BC, BC), (1, 0))
+    p_Akk31 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc3, BC), (BC, BC), (1, 0))
+    p_Akk32 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc3, 2 * BC), (BC, BC), (1, 0))
+    p_Akk33 = tl.make_block_ptr(Akk, (T, BT), (H * BT, 1), (i_tc3, 3 * BC), (BC, BC), (1, 0))
+
+    tl.store(p_Akk00, b_Ai00.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk10, b_Ai10.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk11, b_Ai11.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk20, b_Ai20.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk21, b_Ai21.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk22, b_Ai22.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk30, b_Ai30.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk31, b_Ai31.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk32, b_Ai32.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_Akk33, b_Ai33.to(Akk.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4]],
+    key=["BK", "NC", "BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["B", "T"])
+def chunk_kda_bwd_kernel_intra(
+    q,
+    k,
+    g,
+    beta,
+    dAqk,
+    dAkk,
+    dq,
+    dq2,
+    dk,
+    dk2,
+    dg,
+    dg2,
+    db,
+    cu_seqlens,
+    chunk_indices,
+    B,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BK: tl.constexpr,
+    NC: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_kc, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    i_k, i_i = i_kc // NC, i_kc % NC
+
+    all = B * T
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    T = eos - bos
+
+    i_ti = i_t * BT + i_i * BC
+    if i_ti >= T:
+        return
+
+    o_k = i_k * BK + tl.arange(0, BK)
+    m_k = o_k < K
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    g += (bos * H + i_h) * K
+    beta += bos * H + i_h
+
+    dAqk += (bos * H + i_h) * BT
+    dAkk += (bos * H + i_h) * BT
+    dq += (bos * H + i_h) * K
+    dq2 += (bos * H + i_h) * K
+    dk += (bos * H + i_h) * K
+    dk2 += (bos * H + i_h) * K
+    dg += (bos * H + i_h) * K
+    dg2 += (bos * H + i_h) * K
+    db += (i_k * all + bos) * H + i_h
+
+    p_g = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    b_g = tl.load(p_g, boundary_check=(0, 1))
+
+    p_b = tl.make_block_ptr(beta, (T,), (H,), (i_ti,), (BC,), (0,))
+    b_b = tl.load(p_b, boundary_check=(0,))
+
+    b_dq2 = tl.zeros([BC, BK], dtype=tl.float32)
+    b_dk2 = tl.zeros([BC, BK], dtype=tl.float32)
+    if i_i > 0:
+        p_gn = g + i_ti * H * K + o_k
+        # [BK,]
+        b_gn = tl.load(p_gn, mask=m_k, other=0)
+        for i_j in range(0, i_i):
+            p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_gk = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_dAqk = tl.make_block_ptr(dAqk, (T, BT), (H * BT, 1), (i_ti, i_j * BC), (BC, BC), (1, 0))
+            p_dAkk = tl.make_block_ptr(dAkk, (T, BT), (H * BT, 1), (i_ti, i_j * BC), (BC, BC), (1, 0))
+            # [BC, BK]
+            b_k = tl.load(p_k, boundary_check=(0, 1))
+            b_gk = tl.load(p_gk, boundary_check=(0, 1))
+            b_kg = b_k * exp2(b_gn[None, :] - b_gk)
+            # [BC, BC]
+            b_dAqk = tl.load(p_dAqk, boundary_check=(0, 1))
+            b_dAkk = tl.load(p_dAkk, boundary_check=(0, 1))
+            # [BC, BK]
+            b_dq2 += tl.dot(b_dAqk, b_kg)
+            b_dk2 += tl.dot(b_dAkk, b_kg)
+        b_gqn = exp2(b_g - b_gn[None, :])
+        b_dq2 *= b_gqn
+        b_dk2 *= b_gqn
+
+    o_i = tl.arange(0, BC)
+    m_dA = (i_ti + o_i) < T
+    o_dA = (i_ti + o_i) * H * BT + i_i * BC
+    p_kj = k + i_ti * H * K + o_k
+    p_gkj = g + i_ti * H * K + o_k
+
+    p_q = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        # [BC]
+        b_dAqk = tl.load(dAqk + o_dA + j, mask=m_dA, other=0)
+        b_dAkk = tl.load(dAkk + o_dA + j, mask=m_dA, other=0)
+        # [BK]
+        b_kj = tl.load(p_kj, mask=m_k, other=0).to(tl.float32)
+        b_gkj = tl.load(p_gkj, mask=m_k, other=0).to(tl.float32)
+        # [BC, BK]
+        m_i = o_i[:, None] >= j
+        # [BC, BK]
+        b_kgj = b_kj[None, :] * exp2(b_g - b_gkj[None, :])
+        b_dq2 += tl.where(m_i, b_dAqk[:, None] * b_kgj, 0.0)
+        b_dk2 += tl.where(m_i, b_dAkk[:, None] * b_kgj, 0.0)
+
+        p_kj += H * K
+        p_gkj += H * K
+    b_db = tl.sum(b_dk2 * b_k, 1)
+    b_dk2 *= b_b[:, None]
+
+    p_dq = tl.make_block_ptr(dq, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_dq2 = tl.make_block_ptr(dq2, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_db = tl.make_block_ptr(db, (T,), (H,), (i_ti,), (BC,), (0,))
+
+    b_dg2 = b_q * b_dq2
+    b_dq2 = b_dq2 + tl.load(p_dq, boundary_check=(0, 1))
+    tl.store(p_dq2, b_dq2.to(p_dq2.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_db, b_db.to(p_db.dtype.element_ty), boundary_check=(0,))
+
+    tl.debug_barrier()
+    b_dkt = tl.zeros([BC, BK], dtype=tl.float32)
+
+    NC = min(NC, tl.cdiv(T - i_t * BT, BC))
+    if i_i < NC - 1:
+        p_gn = g + (min(i_ti + BC, T) - 1) * H * K + o_k
+        # [BK,]
+        b_gn = tl.load(p_gn, mask=m_k, other=0)
+        for i_j in range(i_i + 1, NC):
+            p_q = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_gk = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_t * BT + i_j * BC, i_k * BK), (BC, BK), (1, 0))
+            p_b = tl.make_block_ptr(beta, (T,), (H,), (i_t * BT + i_j * BC,), (BC,), (0,))
+            p_dAqk = tl.make_block_ptr(dAqk, (BT, T), (1, H * BT), (i_i * BC, i_t * BT + i_j * BC), (BC, BC), (0, 1))
+            p_dAkk = tl.make_block_ptr(dAkk, (BT, T), (1, H * BT), (i_i * BC, i_t * BT + i_j * BC), (BC, BC), (0, 1))
+            # [BC]
+            b_b = tl.load(p_b, boundary_check=(0,))
+            # [BC, BK]
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_kb = tl.load(p_k, boundary_check=(0, 1)) * b_b[:, None]
+            b_gk = tl.load(p_gk, boundary_check=(0, 1))
+            # [BC, BC]
+            b_dAqk = tl.load(p_dAqk, boundary_check=(0, 1))
+            b_dAkk = tl.load(p_dAkk, boundary_check=(0, 1))
+
+            o_j = i_t * BT + i_j * BC + o_i
+            m_j = o_j < T
+            # [BC, BK]
+            b_gkn = tl.where(m_j[:, None], exp2(b_gk - b_gn[None, :]), 0)
+            b_qg = b_q * b_gkn
+            b_kbg = b_kb * b_gkn
+            # [BC, BK]
+            b_dkt += tl.dot(b_dAqk, b_qg) + tl.dot(b_dAkk, b_kbg)
+        b_dkt *= exp2(b_gn[None, :] - b_g)
+
+    o_dA = i_ti * H * BT + i_i * BC + o_i
+    p_qj = q + i_ti * H * K + o_k  # [bs, i_ti, i_h*block_h, i_k*bk:(i_k+1)*bk]
+    p_kj = k + i_ti * H * K + o_k
+    p_gkj = g + i_ti * H * K + o_k
+    p_bj = beta + i_ti * H
+
+    for j in range(0, min(BC, T - i_t * BT - i_i * BC)):
+        # [BC,]
+        b_dAqk = tl.load(dAqk + o_dA + j * H * BT)
+        b_dAkk = tl.load(dAkk + o_dA + j * H * BT)
+        # [BK,]
+        b_qj = tl.load(p_qj, mask=m_k, other=0).to(tl.float32)
+        b_kbj = tl.load(p_kj, mask=m_k, other=0).to(tl.float32) * tl.load(p_bj)
+        b_gkj = tl.load(p_gkj, mask=m_k, other=0).to(tl.float32)
+        # [BC, BK]
+        m_i = o_i[:, None] <= j
+        b_gkq = exp2(b_gkj[None, :] - b_g)
+        b_dkt += tl.where(m_i, (b_dAkk[:, None] * b_kbj[None, :] + b_dAqk[:, None] * b_qj[None, :]) * b_gkq, 0.0)
+
+        p_qj += H * K
+        p_kj += H * K
+        p_gkj += H * K
+        p_bj += H
+    p_dk = tl.make_block_ptr(dk, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_dk2 = tl.make_block_ptr(dk2, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_dg = tl.make_block_ptr(dg, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+    p_dg2 = tl.make_block_ptr(dg2, (T, K), (H * K, 1), (i_ti, i_k * BK), (BC, BK), (1, 0))
+
+    b_dg2 += (b_dk2 - b_dkt) * b_k + tl.load(p_dg, boundary_check=(0, 1))
+    b_dk2 += tl.load(p_dk, boundary_check=(0, 1))
+    b_dk2 += b_dkt
+
+    tl.store(p_dk2, b_dk2.to(p_dk2.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dg2, b_dg2.to(p_dg2.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_kda_bwd_intra(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    g: torch.Tensor,
+    beta: torch.Tensor,
+    dAqk: torch.Tensor,
+    dAkk: torch.Tensor,
+    dq: torch.Tensor,
+    dk: torch.Tensor,
+    db: torch.Tensor,
+    dg: torch.Tensor,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_indices: torch.LongTensor = None,
+    chunk_size: int = 64,
+):
+    B, T, H, K = k.shape
+    BT = chunk_size
+    BC = min(16, BT)
+    BK = min(32, triton.next_power_of_2(K))
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    NC = triton.cdiv(BT, BC)
+    NK = triton.cdiv(K, BK)
+
+    dq2 = torch.empty_like(q)
+    dk2 = torch.empty_like(k)
+    db2 = beta.new_empty(NK, *beta.shape, dtype=torch.float)
+    dg2 = torch.empty_like(dg, dtype=torch.float)
+    grid = (NK * NC, NT, B * H)
+    chunk_kda_bwd_kernel_intra[grid](
+        q=q,
+        k=k,
+        g=g,
+        beta=beta,
+        dAqk=dAqk,
+        dAkk=dAkk,
+        dq=dq,
+        dq2=dq2,
+        dk=dk,
+        dk2=dk2,
+        dg=dg,
+        dg2=dg2,
+        db=db2,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        B=B,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+        BK=BK,
+        NC=NC,
+    )
+    dq = dq2
+    dk = dk2
+    db = db2.sum(0).add_(db)
+    dg = chunk_local_cumsum(
+        dg2,
+        chunk_size=chunk_size,
+        reverse=True,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+    )
+
+    return dq, dk, db, dg
+
+
+def chunk_kda_fwd_inter_solve_fused(
+    q,
+    k,
+    gk,
+    beta,
+    Aqk,
+    Akk_diag,
+    Akk,
+    scale,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+):
+    B, T, H, K = k.shape
+    assert K <= 256
+    BT = chunk_size
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    BC = 16
+
+    grid = (NT, B * H)
+    chunk_kda_fwd_kernel_inter_solve_fused[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        Aqk=Aqk,
+        Akk_diag=Akk_diag,
+        Akk=Akk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+    )
diff --git a/examples/kda/FLA_KDA/fla_chunk_intra_token_parallel.py b/examples/kda/FLA_KDA/fla_chunk_intra_token_parallel.py
new file mode 100644
index 0000000000..1dba202821
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_intra_token_parallel.py
@@ -0,0 +1,168 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+# Token-parallel implementation of KDA intra chunk kernel
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import exp2, autotune_cache_kwargs
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({"BH": BH}, num_warps=num_warps) for BH in [1, 2, 4, 8] for num_warps in [1, 2, 4, 8]],
+    key=["K", "H"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T", "N"])
+def chunk_kda_fwd_kernel_intra_token_parallel(
+    q,
+    k,
+    g,
+    beta,
+    Aqk,
+    Akk,
+    scale,
+    cu_seqlens,
+    N,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    BT: tl.constexpr,
+    BC: tl.constexpr,
+    BH: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_tg, i_hg = tl.program_id(0), tl.program_id(1)
+
+    if IS_VARLEN:
+        i_n = 0
+        left, right = 0, N
+
+        # Unrolled binary search (max B=2^32)
+        # We can limit iterations based on expected max batch size if needed
+        # 20 iterations covers B=1M, usually enough
+        for _ in range(20):
+            if left < right:
+                mid = (left + right) // 2
+                if i_tg < tl.load(cu_seqlens + mid + 1).to(tl.int32):
+                    right = mid
+                else:
+                    left = mid + 1
+        i_n = left
+
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        i_t = i_tg - bos
+    else:
+        bos = (i_tg // T) * T
+        i_t = i_tg % T
+
+    if i_t >= T:
+        return
+
+    i_c = i_t // BT  # chunk indices
+    i_s = (i_t % BT) // BC  # sub_chunk indices
+    i_tc = i_c * BT  # chunk 首坐标
+    i_ts = i_tc + i_s * BC  # subchunk 首坐标
+
+    q += bos * H * K
+    k += bos * H * K
+    g += bos * H * K
+    Aqk += bos * H * BT
+    Akk += bos * H * BC
+    beta += bos * H
+
+    BK: tl.constexpr = triton.next_power_of_2(K)
+    o_h = tl.arange(0, BH)
+    o_k = tl.arange(0, BK)
+    m_h = (i_hg * BH + o_h) < H
+    m_k = o_k < K
+
+    p_q = tl.make_block_ptr(q + i_t * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+    p_k = tl.make_block_ptr(k + i_t * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+    p_g = tl.make_block_ptr(g + i_t * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+    p_beta = tl.make_block_ptr(beta + i_t * H, (H,), (1,), (i_hg * BH,), (BH,), (0,))
+    # [BH, BK]
+    b_q = tl.load(p_q, boundary_check=(0, 1)).to(tl.float32)
+    b_k = tl.load(p_k, boundary_check=(0, 1)).to(tl.float32)
+    b_g = tl.load(p_g, boundary_check=(0, 1)).to(tl.float32)
+    b_k = b_k * tl.load(p_beta, boundary_check=(0,)).to(tl.float32)[:, None]
+
+    for j in range(i_ts, min(i_t + 1, min(T, i_ts + BC))):
+        p_kj = tl.make_block_ptr(k + j * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+        p_gj = tl.make_block_ptr(g + j * H * K, (H, K), (K, 1), (i_hg * BH, 0), (BH, BK), (1, 0))
+        # [BH, BK]
+        b_kj = tl.load(p_kj, boundary_check=(0, 1)).to(tl.float32)
+        b_gj = tl.load(p_gj, boundary_check=(0, 1)).to(tl.float32)
+
+        b_kgj = b_kj * exp2(b_g - b_gj)
+
+        b_kgj = tl.where(m_k[None, :], b_kgj, 0.0)
+        # [BH]
+        b_Aqk = tl.sum(b_q * b_kgj, axis=1) * scale
+        b_Akk = tl.sum(b_k * b_kgj, axis=1) * tl.where(j < i_t, 1.0, 0.0)
+
+        tl.store(Aqk + i_t * H * BT + (i_hg * BH + o_h) * BT + j % BT, b_Aqk.to(Aqk.dtype.element_ty), mask=m_h)
+        tl.store(Akk + i_t * H * BC + (i_hg * BH + o_h) * BC + j - i_ts, b_Akk.to(Akk.dtype.element_ty), mask=m_h)
+
+
+def chunk_kda_fwd_intra_token_parallel(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    gk: torch.Tensor,
+    beta: torch.Tensor,
+    Aqk: torch.Tensor,
+    Akk: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    sub_chunk_size: int = 16,
+) -> None:
+    """
+    Token-parallel implementation: each token gets its own thread block.
+    Supports both fixed-length and variable-length sequences.
+    Reduces wasted computation on padding.
+
+    Writes directly to Aqk and Akk tensors (in-place).
+
+    Args:
+        q: [B, T, H, K]
+        k: [B, T, H, K]
+        gk: [B, T, H, K] cumsum of gates
+        beta: [B, T, H]
+        Aqk: [B, T, H, BT] output tensor to write to
+        Akk: [B, T, H, BC] output tensor for diagonal blocks (fp32)
+        scale: attention scale
+        chunk_size: BT (default 64)
+        sub_chunk_size: BC (default 16)
+    """
+    B, T, H, K = q.shape
+    N = len(cu_seqlens) - 1 if cu_seqlens is not None else B
+    BT = chunk_size
+    BC = sub_chunk_size
+
+    def grid(meta):
+        return (B * T, triton.cdiv(H, meta["BH"]))
+
+    chunk_kda_fwd_kernel_intra_token_parallel[grid](
+        q=q,
+        k=k,
+        g=gk,
+        beta=beta,
+        Aqk=Aqk,
+        Akk=Akk,
+        scale=scale,
+        cu_seqlens=cu_seqlens,
+        N=N,
+        T=T,
+        H=H,
+        K=K,
+        BT=BT,
+        BC=BC,
+    )
+    return Aqk, Akk
diff --git a/examples/kda/FLA_KDA/fla_chunk_o.py b/examples/kda/FLA_KDA/fla_chunk_o.py
new file mode 100644
index 0000000000..c29db9508f
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_chunk_o.py
@@ -0,0 +1,546 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+
+
+from .fla_utils import prepare_chunk_indices, exp, exp2, autotune_cache_kwargs, check_shared_mem
+
+
+BK_LIST = [32, 64] if check_shared_mem() else [16, 32]
+BV_LIST = [64, 128] if check_shared_mem("ampere") else [16, 32]
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in [32, 64]
+        for BV in [64, 128]
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_fwd_kernel_o(
+    q,
+    v,
+    g,
+    h,
+    o,
+    A,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_EXP2: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+
+    b_o = tl.zeros([BT, BV], dtype=tl.float32)
+    for i_k in range(tl.cdiv(K, BK)):
+        p_q = tl.make_block_ptr(q + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_g = tl.make_block_ptr(g + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_h = tl.make_block_ptr(h + (i_tg * H + i_h) * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+
+        # [BT, BK]
+        b_q = tl.load(p_q, boundary_check=(0, 1))
+        b_q = (b_q * scale).to(b_q.dtype)
+        # [BT, BK]
+        b_g = tl.load(p_g, boundary_check=(0, 1))
+        # [BT, BK]
+        if USE_EXP2:
+            b_qg = (b_q * exp2(b_g)).to(b_q.dtype)
+        else:
+            b_qg = (b_q * exp(b_g)).to(b_q.dtype)
+        # [BK, BV]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        # works but dkw, owing to divine benevolence
+        # [BT, BV]
+        if i_k >= 0:
+            b_o += tl.dot(b_qg, b_h.to(b_qg.dtype))
+    p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_o = tl.make_block_ptr(o + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    # [BT, BV]
+    b_v = tl.load(p_v, boundary_check=(0, 1))
+    # [BT, BT]
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_A = tl.where(m_s, b_A, 0.0).to(b_v.dtype)
+    b_o += tl.dot(b_A, b_v)
+    tl.store(p_o, b_o.to(p_o.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps, num_stages=num_stages)
+        for BK in BK_LIST
+        for BV in BV_LIST
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+    ],
+    key=["BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_bwd_kernel_dv(
+    k,
+    g,
+    A,
+    do,
+    dh,
+    dv,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_v, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (BT, T), (1, H * BT), (0, i_t * BT), (BT, BT), (0, 1))
+    p_do = tl.make_block_ptr(do + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_do = tl.load(p_do, boundary_check=(0, 1))
+
+    b_A = tl.where(tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :], b_A, 0.0)
+    # (SY 09/17) important to disallow tf32 here to maintain a good precision.
+    b_dv = tl.dot(b_A, b_do.to(b_A.dtype), allow_tf32=False)
+
+    for i_k in range(tl.cdiv(K, BK)):
+        o_k = i_k * BK + tl.arange(0, BK)
+        m_k = o_k < K
+
+        p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_gk = tl.make_block_ptr(g + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_gn = g + (bos + min(i_t * BT + BT, T) - 1) * H * K + i_h * K + o_k
+        p_dh = tl.make_block_ptr(dh + (i_tg * H + i_h) * K * V, (K, V), (V, 1), (i_k * BK, i_v * BV), (BK, BV), (1, 0))
+
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+
+        b_gn = exp(tl.load(p_gn, mask=m_k, other=0)[None, :] - b_gk)
+        b_k = (b_k * b_gn).to(b_k.dtype)
+        # [BT, BV]
+        # (SY 09/17) it is ok to have bf16 interchunk gradient contribution here
+        b_dv += tl.dot(b_k, b_dh.to(b_k.dtype))
+
+    tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({"BK": BK, "BV": BV}, num_warps=num_warps) for BK in BK_LIST for BV in BV_LIST for num_warps in [2, 4, 8]],
+    key=["BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_bwd_kernel_inter(
+    q,
+    k,
+    v,
+    g,
+    h,
+    do,
+    dh,
+    dq,
+    dk,
+    dq2,
+    dk2,
+    dg,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_k, i_t, i_bh = tl.program_id(0), tl.program_id(1), tl.program_id(2)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_tg = i_t
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+        NT = tl.cdiv(T, BT)
+    else:
+        NT = tl.cdiv(T, BT)
+        i_tg = i_b * NT + i_t
+        bos, eos = i_b * T, i_b * T + T
+    o_k = i_k * BK + tl.arange(0, BK)
+    m_k = o_k < K
+
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    v += (bos * H + i_h) * V
+    g += (bos * H + i_h) * K
+    h += (i_tg * H + i_h) * K * V
+    do += (bos * H + i_h) * V
+    dh += (i_tg * H + i_h) * K * V
+    dq += (bos * H + i_h) * K
+    dk += (bos * H + i_h) * K
+    dq2 += (bos * H + i_h) * K
+    dk2 += (bos * H + i_h) * K
+    dg += (bos * H + i_h) * K
+
+    p_gk = tl.make_block_ptr(g, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_gk = tl.load(p_gk, boundary_check=(0, 1))
+    p_gn = g + (min(T, i_t * BT + BT) - 1) * H * K + o_k
+    b_gn = tl.load(p_gn, mask=m_k, other=0)
+    b_dq = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dk = tl.zeros([BT, BK], dtype=tl.float32)
+    b_dgk = tl.zeros([BK], dtype=tl.float32)
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_do = tl.make_block_ptr(do, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_h = tl.make_block_ptr(h, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        p_dh = tl.make_block_ptr(dh, (V, K), (1, V), (i_v * BV, i_k * BK), (BV, BK), (0, 1))
+        # [BT, BV]
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        # [BV, BK]
+        b_h = tl.load(p_h, boundary_check=(0, 1))
+        b_dh = tl.load(p_dh, boundary_check=(0, 1))
+
+        # [BK]
+        b_dgk += tl.sum(b_h * b_dh, axis=0)
+        # [BT, BK]
+        b_dq += tl.dot(b_do, b_h.to(b_do.dtype))
+        b_dk += tl.dot(b_v, b_dh.to(b_v.dtype))
+
+    b_dgk *= exp(b_gn)
+    b_dq *= scale
+    b_dq = b_dq * exp(b_gk)
+    b_dk = b_dk * exp(b_gn[None, :] - b_gk)
+
+    p_q = tl.make_block_ptr(q, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_k = tl.make_block_ptr(k, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dq = tl.make_block_ptr(dq, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dk = tl.make_block_ptr(dk, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    b_q = tl.load(p_q, boundary_check=(0, 1))
+    b_k = tl.load(p_k, boundary_check=(0, 1))
+    b_dgk += tl.sum(b_dk * b_k, axis=0)
+    b_dq += tl.load(p_dq, boundary_check=(0, 1))
+    b_dk += tl.load(p_dk, boundary_check=(0, 1))
+    b_dg = b_q * b_dq - b_k * b_dk
+    # tl.debug_barrier()
+    b_dg = b_dg - tl.cumsum(b_dg, axis=0) + tl.sum(b_dg, axis=0)[None, :] + b_dgk[None, :]
+    # Buggy due to strange triton compiler issue.
+    # m_s = tl.where(tl.arange(0, BT)[:, None] <= tl.arange(0, BT)[None, :], 1., 0.)
+    # b_dg = tl.dot(m_s, b_dg, allow_tf32=False) + b_dgk[None, :]
+    p_dq = tl.make_block_ptr(dq2, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dk = tl.make_block_ptr(dk2, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    p_dg = tl.make_block_ptr(dg, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+    tl.store(p_dq, b_dq.to(p_dq.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dk, b_dk.to(p_dk.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_dg, b_dg.to(p_dg.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gla_fwd_o_gk(
+    q: torch.Tensor,
+    v: torch.Tensor,
+    g: torch.Tensor,
+    A: torch.Tensor,
+    h: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+    use_exp2: bool = False,
+):
+    B, T, H, K, V = *q.shape, v.shape[-1]
+    BT = chunk_size
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    o = torch.empty_like(v)
+
+    def grid(meta):
+        return (triton.cdiv(V, meta["BV"]), NT, B * H)
+
+    chunk_gla_fwd_kernel_o[grid](
+        q=q,
+        v=v,
+        g=g,
+        h=h,
+        o=o,
+        A=A,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        USE_EXP2=use_exp2,
+    )
+    return o
+
+
+NUM_WARPS = [2, 4]
+
+
+@triton.heuristics(
+    {
+        "USE_G": lambda args: args["g"] is not None,
+        "USE_G_GAMMA": lambda args: args["g_gamma"] is not None,
+        "USE_A": lambda args: args["A"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in NUM_WARPS for num_stages in [2, 3, 4]],
+    key=["H", "K", "V", "BT", "BK", "BV", "USE_G"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_bwd_kernel_dv_local(
+    q,
+    k,
+    g,
+    g_gamma,
+    A,
+    do,
+    dv,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    USE_G: tl.constexpr,
+    USE_G_GAMMA: tl.constexpr,
+    USE_A: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    # offset calculation
+    q += (bos * H + i_h) * K
+    k += (bos * H + i_h) * K
+    do += (bos * H + i_h) * V
+    dv += (bos * H + i_h) * V
+
+    if USE_A:
+        p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (BT, T), (1, H * BT), (0, i_t * BT), (BT, BT), (0, 1))
+        b_A = tl.load(p_A, boundary_check=(0, 1))
+
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+    m_A = (o_t[:, None] <= o_t[None, :]) & (m_t[:, None] & m_t)
+    b_A = tl.where(m_A, b_A, 0).to(do.dtype.element_ty)
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_do = tl.make_block_ptr(do, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dv = tl.make_block_ptr(dv, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+        b_dv = tl.dot(b_A.to(b_do.dtype), b_do)
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_bwd_dv_local(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    do: torch.Tensor,
+    g: torch.Tensor = None,
+    g_gamma: torch.Tensor = None,
+    A: torch.Tensor = None,
+    scale: float = None,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+) -> torch.Tensor:
+    B, T, H, K, V = *k.shape, do.shape[-1]
+    BT = chunk_size
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    # H100 can have larger block size
+    if check_shared_mem("hopper", k.device.index):
+        CONST_TILING = 128
+    elif check_shared_mem:
+        CONST_TILING = 64
+    else:
+        CONST_TILING = 32
+    BK = min(max(triton.next_power_of_2(K), 16), CONST_TILING)
+    BV = min(max(triton.next_power_of_2(V), 16), CONST_TILING)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    dv = torch.empty_like(do)
+    grid = (NT, B * H)
+    chunk_bwd_kernel_dv_local[grid](
+        q=q,
+        k=k,
+        g=g,
+        g_gamma=g_gamma,
+        A=A,
+        do=do,
+        dv=dv,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return dv
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [1, 2, 4, 8] for num_stages in [2, 3, 4]],
+    key=["BV", "BT"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def chunk_gla_bwd_kernel_dA(
+    v,
+    do,
+    dA,
+    cu_seqlens,
+    chunk_indices,
+    scale,
+    T,
+    H: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    T = eos - bos
+
+    b_dA = tl.zeros([BT, BT], dtype=tl.float32)
+    for i_v in range(tl.cdiv(V, BV)):
+        p_do = tl.make_block_ptr(do + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (V, T), (1, H * V), (i_v * BV, i_t * BT), (BV, BT), (0, 1))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_do = tl.load(p_do, boundary_check=(0, 1))
+
+        b_dA += tl.dot(b_do, b_v)
+
+    p_dA = tl.make_block_ptr(dA + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    m_s = tl.arange(0, BT)[:, None] >= tl.arange(0, BT)[None, :]
+    b_dA = tl.where(m_s, b_dA * scale, 0.0)
+    tl.store(p_dA, b_dA.to(p_dA.dtype.element_ty), boundary_check=(0, 1))
+
+
+def chunk_gla_bwd_dA(
+    v: torch.Tensor,
+    do: torch.Tensor,
+    scale: float,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_size: int = 64,
+    chunk_indices: torch.LongTensor = None,
+):
+    B, T, H, V = v.shape
+    BT = chunk_size
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, chunk_size)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    BV = min(64, triton.next_power_of_2(V))
+
+    dA = v.new_empty(B, T, H, BT, dtype=torch.float32)
+    grid = (NT, B * H)
+    chunk_gla_bwd_kernel_dA[grid](
+        v=v,
+        do=do,
+        dA=dA,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        scale=scale,
+        T=T,
+        H=H,
+        V=V,
+        BT=BT,
+        BV=BV,
+    )
+    return dA
diff --git a/examples/kda/FLA_KDA/fla_utils.py b/examples/kda/FLA_KDA/fla_utils.py
new file mode 100644
index 0000000000..b278aec909
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_utils.py
@@ -0,0 +1,240 @@
+import contextlib
+import functools
+import inspect
+import os
+import warnings
+from collections.abc import Callable
+from typing import Any
+from packaging import version
+from enum import Enum
+
+import torch
+import triton
+import triton.language.extra.libdevice as tldevice
+
+
+device = "cuda"
+device_torch_lib = getattr(torch, device)
+
+exp = tldevice.fast_expf
+exp2 = tldevice.exp2
+log = tldevice.fast_logf
+log2 = tldevice.fast_log2f
+
+IS_NVIDIA_HOPPER = True and ("NVIDIA H" in torch.cuda.get_device_name(0) or torch.cuda.get_device_capability()[0] >= 9)
+USE_CUDA_GRAPH = True and os.environ.get("FLA_USE_CUDA_GRAPH", "0") == "1"
+
+
+FLA_CACHE_RESULTS = os.getenv("FLA_CACHE_RESULTS", "1") == "1"
+SUPPORTS_AUTOTUNE_CACHE = "cache_results" in inspect.signature(triton.autotune).parameters
+autotune_cache_kwargs = {"cache_results": FLA_CACHE_RESULTS} if SUPPORTS_AUTOTUNE_CACHE else {}
+
+
+# error check，copy from
+def get_abs_err(x, y):
+    return (x.detach() - y.detach()).flatten().abs().max().item()
+
+
+def get_err_ratio(x, y):
+    err = (x.detach() - y.detach()).flatten().square().mean().sqrt().item()
+    base = (x.detach()).flatten().square().mean().sqrt().item()
+    return err / (base + 1e-8)
+
+
+def assert_close(prefix, ref, tri, ratio, warning=False, err_atol=1e-6):
+    abs_atol = get_abs_err(ref, tri)
+    msg = f"{prefix:>16} diff: {abs_atol:.6f} ratio: {get_err_ratio(ref, tri):.6f}"
+    print(msg)
+    error_rate = get_err_ratio(ref, tri)
+    if abs_atol <= err_atol:
+        return
+    if warning or (error_rate < 0.01 or abs_atol <= 0.3):
+        if error_rate > ratio:
+            warnings.warn(msg, stacklevel=2)
+    else:
+        assert error_rate < ratio, msg
+
+
+def tensor_cache(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator that caches the most recent result of a function with tensor inputs.
+
+    This decorator will store the output of the decorated function for the most recent set of input tensors.
+    If the function is called again with the same input tensors, it will return the cached result.
+
+
+    Args:
+        fn (Callable[..., torch.Tensor]):
+            The function to be decorated. It should take tensor inputs and return tensor outputs.
+
+    Returns:
+        Callable[..., torch.Tensor]:
+            A wrapped version of the input function with single-entry caching.
+    """
+    last_args: tuple | None = None
+    last_kwargs: dict | None = None
+    last_result: Any = None
+
+    @functools.wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        nonlocal last_args, last_kwargs, last_result
+
+        if (
+            last_args is not None
+            and last_kwargs is not None
+            and len(args) == len(last_args)
+            and len(kwargs) == len(last_kwargs)
+            and all(a is b for a, b in zip(args, last_args))
+            and all(k in last_kwargs and v is last_kwargs[k] for k, v in kwargs.items())
+        ):
+            return last_result
+
+        result = fn(*args, **kwargs)
+        last_args, last_kwargs, last_result = args, kwargs, result
+        return result
+
+    return wrapper
+
+
+@tensor_cache
+def prepare_lens(cu_seqlens: torch.LongTensor) -> torch.LongTensor:
+    return torch.diff(cu_seqlens)
+
+
+@tensor_cache
+def prepare_chunk_indices(
+    cu_seqlens: torch.LongTensor,
+    chunk_size: int,
+) -> torch.LongTensor:
+    indices = torch.cat([torch.arange(n) for n in triton.cdiv(prepare_lens(cu_seqlens), chunk_size).tolist()])
+    return torch.stack([indices.eq(0).cumsum(0) - 1, indices], 1).to(cu_seqlens)
+
+
+# @functools.cache
+# def get_multiprocessor_count(tensor_idx: int = 0) -> int:
+#     try:
+#         return triton.runtime.driver.active.utils.get_device_properties(tensor_idx)['multiprocessor_count']
+#     except BaseException:
+#         # Maybe we use a NPU device.
+#         if triton.runtime.driver.active.get_current_target().backend == 'npu':
+#             return triton.runtime.driver.active.utils.get_device_properties(tensor_idx)['num_vectorcore']
+#         else:
+#             return 1
+@functools.cache
+def get_multiprocessor_count(tensor_idx: int = 0) -> int:
+    """
+    Compatible across Triton versions:
+    - 2.0.x
+    - 2.1.0
+    - 2.2.x and above
+    Supports CUDA and NPU.
+    """
+
+    # ---- Try the newer Triton 2.2+ API ----
+    try:
+        drv = triton.runtime.driver.active
+        props = drv.utils.get_device_properties(tensor_idx)
+        return props.get("multiprocessor_count") or props.get("num_vectorcore") or 1
+    except Exception:
+        pass
+
+    # ---- Fallback: Triton 2.0 / 2.1 API ----
+    try:
+        cuda = triton.runtime.driver.CudaDriver
+        dev = cuda.get_current_device()
+        props = cuda.get_device_properties(dev)
+        return props.get("multiprocessor_count", 1)
+    except Exception:
+        pass
+
+    return 1
+
+
+def input_guard(
+    fn: Callable[..., torch.Tensor],
+) -> Callable[..., torch.Tensor]:
+    """
+    A decorator to make sure all input tensors are contiguous and set the device based on input tensors.
+    """
+
+    @functools.wraps(fn)
+    def wrapper(*args, **kwargs):
+        contiguous_args = (i if not isinstance(i, torch.Tensor) else i.contiguous() for i in args)
+        contiguous_kwargs = {k: (v if not isinstance(v, torch.Tensor) else v.contiguous()) for k, v in kwargs.items()}
+
+        tensor = None
+        for arg in args:
+            if isinstance(arg, torch.Tensor):
+                tensor = arg
+                break
+        if tensor is None:
+            for value in kwargs.values():
+                if isinstance(value, torch.Tensor):
+                    tensor = value
+                    break
+
+        if tensor is not None:
+            ctx = custom_device_ctx(tensor.device.index)
+        else:
+            ctx = contextlib.nullcontext()
+
+        with ctx:
+            return fn(*contiguous_args, **contiguous_kwargs)
+
+    return wrapper
+
+
+@functools.cache
+def check_pytorch_version(version_s: str = "2.4") -> bool:
+    return version.parse(torch.__version__) >= version.parse(version_s)
+
+
+if check_pytorch_version("2.4"):
+    device = "cuda"
+    autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=device)
+    autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=device)
+
+    def custom_device_ctx(index: int):
+        return device_torch_lib.device(index)
+else:
+    assert device == "cuda", "Only cuda device is supported for PyTorch version < 2.4.0."
+    autocast_custom_fwd = device_torch_lib.amp.custom_fwd
+    autocast_custom_bwd = device_torch_lib.amp.custom_bwd
+
+    def custom_device_ctx(index: int):
+        return torch.cuda.device(index)
+
+
+class Backend(Enum):
+    ADA = 101376  # RTX 4090
+    AMPERE = 166912  # A100
+    HOPPER = 232448  # H100
+    DEFAULT = 102400  # Default
+
+    @classmethod
+    def get_shared_memory(cls, arch: str) -> int:
+        try:
+            return cls[arch.upper()].value
+        except KeyError:
+            return cls.DEFAULT.value
+
+
+def get_all_max_shared_mem():
+    try:
+        return [
+            triton.runtime.driver.active.utils.get_device_properties(i)["max_shared_mem"] for i in range(device_torch_lib.device_count())
+        ]
+    except BaseException:
+        return [-1]
+
+
+@functools.cache
+def check_shared_mem(arch: str = "none", tensor_idx: int = 0) -> bool:
+    try:
+        device_shared_mem_list = get_all_max_shared_mem()
+        max_shared_memory = device_shared_mem_list[tensor_idx]
+        return max_shared_memory >= Backend.get_shared_memory(arch)
+    except Exception:
+        return False
diff --git a/examples/kda/FLA_KDA/fla_wy_fast.py b/examples/kda/FLA_KDA/fla_wy_fast.py
new file mode 100644
index 0000000000..a042c2a5fe
--- /dev/null
+++ b/examples/kda/FLA_KDA/fla_wy_fast.py
@@ -0,0 +1,312 @@
+# Copyright (c) 2023-2025, Songlin Yang, Yu Zhang
+
+import torch
+import triton
+import triton.language as tl
+
+from .fla_utils import prepare_chunk_indices, exp2, autotune_cache_kwargs
+
+
+@triton.heuristics(
+    {
+        "STORE_QG": lambda args: args["qg"] is not None,
+        "STORE_KG": lambda args: args["kg"] is not None,
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[
+        triton.Config({"DOT_PRECISION": DOT_PRECISION}, num_warps=num_warps, num_stages=num_stages)
+        for num_warps in [2, 4, 8]
+        for num_stages in [2, 3, 4]
+        for DOT_PRECISION in (["tf32x3", "ieee"])
+    ],
+    key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def recompute_w_u_fwd_kernel(
+    q,
+    k,
+    qg,
+    kg,
+    v,
+    beta,
+    w,
+    u,
+    A,
+    gk,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    STORE_QG: tl.constexpr,
+    STORE_KG: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+    DOT_PRECISION: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+    p_b = tl.make_block_ptr(beta + bos * H + i_h, (T,), (H,), (i_t * BT,), (BT,), (0,))
+    b_b = tl.load(p_b, boundary_check=(0,))
+
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_u = tl.make_block_ptr(u + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_b[:, None]).to(b_v.dtype)
+        b_u = tl.dot(b_A, b_vb, input_precision=DOT_PRECISION)
+        tl.store(p_u, b_u.to(p_u.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_w = tl.make_block_ptr(w + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        b_kb = b_k * b_b[:, None]  # 乘beta
+
+        p_gk = tl.make_block_ptr(gk + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_gk = tl.load(p_gk, boundary_check=(0, 1))
+        b_kb *= exp2(b_gk)
+        if STORE_QG:
+            p_q = tl.make_block_ptr(q + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            p_qg = tl.make_block_ptr(qg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            b_q = tl.load(p_q, boundary_check=(0, 1))
+            b_qg = b_q * exp2(b_gk)
+            tl.store(p_qg, b_qg.to(p_qg.dtype.element_ty), boundary_check=(0, 1))
+        if STORE_KG:
+            last_idx = min(i_t * BT + BT, T) - 1
+            o_k = i_k * BK + tl.arange(0, BK)
+            m_k = o_k < K
+            b_gn = tl.load(gk + ((bos + last_idx) * H + i_h) * K + o_k, mask=m_k, other=0.0)  # chunk的最后一个g
+            b_kg = b_k * tl.where((i_t * BT + tl.arange(0, BT) < T)[:, None], exp2(b_gn[None, :] - b_gk), 0)
+            p_kg = tl.make_block_ptr(kg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+            tl.store(p_kg, b_kg.to(p_kg.dtype.element_ty), boundary_check=(0, 1))
+
+        b_w = tl.dot(b_A, b_kb.to(b_k.dtype))
+        tl.store(p_w, b_w.to(p_w.dtype.element_ty), boundary_check=(0, 1))
+
+
+@triton.heuristics(
+    {
+        "IS_VARLEN": lambda args: args["cu_seqlens"] is not None,
+    }
+)
+@triton.autotune(
+    configs=[triton.Config({}, num_warps=num_warps, num_stages=num_stages) for num_warps in [2, 4] for num_stages in [2, 3, 4]],
+    key=["H", "K", "V", "BT", "BK", "BV", "IS_VARLEN"],
+    **autotune_cache_kwargs,
+)
+@triton.jit(do_not_specialize=["T"])
+def prepare_wy_repr_bwd_kernel(
+    k,
+    v,
+    beta,
+    gk,
+    A,
+    dA,
+    dw,
+    du,
+    dk,
+    dk2,
+    dv,
+    db,
+    dg,
+    dg2,
+    cu_seqlens,
+    chunk_indices,
+    T,
+    H: tl.constexpr,
+    K: tl.constexpr,
+    V: tl.constexpr,
+    BT: tl.constexpr,
+    BK: tl.constexpr,
+    BV: tl.constexpr,
+    IS_VARLEN: tl.constexpr,
+):
+    i_t, i_bh = tl.program_id(0), tl.program_id(1)
+    i_b, i_h = i_bh // H, i_bh % H
+    if IS_VARLEN:
+        i_n, i_t = tl.load(chunk_indices + i_t * 2).to(tl.int32), tl.load(chunk_indices + i_t * 2 + 1).to(tl.int32)
+        bos, eos = tl.load(cu_seqlens + i_n).to(tl.int32), tl.load(cu_seqlens + i_n + 1).to(tl.int32)
+        T = eos - bos
+    else:
+        bos, eos = i_b * T, i_b * T + T
+
+    p_b = tl.make_block_ptr(beta + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_db = tl.make_block_ptr(db + (bos * H + i_h), (T,), (H,), (i_t * BT,), (BT,), (0,))
+    p_A = tl.make_block_ptr(A + (bos * H + i_h) * BT, (BT, T), (1, H * BT), (0, i_t * BT), (BT, BT), (0, 1))
+
+    b_b = tl.load(p_b, boundary_check=(0,))
+    b_db = tl.zeros([BT], dtype=tl.float32)
+    b_A = tl.load(p_A, boundary_check=(0, 1))
+    b_dA = tl.zeros([BT, BT], dtype=tl.float32)
+
+    for i_k in range(tl.cdiv(K, BK)):
+        p_k = tl.make_block_ptr(k + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dk = tl.make_block_ptr(dk + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dk2 = tl.make_block_ptr(dk2 + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dw = tl.make_block_ptr(dw + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dg = tl.make_block_ptr(dg + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        p_dg2 = tl.make_block_ptr(dg2 + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+
+        # [BT, BK]
+        b_k = tl.load(p_k, boundary_check=(0, 1))
+        p_gk = tl.make_block_ptr(gk + (bos * H + i_h) * K, (T, K), (H * K, 1), (i_t * BT, i_k * BK), (BT, BK), (1, 0))
+        b_gk_exp = exp2(tl.load(p_gk, boundary_check=(0, 1)))
+        b_kbg = b_k * b_b[:, None] * b_gk_exp
+        b_dw = tl.load(p_dw, boundary_check=(0, 1))
+
+        b_dA += tl.dot(b_dw, tl.trans(b_kbg).to(b_dw.dtype))
+        b_dkbg = tl.dot(b_A, b_dw)
+        b_dk = b_dkbg * b_gk_exp * b_b[:, None] + tl.load(p_dk, boundary_check=(0, 1))
+        b_db += tl.sum(b_dkbg * b_k * b_gk_exp, 1)
+        b_dg = b_kbg * b_dkbg + tl.load(p_dg, boundary_check=(0, 1))
+
+        tl.store(p_dk2, b_dk.to(p_dk2.dtype.element_ty), boundary_check=(0, 1))
+        tl.store(p_dg2, b_dg.to(p_dg2.dtype.element_ty), boundary_check=(0, 1))
+
+    for i_v in range(tl.cdiv(V, BV)):
+        p_v = tl.make_block_ptr(v + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_dv = tl.make_block_ptr(dv + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        p_du = tl.make_block_ptr(du + (bos * H + i_h) * V, (T, V), (H * V, 1), (i_t * BT, i_v * BV), (BT, BV), (1, 0))
+        b_v = tl.load(p_v, boundary_check=(0, 1))
+        b_vb = (b_v * b_b[:, None]).to(b_v.dtype)
+        b_du = tl.load(p_du, boundary_check=(0, 1))
+        b_dA += tl.dot(b_du, tl.trans(b_vb))
+        b_dvb = tl.dot(b_A, b_du)
+        b_dv = b_dvb * b_b[:, None]
+        b_db += tl.sum(b_dvb * b_v, 1)
+        tl.store(p_dv, b_dv.to(p_dv.dtype.element_ty), boundary_check=(0, 1))
+
+    o_t = i_t * BT + tl.arange(0, BT)
+    m_t = o_t < T
+    m_A = (o_t[:, None] > o_t[None, :]) & (m_t[:, None] & m_t)
+    b_dA = tl.where(m_A, b_dA, 0)
+    b_dA = tl.dot(b_dA.to(b_A.dtype), b_A)
+    b_dA = tl.dot(b_A, b_dA.to(b_A.dtype))
+
+    b_dA = tl.where(m_A, -b_dA, 0)
+
+    # if using gk, save dA first and handle dk in another kernel
+    p_dA = tl.make_block_ptr(dA + (bos * H + i_h) * BT, (T, BT), (H * BT, 1), (i_t * BT, 0), (BT, BT), (1, 0))
+    tl.store(p_dA, b_dA.to(p_dA.dtype.element_ty), boundary_check=(0, 1))
+    tl.store(p_db, b_db.to(p_db.dtype.element_ty), boundary_check=(0,))
+
+
+def recompute_w_u_fwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    A: torch.Tensor,
+    q: torch.Tensor = None,
+    gk: torch.Tensor = None,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_indices: torch.LongTensor = None,
+) -> tuple[torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = A.shape[-1]
+    BK = 64
+    BV = 64
+
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+
+    w = torch.empty_like(k)
+    u = torch.empty_like(v)
+    qg = torch.empty_like(q) if q is not None else None
+    kg = torch.empty_like(k) if gk is not None else None
+    recompute_w_u_fwd_kernel[(NT, B * H)](
+        q=q,
+        k=k,
+        qg=qg,
+        kg=kg,
+        v=v,
+        beta=beta,
+        w=w,
+        u=u,
+        A=A,
+        gk=gk,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    return w, u, qg, kg
+
+
+def prepare_wy_repr_bwd(
+    k: torch.Tensor,
+    v: torch.Tensor,
+    beta: torch.Tensor,
+    gk: torch.Tensor,
+    A: torch.Tensor,
+    dk: torch.Tensor,
+    dw: torch.Tensor,
+    du: torch.Tensor,
+    dg: torch.Tensor,
+    cu_seqlens: torch.LongTensor = None,
+    chunk_indices: torch.LongTensor = None,
+) -> tuple[torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor, torch.Tensor]:
+    B, T, H, K, V = *k.shape, v.shape[-1]
+    BT = 64
+    if chunk_indices is None and cu_seqlens is not None:
+        chunk_indices = prepare_chunk_indices(cu_seqlens, BT)
+    NT = triton.cdiv(T, BT) if cu_seqlens is None else len(chunk_indices)
+    CONST_TILING = 64
+    BK = min(max(triton.next_power_of_2(K), 16), CONST_TILING)
+    BV = min(max(triton.next_power_of_2(V), 16), CONST_TILING)
+
+    dk2 = torch.empty_like(dk, dtype=torch.float)
+    dv = torch.empty_like(v)
+    dg2 = torch.empty_like(gk, dtype=torch.float)
+    dA = torch.empty_like(A, dtype=torch.float)
+    db = torch.empty_like(beta, dtype=torch.float)
+    prepare_wy_repr_bwd_kernel[(NT, B * H)](
+        k=k,
+        v=v,
+        beta=beta,
+        gk=gk,
+        A=A,
+        dA=dA,
+        dw=dw,
+        du=du,
+        dk=dk,
+        dk2=dk2,
+        dv=dv,
+        db=db,
+        dg=dg,
+        dg2=dg2,
+        cu_seqlens=cu_seqlens,
+        chunk_indices=chunk_indices,
+        T=T,
+        H=H,
+        K=K,
+        V=V,
+        BT=BT,
+        BK=BK,
+        BV=BV,
+    )
+    dk = dk2
+    dg = dg2
+
+    return dk, dv, db, dg, dA
diff --git a/examples/kda/README.md b/examples/kda/README.md
new file mode 100644
index 0000000000..f445a9f097
--- /dev/null
+++ b/examples/kda/README.md
@@ -0,0 +1,7 @@
+# KDA kernel implementation with TileLang
+## Requirement
+- TileLang: 0.1.6.post2+cuda.git729e66ca
+- triton: 3.2.0
+- FLA: commit 9714c5(used for comparison)
+
+We copy the needed files and function from flash-linear-attention to the FLA_KDA/ for easily comparison.
diff --git a/examples/kda/chunk_bwd_dqkwg.py b/examples/kda/chunk_bwd_dqkwg.py
new file mode 100644
index 0000000000..d3d4df4b44
--- /dev/null
+++ b/examples/kda/chunk_bwd_dqkwg.py
@@ -0,0 +1,274 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_inter import chunk_kda_bwd_dqkwg
+from test_utils_kda import do_bench, compare_tensors
+
+import torch
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    gate_dtype,
+):
+    BS = S // chunk_size
+    q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    v_new = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    w = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    g = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    h = torch.randn(B, BS, H, DK, DV, dtype=input_dtype).cuda()
+    dv = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    do = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    dh = torch.randn(B, BS, H, DK, DV, dtype=input_dtype).cuda()
+
+    return q, k, v_new, w, g, h, dv, do, dh
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    gate_dtype,
+):
+    dq = torch.randn(B, S, H, DK, dtype=torch.float32).cuda()
+    dk = torch.randn(B, S, H, DK, dtype=torch.float32).cuda()
+    dw = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    dg = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    return dq, dk, dw, dg
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-4, -3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def chunk_bwd_dqkwg(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    scale,
+    chunk_size,
+    input_dtype,
+    gate_dtype,
+    block_DK=32,
+    block_DV=32,
+    threads=32,
+    num_stages=0,
+):
+    block_S = chunk_size
+    BS = S // block_S
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    H_shape = (B, BS, H, DK, DV)
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(K_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        G: T.Tensor(K_shape, dtype=gate_dtype),
+        h: T.Tensor(H_shape, dtype=input_dtype),
+        dv: T.Tensor(V_shape, dtype=input_dtype),
+        DO: T.Tensor(V_shape, dtype=input_dtype),
+        Dh: T.Tensor(H_shape, dtype=input_dtype),
+        dq: T.Tensor(K_shape, dtype=T.float32),
+        dk: T.Tensor(K_shape, dtype=T.float32),
+        dw: T.Tensor(K_shape, dtype=gate_dtype),
+        dg: T.Tensor(K_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DK, block_DK), T.ceildiv(S, block_S), B * H, threads=threads) as (bk, bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            chunk_last_idx = T.min(S, (bs + 1) * block_S) - 1
+
+            dgkn_fragment = T.alloc_fragment((block_DK), dtype=T.float32)
+            dgkn_fragment_tmp = T.alloc_fragment((block_DK,), dtype=T.float32)
+            dq_fragment = T.alloc_fragment((block_S, block_DK), dtype=T.float32)
+            dk_fragment = T.alloc_fragment((block_S, block_DK), dtype=T.float32)
+            dw_fragment = T.alloc_fragment((block_S, block_DK), dtype=T.float32)
+            dgk_shared = T.alloc_shared((block_S, block_DK), dtype=T.float32)
+
+            h_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            dh_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            dgkn_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)  # d of last token in a chunk
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            DO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            DV_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            G_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)  # chunk G
+            Gn_shared = T.alloc_shared((block_DK), dtype=input_dtype)  # chunk last token G
+            Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+
+            dkkn_shared = T.alloc_shared((block_S, block_DK), dtype=T.float32)
+            pp_shared = T.alloc_shared((block_DK), dtype=T.float32)
+
+            T.clear(dgkn_fragment)
+            T.clear(dq_fragment)
+            T.clear(dk_fragment)
+            T.clear(dw_fragment)
+
+            T.copy(G[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], G_shared)
+            T.copy(G[bb, chunk_last_idx, bh, bk * block_DK : (bk + 1) * block_DK], Gn_shared)
+
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(h[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], h_shared)
+                T.copy(Dh[bb, bs, bh, bk * block_DK : (bk + 1) * block_DK, i_v * block_DV : (i_v + 1) * block_DV], dh_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                T.copy(DO[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], DO_shared)
+                T.copy(dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], DV_shared)
+                # += reduce_sum
+                for i_k1, i_v1 in T.Parallel(block_DK, block_DV):
+                    dgkn_shared[i_k1, i_v1] = h_shared[i_k1, i_v1] * dh_shared[i_k1, i_v1]
+                T.reduce_sum(dgkn_shared, dgkn_fragment_tmp, dim=1, clear=True)  # [block_DK]
+                for i_ks in T.Parallel(block_DK):
+                    dgkn_fragment[i_ks] += dgkn_fragment_tmp[i_ks]
+                T.gemm(DO_shared, h_shared, dq_fragment, transpose_B=True, clear_accum=False)  # [block_S, block_DK]
+                T.gemm(V_shared, dh_shared, dk_fragment, transpose_B=True, clear_accum=False)  # [block_S, block_DK]
+                T.gemm(DV_shared, h_shared, dw_fragment, transpose_B=True, clear_accum=False)  # [block_S, block_DK]
+            # chunk last token
+            for i_k0 in T.Parallel(block_DK):
+                dgkn_fragment[i_k0] = dgkn_fragment[i_k0] * T.exp2(Gn_shared[i_k0])
+
+            for i_s, i_k in T.Parallel(block_S, block_DK):
+                dw_fragment[i_s, i_k] = -dw_fragment[i_s, i_k]
+                dq_fragment[i_s, i_k] = dq_fragment[i_s, i_k] * scale * T.exp2(G_shared[i_s, i_k])
+                dk_fragment[i_s, i_k] = dk_fragment[i_s, i_k] * T.exp2(Gn_shared[i_k] - G_shared[i_s, i_k])
+
+            T.copy(dw_fragment, dw[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+            T.copy(dq_fragment, dq[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+            T.copy(dk_fragment, dk[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+
+            T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], Q_shared)
+            T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK], K_shared)
+
+            for i_s2, i_k2 in T.Parallel(block_S, block_DK):
+                dkkn_shared[i_s2, i_k2] = dk_fragment[i_s2, i_k2] * K_shared[i_s2, i_k2]
+            T.reduce_sum(dkkn_shared, pp_shared, dim=0, clear=True)
+            for i_k3 in T.Parallel(block_DK):
+                pp_shared[i_k3] += dgkn_fragment[i_k3]
+
+            for i_s4, i_k4 in T.Parallel(block_S, block_DK):
+                dgk_shared[i_s4, i_k4] = (
+                    Q_shared[i_s4, i_k4] * dq_fragment[i_s4, i_k4]
+                    - K_shared[i_s4, i_k4] * dk_fragment[i_s4, i_k4]
+                    + T.if_then_else(chunk_last_idx == bs * block_S + i_s4, pp_shared[i_k4], 0.0)
+                )
+
+            T.copy(dgk_shared, dg[bb, bs * block_S : (bs + 1) * block_S, bh, bk * block_DK : (bk + 1) * block_DK])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    scale,
+    input_dtype,
+    gate_dtype,
+    qk_dtype,
+    chunk_size,
+    use_gk=True,
+    use_initial_state=True,
+    store_final_state=True,
+    save_new_value=True,
+    block_DK=64,
+    block_DV=32,
+    threads=128,
+    num_stages=0,
+):
+    q, k, v_new, w, g, h, dv, do, dh = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, gate_dtype))
+
+    dq_ref, dk_ref, dw_ref, dg_ref = chunk_kda_bwd_dqkwg(
+        q=q,
+        k=k,
+        v=v_new,
+        w=w,
+        g=g,
+        h=h,
+        dv=dv,
+        do=do,
+        dh=dh,
+        scale=scale,
+    )
+
+    dq, dk, dw, dg = prepare_output(B, S, H, DK, DV, chunk_size, getattr(torch, gate_dtype))
+    kernel = chunk_bwd_dqkwg(
+        B=B, S=S, H=H, DK=DK, DV=DV, scale=scale, chunk_size=chunk_size, input_dtype=input_dtype, gate_dtype=gate_dtype
+    )
+    dq, dk, dw, dg = kernel(q, k, v_new, g, h, dv, do, dh)
+
+    compare_tensors("dq", dq_ref, dq)
+    compare_tensors("dk", dk_ref, dk)
+    compare_tensors("dw", dw_ref, dw)
+    compare_tensors("dg", dg_ref, dg)
+
+    fla_time = do_bench(
+        chunk_kda_bwd_dqkwg,
+        q=q,
+        k=k,
+        v=v_new,
+        w=w,
+        g=g,
+        h=h,
+        dv=dv,
+        do=do,
+        dh=dh,
+        scale=scale,
+    )
+    tilelang_time = do_bench(kernel, q, k, v_new, g, h, dv, do, dh)
+    print("fla_time:", fla_time)
+    print("tilelang_time:", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=8192,
+        H=64,
+        DK=128,
+        DV=128,
+        scale=1.0,
+        input_dtype="float32",
+        gate_dtype="float32",  # gate must be float32
+        qk_dtype="float32",
+        chunk_size=64,
+        use_gk=True,
+        use_initial_state=True,
+        store_final_state=True,
+        save_new_value=True,
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_bwd_dv.py b/examples/kda/chunk_bwd_dv.py
new file mode 100644
index 0000000000..cdbe0a899c
--- /dev/null
+++ b/examples/kda/chunk_bwd_dv.py
@@ -0,0 +1,150 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+import sys  # noqa: F401
+
+from FLA_KDA.fla_chunk_o import chunk_bwd_dv_local
+from test_utils_kda import compare_tensors, do_bench
+
+import torch
+
+torch.random.manual_seed(1)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    do_dtype,
+):
+    q = torch.randn(B, S, H, DK, dtype=do_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=do_dtype).cuda()
+    DO = torch.randn(B, S, H, DV, dtype=do_dtype).cuda()
+    A = torch.randn(B, S, H, chunk_size, dtype=input_dtype).cuda()
+    return q, k, DO, A
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DV,
+    chunk_size,
+    output_dtype,
+):
+    dv = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return dv
+
+
+def get_configs():
+    import itertools
+
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DV, threads, num_stages))
+    configs = [{"block_DV": c[0], "threads": c[1], "num_stages": c[2]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=10, rep=5)
+@tilelang.jit(out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_bwd_kernel_dv_local(
+    B,
+    S,
+    H,
+    DV,
+    input_dtype,
+    output_dtype,
+    do_dtype,
+    chunk_size,
+    block_DV=128,
+    threads=128,
+    num_stages=1,
+):
+    block_S = BS = chunk_size
+    DO_shape = (B, S, H, DV)
+    A_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+        DO: T.Tensor(DO_shape, dtype=do_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dv: T.Tensor(DO_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            A_shared = T.alloc_shared((BS, BS), dtype=do_dtype)
+            DO_shared = T.alloc_shared((BS, block_DV), dtype=do_dtype)
+            dv_fragment = T.alloc_fragment((BS, block_DV), dtype=T.float32)
+            dv_shared = T.alloc_shared((BS, block_DV), dtype=output_dtype)
+
+            T.copy(A[bb, bs * BS : (bs + 1) * BS, bh, :], A_shared)
+            for i_s1, i_s2 in T.Parallel(BS, BS):
+                A_shared[i_s1, i_s2] = T.if_then_else(i_s1 >= i_s2, A_shared[i_s1, i_s2], 0.0)
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(DO[bb, bs * BS : (bs + 1) * BS, bh, i_v * block_DV : (i_v + 1) * block_DV], DO_shared)
+                T.gemm(A_shared, DO_shared, dv_fragment, transpose_A=True, clear_accum=True)  # transpose_A: A^T
+                T.copy(dv_fragment, dv_shared)
+                T.copy(dv_shared, dv[bb, bs * BS : (bs + 1) * BS, bh, i_v * block_DV : (i_v + 1) * block_DV])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    scale,
+    input_dtype,
+    do_dtype,
+    output_dtype,
+    chunk_size,
+):
+    q, k, DO, A = prepare_input(B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, do_dtype))
+    dv_ref = chunk_bwd_dv_local(q, k, do=DO, A=A)
+
+    dv_tilelang = prepare_output(B, S, H, DV, chunk_size, getattr(torch, output_dtype))
+    kernel = tilelang_chunk_bwd_kernel_dv_local(
+        B=B,
+        S=S,
+        H=H,
+        DV=DV,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        do_dtype=do_dtype,
+        chunk_size=chunk_size,
+    )
+    dv_tilelang = kernel(DO, A)
+    compare_tensors("dv", dv_ref, dv_tilelang)
+
+    fla_time = do_bench(chunk_bwd_dv_local, q, k, do=DO, A=A)
+    tilelang_time = do_bench(kernel, DO, A)
+    print("fla_time: ", fla_time)
+    print("tilelang_time: ", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=1024 * 8,  # 32768
+        H=64,
+        DK=128,
+        DV=128,
+        scale=1.0,
+        input_dtype="bfloat16",
+        do_dtype="float32",
+        output_dtype="bfloat16",
+        chunk_size=64,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_bwd_gla_dA.py b/examples/kda/chunk_bwd_gla_dA.py
new file mode 100644
index 0000000000..913fa9171f
--- /dev/null
+++ b/examples/kda/chunk_bwd_gla_dA.py
@@ -0,0 +1,147 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_o import chunk_gla_bwd_dA
+from test_utils_kda import compare_tensors, do_bench
+
+import torch
+
+torch.random.manual_seed(1)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DV,
+    chunk_size,
+    input_dtype,
+    do_dtype,
+):
+    DO = torch.randn(B, S, H, DV, dtype=do_dtype).cuda()
+    V_new = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    return DO, V_new
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DV,
+    chunk_size,
+    d_type,
+):
+    dA = torch.empty(B, S, H, chunk_size, dtype=d_type).cuda()
+    return dA
+
+
+def get_configs():
+    import itertools
+
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DV, threads, num_stages))
+    configs = [{"block_DV": c[0], "threads": c[1], "num_stages": c[2]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=10, rep=5)
+@tilelang.jit(out_idx=[-1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_bwd_kernel_dv_local(
+    B,
+    S,
+    H,
+    DV,
+    scale,
+    input_dtype,
+    da_dtype,
+    do_dtype,
+    chunk_size,
+    block_DV=128,
+    threads=128,
+    num_stages=1,
+):
+    block_S = BS = chunk_size
+    DO_shape = (B, S, H, DV)
+    V_shape = (B, S, H, DV)
+    dA_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+        DO: T.Tensor(DO_shape, dtype=do_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        dA: T.Tensor(dA_shape, dtype=da_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            do_shared = T.alloc_shared((block_S, block_DV), dtype=do_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=do_dtype)
+            dA_fragment = T.alloc_fragment((block_S, block_S), dtype=T.float32)
+
+            T.clear(dA_fragment)
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(DO[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], do_shared)
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                T.gemm(do_shared, V_shared, dA_fragment, transpose_B=True)
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                dA_fragment[i_s1, i_s2] = T.if_then_else(i_s1 >= i_s2, dA_fragment[i_s1, i_s2] * scale, 0.0)  # 下三角矩阵
+            T.copy(dA_fragment, dA[bb, bs * block_S : (bs + 1) * block_S, bh, 0:block_S])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    scale,
+    input_dtype,
+    do_dtype,
+    da_dtype,
+    chunk_size,
+):
+    DO, V_new = prepare_input(B, S, H, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, do_dtype))
+    print(DO.dtype, V_new.dtype)
+    dA_ref = chunk_gla_bwd_dA(v=V_new, do=DO, scale=scale)
+
+    dA_tilelang = prepare_output(B, S, H, DV, chunk_size, getattr(torch, da_dtype))
+    kernel = tilelang_chunk_bwd_kernel_dv_local(
+        B=B,
+        S=S,
+        H=H,
+        DV=DV,
+        scale=scale,
+        input_dtype=input_dtype,
+        da_dtype=da_dtype,
+        do_dtype=do_dtype,
+        chunk_size=chunk_size,
+    )
+    dA_tilelang = kernel(DO, V_new)
+    compare_tensors("dA", dA_ref, dA_tilelang)
+    fla_time = do_bench(chunk_gla_bwd_dA, v=V_new, do=DO, scale=scale)
+    tilelang_time = do_bench(kernel, DO, V_new)
+    print("fla_time:", fla_time)
+    print("tilelang_time:", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=1024 * 8,  # 32768
+        H=64,
+        DK=128,
+        DV=128,
+        scale=1.0,
+        input_dtype="bfloat16",
+        do_dtype="bfloat16",
+        da_dtype="float32",
+        chunk_size=64,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_bwd_intra.py b/examples/kda/chunk_bwd_intra.py
new file mode 100644
index 0000000000..a4aa4f9d43
--- /dev/null
+++ b/examples/kda/chunk_bwd_intra.py
@@ -0,0 +1,492 @@
+# Reference: FLA_KDA/fla_chunk_intra.py
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_intra import chunk_kda_bwd_intra
+from FLA_KDA.cumsum import chunk_local_cumsum
+from test_utils_kda import compare_tensors, do_bench
+
+import torch
+
+torch.random.manual_seed(0)
+torch.set_printoptions(profile="full")
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BT = chunk_size
+    q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    g = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+
+    # dAqk and dAkk are gradients w.r.t. Aqk and Akk
+    # Shape: (B, S, H, BT)
+    dAqk = torch.randn(B, S, H, BT, dtype=input_dtype).cuda()
+    dAkk = torch.randn(B, S, H, BT, dtype=input_dtype).cuda()
+
+    # Initial gradients (will be updated by the kernel)
+    dq = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    dk = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    db = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    dg = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+
+    return q, k, g, beta, dAqk, dAkk, dq, dk, db, dg
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    NK,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    dq = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    dk = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    db = torch.empty(NK, B, S, H, dtype=output_dtype).cuda()
+    dg = torch.empty(B, S, H, DK, dtype=gate_dtype).cuda()
+    return dq, dk, db, dg
+
+
+def get_configs():
+    import itertools
+
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3]
+    _configs = list(itertools.product(threads, num_stages))
+
+    configs = [{"threads": c[0], "num_stages": c[1]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=5, rep=5)
+@tilelang.jit(
+    out_idx=[-4, -3, -2, -1],
+)
+def tilelang_chunk_bwd_intra(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    # kernel config
+    block_DK,
+    block_BC=16,
+    threads=128,
+    num_stages=0,
+):
+    BT = chunk_size
+    BC = block_BC  # sub-chunk size, typically 16
+
+    NC = BT // BC  # number of sub-chunks
+    NT = T.ceildiv(S, BT)
+    NK = T.ceildiv(DK, block_DK)  # number of K blocks
+
+    K_shape = (B, S, H, DK)
+    Beta_shape = (B, S, H)
+    G_shape = (B, S, H, DK)
+    BT_shape = (B, S, H, BT)  # for dAqk and dAkk
+
+    dq_shape = (B, S, H, DK)
+    dk_shape = (B, S, H, DK)
+    db_shape = (B, S, H)
+    db2_shape = (NK, B, S, H)
+    dg_shape = (B, S, H, DK)
+
+    @T.prim_func
+    def kernel(
+        # input
+        q: T.Tensor(K_shape, dtype=input_dtype),
+        k: T.Tensor(K_shape, dtype=input_dtype),
+        g: T.Tensor(G_shape, dtype=gate_dtype),
+        beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        dAqk: T.Tensor(BT_shape, dtype=input_dtype),
+        dAkk: T.Tensor(BT_shape, dtype=input_dtype),
+        dq: T.Tensor(dq_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=input_dtype),
+        db: T.Tensor(db_shape, dtype=input_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # output
+        dq2: T.Tensor(dq_shape, dtype=output_dtype),
+        dk2: T.Tensor(dk_shape, dtype=output_dtype),
+        db2: T.Tensor(db2_shape, dtype=output_dtype),
+        dg2: T.Tensor(dg_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DK, block_DK) * NC, NT, B * H, threads=threads) as (i_kc, i_t, i_bh):
+            i_k, i_i = i_kc // NC, i_kc % NC
+            bb, bh = i_bh // H, i_bh % H
+
+            # actual sub-chunk index
+            i_ti = i_t * BT + i_i * BC
+
+            # current sub-chunk data
+            q_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            k_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            beta_shared = T.alloc_shared((BC,), dtype=input_dtype)
+            g_current_shared = T.alloc_shared((BC, block_DK), dtype=gate_dtype)
+            gn_shared = T.alloc_shared((block_DK,), dtype=gate_dtype)  # last token's g in current sub-chunk
+
+            dq_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            dk_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            dg_shared = T.alloc_shared((BC, block_DK), dtype=gate_dtype)
+
+            # Allocate fragments
+            dq2_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            dk2_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            dg2_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            db_fragment = T.alloc_fragment((BC,), dtype=accum_dtype)
+
+            # Initialize fragments
+            T.clear(dq2_fragment)
+            T.clear(dk2_fragment)
+            T.clear(dg2_fragment)
+            T.clear(db_fragment)
+
+            # Temporary shared memory for previous sub-chunks
+            k_prev_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            g_prev_shared = T.alloc_shared((BC, block_DK), dtype=gate_dtype)
+            dAqk_prev_shared = T.alloc_shared((BC, BC), dtype=input_dtype)
+            dAkk_prev_shared = T.alloc_shared((BC, BC), dtype=input_dtype)
+
+            # Temporary fragment for b_kg computation
+            kg_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+
+            kj_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            gkj_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            kgj_fragment = T.alloc_fragment((BC, block_DK), dtype=T.float32)
+            dAqk_col = T.alloc_shared((BC,), dtype=input_dtype)
+            dAkk_col = T.alloc_shared((BC,), dtype=input_dtype)
+
+            # Load g, q, k for current sub-chunk
+            T.copy(q[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], q_shared)
+            T.copy(k[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], k_shared)
+            T.copy(g[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], g_current_shared)
+            T.copy(beta[bb, i_ti : i_ti + BC, bh], beta_shared)
+
+            if i_i > 0:
+                chunk_first_idx = i_ti  # chunk first token idx
+
+                T.copy(g[bb, chunk_first_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], gn_shared)  # Get the first token's g value (b_gn)
+
+                # Loop over previous sub-chunks (i_j from 0 to i_i-1)
+                # Since i_i is computed from i_kc % NC and NC is small, we can use conditional blocks
+                # Process each possible previous sub-chunk with conditional execution
+                for i_j in T.Pipelined(i_i, num_stages=num_stages):  # i_j is index ofprevious sub_chunks
+                    prev_ti = i_t * BT + i_j * BC
+                    T.copy(k[bb, prev_ti : prev_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], k_prev_shared)
+                    T.copy(g[bb, prev_ti : prev_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], g_prev_shared)
+
+                    T.copy(dAqk[bb, i_ti : i_ti + BC, bh, i_j * BC : (i_j + 1) * BC], dAqk_prev_shared)
+                    T.copy(dAkk[bb, i_ti : i_ti + BC, bh, i_j * BC : (i_j + 1) * BC], dAkk_prev_shared)
+
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        kg_fragment[i_bc, i_k2] = k_prev_shared[i_bc, i_k2] * T.exp2(gn_shared[i_k2] - g_prev_shared[i_bc, i_k2])
+
+                    T.gemm(dAqk_prev_shared, kg_fragment, dq2_fragment, clear_accum=False)
+                    T.gemm(dAkk_prev_shared, kg_fragment, dk2_fragment, clear_accum=False)
+
+                for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                    gqn = T.exp2(g_current_shared[i_bc, i_k2] - gn_shared[i_k2])
+                    dq2_fragment[i_bc, i_k2] = dq2_fragment[i_bc, i_k2] * gqn
+                    dk2_fragment[i_bc, i_k2] = dk2_fragment[i_bc, i_k2] * gqn
+
+            # Process current sub-chunk diagonal
+            loop_length = T.min(BC, S - i_t * BT - i_i * BC)
+            for j in T.Pipelined(loop_length, num_stages=num_stages):
+                token_j_idx = i_ti + j
+
+                T.copy(k[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], kj_shared)
+                T.copy(g[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], gkj_shared)
+                T.copy(dAqk[bb, i_ti : i_ti + BC, bh, i_i * BC + j], dAqk_col)
+                T.copy(dAkk[bb, i_ti : i_ti + BC, bh, i_i * BC + j], dAkk_col)
+
+                for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                    kgj_fragment[i_bc, i_k2] = kj_shared[i_k2] * T.exp2(g_current_shared[i_bc, i_k2] - gkj_shared[i_k2])
+                    dq2_fragment[i_bc, i_k2] += T.if_then_else(i_bc >= j, dAqk_col[i_bc] * kgj_fragment[i_bc, i_k2], 0.0)
+                    dk2_fragment[i_bc, i_k2] += T.if_then_else(i_bc >= j, dAkk_col[i_bc] * kgj_fragment[i_bc, i_k2], 0.0)
+
+            # Compute b_db = sum(b_dk2 * b_k, dim=1)
+            dk2_k_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dk2_k_fragment[i_bc, i_k2] = dk2_fragment[i_bc, i_k2] * k_shared[i_bc, i_k2]
+            T.reduce_sum(dk2_k_fragment, db_fragment, dim=1, clear=True)
+
+            # b_dk2 *= b_b[:, None]
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dk2_fragment[i_bc, i_k2] = dk2_fragment[i_bc, i_k2] * beta_shared[i_bc]
+
+            # Compute b_dg2 = b_q * b_dq2 (before adding dq to dq2)
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dg2_fragment[i_bc, i_k2] = q_shared[i_bc, i_k2] * dq2_fragment[i_bc, i_k2]
+
+            # Load dq and compute b_dq2 = b_dq2 + b_dq
+            T.copy(dq[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], dq_shared)
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dq2_fragment[i_bc, i_k2] = dq2_fragment[i_bc, i_k2] + dq_shared[i_bc, i_k2]
+
+            # # Store results
+            T.copy(dq2_fragment, dq2[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK])
+            T.copy(db_fragment, db2[i_k, bb, i_ti : i_ti + BC, bh])
+
+            # Initialize dkt_fragment for processing subsequent sub-chunks and lower triangular part
+            dkt_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            T.clear(dkt_fragment)
+
+            # Temporary shared memory for subsequent sub-chunks
+            q_next_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            k_next_shared = T.alloc_shared((BC, block_DK), dtype=input_dtype)
+            g_next_shared = T.alloc_shared((BC, block_DK), dtype=gate_dtype)
+            beta_next_shared = T.alloc_shared((BC,), dtype=input_dtype)
+            dAqk_next_shared = T.alloc_shared((BC, BC), dtype=input_dtype)
+            dAkk_next_shared = T.alloc_shared((BC, BC), dtype=input_dtype)
+
+            # Temporary fragments for computation
+            gkn_shared = T.alloc_shared((BC, block_DK), dtype=accum_dtype)
+            qg_shared = T.alloc_shared((BC, block_DK), dtype=accum_dtype)
+            kbg_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            kbg_shared = T.alloc_shared((BC, block_DK), dtype=accum_dtype)
+            dkt_temp_fragment = T.alloc_fragment((BC, block_DK), dtype=accum_dtype)
+            # T.use_swizzle(10)
+
+            NC_actual = T.min(NC, T.ceildiv(S - i_t * BT, BC))  # Process subsequent sub-chunks (i_j from i_i+1 to NC-1)
+            if i_i < NC_actual - 1:
+                # Get the last token's g value in current sub-chunk
+                chunk_last_idx = T.min(S, i_ti + BC) - 1
+                gn_last_shared = T.alloc_shared((block_DK,), dtype=gate_dtype)
+                T.copy(g[bb, chunk_last_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], gn_last_shared)
+
+                # Loop over subsequent sub-chunks
+                for i_j in T.Pipelined(i_i + 1, NC_actual, num_stages=num_stages):
+                    i_tj = i_t * BT + i_j * BC
+
+                    T.copy(q[bb, i_tj : i_tj + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], q_next_shared)
+                    T.copy(k[bb, i_tj : i_tj + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], k_next_shared)
+                    T.copy(g[bb, i_tj : i_tj + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], g_next_shared)
+                    T.copy(beta[bb, i_tj : i_tj + BC, bh], beta_next_shared)
+
+                    T.copy(dAqk[bb, i_tj : i_tj + BC, bh, i_i * BC : (i_i + 1) * BC], dAqk_next_shared)  # [BC, BC] need transpose
+                    T.copy(dAkk[bb, i_tj : i_tj + BC, bh, i_i * BC : (i_i + 1) * BC], dAkk_next_shared)  # [BC, BC] need transpose
+
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        # kbg = k * beta
+                        kbg_fragment[i_bc, i_k2] = k_next_shared[i_bc, i_k2] * beta_next_shared[i_bc]
+                        gkn_shared[i_bc, i_k2] = T.if_then_else(
+                            i_tj + i_bc < S, T.exp2(g_next_shared[i_bc, i_k2] - gn_last_shared[i_k2]), 0.0
+                        )
+
+                    # Compute qg and kbg
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        qg_shared[i_bc, i_k2] = q_next_shared[i_bc, i_k2] * gkn_shared[i_bc, i_k2]
+                        kbg_shared[i_bc, i_k2] = kbg_fragment[i_bc, i_k2] * gkn_shared[i_bc, i_k2]
+
+                    # Accumulate: dkt += dAqk^T @ qg + dAkk^T @ kbg
+                    # Use transpose_A=True because dAqk/dAkk are loaded in (T, BT) layout but we need (BT, T) for gemm
+                    T.gemm(dAqk_next_shared, qg_shared, dkt_temp_fragment, transpose_A=True, clear_accum=True)
+                    T.gemm(dAkk_next_shared, kbg_shared, dkt_temp_fragment, transpose_A=True, clear_accum=False)
+
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        dkt_fragment[i_bc, i_k2] = dkt_fragment[i_bc, i_k2] + dkt_temp_fragment[i_bc, i_k2]
+
+                # Scale dkt by exp2(gn_last - g_current)
+                for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                    g_scale = T.exp2(gn_last_shared[i_k2] - g_current_shared[i_bc, i_k2])
+                    dkt_fragment[i_bc, i_k2] = dkt_fragment[i_bc, i_k2] * g_scale
+
+            # Process lower triangular part of current sub-chunk diagonal
+            # This corresponds to j <= i_bc in the diagonal block
+            qj_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            kj_shared_lower = T.alloc_shared((block_DK,), dtype=T.float32)
+            gj_shared_lower = T.alloc_shared((block_DK,), dtype=T.float32)
+            bj_local = T.alloc_local((1), dtype=input_dtype)
+            dAqk_col_lower = T.alloc_shared((BC,), dtype=input_dtype)
+            dAkk_col_lower = T.alloc_shared((BC,), dtype=input_dtype)
+
+            gkq_fragment = T.alloc_fragment((BC, block_DK), dtype=T.float32)
+            # dkt_lower_temp = T.alloc_fragment((BC, block_DK), dtype=T.float32)
+            kbj_fragment = T.alloc_fragment((block_DK,), dtype=T.float32)
+
+            max_token_j_idx = T.min(S, i_ti + BC)
+            for j in T.Pipelined(BC, num_stages=num_stages):
+                token_j_idx = i_ti + j
+
+                if token_j_idx < max_token_j_idx:
+                    T.copy(q[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], qj_shared)  # [BK]
+                    T.copy(k[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], kj_shared_lower)
+                    T.copy(g[bb, token_j_idx, bh, i_k * block_DK : (i_k + 1) * block_DK], gj_shared_lower)
+
+                    bj_local[0] = beta[bb, token_j_idx, bh]
+                    T.copy(dAqk[bb, token_j_idx, bh, i_i * BC : (i_i + 1) * BC], dAqk_col_lower)  # [BC]
+                    T.copy(dAkk[bb, token_j_idx, bh, i_i * BC : (i_i + 1) * BC], dAkk_col_lower)
+
+                    # Compute kbj = kj * bj
+                    for i_k2 in T.Parallel(block_DK):
+                        kbj_fragment[i_k2] = kj_shared_lower[i_k2] * bj_local[0]
+                    # Compute gkq = exp2(gj - g_current)
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        gkq_fragment[i_bc, i_k2] = T.exp2(gj_shared_lower[i_k2] - g_current_shared[i_bc, i_k2])
+
+                    # Accumulate: dkt += (dAkk * kbj + dAqk * qj) * gkq for i_bc <= j
+                    for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                        dkt_fragment[i_bc, i_k2] += T.if_then_else(
+                            i_bc <= j,
+                            (dAkk_col_lower[i_bc] * kbj_fragment[i_k2] + dAqk_col_lower[i_bc] * qj_shared[i_k2]) * gkq_fragment[i_bc, i_k2],
+                            0.0,
+                        )
+
+            # Load dk and dg
+            T.copy(dk[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], dk_shared)
+            T.copy(dg[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], dg_shared)
+
+            # Update dg2: dg2 += (dk2 - dkt) * k + dg
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dg2_fragment[i_bc, i_k2] = (
+                    dg2_fragment[i_bc, i_k2]
+                    + (dk2_fragment[i_bc, i_k2] - dkt_fragment[i_bc, i_k2]) * k_shared[i_bc, i_k2]
+                    + dg_shared[i_bc, i_k2]
+                )
+
+            # Update dk2: dk2 += dk + dkt
+            for i_bc, i_k2 in T.Parallel(BC, block_DK):
+                dk2_fragment[i_bc, i_k2] += dk_shared[i_bc, i_k2] + dkt_fragment[i_bc, i_k2]
+
+            # Store dk2 and dg2
+            T.copy(dk2_fragment, dk2[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK])
+            T.copy(dg2_fragment, dg2[bb, i_ti : i_ti + BC, bh, i_k * block_DK : (i_k + 1) * block_DK])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    threads=128,
+    num_stages=0,
+    cu_seqlens=None,
+    chunk_indices=None,
+):
+    q, k, g, beta, dAqk, dAkk, dq, dk, db, dg = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+
+    # Reference implementation
+    dq_ref, dk_ref, db_ref, dg_ref = chunk_kda_bwd_intra(
+        q=q,
+        k=k,
+        g=g,
+        beta=beta,
+        dAqk=dAqk,
+        dAkk=dAkk,
+        dq=dq,
+        dk=dk,
+        db=db,
+        dg=dg,
+    )
+    block_DK = min(64, tilelang.math.next_power_of_2(DK))
+    NK = (DK + block_DK - 1) // block_DK
+    # TileLang implementation
+    kernel = tilelang_chunk_bwd_intra(
+        B=B,
+        S=S,
+        H=H,
+        DK=DK,
+        input_dtype=input_dtype,
+        output_dtype=output_dtype,
+        accum_dtype=accum_dtype,
+        gate_dtype=gate_dtype,
+        state_dtype=state_dtype,
+        chunk_size=chunk_size,
+        block_DK=block_DK,
+    )
+
+    dq_tilelang, dk_tilelang, db_tilelang, dg_tilelang = prepare_output(
+        B, S, H, DK, chunk_size, NK, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+    dq_tilelang, dk_tilelang, db_tilelang, dg_tilelang = kernel(q, k, g, beta, dAqk, dAkk, dq, dk, db, dg)
+    db_tilelang = db_tilelang.sum(0).add_(db)
+    dg_tilelang = chunk_local_cumsum(
+        dg_tilelang,
+        chunk_size=chunk_size,
+        reverse=True,
+    )
+
+    compare_tensors("dq", dq_tilelang, dq_ref)
+    compare_tensors("dk", dk_tilelang, dk_ref)
+    compare_tensors("db", db_tilelang, db_ref)
+    compare_tensors("dg", dg_tilelang, dg_ref)
+
+    fla_time = do_bench(
+        chunk_kda_bwd_intra,
+        q=q,
+        k=k,
+        g=g,
+        beta=beta,
+        dAqk=dAqk,
+        dAkk=dAkk,
+        dq=dq,
+        dk=dk,
+        db=db,
+        dg=dg,
+    )
+    tilelang_time = do_bench(kernel, q, k, g, beta, dAqk, dAkk, dq, dk, db, dg)
+    print(f"Fla time: {fla_time}")
+    print(f"Tilelang time: {tilelang_time}")
+
+
+def main():
+    DK = 128
+    run_test(
+        B=1,
+        S=8192,
+        H=8,
+        DK=DK,
+        input_dtype=T.float32,
+        output_dtype=T.float32,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
+        chunk_size=64,
+        threads=128,
+        num_stages=0,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_delta_bwd.py b/examples/kda/chunk_delta_bwd.py
new file mode 100644
index 0000000000..8c22488ca4
--- /dev/null
+++ b/examples/kda/chunk_delta_bwd.py
@@ -0,0 +1,309 @@
+# Reference: fla/ops/common/chunk_delta_h.py
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_delta import chunk_gated_delta_rule_bwd_dhu
+from FLA_KDA.cumsum import chunk_local_cumsum
+from test_utils_kda import do_bench, compare_tensors
+
+import torch
+import torch.nn.functional as F
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    Q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda() * 0.01
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = F.normalize(K, dim=-1, p=2)
+    W = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    # Note: G should be in logspace and do chunkwise cumsum
+    G = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    G = F.logsigmoid(G)
+    G = chunk_local_cumsum(G, chunk_size)
+
+    h0 = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    dht = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    dO = torch.randn(B, S, H, DV, dtype=input_dtype).cuda() * 0.01
+
+    dv = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    return Q, K, W, G, h0, dht, dO, dv
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = S // chunk_size
+    dh = torch.empty(B, BS, H, DK, DV, dtype=output_dtype).cuda()
+    dh0 = torch.empty(B, H, DK, DV, dtype=state_dtype).cuda()
+    dv2 = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return dh, dh0, dv2
+
+
+def get_configs():
+    import itertools
+
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DV, threads, num_stages))
+
+    configs = [{"block_DV": c[0], "threads": c[1], "num_stages": c[2]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=10, rep=10)
+@tilelang.jit(out_idx=[-3, -2, -1])
+def tilelang_chunk_gated_delta_rule_bwd_dhu(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    scale,
+    use_gk=True,
+    use_initial_state=True,
+    use_final_state_gradient=True,
+    # kernel config
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    block_S = chunk_size
+    # Should support cu_seqlen
+    BS = S // block_S
+
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    W_shape = (B, S, H, DK)
+    G_shape = (B, S, H, DK)
+    h0_shape = (B, H, DK, DV)
+    dht_shape = (B, H, DK, DV)
+    dO_shape = (B, S, H, DV)
+    dv_shape = (B, S, H, DV)
+
+    dh_shape = (B, BS, H, DK, DV)
+    dh0_shape = (B, H, DK, DV)
+    dv2_shape = (B, S, H, DV)
+
+    @T.prim_func
+    def kernel(
+        # Input
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        GK: T.Tensor(G_shape, dtype=gate_dtype),
+        h0: T.Tensor(h0_shape, dtype=input_dtype),
+        dht: T.Tensor(dht_shape, dtype=input_dtype),
+        dO: T.Tensor(dO_shape, dtype=input_dtype),
+        dv: T.Tensor(dv_shape, dtype=input_dtype),
+        # Output
+        dh: T.Tensor(dh_shape, dtype=output_dtype),
+        dh0: T.Tensor(dh0_shape, dtype=state_dtype),
+        dv2: T.Tensor(dv2_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            b_dh_shared = T.alloc_shared((DK, block_DV), dtype=output_dtype)
+            b_dh_fragment = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            b_dh_fragment_1 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            b_dh_fragment_2 = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+            dv_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dv_fragment_2 = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dO_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+
+            Q_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+
+            GK_last_shared = T.alloc_shared((DK,), dtype=gate_dtype)
+
+            if use_final_state_gradient:
+                T.copy(dht[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_dh_shared)
+                T.copy(b_dh_shared, b_dh_fragment)
+            else:
+                T.clear(b_dh_fragment)
+
+            for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
+                # The gradient should be stored in the reverse order
+                i_s_inv = T.ceildiv(S, block_S) - i_s - 1  # reverse indices
+                # Store the updated dh
+                T.copy(b_dh_fragment, b_dh_shared)
+                T.copy(b_dh_shared, dh[bb, i_s_inv, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
+
+                # Update dv
+                T.copy(K[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], K_shared)
+                T.gemm(K_shared, b_dh_shared, dv_fragment, clear_accum=True)
+                T.copy(
+                    dv[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dv_shared
+                )  # copy old dv
+                T.copy(dv_shared, dv_fragment_2)
+                for i_s2, i_v in T.Parallel(block_S, block_DV):
+                    dv_fragment[i_s2, i_v] = dv_fragment[i_s2, i_v] + dv_fragment_2[i_s2, i_v]
+                # Store the updated dv
+                T.copy(dv_fragment, dv_shared)
+                T.copy(dv_shared, dv2[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
+
+                # Update dh
+                T.copy(Q[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], Q_shared)  # [block_S, DK]
+                T.copy(W[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, 0:DK], W_shared)  # [block_S, DK]
+                T.copy(
+                    dO[bb, i_s_inv * block_S : (i_s_inv + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], dO_shared
+                )  # [block_S, block_DV]
+
+                if use_gk:
+                    last_idx = T.min((i_s_inv + 1) * block_S, S) - 1  # chunk last token gk
+                    T.copy(GK[bb, last_idx, bh, :], GK_last_shared)
+                    for i_k, i_v in T.Parallel(DK, block_DV):
+                        b_dh_fragment[i_k, i_v] *= T.exp2(GK_last_shared[i_k])
+
+                T.gemm(Q_shared, dO_shared, b_dh_fragment_1, transpose_A=True, clear_accum=True)  # [DK, block_DV]
+
+                # dv_shared: [block_S, block_DV]
+                T.gemm(W_shared, dv_shared, b_dh_fragment_2, transpose_A=True, clear_accum=True)  # [DK, block_DV]
+                for i_k, i_v in T.Parallel(DK, block_DV):
+                    b_dh_fragment[i_k, i_v] += b_dh_fragment_1[i_k, i_v] * scale - b_dh_fragment_2[i_k, i_v]
+
+            if use_initial_state:
+                T.copy(b_dh_fragment, dh0[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    scale,
+    use_gk=True,
+    use_initial_state=True,
+    use_final_state_gradient=True,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+    use_torch=False,
+):
+    Q, K, W, G, h0, dht, dO, dv = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+
+    dh_tilelang, dh0_tilelang, dv2_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+
+    # fla ref
+    print("fla running...", flush=True)
+    if use_gk:
+        dh_ref, dh0_ref, dv2_ref = chunk_gated_delta_rule_bwd_dhu(
+            q=Q, k=K, w=W, do=dO, dv=dv, gk=G, h0=h0, dht=dht, scale=scale, use_exp2=True
+        )
+
+    # tilelang
+    print("tilelang running...", flush=True)
+    kernel = tilelang_chunk_gated_delta_rule_bwd_dhu(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        scale,
+        use_gk,
+        use_initial_state,
+        use_final_state_gradient,
+    )
+    dh_tilelang, dh0_tilelang, dv2_tilelang = kernel(Q, K, W, G, h0, dht, dO, dv)
+
+    fla_time = do_bench(
+        chunk_gated_delta_rule_bwd_dhu, q=Q, k=K, w=W, do=dO, dv=dv, gk=G, h0=h0, dht=dht, scale=scale, chunk_size=chunk_size
+    )
+    tilelang_time = do_bench(kernel, Q, K, W, G, h0, dht, dO, dv)
+
+    print(f"fla time: {fla_time} ms")
+    print(f"tilelang time: {tilelang_time} ms")
+
+    compare_tensors("dh", dh_ref, dh_tilelang)
+    compare_tensors("dh0", dh0_ref, dh0_tilelang)
+    compare_tensors("dv2", dv2_ref, dv2_tilelang)
+
+
+def main():
+    DK = 128
+    run_test(
+        B=1,
+        S=1024 * 8,
+        H=64,
+        DK=DK,
+        DV=128,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        state_dtype="float32",
+        chunk_size=64,
+        scale=DK**-0.5,
+        use_gk=True,
+        use_initial_state=True,
+        use_final_state_gradient=True,
+        block_DV=32,
+        threads=128,
+        num_stages=1,
+        use_torch=False,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_delta_h_fwd.py b/examples/kda/chunk_delta_h_fwd.py
new file mode 100644
index 0000000000..fbb8bd9882
--- /dev/null
+++ b/examples/kda/chunk_delta_h_fwd.py
@@ -0,0 +1,306 @@
+# Reference: fla/ops/common/chunk_delta_h.py
+
+import sys  # noqa: F401
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+# Add your fla repository path to sys.path
+# Currently we use the fla repository from the flash-linear-attention project at commit id f03cb3ae
+# sys.path.insert(0, "/your/path/to/flash-linear-attention")
+
+from FLA_KDA.fla_chunk_delta import chunk_gated_delta_rule_fwd_h
+from FLA_KDA.cumsum import chunk_local_cumsum
+
+import torch
+import torch.nn.functional as F
+
+from test_utils_kda import compare_tensors, do_bench
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    K = F.normalize(K, dim=-1, p=2)
+    W = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    W = F.normalize(W, dim=-1, p=2)
+    U = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    U = F.normalize(U, dim=-1, p=2)
+    G = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    G = F.logsigmoid(G)
+    G = chunk_local_cumsum(G, chunk_size)
+    initial_state = torch.randn(B, H, DK, DV, dtype=input_dtype).cuda()
+    return K, W, U, G, initial_state
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    state_dtype,
+):
+    BS = (S + chunk_size - 1) // chunk_size  # ceildiv to match kernel iteration
+    h = torch.empty(B, BS, H, DK, DV, dtype=output_dtype).cuda()
+    final_state = torch.empty(B, H, DK, DV, dtype=state_dtype).cuda()
+    V_new = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return h, final_state, V_new
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [128, 256]
+    num_stages = [1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_gated_delta_rule_fwd_h(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    use_gk,
+    use_initial_state,
+    store_final_state,
+    save_new_value,
+    # kernel config
+    block_DK=64,
+    block_DV=32,
+    threads=128,
+    num_stages=1,
+):
+    block_S = chunk_size
+    BS = (S + chunk_size - 1) // chunk_size  # ceildiv to match kernel iteration
+
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    W_shape = (B, S, H, DK)
+    U_shape = (B, S, H, DV)
+    GK_shape = (B, S, H, DK)
+    h_shape = (B, BS, H, DK, DV)
+    initial_state_shape = (B, H, DK, DV)
+    final_state_shape = (B, H, DK, DV)
+
+    @T.prim_func
+    def kernel(
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        W: T.Tensor(W_shape, dtype=input_dtype),
+        U: T.Tensor(U_shape, dtype=input_dtype),
+        GK: T.Tensor(GK_shape, dtype=gate_dtype),
+        initial_state: T.Tensor(initial_state_shape, dtype=input_dtype),
+        h: T.Tensor(h_shape, dtype=output_dtype),
+        final_state: T.Tensor(final_state_shape, dtype=state_dtype),
+        V_new: T.Tensor(V_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(DV, block_DV), B * H, threads=threads) as (bv, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            b_h_shared = T.alloc_shared((DK, block_DV), dtype=input_dtype)
+            b_h_fragment = T.alloc_fragment((DK, block_DV), dtype=accum_dtype)
+
+            U_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            U_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            W_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            V_new_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            V_new_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            K_shared = T.alloc_shared((block_S, DK), dtype=input_dtype)
+            GK_last_shared = T.alloc_shared((DK), dtype=gate_dtype)
+
+            if use_initial_state:
+                T.copy(initial_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV], b_h_shared)
+                T.copy(b_h_shared, b_h_fragment)
+            else:
+                T.clear(b_h_fragment)
+
+            for i_s in T.Pipelined(T.ceildiv(S, block_S), num_stages=num_stages):
+                # Store previous result to the hidden tensor, like the epilogue
+                T.copy(b_h_shared, h[bb, i_s, bh, :, bv * block_DV : (bv + 1) * block_DV])
+
+                # Recurrence
+                T.copy(W[bb, i_s * block_S : (i_s + 1) * block_S, bh, :], W_shared)
+                T.gemm(W_shared, b_h_shared, V_new_fragment, clear_accum=True)
+
+                # U - W * S
+                T.copy(U[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], U_shared)
+                T.copy(U_shared, U_fragment)
+                for i_s2, i_v in T.Parallel(block_S, block_DV):
+                    V_new_fragment[i_s2, i_v] = -V_new_fragment[i_s2, i_v] + U_fragment[i_s2, i_v]
+
+                # Save V_new
+                if save_new_value:
+                    T.copy(V_new_fragment, dst=V_new_shared)
+                    T.copy(V_new_shared, V_new[bb, i_s * block_S : (i_s + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
+
+                T.copy(K[bb, i_s * block_S : (i_s + 1) * block_S, bh, 0:DK], K_shared)
+                # use_gk
+                if use_gk:
+                    T.copy(GK[bb, (i_s + 1) * block_S - 1, bh, :], GK_last_shared)  # block last token
+                    for i_k, i_v in T.Parallel(DK, block_DV):
+                        b_h_fragment[i_k, i_v] *= T.exp2(GK_last_shared[i_k])
+
+                # Update intermediate results
+                T.copy(V_new_fragment, V_new_shared)
+                T.gemm(K_shared, V_new_shared, b_h_fragment, transpose_A=True)
+
+                T.copy(b_h_fragment, b_h_shared)
+
+            # Save final state
+            if store_final_state:
+                T.copy(b_h_fragment, final_state[bb, bh, 0:DK, bv * block_DV : (bv + 1) * block_DV])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    use_gk=True,
+    use_initial_state=True,
+    store_final_state=True,
+    save_new_value=True,
+    block_DK=64,
+    block_DV=32,
+    threads=128,
+    num_stages=0,
+):
+    K, W, U, G, initial_state = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    h_ref, final_state_ref, V_new_ref = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, state_dtype)
+    )
+
+    # fla ref
+    h_ref, V_new_ref, final_state_ref = chunk_gated_delta_rule_fwd_h(
+        k=K,
+        w=W,
+        u=U,
+        gk=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+        use_exp2=True,
+    )
+
+    # tilelang
+    kernel = tilelang_chunk_gated_delta_rule_fwd_h(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+        use_gk,
+        use_initial_state,
+        store_final_state,
+        save_new_value,
+    )
+    h_tilelang, final_state_tilelang, V_new_tilelang = kernel(K, W, U, G, initial_state)
+
+    fla_time = do_bench(
+        chunk_gated_delta_rule_fwd_h,
+        k=K,
+        w=W,
+        u=U,
+        gk=G,
+        initial_state=initial_state,
+        output_final_state=store_final_state,
+        chunk_size=chunk_size,
+        save_new_value=save_new_value,
+        use_exp2=True,
+    )
+    tilelang_time = do_bench(kernel, K, W, U, G, initial_state)
+
+    # check correctness
+    compare_tensors("h", h_ref, h_tilelang)
+    compare_tensors("final_state", final_state_ref, final_state_tilelang)
+    compare_tensors("V_new", V_new_ref, V_new_tilelang)
+
+    print(f"tilelang time: {tilelang_time} ms")
+    print(f"fla time: {fla_time} ms")
+
+
+def main():
+    run_test(
+        B=1,
+        S=8192,
+        H=64,
+        DK=128,
+        DV=128,
+        input_dtype="float16",
+        output_dtype="float16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        state_dtype="float32",
+        chunk_size=64,
+        use_gk=True,
+        use_initial_state=True,
+        store_final_state=True,
+        save_new_value=True,
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=2,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_inter_solve_fused.py b/examples/kda/chunk_inter_solve_fused.py
new file mode 100644
index 0000000000..940dc20c86
--- /dev/null
+++ b/examples/kda/chunk_inter_solve_fused.py
@@ -0,0 +1,566 @@
+import tilelang
+import tilelang.language as T
+
+from FLA_KDA.fla_chunk_intra import chunk_kda_fwd_inter_solve_fused
+from FLA_KDA.cumsum import chunk_local_cumsum
+from test_utils_kda import compare_tensors, do_bench
+
+import torch
+import torch.nn.functional as F
+
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    sub_chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    gk = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()  # 需要是cumsum
+    gk = F.logsigmoid(gk)
+    gk = chunk_local_cumsum(gk, chunk_size)
+
+    Aqk = torch.empty(B, S, H, chunk_size, dtype=input_dtype).cuda()
+    Akk_diag = torch.ones(B, S, H, sub_chunk_size, dtype=torch.float32).cuda()
+
+    return q, k, gk, beta, Aqk, Akk_diag
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    chunk_size,
+    sub_chunk_size,
+    output_dtype,
+):
+    Akk = torch.empty(B, S, H, chunk_size, dtype=output_dtype).cuda()
+    return Akk
+
+
+@tilelang.jit(out_idx=[-2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_kda_fwd_inter_fused(
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    sub_chunk_size,
+    scale,
+    block_DK=32,
+    threads=32,
+    num_stages=1,
+):
+    block_S = BS = chunk_size
+    BC = sub_chunk_size
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    GK_shape = (B, S, H, DK)
+    Beta_shape = (B, S, H)
+    Aqk_shape = (B, S, H, BS)
+    Akk_diag_shape = (B, S, H, BC)
+    """
+    Fused kernel: compute inter-subchunk Akk + solve_tril in one pass.
+    Prerequisite: token_parallel has already computed diagonal Akk blocks in Akk_diag.
+
+    This kernel:
+    1. Computes off-diagonal Aqk blocks -> writes to global
+    2. Computes off-diagonal Akk blocks -> keeps in registers
+    3. Loads diagonal Akk blocks from Akk_diag (fp32)
+    4. Does forward substitution on diagonals
+    5. Computes merged Akk_inv
+    6. Writes Akk_inv to Akk
+    """
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        GK: T.Tensor(GK_shape, dtype=gate_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        Akk_diag: T.Tensor(Akk_diag_shape, dtype=T.float32),
+        Aqk: T.Tensor(Aqk_shape, dtype=output_dtype),
+        Akk: T.Tensor(Aqk_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            Aqk10_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk10_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk20_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk20_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk21_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk21_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk30_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk30_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk31_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk31_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Aqk32_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk32_fragment = T.alloc_fragment((BC, BC), dtype=accum_dtype)
+            Akk10_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk20_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk21_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk30_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk31_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Akk32_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+
+            K0_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            GK0_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            Q1_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            K1_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            GK1_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            Q2_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            K2_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            GK2_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            Q3_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            K3_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            GK3_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+
+            Q_GK_scaled_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            K_GK_scaled_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            b_kt_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+
+            b_gn1_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            b_gn2_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+            b_gn3_shared = T.alloc_shared((block_DK,), dtype=T.float32)
+
+            b_gqn1_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            b_gqn2_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+            b_gqn3_shared = T.alloc_shared((BC, block_DK), dtype=T.float32)
+
+            beta_1_shared = T.alloc_shared((BC,), dtype=T.float32)
+            beta_2_shared = T.alloc_shared((BC,), dtype=T.float32)
+            beta_3_shared = T.alloc_shared((BC,), dtype=T.float32)
+            # Akk_inv
+            Ai_00_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_10_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_11_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_20_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_21_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_22_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_30_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_31_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_32_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_33_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+
+            T.clear(Aqk10_fragment)
+            T.clear(Akk10_fragment)
+            T.clear(Aqk20_fragment)
+            T.clear(Akk20_fragment)
+            T.clear(Aqk21_fragment)
+            T.clear(Akk21_fragment)
+            T.clear(Aqk30_fragment)
+            T.clear(Akk30_fragment)
+            T.clear(Aqk31_fragment)
+            T.clear(Akk31_fragment)
+            T.clear(Aqk32_fragment)
+            T.clear(Akk32_fragment)
+
+            i_tc0 = bs * BS
+            i_tc1 = bs * BS + BC
+            i_tc2 = bs * BS + 2 * BC
+            i_tc3 = bs * BS + 3 * BC
+
+            ################################################################################
+            # 1. off-diagonal blocks
+            ################################################################################
+
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(K[bb, bs * BS : bs * BS + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], K0_shared)
+                T.copy(GK[bb, bs * BS : bs * BS + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], GK0_shared)
+                if i_tc1 < S:
+                    T.copy(Q[bb, i_tc1 : i_tc1 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], Q1_shared)
+                    T.copy(K[bb, i_tc1 : i_tc1 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], K1_shared)
+                    T.copy(GK[bb, i_tc1 : i_tc1 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], GK1_shared)
+                    T.copy(GK[bb, i_tc1, bh, i_k * block_DK : (i_k + 1) * block_DK], b_gn1_shared)  # subblock第一个token的GK
+                    for i_c1, i_k1 in T.Parallel(BC, block_DK):
+                        b_gqn1_shared[i_c1, i_k1] = T.if_then_else(
+                            i_tc1 + i_c1 < S, T.exp2(GK1_shared[i_c1, i_k1] - b_gn1_shared[i_k1]), 0.0
+                        )
+                        Q_GK_scaled_shared[i_c1, i_k1] = Q1_shared[i_c1, i_k1] * b_gqn1_shared[i_c1, i_k1]
+                        K_GK_scaled_shared[i_c1, i_k1] = K1_shared[i_c1, i_k1] * b_gqn1_shared[i_c1, i_k1]
+                        b_kt_shared[i_c1, i_k1] = K0_shared[i_c1, i_k1] * T.exp2(b_gn1_shared[i_k1] - GK0_shared[i_c1, i_k1])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk10_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk10_fragment, transpose_B=True)
+                if i_tc2 < S:
+                    T.copy(Q[bb, i_tc2 : i_tc2 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], Q2_shared)
+                    T.copy(K[bb, i_tc2 : i_tc2 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], K2_shared)
+                    T.copy(GK[bb, i_tc2 : i_tc2 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], GK2_shared)
+                    T.copy(GK[bb, i_tc2, bh, i_k * block_DK : (i_k + 1) * block_DK], b_gn2_shared)
+                    for i_c2, i_k2 in T.Parallel(BC, block_DK):
+                        b_gqn2_shared[i_c2, i_k2] = T.if_then_else(
+                            i_tc2 + i_c2 < S, T.exp2(GK2_shared[i_c2, i_k2] - b_gn2_shared[i_k2]), 0.0
+                        )
+                        Q_GK_scaled_shared[i_c2, i_k2] = Q2_shared[i_c2, i_k2] * b_gqn2_shared[i_c2, i_k2]
+                        K_GK_scaled_shared[i_c2, i_k2] = K2_shared[i_c2, i_k2] * b_gqn2_shared[i_c2, i_k2]
+                        b_kt_shared[i_c2, i_k2] = K0_shared[i_c2, i_k2] * T.exp2(b_gn2_shared[i_k2] - GK0_shared[i_c2, i_k2])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk20_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk20_fragment, transpose_B=True)
+                    for i_c3, i_k3 in T.Parallel(BC, block_DK):
+                        b_kt_shared[i_c3, i_k3] = K1_shared[i_c3, i_k3] * T.exp2(b_gn2_shared[i_k3] - GK1_shared[i_c3, i_k3])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk21_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk21_fragment, transpose_B=True)
+                if i_tc3 < S:
+                    T.copy(Q[bb, i_tc3 : i_tc3 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], Q3_shared)
+                    T.copy(K[bb, i_tc3 : i_tc3 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], K3_shared)
+                    T.copy(GK[bb, i_tc3 : i_tc3 + BC, bh, i_k * block_DK : (i_k + 1) * block_DK], GK3_shared)
+                    T.copy(GK[bb, i_tc3, bh, i_k * block_DK : (i_k + 1) * block_DK], b_gn3_shared)
+                    for i_c4, i_k4 in T.Parallel(BC, block_DK):
+                        b_gqn3_shared[i_c4, i_k4] = T.if_then_else(
+                            i_tc3 + i_c4 < S, T.exp2(GK3_shared[i_c4, i_k4] - b_gn3_shared[i_k4]), 0.0
+                        )
+                        Q_GK_scaled_shared[i_c4, i_k4] = Q3_shared[i_c4, i_k4] * b_gqn3_shared[i_c4, i_k4]
+                        K_GK_scaled_shared[i_c4, i_k4] = K3_shared[i_c4, i_k4] * b_gqn3_shared[i_c4, i_k4]
+                        b_kt_shared[i_c4, i_k4] = K0_shared[i_c4, i_k4] * T.exp2(b_gn3_shared[i_k4] - GK0_shared[i_c4, i_k4])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk30_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk30_fragment, transpose_B=True)
+                    for i_c5, i_k5 in T.Parallel(BC, block_DK):
+                        b_kt_shared[i_c5, i_k5] = K1_shared[i_c5, i_k5] * T.exp2(b_gn3_shared[i_k5] - GK1_shared[i_c5, i_k5])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk31_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk31_fragment, transpose_B=True)
+                    for i_c6, i_k6 in T.Parallel(BC, block_DK):
+                        b_kt_shared[i_c6, i_k6] = K2_shared[i_c6, i_k6] * T.exp2(b_gn3_shared[i_k6] - GK2_shared[i_c6, i_k6])
+                    T.gemm(Q_GK_scaled_shared, b_kt_shared, Aqk32_fragment, transpose_B=True)
+                    T.gemm(K_GK_scaled_shared, b_kt_shared, Akk32_fragment, transpose_B=True)
+
+            ################################################################################
+            # 2. save off-diagonal Aqk blocks and prepare Akk
+            ################################################################################
+
+            if i_tc1 < S:
+                T.copy(Beta[bb, i_tc1 : i_tc1 + BC, bh], beta_1_shared)
+                for i_c21, i_c22 in T.Parallel(BC, BC):
+                    Aqk10_fragment[i_c21, i_c22] = Aqk10_fragment[i_c21, i_c22] * scale
+                    Akk10_fragment[i_c21, i_c22] = Akk10_fragment[i_c21, i_c22] * beta_1_shared[i_c21]
+                T.copy(Aqk10_fragment, Aqk[bb, i_tc1 : i_tc1 + BC, bh, 0:BC])
+                T.copy(Akk10_fragment, Akk10_shared)
+            if i_tc2 < S:
+                T.copy(Beta[bb, i_tc2 : i_tc2 + BC, bh], beta_2_shared)
+                for i_c23, i_c24 in T.Parallel(BC, BC):
+                    Aqk20_fragment[i_c23, i_c24] = Aqk20_fragment[i_c23, i_c24] * scale
+                    Aqk21_fragment[i_c23, i_c24] = Aqk21_fragment[i_c23, i_c24] * scale
+                    Akk20_fragment[i_c23, i_c24] = Akk20_fragment[i_c23, i_c24] * beta_2_shared[i_c23]
+                    Akk21_fragment[i_c23, i_c24] = Akk21_fragment[i_c23, i_c24] * beta_2_shared[i_c23]
+                T.copy(Aqk20_fragment, Aqk[bb, i_tc2 : i_tc2 + BC, bh, 0:BC])
+                T.copy(Aqk21_fragment, Aqk[bb, i_tc2 : i_tc2 + BC, bh, BC : 2 * BC])
+                T.copy(Akk20_fragment, Akk20_shared)
+                T.copy(Akk21_fragment, Akk21_shared)
+            if i_tc3 < S:
+                T.copy(Beta[bb, i_tc3 : i_tc3 + BC, bh], beta_3_shared)
+                for i_c25, i_c26 in T.Parallel(BC, BC):
+                    Aqk30_fragment[i_c25, i_c26] = Aqk30_fragment[i_c25, i_c26] * scale
+                    Aqk31_fragment[i_c25, i_c26] = Aqk31_fragment[i_c25, i_c26] * scale
+                    Aqk32_fragment[i_c25, i_c26] = Aqk32_fragment[i_c25, i_c26] * scale
+                    Akk30_fragment[i_c25, i_c26] = Akk30_fragment[i_c25, i_c26] * beta_3_shared[i_c25]
+                    Akk31_fragment[i_c25, i_c26] = Akk31_fragment[i_c25, i_c26] * beta_3_shared[i_c25]
+                    Akk32_fragment[i_c25, i_c26] = Akk32_fragment[i_c25, i_c26] * beta_3_shared[i_c25]
+                T.copy(Aqk30_fragment, Aqk[bb, i_tc3 : i_tc3 + BC, bh, 0:BC])
+                T.copy(Aqk31_fragment, Aqk[bb, i_tc3 : i_tc3 + BC, bh, BC : 2 * BC])
+                T.copy(Aqk32_fragment, Aqk[bb, i_tc3 : i_tc3 + BC, bh, 2 * BC : 3 * BC])
+                T.copy(Akk30_fragment, Akk30_shared)
+                T.copy(Akk31_fragment, Akk31_shared)
+                T.copy(Akk32_fragment, Akk32_shared)
+
+            ################################################################################
+            # 3. load diagonal Akk blocks
+            ################################################################################
+
+            T.copy(Akk_diag[bb, i_tc0 : i_tc0 + BC, bh, :], Ai_00_shared)
+            T.copy(Akk_diag[bb, i_tc1 : i_tc1 + BC, bh, :], Ai_11_shared)
+            T.copy(Akk_diag[bb, i_tc2 : i_tc2 + BC, bh, :], Ai_22_shared)
+            T.copy(Akk_diag[bb, i_tc3 : i_tc3 + BC, bh, :], Ai_33_shared)
+            for i_c1, i_c2 in T.Parallel(BC, BC):
+                Ai_00_shared[i_c1, i_c2] = T.if_then_else(i_c1 > i_c2, -Ai_00_shared[i_c1, i_c2], 0)
+                Ai_11_shared[i_c1, i_c2] = T.if_then_else(i_c1 > i_c2, -Ai_11_shared[i_c1, i_c2], 0)
+                Ai_22_shared[i_c1, i_c2] = T.if_then_else(i_c1 > i_c2, -Ai_22_shared[i_c1, i_c2], 0)
+                Ai_33_shared[i_c1, i_c2] = T.if_then_else(i_c1 > i_c2, -Ai_33_shared[i_c1, i_c2], 0)
+
+            ################################################################################
+            # 4. forward substitution on diagonals
+            ################################################################################
+            a_00_shared = T.alloc_shared((BC,), dtype=T.float32)
+            Aa_mul_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            reduce_shared = T.alloc_shared((BC,), dtype=T.float32)
+            for i_i in T.Pipelined(2, T.min(BC, S - i_tc0), num_stages=num_stages):
+                T.copy(Akk_diag[bb, i_tc0 + i_i, bh, :], a_00_shared)  # load row
+                for i_c in T.Parallel(BC):
+                    a_00_shared[i_c] = T.if_then_else(i_c < i_i, -a_00_shared[i_c], 0.0)  # mask:i_c<i_i
+                for i_c2, i_c3 in T.Parallel(BC, BC):
+                    Aa_mul_shared[i_c2, i_c3] = a_00_shared[i_c2] * Ai_00_shared[i_c2, i_c3]
+                T.reduce_sum(Aa_mul_shared, reduce_shared, dim=0, clear=True)
+                for i_c4 in T.Parallel(BC):
+                    a_00_shared[i_c4] += reduce_shared[i_c4]
+                for i_c5, i_c6 in T.Parallel(BC, BC):
+                    Ai_00_shared[i_c5, i_c6] = T.if_then_else(i_c5 == i_i, a_00_shared[i_c6], Ai_00_shared[i_c5, i_c6])
+
+            a_11_shared = T.alloc_shared((BC,), dtype=T.float32)
+            Aa11_mul_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            for i_i in T.Pipelined(BC + 2, T.min(2 * BC, S - i_tc0), num_stages=num_stages):
+                T.copy(Akk_diag[bb, i_tc0 + i_i, bh, :], a_11_shared)
+                for i_c in T.Parallel(BC):
+                    a_11_shared[i_c] = T.if_then_else(i_c < i_i - BC, -a_11_shared[i_c], 0.0)
+                for i_c2, i_c3 in T.Parallel(BC, BC):
+                    Aa11_mul_shared[i_c2, i_c3] = a_11_shared[i_c2] * Ai_11_shared[i_c2, i_c3]
+                T.reduce_sum(
+                    Aa11_mul_shared,
+                    reduce_shared,
+                    dim=0,
+                )
+                for i_c4 in T.Parallel(BC):
+                    a_11_shared[i_c4] = reduce_shared[i_c4] + a_11_shared[i_c4]
+                for i_c5, i_c6 in T.Parallel(BC, BC):
+                    Ai_11_shared[i_c5, i_c6] = T.if_then_else(
+                        i_c5 == (i_i - BC),
+                        a_11_shared[i_c6],
+                        Ai_11_shared[i_c5, i_c6],
+                    )
+
+            a_22_shared = T.alloc_shared((BC,), dtype=T.float32)
+            Aa22_mul_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            for i_i in T.Pipelined(2 * BC + 2, T.min(3 * BC, S - i_tc0), num_stages=num_stages):
+                T.copy(Akk_diag[bb, i_tc0 + i_i, bh, :], a_22_shared)
+                for i_c in T.Parallel(BC):
+                    a_22_shared[i_c] = T.if_then_else(i_c < i_i - 2 * BC, -a_22_shared[i_c], 0.0)
+                for i_c2, i_c3 in T.Parallel(BC, BC):
+                    Aa22_mul_shared[i_c2, i_c3] = a_22_shared[i_c2] * Ai_22_shared[i_c2, i_c3]
+                T.reduce_sum(
+                    Aa22_mul_shared,
+                    reduce_shared,
+                    dim=0,
+                )
+                for i_c4 in T.Parallel(BC):
+                    a_22_shared[i_c4] = reduce_shared[i_c4] + a_22_shared[i_c4]
+                for i_c5, i_c6 in T.Parallel(BC, BC):
+                    Ai_22_shared[i_c5, i_c6] = T.if_then_else(i_c5 == (i_i - 2 * BC), a_22_shared[i_c6], Ai_22_shared[i_c5, i_c6])
+
+            a_33_shared = T.alloc_shared((BC,), dtype=T.float32)
+            Aa33_mul_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            for i_i in T.Pipelined(3 * BC + 2, T.min(4 * BC, S - i_tc0), num_stages=num_stages):
+                T.copy(Akk_diag[bb, i_tc0 + i_i, bh, :], a_33_shared)
+                for i_c in T.Parallel(BC):
+                    a_33_shared[i_c] = T.if_then_else(i_c < i_i - 3 * BC, -a_33_shared[i_c], 0.0)
+                for i_c2, i_c3 in T.Parallel(BC, BC):
+                    Aa33_mul_shared[i_c2, i_c3] = a_33_shared[i_c2] * Ai_33_shared[i_c2, i_c3]
+                T.reduce_sum(
+                    Aa33_mul_shared,
+                    reduce_shared,
+                    dim=0,
+                )
+                for i_c4 in T.Parallel(BC):
+                    a_33_shared[i_c4] = reduce_shared[i_c4] + a_33_shared[i_c4]
+                for i_c5, i_c6 in T.Parallel(BC, BC):
+                    Ai_33_shared[i_c5, i_c6] = T.if_then_else(
+                        i_c5 == (i_i - 3 * BC),
+                        a_33_shared[i_c6],
+                        Ai_33_shared[i_c5, i_c6],
+                    )
+
+            for i, j in T.Parallel(BC, BC):
+                Ai_00_shared[i, j] += T.if_then_else(i == j, 1.0, 0.0)
+                Ai_11_shared[i, j] += T.if_then_else(i == j, 1.0, 0.0)
+                Ai_22_shared[i, j] += T.if_then_else(i == j, 1.0, 0.0)
+                Ai_33_shared[i, j] += T.if_then_else(i == j, 1.0, 0.0)
+
+            ################################################################################
+            # 5. compute merged inverse using off-diagonals
+            ################################################################################
+
+            Ai_10_inv_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_10_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_21_inv_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_21_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_32_inv_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_32_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            Ai_10_inv_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_21_inv_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai_32_inv_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+
+            # ---------- Ai_10 = - (Ai11@Akk10)@Ai00 ----------
+            T.gemm(Ai_11_shared, Akk10_shared, Ai_10_inv_frag, clear_accum=True)  # [BC, BC] * [BC, BC]
+            T.copy(Ai_10_inv_frag, Ai_10_inv_shared)
+            T.gemm(Ai_10_inv_shared, Ai_00_shared, Ai_10_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai_10_final_frag[i_bc, j_bc] = -Ai_10_final_frag[i_bc, j_bc]
+            T.copy(Ai_10_final_frag, Ai_10_shared)
+            # ---------- Ai_21 = - (Ai22@Akk21)@Ai11 ----------
+            T.gemm(Ai_22_shared, Akk21_shared, Ai_21_inv_frag, clear_accum=True)
+            T.copy(Ai_21_inv_frag, Ai_21_inv_shared)
+            T.gemm(Ai_21_inv_shared, Ai_11_shared, Ai_21_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai_21_final_frag[i_bc, j_bc] = -Ai_21_final_frag[i_bc, j_bc]
+            T.copy(Ai_21_final_frag, Ai_21_shared)
+            # ---------- Ai_32 = - (Ai33@Akk32)@Ai22 ----------
+            T.gemm(Ai_33_shared, Akk32_shared, Ai_32_inv_frag, clear_accum=True)
+            T.copy(Ai_32_inv_frag, Ai_32_inv_shared)
+            T.gemm(Ai_32_inv_shared, Ai_22_shared, Ai_32_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai_32_final_frag[i_bc, j_bc] = -Ai_32_final_frag[i_bc, j_bc]
+            T.copy(Ai_32_final_frag, Ai_32_shared)
+
+            # ---------- Ai_20 = - Ai_22 @ ( Akk20@Ai00 + Akk21@Ai10 ) ----------
+            Ai20_t0_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk20 @ Ai00
+            Ai20_t1_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk21 @ Ai10
+            Ai20_sum_shared = T.alloc_shared((BC, BC), dtype=T.float32)  # t0 + t1
+            Ai20_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+
+            T.gemm(Akk20_shared, Ai_00_shared, Ai20_t0_frag, clear_accum=True)
+            T.gemm(Akk21_shared, Ai_10_shared, Ai20_t1_frag, clear_accum=True)
+
+            # sum = t0 + t1
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai20_sum_shared[i_bc, j_bc] = Ai20_t0_frag[i_bc, j_bc] + Ai20_t1_frag[i_bc, j_bc]
+
+            # final = Ai_22 @ sum
+            T.gemm(Ai_22_shared, Ai20_sum_shared, Ai20_final_frag, clear_accum=True)
+
+            # negate
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai20_final_frag[i_bc, j_bc] = -Ai20_final_frag[i_bc, j_bc]
+
+            T.copy(Ai20_final_frag, Ai_20_shared)
+
+            # ---------- Ai_31 = - Ai_33 @ ( Akk31@Ai11 + Akk32@Ai21 ) ----------
+            Ai31_t0_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk31 @ Ai11
+            Ai31_t1_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk32 @ Ai21
+            Ai31_sum_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai31_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            T.gemm(Akk31_shared, Ai_11_shared, Ai31_t0_frag, clear_accum=True)
+            T.gemm(Akk32_shared, Ai_21_shared, Ai31_t1_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai31_sum_shared[i_bc, j_bc] = Ai31_t0_frag[i_bc, j_bc] + Ai31_t1_frag[i_bc, j_bc]
+            T.gemm(Ai_33_shared, Ai31_sum_shared, Ai31_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai31_final_frag[i_bc, j_bc] = -Ai31_final_frag[i_bc, j_bc]
+            T.copy(Ai31_final_frag, Ai_31_shared)
+
+            # ---------- Ai_30 = - Ai_33 @ ( Akk30@Ai00 + Akk31@Ai10 + Akk32@Ai20 ) ----------
+            Ai30_t0_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk30 @ Ai00
+            Ai30_t1_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk31 @ Ai10
+            Ai30_t2_frag = T.alloc_fragment((BC, BC), dtype=T.float32)  # Akk32 @ Ai20
+            Ai30_sum_shared = T.alloc_shared((BC, BC), dtype=T.float32)
+            Ai30_final_frag = T.alloc_fragment((BC, BC), dtype=T.float32)
+            T.gemm(Akk30_shared, Ai_00_shared, Ai30_t0_frag, clear_accum=True)
+            T.gemm(Akk31_shared, Ai_10_shared, Ai30_t1_frag, clear_accum=True)
+            T.gemm(Akk32_shared, Ai_20_shared, Ai30_t2_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai30_sum_shared[i_bc, j_bc] = Ai30_t0_frag[i_bc, j_bc] + Ai30_t1_frag[i_bc, j_bc] + Ai30_t2_frag[i_bc, j_bc]
+            T.gemm(Ai_33_shared, Ai30_sum_shared, Ai30_final_frag, clear_accum=True)
+            for i_bc, j_bc in T.Parallel(BC, BC):
+                Ai30_final_frag[i_bc, j_bc] = -Ai30_final_frag[i_bc, j_bc]
+            T.copy(Ai30_final_frag, Ai_30_shared)
+
+            T.copy(Ai_00_shared, Akk[bb, i_tc0 : i_tc0 + BC, bh, 0:BC])
+            T.copy(Ai_10_shared, Akk[bb, i_tc1 : i_tc1 + BC, bh, 0:BC])
+            T.copy(Ai_11_shared, Akk[bb, i_tc1 : i_tc1 + BC, bh, BC : 2 * BC])
+            T.copy(Ai_20_shared, Akk[bb, i_tc2 : i_tc2 + BC, bh, 0:BC])
+            T.copy(Ai_21_shared, Akk[bb, i_tc2 : i_tc2 + BC, bh, BC : 2 * BC])
+            T.copy(Ai_22_shared, Akk[bb, i_tc2 : i_tc2 + BC, bh, 2 * BC : 3 * BC])
+            T.copy(Ai_30_shared, Akk[bb, i_tc3 : i_tc3 + BC, bh, 0:BC])
+            T.copy(Ai_31_shared, Akk[bb, i_tc3 : i_tc3 + BC, bh, BC : 2 * BC])
+            T.copy(Ai_32_shared, Akk[bb, i_tc3 : i_tc3 + BC, bh, 2 * BC : 3 * BC])
+            T.copy(Ai_33_shared, Akk[bb, i_tc3 : i_tc3 + BC, bh, 3 * BC : 4 * BC])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    scale,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    sub_chunk_size,
+):
+    q, k, gk, beta, Aqk, Akk_diag = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        chunk_size,
+        sub_chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    Aqk_ref = Aqk.clone()
+    Akk_ref = prepare_output(B, S, H, chunk_size, sub_chunk_size, getattr(torch, output_dtype))
+    chunk_kda_fwd_inter_solve_fused(
+        q=q,
+        k=k,
+        gk=gk,
+        beta=beta,
+        Aqk=Aqk_ref,
+        Akk_diag=Akk_diag,
+        Akk=Akk_ref,
+        scale=scale,
+    )
+    Aqk_tilelang = Aqk.clone()
+    Akk_tilelang = prepare_output(B, S, H, chunk_size, sub_chunk_size, getattr(torch, output_dtype))
+    kernel = tilelang_chunk_kda_fwd_inter_fused(
+        B, S, H, DK, input_dtype, output_dtype, accum_dtype, gate_dtype, chunk_size, sub_chunk_size, scale
+    )
+    Aqk_tilelang, Akk_tilelang = kernel(
+        q,
+        k,
+        gk,
+        beta,
+        Akk_diag,
+    )
+
+    compare_tensors("Aqk", Aqk_ref, Aqk_tilelang)
+    compare_tensors("Akk", Akk_ref, Akk_tilelang)
+    fla_time = do_bench(
+        chunk_kda_fwd_inter_solve_fused,
+        q=q,
+        k=k,
+        gk=gk,
+        beta=beta,
+        Aqk=Aqk_ref,
+        Akk_diag=Akk_diag,
+        Akk=Akk_ref,
+        scale=scale,
+    )
+    tilelang_time = do_bench(kernel, q, k, gk, beta, Akk_diag)
+    print("fla_time:", fla_time)
+    print("tilelang_time:", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=1024 * 8,  # 32768
+        H=64,
+        DK=128,
+        scale=1.0,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        chunk_size=64,
+        sub_chunk_size=16,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_intra_token_parallel.py b/examples/kda/chunk_intra_token_parallel.py
new file mode 100644
index 0000000000..cc024e7a98
--- /dev/null
+++ b/examples/kda/chunk_intra_token_parallel.py
@@ -0,0 +1,273 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+import torch
+import torch.nn.functional as F
+from FLA_KDA.fla_chunk_intra_token_parallel import chunk_kda_fwd_intra_token_parallel
+from FLA_KDA.cumsum import chunk_local_cumsum
+from test_utils_kda import do_bench
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    k = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    gk = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    gk = F.logsigmoid(gk)
+    gk = chunk_local_cumsum(gk, chunk_size)
+    return q, k, gk, beta
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    chunk_size,
+    sub_chunk_size,
+    output_dtype,
+):
+    Aqk = torch.empty(B, S, H, chunk_size, dtype=output_dtype).cuda()
+    Akk = torch.empty(B, S, H, sub_chunk_size, dtype=output_dtype).cuda()
+    return Aqk, Akk
+
+
+def get_configs():
+    import itertools
+
+    block_H = [1, 2, 4, 8]
+    threads = [128, 256]
+    num_stages = [0, 1, 2, 3]
+    _configs = list(itertools.product(block_H, threads, num_stages))
+
+    configs = [{"block_H": c[0], "threads": c[1], "num_stages": c[2]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_chunk_kda_fwd_intra_token_parallel(
+    B,
+    S,
+    H,
+    DK,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    sub_chunk_size,
+    block_H=1,
+    threads=32,
+    num_stages=1,
+):
+    CS = chunk_size
+    SCS = sub_chunk_size
+    Q_shape = (B, S, H, DK)
+    K_shape = (B, S, H, DK)
+    GK_shape = (B, S, H, DK)
+    Beta_shape = (B, S, H)
+    Aqk_shape = (B, S, H, CS)
+    Akk_shape = (B, S, H, SCS)
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(Q_shape, dtype=input_dtype),
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        GK: T.Tensor(GK_shape, dtype=gate_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        Aqk: T.Tensor(Aqk_shape, dtype=output_dtype),
+        Akk: T.Tensor(Akk_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(B * S, T.ceildiv(H, block_H), threads=threads) as (bbs, bh):  # block_index_bs, block_index_dh
+            bb, bs = bbs // S, bbs % S
+            i_c = bs // CS  # indice chunk
+            i_s = (bs % CS) // SCS  # indice subchunk
+            i_tc = i_c * CS
+            i_ts = i_tc + i_s * SCS
+            loops = bs + 1 - i_ts
+
+            Q_i_shared = T.alloc_shared((block_H, DK), dtype=input_dtype)
+            K_i_shared = T.alloc_shared((block_H, DK), dtype=input_dtype)
+            GK_i_shared = T.alloc_shared((block_H, DK), dtype=gate_dtype)
+            Beta_shared = T.alloc_shared(
+                (block_H,),
+                dtype=input_dtype,
+            )
+            K_j_shared = T.alloc_shared((block_H, DK), dtype=input_dtype)
+            GK_j_shared = T.alloc_shared((block_H, DK), dtype=gate_dtype)
+            Aqk_shared = T.alloc_shared((block_H, DK), dtype=accum_dtype)
+            Akk_shared = T.alloc_shared((block_H, DK), dtype=accum_dtype)
+            Sum_Aqk_shared = T.alloc_shared((block_H, CS), dtype=output_dtype)
+            Sum_Akk_shared = T.alloc_shared((block_H, SCS), dtype=output_dtype)
+
+            Q_i_fragment = T.alloc_fragment(
+                (block_H, DK),
+                dtype=input_dtype,
+            )
+            K_i_fragment = T.alloc_fragment(
+                (block_H, DK),
+                dtype=input_dtype,
+            )
+            K_j_fragment = T.alloc_fragment(
+                (block_H, DK),
+                dtype=accum_dtype,
+            )
+
+            Sum_Aqk_fragment = T.alloc_fragment(
+                (block_H,),
+                dtype=accum_dtype,
+            )
+            Sum_Akk_fragment = T.alloc_fragment(
+                (block_H,),
+                dtype=accum_dtype,
+            )
+
+            T.copy(Q[bb, bs, bh * block_H : (bh + 1) * block_H, :], Q_i_shared)
+            T.copy(K[bb, bs, bh * block_H : (bh + 1) * block_H, :], K_i_shared)
+            T.copy(GK[bb, bs, bh * block_H : (bh + 1) * block_H, :], GK_i_shared)  # TMA
+
+            T.disable_warp_group_reg_alloc()
+            for i_h in T.Parallel(block_H):  # cannot use TMA
+                Beta_shared[i_h] = Beta[bb, bs, bh * block_H + i_h]
+
+            for i_h, i_k in T.Parallel(block_H, DK):
+                K_i_fragment[i_h, i_k] = K_i_shared[i_h, i_k] * Beta_shared[i_h]
+                Q_i_fragment[i_h, i_k] = Q_i_shared[i_h, i_k]
+
+            T.clear(Sum_Akk_shared)
+            T.clear(Sum_Aqk_shared)
+
+            for d in T.Pipelined(loops, num_stages=num_stages):
+                j = d + i_ts
+                T.copy(K[bb, j, bh * block_H : (bh + 1) * block_H, :], K_j_shared)
+                T.copy(GK[bb, j, bh * block_H : (bh + 1) * block_H, :], GK_j_shared)
+                # T.copy(K_j_shared, K_j_fragment)
+                for i_h, i_k in T.Parallel(block_H, DK):
+                    K_j_fragment[i_h, i_k] = K_j_shared[i_h, i_k] * T.exp2(GK_i_shared[i_h, i_k] - GK_j_shared[i_h, i_k])
+                    Aqk_shared[i_h, i_k] = Q_i_fragment[i_h, i_k] * K_j_fragment[i_h, i_k]
+                    Akk_shared[i_h, i_k] = K_i_fragment[i_h, i_k] * K_j_fragment[i_h, i_k]
+
+                T.reduce_sum(Aqk_shared, Sum_Aqk_fragment, dim=-1, clear=True)
+                T.reduce_sum(Akk_shared, Sum_Akk_fragment, dim=-1, clear=True)
+
+                T.copy(Sum_Aqk_fragment, Sum_Aqk_shared[:, j % CS])
+
+                if j < bs:
+                    T.copy(Sum_Akk_fragment, Sum_Akk_shared[:, d])
+
+            T.copy(Sum_Aqk_shared, Aqk[bb, bs, bh * block_H : (bh + 1) * block_H, :])
+            T.copy(Sum_Akk_shared, Akk[bb, bs, bh * block_H : (bh + 1) * block_H, :])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    scale,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    sub_chunk_size,
+):
+    q, k, gk, beta = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+    )
+    Aqk_ref, Akk_ref = prepare_output(B, S, H, chunk_size, sub_chunk_size, getattr(torch, output_dtype))
+    Aqk_tilelang, Akk_tilelang = prepare_output(B, S, H, chunk_size, sub_chunk_size, getattr(torch, output_dtype))
+
+    Aqk_ref, Akk_ref = chunk_kda_fwd_intra_token_parallel(
+        q=q, k=k, gk=gk, beta=beta, Aqk=Aqk_ref, Akk=Akk_ref, scale=scale, chunk_size=chunk_size, sub_chunk_size=sub_chunk_size
+    )
+
+    kernel = tilelang_chunk_kda_fwd_intra_token_parallel(
+        B,
+        S,
+        H,
+        DK,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        sub_chunk_size,
+    )
+    # kernel_source  = kernel.get_kernel_source()
+    # print(kernel_source)
+    # exit()
+    # # scale 如何传值
+    # r = torch.cuda.nvtx.range_start("TILELANG_KDA")
+    Aqk_tilelang, Akk_tilelang = kernel(
+        q,
+        k,
+        gk,
+        beta,
+    )
+    # torch.cuda.nvtx.range_end(r)
+
+    fla_time = do_bench(
+        chunk_kda_fwd_intra_token_parallel,
+        q=q,
+        k=k,
+        gk=gk,
+        beta=beta,
+        Aqk=Aqk_ref,
+        Akk=Akk_ref,
+        scale=scale,
+        chunk_size=chunk_size,
+        sub_chunk_size=sub_chunk_size,
+    )
+    tilelang_time = do_bench(
+        kernel,
+        q,
+        k,
+        gk,
+        beta,
+    )
+
+    print(f"fla time: {fla_time} ms")
+    print(f"tilelang time: {tilelang_time} ms")
+
+
+def main():
+    run_test(
+        B=1,
+        S=1024 * 8,  # 32768
+        H=64,
+        DK=128,
+        scale=1.0,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        chunk_size=64,
+        sub_chunk_size=16,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/chunk_o.py b/examples/kda/chunk_o.py
new file mode 100644
index 0000000000..3d92d9a4ce
--- /dev/null
+++ b/examples/kda/chunk_o.py
@@ -0,0 +1,242 @@
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+
+from FLA_KDA.fla_chunk_o import chunk_gla_fwd_o_gk
+from test_utils_kda import compare_tensors
+
+import torch
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+):
+    BS = chunk_size
+    Q = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    A = torch.randn(B, S, H, BS, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    HIDDEN = torch.randn(B, S // BS, H, DK, DV, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    return Q, V, G, A, HIDDEN
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+):
+    O = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    return O
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [128, 256]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-1])
+def tilelang_chunk_fwd_o(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    chunk_size,
+    scale,
+    # kernel config
+    block_S=64,
+    block_DK=64,
+    block_DV=64,
+    threads=256,
+    num_stages=0,
+):
+    assert chunk_size == block_S, "chunk_size must be equal to block_S"
+    BS = chunk_size
+    Q_shape = (B, S, H, DK)
+    A_shape = (B, S, H, BS)
+    V_shape = (B, S, H, DV)
+    H_shape = (B, S // BS, H, DK, DV)
+    GK_shape = (B, S, H, DK)
+    O_shape = (B, S, H, DV)
+
+    @T.prim_func
+    def kernel(
+        Q: T.Tensor(Q_shape, dtype=input_dtype),  # type: ignore
+        V: T.Tensor(V_shape, dtype=input_dtype),  # type: ignore
+        GK: T.Tensor(GK_shape, dtype=gate_dtype),  # type: ignore
+        A: T.Tensor(A_shape, dtype=input_dtype),  # type: ignore
+        HIDDEN: T.Tensor(H_shape, dtype=input_dtype),  # type: ignore
+        O: T.Tensor(O_shape, dtype=output_dtype),  # type: ignore
+    ):
+        with T.Kernel(T.ceildiv(DV, block_DV), T.ceildiv(S, block_S), B * H, threads=threads) as (bv, bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            Q_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            HIDDEN_shared = T.alloc_shared((block_DK, block_DV), dtype=input_dtype)
+            A_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+            O_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            O_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            GK_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            GQ_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+
+            T.clear(O_fragment)
+
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(Q[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], Q_shared)  # [block_S, block_DK]
+                T.copy(
+                    GK[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], GK_shared
+                )  # [block_S, block_DK]
+                T.copy(
+                    HIDDEN[bb, bs, bh, i_k * block_DK : (i_k + 1) * block_DK, bv * block_DV : (bv + 1) * block_DV], HIDDEN_shared
+                )  # [block_DK, block_DV]
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    Q_shared[i_s, i_k2] = Q_shared[i_s, i_k2] * scale
+                    GQ_shared[i_s, i_k2] = Q_shared[i_s, i_k2] * T.exp2(GK_shared[i_s, i_k2])
+                T.gemm(GQ_shared, HIDDEN_shared, O_fragment)  # O_fragment as accumulator
+            T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV], V_shared)  # [block_S, block_DV]
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, 0:block_S], A_shared)  # [block_S, block_S]
+
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                A_shared[i_s1, i_s2] = T.if_then_else(i_s1 < i_s2, 0, A_shared[i_s1, i_s2])
+
+            T.gemm(
+                A_shared,
+                V_shared,
+                O_fragment,
+            )
+
+            T.copy(O_fragment, O_shared)
+
+            T.copy(O_shared, O[bb, bs * block_S : (bs + 1) * block_S, bh, bv * block_DV : (bv + 1) * block_DV])
+
+    return kernel
+
+
+def do_bench(fn, *args, warmup=10, rep=10, **kwargs):
+    """
+    Do benchmark for a function.
+    """
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+
+    torch.cuda.synchronize()
+    for i in range(rep):
+        start_event[i].record()
+        fn(*args, **kwargs)
+        end_event[i].record()
+    torch.cuda.synchronize()
+
+    # Record clocks
+    times = torch.tensor(
+        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
+        dtype=torch.float,
+    )
+
+    return times.mean().item()
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    block_DK,
+    block_DV,
+    threads,
+    num_stages,
+):
+    input_dtype_torch = getattr(torch, input_dtype)
+    output_dtype_torch = getattr(torch, output_dtype)
+    accum_dtype_torch = getattr(torch, accum_dtype)
+    gate_dtype_torch = getattr(torch, gate_dtype)
+    Q, V, G, A, HIDDEN = prepare_input(
+        B, S, H, DK, DV, chunk_size, input_dtype_torch, output_dtype_torch, accum_dtype_torch, gate_dtype_torch
+    )
+    scale = 1.0 / DK**0.5
+    # scale = 1.0
+
+    O_ref = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
+    O_ref = chunk_gla_fwd_o_gk(Q, V, G, A, HIDDEN, scale, chunk_size=chunk_size, use_exp2=True)
+
+    block_S = chunk_size
+    O_tilelang = prepare_output(B, S, H, DK, DV, chunk_size, output_dtype_torch)
+    kernel = tilelang_chunk_fwd_o(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        chunk_size,
+        scale,
+        block_S,
+    )
+    O_tilelang = kernel(Q, V, G, A, HIDDEN)
+    compare_tensors("O", O_ref, O_tilelang)
+    fla_time = do_bench(chunk_gla_fwd_o_gk, Q, V, G, A, HIDDEN, scale, chunk_size=chunk_size, use_exp2=True)
+    tilelang_time = do_bench(kernel, Q, V, G, A, HIDDEN)
+    print("fla_time:", fla_time)
+    print("tilelang_time:", tilelang_time)
+
+
+def main():
+    run_test(
+        B=1,
+        S=8192,
+        H=64,
+        DK=128,
+        DV=128,
+        chunk_size=64,
+        input_dtype="bfloat16",
+        output_dtype="bfloat16",
+        accum_dtype="float32",
+        gate_dtype="float32",
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=1,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/test_utils_kda.py b/examples/kda/test_utils_kda.py
new file mode 100644
index 0000000000..4d0240e4e0
--- /dev/null
+++ b/examples/kda/test_utils_kda.py
@@ -0,0 +1,108 @@
+import torch
+
+RCP_LN2 = 1.4426950216
+
+
+def print_red_warning(message):
+    print(f"\033[31mWARNING: {message}\033[0m")
+
+
+def calc_sim(x, y, name="tensor"):
+    x, y = x.data.double(), y.data.double()
+    denominator = (x * x + y * y).sum()
+    if denominator == 0:
+        print_red_warning(f"{name} all zero")
+        return 1
+    sim = 2 * (x * y).sum() / denominator
+    return sim
+
+
+def assert_similar(x, y, eps=1e-8, name="tensor", data="", raise_assert=True):
+    x_mask = torch.isfinite(x)
+    y_mask = torch.isfinite(y)
+    if not torch.all(x_mask == y_mask):
+        print_red_warning(f"{name} Error: isfinite mask mismatch")
+        if raise_assert:
+            raise AssertionError
+    if not torch.isclose(x.masked_fill(x_mask, 0), y.masked_fill(y_mask, 0), rtol=0, atol=0, equal_nan=True).all():
+        print_red_warning(f"{name} Error: nonfinite value mismatch")
+        if raise_assert:
+            raise AssertionError
+    x = x.masked_fill(~x_mask, 0)
+    y = y.masked_fill(~y_mask, 0)
+    sim = calc_sim(x, y, name)
+    diff = 1.0 - sim
+    if not (0 <= diff <= eps):
+        print_red_warning(f"{name} Error: {diff}")
+        if raise_assert:
+            raise AssertionError
+    else:
+        print(f"{name} {data} passed")
+
+
+def compare_tensors(name, x, y, atol=1e-5, rtol=1e-5):
+    import numpy as np
+    import torch
+
+    diff = (x - y).abs()
+
+    # ========= Max Absolute Error =========
+    max_abs_err = diff.max().item()
+    abs_flat_idx = diff.argmax()
+    abs_idx = list(np.unravel_index(abs_flat_idx.cpu().numpy(), diff.shape))
+
+    # ========= Relative Error (NaN-safe) =========
+    denom = y.abs()
+    rel = torch.zeros_like(diff)
+    mask = denom > 0
+    rel[mask] = diff[mask] / denom[mask]
+
+    max_rel_err = rel.max().item()
+    rel_flat_idx = rel.argmax()
+    rel_idx = list(np.unravel_index(rel_flat_idx.cpu().numpy(), rel.shape))
+
+    # ========= Cross Error =========
+    abs_pos_rel_err = rel[tuple(abs_idx)].item()
+    rel_pos_abs_err = diff[tuple(rel_idx)].item()
+
+    # ========= Print =========
+    print(f"========== Compare: {name} ==========")
+
+    print(f"Max absolute error : {max_abs_err:.6e}")
+    print(f"  at index         : {abs_idx}")
+    print(f"  x[{abs_idx}] = {x[tuple(abs_idx)].item():.6e}")
+    print(f"  y[{abs_idx}] = {y[tuple(abs_idx)].item():.6e}")
+    print(f"  relative error   : {abs_pos_rel_err:.6e}")
+
+    print(f"\nMax relative error : {max_rel_err:.6e}")
+    print(f"  at index         : {rel_idx}")
+    print(f"  x[{rel_idx}] = {x[tuple(rel_idx)].item():.6e}")
+    print(f"  y[{rel_idx}] = {y[tuple(rel_idx)].item():.6e}")
+    print(f"  absolute error   : {rel_pos_abs_err:.6e}")
+
+    print("=====================================\n")
+
+
+def do_bench(fn, *args, warmup=20, rep=10, **kwargs):
+    """
+    Do benchmark for a function.
+    """
+    start_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    end_event = [torch.cuda.Event(enable_timing=True) for i in range(rep)]
+    for _ in range(warmup):
+        fn(*args, **kwargs)
+
+    torch.cuda.synchronize()
+    for i in range(rep):
+        start_event[i].record()
+        fn(*args, **kwargs)
+        end_event[i].record()
+    torch.cuda.synchronize()
+
+    # Record clocks
+    times = torch.tensor(
+        [s.elapsed_time(e) for s, e in zip(start_event, end_event)],
+        dtype=torch.float,
+    )
+    print(times)
+    return times.mean().item()
diff --git a/examples/kda/wy_fast.py b/examples/kda/wy_fast.py
new file mode 100644
index 0000000000..2b1cd5fb07
--- /dev/null
+++ b/examples/kda/wy_fast.py
@@ -0,0 +1,231 @@
+# Reference: fla/ops/gated_delta_rule/wy_fast.py
+
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+import torch
+
+from FLA_KDA.fla_wy_fast import recompute_w_u_fwd
+from test_utils_kda import compare_tensors, do_bench
+
+torch.random.manual_seed(42)
+
+
+def prepare_input(B, S, H, DK, DV, chunk_size, input_dtype, output_dtype, gate_dtype=torch.float32):
+    BS = chunk_size
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    Beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    G = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    A = torch.randn(B, S, H, BS, dtype=input_dtype).cuda()
+    return K, V, Beta, G, A
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    use_qg,
+    use_kg,
+    output_dtype,
+):
+    W = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    U = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    QG = torch.empty(B, S, H, DK, dtype=output_dtype).cuda() if use_qg else None
+    KG = torch.empty(B, S, H, DK, dtype=output_dtype).cuda() if use_kg else None
+    return W, U, QG, KG
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64]
+    block_DV = [32, 64]
+    threads = [64, 128, 256]
+    num_stages = [0, 1, 2, 3, 4]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(out_idx=[-4, -3, -2, -1], pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: True})
+def tilelang_recompute_w_u_fwd(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    gate_dtype,
+    accum_dtype,
+    chunk_size,
+    use_qg,
+    use_kg,
+    # kernel config
+    block_S=64,
+    block_DK=32,
+    block_DV=32,
+    threads=128,
+    num_stages=0,
+):
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    Beta_shape = (B, S, H)
+    assert chunk_size == block_S, "chunk_size must be equal to block_S"
+    BS = chunk_size
+    G_shape = (B, S, H, DK)
+    A_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        G: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=output_dtype),
+        W: T.Tensor(K_shape, dtype=output_dtype),
+        U: T.Tensor(V_shape, dtype=output_dtype),
+        QG: T.Tensor(K_shape, dtype=output_dtype),
+        KG: T.Tensor(K_shape, dtype=output_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+            Beta_shared = T.alloc_shared((block_S,), dtype=input_dtype, scope="shared")
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            G_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            A_shared = T.alloc_shared((block_S, block_S), dtype=output_dtype)
+            W_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            U_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            W_shared = T.alloc_shared((block_S, block_DK), dtype=output_dtype)
+            U_shared = T.alloc_shared((block_S, block_DV), dtype=output_dtype)
+            W_Beta_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            U_Beta_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            G_n_shared = T.alloc_shared(block_DK, dtype=gate_dtype)
+            KG_shared = T.alloc_shared((block_S, block_DK), dtype=output_dtype)
+
+            T.disable_warp_group_reg_alloc()  # TMA to transfer the last dimension of the data should be 16 times
+            for i_s in T.Parallel(block_S):
+                Beta_shared[i_s] = Beta[bb, bs * block_S + i_s, bh]
+
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)
+
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    U_Beta_shared[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
+                T.gemm(A_shared, U_Beta_shared, U_fragment, clear_accum=True)
+                T.copy(U_fragment, U_shared)
+                T.copy(U_shared, U[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
+
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(G[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], G_shared)
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    W_Beta_shared[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * T.exp2(G_shared[i_s, i_k2])
+                T.gemm(A_shared, W_Beta_shared, W_fragment, clear_accum=True)
+                T.copy(W_fragment, W_shared)
+                T.copy(W_shared, W[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
+
+                if use_kg:
+                    T.copy(G[bb, (bs + 1) * block_S - 1, bh, i_k * block_DK : (i_k + 1) * block_DK], G_n_shared)
+
+                    for i_s3, i_k3 in T.Parallel(block_S, block_DK):
+                        KG_shared[i_s3, i_k3] = K_shared[i_s3, i_k3] * T.exp2(G_n_shared[i_k3] - G_shared[i_s3, i_k3])
+                    T.copy(KG_shared, KG[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    gate_dtype,
+    accum_dtype,
+    block_DK,
+    block_DV,
+    threads,
+    num_stages,
+):
+    use_qg = False
+    use_kg = True
+    K, V, Beta, G, A = prepare_input(
+        B, S, H, DK, DV, chunk_size, getattr(torch, input_dtype), getattr(torch, output_dtype), gate_dtype=getattr(torch, gate_dtype)
+    )
+    W_ref, U_ref, QG_ref, KG_ref = prepare_output(B, S, H, DK, DV, use_qg, use_kg, getattr(torch, output_dtype))
+    W_tilelang, U_tilelang, QG_tilelang, KG_tilelang = prepare_output(B, S, H, DK, DV, use_qg, use_kg, getattr(torch, output_dtype))
+
+    # reference
+    (
+        W_ref,
+        U_ref,
+        _,
+        KG_ref,
+    ) = recompute_w_u_fwd(
+        k=K,
+        v=V,
+        beta=Beta,
+        gk=G,
+        A=A,
+    )
+
+    block_S = chunk_size
+    kernel = tilelang_recompute_w_u_fwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        gate_dtype,
+        accum_dtype,
+        chunk_size,
+        use_qg,
+        use_kg,
+        block_S=block_S,
+    )
+    W_tilelang, U_tilelang, _, KG_tilelang = kernel(K, V, Beta, G, A)
+
+    tilelang_time = do_bench(kernel, K, V, Beta, G, A)
+    triton_time = do_bench(recompute_w_u_fwd, k=K, v=V, beta=Beta, gk=G, A=A)
+    print("tilelang time:", tilelang_time)
+    print("tritron time:", triton_time)
+
+    compare_tensors("W", W_ref, W_tilelang)
+    compare_tensors("U", U_ref, U_tilelang)
+    compare_tensors("KG", KG_ref, KG_tilelang)
+
+
+def main():
+    run_test(
+        B=1,
+        S=8192,
+        H=64,
+        DK=128,
+        DV=128,
+        chunk_size=64,
+        input_dtype=T.bfloat16,
+        output_dtype=T.bfloat16,
+        gate_dtype=T.float32,
+        accum_dtype=T.float32,
+        block_DK=64,
+        block_DV=32,
+        threads=128,
+        num_stages=3,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/kda/wy_fast_bwd.py b/examples/kda/wy_fast_bwd.py
new file mode 100644
index 0000000000..8fd4b4d707
--- /dev/null
+++ b/examples/kda/wy_fast_bwd.py
@@ -0,0 +1,350 @@
+# Reference: fla/ops/gated_delta_rule/wy_fast.py
+
+import sys  # noqa: F401
+
+import tilelang
+import tilelang.language as T
+from tilelang.autotuner import autotune
+from FLA_KDA.fla_wy_fast import prepare_wy_repr_bwd
+from test_utils_kda import do_bench, compare_tensors
+
+import torch
+
+torch.random.manual_seed(0)
+torch.set_printoptions(profile="full")
+
+
+def prepare_input(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    BS = chunk_size
+    K = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    V = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    Beta = torch.randn(B, S, H, dtype=input_dtype).cuda()
+    GK = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+    A = torch.randn(B, S, H, BS, dtype=input_dtype).cuda()
+    dw = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    dv = torch.randn(B, S, H, DV, dtype=input_dtype).cuda()
+    dk = torch.randn(B, S, H, DK, dtype=input_dtype).cuda()
+    dg = torch.randn(B, S, H, DK, dtype=gate_dtype).cuda()
+
+    return K, V, Beta, GK, A, dw, dv, dk, dg
+
+
+def prepare_output(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    chunk_size,
+    output_dtype,
+    gate_dtype,
+    state_dtype,
+):
+    dk = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    dv = torch.empty(B, S, H, DV, dtype=output_dtype).cuda()
+    dbeta = torch.empty(B, S, H, dtype=output_dtype).cuda()
+    dg = torch.empty(B, S, H, DK, dtype=gate_dtype).cuda()
+    dA = torch.empty(B, S, H, DK, dtype=output_dtype).cuda()
+    return dk, dv, dbeta, dg, dA
+
+
+def get_configs():
+    import itertools
+
+    block_DK = [32, 64, 128]
+    block_DV = [32, 64, 128]
+    threads = [32, 64, 128, 256]
+    num_stages = [0, 1, 2, 3]
+    _configs = list(itertools.product(block_DK, block_DV, threads, num_stages))
+
+    configs = [{"block_DK": c[0], "block_DV": c[1], "threads": c[2], "num_stages": c[3]} for c in _configs]
+    return configs
+
+
+@autotune(configs=get_configs(), warmup=3, rep=5)
+@tilelang.jit(
+    out_idx=[-5, -4, -3, -2, -1],
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
+def tilelang_wy_fast_bwd(
+    # task config
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    # kernel config
+    block_DK=64,
+    block_DV=64,
+    threads=128,
+    num_stages=0,
+):
+    block_S = chunk_size
+    BS = block_S
+
+    K_shape = (B, S, H, DK)
+    V_shape = (B, S, H, DV)
+    Beta_shape = (B, S, H)
+    G_shape = (B, S, H, DK)
+    A_shape = (B, S, H, BS)
+    dw_shape = (B, S, H, DK)
+    du_shape = (B, S, H, DV)
+
+    dk_shape = (B, S, H, DK)
+    dv_shape = (B, S, H, DV)
+    dbeta_shape = (B, S, H)
+    dg_shape = (B, S, H, DK)
+    dA_shape = (B, S, H, BS)
+
+    @T.prim_func
+    def kernel(
+        # input
+        K: T.Tensor(K_shape, dtype=input_dtype),
+        V: T.Tensor(V_shape, dtype=input_dtype),
+        Beta: T.Tensor(Beta_shape, dtype=input_dtype),
+        GK: T.Tensor(G_shape, dtype=gate_dtype),
+        A: T.Tensor(A_shape, dtype=input_dtype),
+        dw: T.Tensor(dw_shape, dtype=input_dtype),
+        du: T.Tensor(du_shape, dtype=input_dtype),
+        dk: T.Tensor(dk_shape, dtype=input_dtype),
+        dg: T.Tensor(dg_shape, dtype=gate_dtype),
+        # output
+        dA: T.Tensor(dA_shape, dtype=input_dtype),
+        dk2: T.Tensor(dk_shape, dtype=output_dtype),
+        dv: T.Tensor(dv_shape, dtype=output_dtype),
+        dbeta: T.Tensor(dbeta_shape, dtype=output_dtype),
+        dg2: T.Tensor(dg_shape, dtype=gate_dtype),
+    ):
+        with T.Kernel(T.ceildiv(S, block_S), B * H, threads=threads) as (bs, bbh):
+            bb, bh = bbh // H, bbh % H
+
+            A_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+            K_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            K_shared_beta_g = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            V_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            V_shared_beta = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+            Beta_shared = T.alloc_shared((block_S,), dtype=input_dtype)
+            GK_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            GK_shared_exp = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            dw_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            du_shared = T.alloc_shared((block_S, block_DV), dtype=input_dtype)
+
+            dk_old_shared = T.alloc_shared((block_S, block_DK), dtype=input_dtype)
+            dg_old_shared = T.alloc_shared((block_S, block_DK), dtype=gate_dtype)
+            dA_shared = T.alloc_shared((block_S, block_S), dtype=input_dtype)
+
+            dA_fragment = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            dA_fragment_tmp1 = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+            dA_fragment_tmp2 = T.alloc_fragment((block_S, block_S), dtype=accum_dtype)
+
+            dk_fragment = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dk_fragment_beta_g = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dv_fragment = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dv_fragment_beta = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dbeta_fragment = T.alloc_fragment((block_S,), dtype=accum_dtype)
+            dbeta_fragment_reduce_tmpk = T.alloc_fragment((block_S, block_DK), dtype=accum_dtype)
+            dbeta_fragment_reduce_tmpv = T.alloc_fragment((block_S, block_DV), dtype=accum_dtype)
+            dg_fragment = T.alloc_fragment((block_S, block_DK), dtype=gate_dtype)
+
+            T.clear(dA_fragment)
+            T.clear(dk_fragment)
+            T.clear(dk_fragment_beta_g)
+            T.clear(dv_fragment)
+            T.clear(dv_fragment_beta)
+            T.clear(dbeta_fragment)
+            T.clear(dg_fragment)
+
+            T.copy(A[bb, bs * block_S : (bs + 1) * block_S, bh, :], A_shared)  # load A
+            T.copy(Beta[bb, bs * block_S : (bs + 1) * block_S, bh], Beta_shared)
+
+            # Update dk
+            for i_k in T.Pipelined(T.ceildiv(DK, block_DK), num_stages=num_stages):
+                T.copy(K[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], K_shared)
+                T.copy(dk[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dk_old_shared)
+                T.copy(dg[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dg_old_shared)
+                T.copy(GK[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], GK_shared)
+
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    GK_shared_exp[i_s, i_k2] = T.exp2(GK_shared[i_s, i_k2])
+                    K_shared_beta_g[i_s, i_k2] = K_shared[i_s, i_k2] * Beta_shared[i_s] * GK_shared_exp[i_s, i_k2]
+
+                T.copy(dw[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK], dw_shared)
+                T.gemm(dw_shared, K_shared_beta_g, dA_fragment, transpose_B=True, clear_accum=False)
+                T.gemm(A_shared, dw_shared, dk_fragment_beta_g, transpose_A=True, clear_accum=True)
+
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dk_fragment[i_s, i_k2] = (
+                        dk_fragment_beta_g[i_s, i_k2] * GK_shared_exp[i_s, i_k2] * Beta_shared[i_s] + dk_old_shared[i_s, i_k2]
+                    )
+
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dbeta_fragment_reduce_tmpk[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared[i_s, i_k2] * GK_shared_exp[i_s, i_k2]
+                T.reduce_sum(dbeta_fragment_reduce_tmpk, dbeta_fragment, dim=1, clear=False)
+
+                for i_s, i_k2 in T.Parallel(block_S, block_DK):
+                    dg_fragment[i_s, i_k2] = dk_fragment_beta_g[i_s, i_k2] * K_shared_beta_g[i_s, i_k2] + dg_old_shared[i_s, i_k2]
+
+                # correct dk, dg
+                T.copy(dk_fragment, dk2[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
+                T.copy(dg_fragment, dg2[bb, bs * block_S : (bs + 1) * block_S, bh, i_k * block_DK : (i_k + 1) * block_DK])
+
+            # Update dv
+            for i_v in T.Pipelined(T.ceildiv(DV, block_DV), num_stages=num_stages):
+                T.copy(V[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], V_shared)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    V_shared_beta[i_s, i_v2] = V_shared[i_s, i_v2] * Beta_shared[i_s]
+                T.copy(du[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV], du_shared)
+                T.gemm(du_shared, V_shared_beta, dA_fragment, transpose_B=True)
+                T.gemm(A_shared, du_shared, dv_fragment_beta, clear_accum=True, transpose_A=True)
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    dv_fragment[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * Beta_shared[i_s]
+
+                for i_s, i_v2 in T.Parallel(block_S, block_DV):
+                    dbeta_fragment_reduce_tmpv[i_s, i_v2] = dv_fragment_beta[i_s, i_v2] * V_shared[i_s, i_v2]
+                T.reduce_sum(dbeta_fragment_reduce_tmpv, dbeta_fragment, dim=1, clear=False)
+
+                T.copy(dv_fragment, dv[bb, bs * block_S : (bs + 1) * block_S, bh, i_v * block_DV : (i_v + 1) * block_DV])
+
+            T.copy(dbeta_fragment, dbeta[bb, bs * block_S : (bs + 1) * block_S, bh])
+
+            # correct dA
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                dA_shared[i_s1, i_s2] = T.if_then_else(i_s1 > i_s2, dA_fragment[i_s1, i_s2], 0.0)
+            T.gemm(dA_shared, A_shared, dA_fragment_tmp1, transpose_B=True, clear_accum=True)
+            T.copy(dA_fragment_tmp1, dA_shared)
+            T.gemm(A_shared, dA_shared, dA_fragment_tmp2, transpose_A=True, clear_accum=True)
+            for i_s1, i_s2 in T.Parallel(block_S, block_S):
+                dA_fragment_tmp2[i_s1, i_s2] = T.if_then_else(i_s1 > i_s2, -dA_fragment_tmp2[i_s1, i_s2], 0.0)
+            T.copy(dA_fragment_tmp2, dA[bb, bs * block_S : (bs + 1) * block_S, bh, :])
+
+    return kernel
+
+
+def run_test(
+    B,
+    S,
+    H,
+    DK,
+    DV,
+    input_dtype,
+    output_dtype,
+    accum_dtype,
+    gate_dtype,
+    state_dtype,
+    chunk_size,
+    block_DK=64,
+    block_DV=64,
+    threads=128,
+    num_stages=0,
+):
+    K, V, Beta, GK, A, dw, dv, dk, dg = prepare_input(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        chunk_size,
+        getattr(torch, input_dtype),
+        getattr(torch, output_dtype),
+        getattr(torch, accum_dtype),
+        getattr(torch, gate_dtype),
+        getattr(torch, state_dtype),
+    )
+
+    dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang, dA_tilelang = prepare_output(
+        B, S, H, DK, DV, chunk_size, getattr(torch, output_dtype), getattr(torch, gate_dtype), getattr(torch, state_dtype)
+    )
+
+    # ref
+    dk_ref, dv_ref, dbeta_ref, dg_ref, dA_ref = prepare_wy_repr_bwd(
+        k=K,
+        v=V,
+        gk=GK,
+        beta=Beta,
+        A=A,
+        dw=dw,
+        du=dv,
+        dk=dk,
+        dg=dg,
+    )
+
+    # # tilelang
+    kernel = tilelang_wy_fast_bwd(
+        B,
+        S,
+        H,
+        DK,
+        DV,
+        input_dtype,
+        output_dtype,
+        accum_dtype,
+        gate_dtype,
+        state_dtype,
+        chunk_size,
+    )
+    dA_tilelang, dk_tilelang, dv_tilelang, dbeta_tilelang, dg_tilelang = kernel(K, V, Beta, GK, A, dw, dv, dk, dg)
+
+    compare_tensors("dA", dA_tilelang, dA_ref)
+    compare_tensors("dk", dk_tilelang, dk_ref)
+    compare_tensors("dv", dv_tilelang, dv_ref)
+    compare_tensors("dbeta", dbeta_tilelang, dbeta_ref)
+    compare_tensors("dg", dg_tilelang, dg_ref)
+    fla_time = do_bench(
+        prepare_wy_repr_bwd,
+        k=K,
+        v=V,
+        gk=GK,
+        beta=Beta,
+        A=A,
+        dw=dw,
+        du=dv,
+        dk=dk,
+        dg=dg,
+    )
+    tilelang_time = do_bench(kernel, K, V, Beta, GK, A, dw, dv, dk, dg)
+    print(f"FLA_time: {fla_time}")
+    print(f"TileLang_time: {tilelang_time}")
+
+
+def main():
+    DK = 128
+    DV = 128
+    run_test(
+        B=1,
+        S=32768,
+        H=8,
+        DK=DK,
+        DV=DV,
+        input_dtype=T.float32,
+        output_dtype=T.float32,
+        accum_dtype=T.float32,
+        gate_dtype=T.float32,
+        state_dtype=T.float32,
+        chunk_size=64,
+        block_DK=32,
+        block_DV=32,
+        threads=128,
+        num_stages=0,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/linear_attention/example_linear_attn_bwd.py b/examples/linear_attention/example_linear_attn_bwd.py
index 82ae1d982a..e15c263060 100644
--- a/examples/linear_attention/example_linear_attn_bwd.py
+++ b/examples/linear_attention/example_linear_attn_bwd.py
@@ -9,12 +9,7 @@
 from typing import Optional, Tuple
 
 
-@tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    }
-)
+@tilelang.jit(pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True})
 def tl_fused_chunk_bwd_kernel(
     B,
     S,
diff --git a/examples/linear_attention/example_linear_attn_fwd.py b/examples/linear_attention/example_linear_attn_fwd.py
index cdfd5cb721..fd64882d48 100644
--- a/examples/linear_attention/example_linear_attn_fwd.py
+++ b/examples/linear_attention/example_linear_attn_fwd.py
@@ -11,10 +11,7 @@
 
 @tilelang.jit(
     out_idx=[4],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def tl_fused_chunk_fwd_kernel(
     B,
diff --git a/examples/linear_attention/example_retention_fwd.py b/examples/linear_attention/example_retention_fwd.py
index f45e383889..9b1e1cd8b1 100644
--- a/examples/linear_attention/example_retention_fwd.py
+++ b/examples/linear_attention/example_retention_fwd.py
@@ -6,7 +6,7 @@
 import argparse
 
 
-@tl.jit(out_idx=3, pass_configs={"tl.disable_tma_lower": True, "tl.disable_warp_specialized": True})
+@tl.jit(out_idx=3, pass_configs={"tl.disable_warp_specialized": True})
 def chunk_retention_fwd_kernel(
     B,
     S,
diff --git a/examples/minference/example_vertical_slash_sparse_attn.py b/examples/minference/example_vertical_slash_sparse_attn.py
index 91af8b454a..a03e5318b1 100644
--- a/examples/minference/example_vertical_slash_sparse_attn.py
+++ b/examples/minference/example_vertical_slash_sparse_attn.py
@@ -48,13 +48,11 @@ def Prefetch(
             bz: T.int32,
             by: T.int32,
         ):
-            with T.attr("default", "async_scope", 1):
-                for i, j in T.Parallel(block_N, dim):
-                    K_shared[i, j] = T.if_then_else(k + i < column_count, K[bz, by, column_index[k + i], j], 0)
+            for i, j in T.Parallel(block_N, dim, prefer_async=True, annotations={"parallel_async_without_async_commit_wait": True}):
+                K_shared[i, j] = T.if_then_else(k + i < column_count, K[bz, by, column_index[k + i], j], 0)
 
-            with T.attr("default", "async_scope", 1):
-                for i, j in T.Parallel(block_N, dim):
-                    V_shared[i, j] = T.if_then_else(k + i < column_count, V[bz, by, column_index[k + i], j], 0)
+            for i, j in T.Parallel(block_N, dim, prefer_async=True, annotations={"parallel_async_without_async_commit_wait": True}):
+                V_shared[i, j] = T.if_then_else(k + i < column_count, V[bz, by, column_index[k + i], j], 0)
 
             T.ptx_commit_group()
 
@@ -135,7 +133,7 @@ def vs_sparse_flashattn_ws(
                 column_count = T.alloc_var(dtype=int_dtype)
                 column_index = T.alloc_shared([vertical_size_round], int_dtype, scope="shared")
 
-                T.create_list_of_mbarrier([128] * 9)
+                mbars = T.alloc_barrier([128] * 9)
 
                 block_count = BlockCount[bz, by, bx]
                 column_count = ColumnCount[bz, by, bx]
@@ -152,30 +150,30 @@ def vs_sparse_flashattn_ws(
 
                 if tid >= 128:
                     T.annotate_producer_reg_dealloc()
-                    T.copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared)
-                    T.mbarrier_arrive(mbarrier=8)
+                    T.tma_copy(Q[bz, by, bx * block_M : (bx + 1) * block_M, :], Q_shared, barrier=mbars[8])
+                    T.mbarrier_arrive(mbarrier=mbars[8])
                     for bi in T.serial(block_count):
                         k = block_offset[bi]
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 4, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(K[bz, by, k : k + block_N, :], K_shared[bi % 2, :, :])
-                        T.mbarrier_arrive(mbarrier=bi % 2)
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 6, parity=(((bi & 3) >> 1) ^ 1))
-                        T.copy(V[bz, by, k : k + block_N, :], V_shared[bi % 2, :, :])
-                        T.mbarrier_arrive(mbarrier=bi % 2 + 2)
+                        T.mbarrier_wait_parity(mbarrier=mbars[bi % 2 + 4], parity=(((bi & 3) >> 1) ^ 1))
+                        T.tma_copy(K[bz, by, k : k + block_N, :], K_shared[bi % 2, :, :], barrier=mbars[bi % 2])
+                        T.mbarrier_arrive(mbarrier=mbars[bi % 2])
+                        T.mbarrier_wait_parity(mbarrier=mbars[bi % 2 + 6], parity=(((bi & 3) >> 1) ^ 1))
+                        T.tma_copy(V[bz, by, k : k + block_N, :], V_shared[bi % 2, :, :], barrier=mbars[bi % 2 + 2])
+                        T.mbarrier_arrive(mbarrier=mbars[bi % 2 + 2])
                 else:
                     T.annotate_consumer_reg_alloc()
                     T.fill(acc_o, 0)
                     T.fill(logsum, 0)
                     T.fill(scores_max, -T.infinity(accum_dtype))
-                    T.mbarrier_wait_parity(mbarrier=8, parity=0)
+                    T.mbarrier_wait_parity(mbarrier=mbars[8], parity=0)
                     for bi in T.serial(block_count):
                         k = block_offset[bi]
                         for i, j in T.Parallel(block_M, block_N):
                             acc_s[i, j] = T.if_then_else(bx * block_M + i >= k + j, 0, -T.infinity(acc_s.dtype))
 
-                        T.mbarrier_wait_parity(mbarrier=bi % 2, parity=((bi & 3) >> 1))
+                        T.mbarrier_wait_parity(mbarrier=mbars[bi % 2], parity=((bi & 3) >> 1))
                         T.gemm(Q_shared, K_shared[bi % 2, :, :], acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
-                        T.mbarrier_arrive(mbarrier=bi % 2 + 4)
+                        T.mbarrier_arrive(mbarrier=mbars[bi % 2 + 4])
 
                         T.copy(scores_max, scores_max_prev)
 
@@ -191,10 +189,10 @@ def vs_sparse_flashattn_ws(
                             acc_o[i, j] = acc_o[i, j] * scores_scale[i]
 
                         T.copy(acc_s, acc_s_cast)
-                        T.mbarrier_wait_parity(mbarrier=bi % 2 + 2, parity=((bi & 3) >> 1))
+                        T.mbarrier_wait_parity(mbarrier=mbars[bi % 2 + 2], parity=((bi & 3) >> 1))
                         T.gemm(acc_s_cast, V_shared[bi % 2, :, :], acc_o, policy=T.GemmWarpPolicy.FullRow)
 
-                        T.mbarrier_arrive(mbarrier=bi % 2 + 6)
+                        T.mbarrier_arrive(mbarrier=mbars[bi % 2 + 6])
 
                         T.reduce_sum(acc_s, scores_sum, dim=1)
 
@@ -560,21 +558,10 @@ def sum_all_diagonal_matrix(mat: torch.tensor):
     return sum_diags[:, :, 1:]
 
 
-def main(argv=None):
-    parser = argparse.ArgumentParser()
+def main(batch=1, heads=1, seq_len=4096, head_dim=64, vertical_size=1000, slash_size=200):
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = batch, heads, seq_len, head_dim
 
-    parser.add_argument("--batch", type=int, default=1)
-    parser.add_argument("--heads", type=int, default=1)
-    parser.add_argument("--seq_len", type=int, default=16384)
-    parser.add_argument("--head_dim", type=int, default=64)
-    parser.add_argument("--vertical_size", type=int, default=1000)
-    parser.add_argument("--slash_size", type=int, default=200)
-
-    args = parser.parse_args(argv)
-
-    BATCH, N_HEADS, SEQ_LEN, D_HEAD = args.batch, args.heads, args.seq_len, args.head_dim
-
-    vertical_size, slash_size = args.vertical_size, args.slash_size
+    vertical_size, slash_size = vertical_size, slash_size
 
     torch.manual_seed(0)
     q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
@@ -613,17 +600,8 @@ def main(argv=None):
     print(f"speedup: {triton_time / tilelang_time:.2f}x")
 
 
-def run_regression_perf(argv=None):
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--batch", type=int, default=1)
-    parser.add_argument("--heads", type=int, default=1)
-    parser.add_argument("--seq_len", type=int, default=16384)
-    parser.add_argument("--head_dim", type=int, default=64)
-    parser.add_argument("--vertical_size", type=int, default=1000)
-    parser.add_argument("--slash_size", type=int, default=200)
-    args = parser.parse_args(argv)
-    BATCH, N_HEADS, SEQ_LEN, D_HEAD = args.batch, args.heads, args.seq_len, args.head_dim
-    vertical_size, slash_size = args.vertical_size, args.slash_size
+def run_regression_perf(batch=1, heads=1, seq_len=16384, head_dim=64, vertical_size=1000, slash_size=200):
+    BATCH, N_HEADS, SEQ_LEN, D_HEAD = batch, heads, seq_len, head_dim
     torch.manual_seed(0)
     q = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
     k = torch.randn(BATCH, N_HEADS, SEQ_LEN, D_HEAD, device="cuda", dtype=torch.float16)
@@ -687,4 +665,19 @@ def run_kernel_only():
 
 
 if __name__ == "__main__":
-    main()
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--batch", type=int, default=1)
+    parser.add_argument("--heads", type=int, default=1)
+    parser.add_argument("--seq_len", type=int, default=16384)
+    parser.add_argument("--head_dim", type=int, default=64)
+    parser.add_argument("--vertical_size", type=int, default=1000)
+    parser.add_argument("--slash_size", type=int, default=200)
+    args = parser.parse_args()
+    main(
+        batch=args.batch,
+        heads=args.heads,
+        seq_len=args.seq_len,
+        head_dim=args.head_dim,
+        vertical_size=args.vertical_size,
+        slash_size=args.slash_size,
+    )
diff --git a/examples/minference/regression_vs_sparse_attn.py b/examples/minference/regression_vs_sparse_attn.py
index 32fdfa9e80..ca2ddf58f2 100644
--- a/examples/minference/regression_vs_sparse_attn.py
+++ b/examples/minference/regression_vs_sparse_attn.py
@@ -3,7 +3,7 @@
 
 
 def regression_example_vertical_slash_sparse_attn():
-    tilelang.testing.process_func(example_vertical_slash_sparse_attn.run_regression_perf, argv=[])
+    tilelang.testing.process_func(example_vertical_slash_sparse_attn.run_regression_perf)
 
 
 if __name__ == "__main__":
diff --git a/examples/minference/test_vs_sparse_attn.py b/examples/minference/test_vs_sparse_attn.py
index f01df3808f..9e6741dcf0 100644
--- a/examples/minference/test_vs_sparse_attn.py
+++ b/examples/minference/test_vs_sparse_attn.py
@@ -5,7 +5,7 @@
 
 @tilelang.testing.requires_cuda
 def test_vs_sparse_attn():
-    example_vertical_slash_sparse_attn.main(argv=[])
+    example_vertical_slash_sparse_attn.main()
 
 
 if __name__ == "__main__":
diff --git a/examples/norm/rms_norm.py b/examples/norm/rms_norm.py
index 57bccc1a0f..f05782add0 100644
--- a/examples/norm/rms_norm.py
+++ b/examples/norm/rms_norm.py
@@ -33,7 +33,7 @@ def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
     return main
 
 
-@tilelang.jit(out_idx=[-1], pass_configs={"tl.disable_tma_lower": True})
+@tilelang.jit(out_idx=[-1])
 def rms_norm(M, N, blk_m):
     dtype = T.float
 
diff --git a/examples/norm/test_rms_norm.py b/examples/norm/test_rms_norm.py
index 53db03d98c..0d14e93a8a 100644
--- a/examples/norm/test_rms_norm.py
+++ b/examples/norm/test_rms_norm.py
@@ -65,7 +65,7 @@ def ref_program(x):
 
 def test_rms_norm(M=1024, N=1024, blk_m=1):
     program = rms_norm(M, N, blk_m)
-    kernel = tilelang.compile(program, out_idx=-1, pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=-1)
     profiler = kernel.get_profiler()
     profiler.assert_allclose(ref_program, rtol=0.01, atol=0.01)
 
diff --git a/examples/plot_layout/layout_swizzle.py b/examples/plot_layout/layout_swizzle.py
new file mode 100644
index 0000000000..c8bd526005
--- /dev/null
+++ b/examples/plot_layout/layout_swizzle.py
@@ -0,0 +1,32 @@
+from tilelang.layout import (
+    make_full_bank_swizzled_layout,
+    make_half_bank_swizzled_layout,
+    make_quarter_bank_swizzled_layout,
+)
+from tilelang.tools import plot_layout
+
+element_size = 16  # float16 = 16 bits
+
+
+# ---- Plot the swizzle patterns ----
+
+# 1. Quarter-bank (32B) — 1-bit XOR — 8x16
+# Rows 0-3: identity; Rows 4-7: two 8-element halves swap
+layout = make_quarter_bank_swizzled_layout(8, 16, element_size)
+print(f"Quarter-bank swizzle (8x16, fp16): {layout}")
+plot_layout(layout, name="swizzle_quarter_8x16")
+
+# 2. Half-bank (64B) — 2-bit XOR — 8x32
+layout = make_half_bank_swizzled_layout(8, 32, element_size)
+print(f"Half-bank swizzle (8x32, fp16): {layout}")
+plot_layout(layout, name="swizzle_half_8x32")
+
+# 3. Full-bank (128B) — 3-bit XOR — 8x64
+layout = make_full_bank_swizzled_layout(8, 64, element_size)
+print(f"Full-bank swizzle (8x64, fp16): {layout}")
+plot_layout(layout, name="swizzle_full_8x64")
+
+# 4. Full-bank (128B) — multi-tile: 32x64
+layout = make_full_bank_swizzled_layout(32, 64, element_size)
+print(f"Full-bank swizzle (32x64, fp16): {layout}")
+plot_layout(layout, name="swizzle_full_32x64")
diff --git a/examples/plot_layout/layout_transform.py b/examples/plot_layout/layout_transform.py
new file mode 100644
index 0000000000..144b3744fd
--- /dev/null
+++ b/examples/plot_layout/layout_transform.py
@@ -0,0 +1,24 @@
+import tilelang.language as T
+from tilelang.tools import plot_layout
+
+# --- Example 1: Simple 2D Transpose (4x4) ---
+transpose_layout = T.Layout([4, 4], lambda i, j: (j, i))
+print("Transpose 4x4:", transpose_layout)
+plot_layout(transpose_layout, name="transpose_4x4")
+
+# --- Example 2: Larger Transpose (8x8) ---
+transpose_8x8 = T.Layout([8, 8], lambda i, j: (j, i))
+print("Transpose 8x8:", transpose_8x8)
+plot_layout(transpose_8x8, name="transpose_8x8")
+
+# --- Example 3: 3D → 2D reshape + transpose ---
+# (i, j, k) with shape [2, 4, 8] → (k, i*4+j)
+reshape_layout = T.Layout([2, 4, 8], lambda i, j, k: (k, i * 4 + j))
+print("Reshape 3D [2,4,8] -> [8,8]:", reshape_layout)
+plot_layout(reshape_layout, name="reshape_3d_to_2d")
+
+# --- Example 4: Interleave layout ---
+# Even rows from first half, odd rows from second half
+interleave = T.Layout([8, 4], lambda i, j: (i % 4 * 2 + i // 4, j))
+print("Interleave [8,4]:", interleave)
+plot_layout(interleave, name="interleave_8x4")
diff --git a/examples/quickstart.py b/examples/quickstart.py
index e99fc0dbce..f8704e2f4d 100644
--- a/examples/quickstart.py
+++ b/examples/quickstart.py
@@ -76,8 +76,8 @@ def matmul_relu_kernel(
 print("Kernel output matches PyTorch reference.")
 
 # 4. Retrieve and inspect the generated CUDA source (optional)
-# cuda_source = matmul_relu_kernel.get_kernel_source()
-# print("Generated CUDA kernel:\n", cuda_source)
+cuda_source = matmul_relu_kernel.get_kernel_source()
+print("Generated CUDA kernel:\n", cuda_source)
 
 # 5.Profile latency with kernel
 profiler = matmul_relu_kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
diff --git a/examples/seer_attention/block_sparse_attn_tilelang.py b/examples/seer_attention/block_sparse_attn_tilelang.py
index 0a3c3a6e37..6ea8710b72 100644
--- a/examples/seer_attention/block_sparse_attn_tilelang.py
+++ b/examples/seer_attention/block_sparse_attn_tilelang.py
@@ -35,7 +35,7 @@ def get_sparse_attn_mask_from_threshold(x, threshold, use_dense_for_last_block=F
 def blocksparse_flashattn(batch, heads, seq_q, seq_kv, dim, downsample_len, is_causal):
     block_M = 64
     block_N = 64
-    num_stages = 0
+    num_stages = 1
     threads = 128
     scale = (1.0 / dim) ** 0.5 * 1.44269504  # log2(e)
     q_shape = [batch, heads, seq_q, dim]
@@ -273,7 +273,6 @@ def run_kernel_only():
     block_mask = get_sparse_attn_mask_from_topk(x_ds, topk=TOPK)
 
     kernel = blocksparse_flashattn(BATCH, N_HEADS, Q_LEN, K_LEN, D_HEAD, downsample_len, is_causal=True)
-    print(kernel.get_kernel_source())
 
     def run_kernel_only2():
         kernel(q, k, v, block_mask.to(torch.int8))
diff --git a/examples/sparse_tensorcore/test_example_sparse_tensorcore.py b/examples/sparse_tensorcore/test_example_sparse_tensorcore.py
index 72292e4486..4ef16a72c8 100644
--- a/examples/sparse_tensorcore/test_example_sparse_tensorcore.py
+++ b/examples/sparse_tensorcore/test_example_sparse_tensorcore.py
@@ -4,7 +4,7 @@
 
 
 @tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version(9, 0)
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 def test_tilelang_example_sparse_tensorcore():
     tilelang_example_sparse_tensorcore.main()
 
diff --git a/examples/warp_specialize/example_warp_specialize_flashmla.py b/examples/warp_specialize/example_warp_specialize_flashmla.py
index 155a459707..3316a5ca0c 100644
--- a/examples/warp_specialize/example_warp_specialize_flashmla.py
+++ b/examples/warp_specialize/example_warp_specialize_flashmla.py
@@ -105,9 +105,9 @@ def main_no_split(
 
             tx = T.get_thread_binding()
 
-            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l)
-            T.copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r)
-            T.copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared)
+            T.tma_copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :h_dim], Q_shared_l, barrier=q_shared_ready_barrier)
+            T.tma_copy(Q[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, h_dim:], Q_shared_r, barrier=q_shared_ready_barrier)
+            T.tma_copy(Q_pe[bid, hid * VALID_BLOCK_H : (hid + 1) * VALID_BLOCK_H, :], Q_pe_shared, barrier=q_shared_ready_barrier)
             T.barrier_arrive(q_shared_ready_barrier)
             T.barrier_wait(q_shared_ready_barrier, 0)
 
@@ -120,23 +120,23 @@ def main_no_split(
                 T.fill(acc_o_l, 0)
                 T.fill(logsum_0, 0)
 
-                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
+                T.tma_copy(KV[bid, block_N : 2 * block_N, cur_kv_head, :h_dim], KV_shared_1_l, barrier=kv_shared_1_l_is_ready)
                 T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                T.copy(KV[bid, block_N : 2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
+                T.tma_copy(KV[bid, block_N : 2 * block_N, cur_kv_head, h_dim:], KV_shared_1_r, barrier=kv_shared_1_r_is_ready)
                 T.barrier_arrive(kv_shared_1_r_is_ready)
 
-                T.copy(K_pe[bid, block_N : 2 * block_N, cur_kv_head, :], K_pe_shared_1)
+                T.tma_copy(K_pe[bid, block_N : 2 * block_N, cur_kv_head, :], K_pe_shared_1, barrier=kv_shared_1_pe_is_ready)
                 T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 for k in T.serial(loop_range):
                     T.barrier_wait(kv_shared_0_l_is_ready, k % 2)
-                    T.gemm(Q_shared_l, KV_shared_0_l, acc_s_0, transpose_B=True, clear_accum=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_0_l, acc_s_0, transpose_B=True, clear_accum=True)
                     T.barrier_wait(kv_shared_0_r_is_ready, k % 2)
-                    T.gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_0_r, acc_s_0, transpose_B=True)
 
                     T.barrier_wait(kv_shared_0_pe_is_ready, k % 2)
-                    T.gemm(Q_pe_local_0, K_pe_shared_0, acc_s_0, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_pe_local_0, K_pe_shared_0, acc_s_0, transpose_B=True)
 
                     T.wait_wgmma(0)
 
@@ -171,7 +171,11 @@ def main_no_split(
                     T.barrier_wait(scale_1_ready_barrier, k % 2)
 
                     if k < loop_range - 1:
-                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :h_dim], KV_shared_0_l)
+                        T.tma_copy(
+                            KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :h_dim],
+                            KV_shared_0_l,
+                            barrier=kv_shared_0_l_is_ready,
+                        )
                         T.barrier_arrive(kv_shared_0_l_is_ready)
 
                     # Step 11.
@@ -191,10 +195,18 @@ def main_no_split(
                     T.gemm(SP1_shared, KV_shared_1_l, acc_o_l)
 
                     if k < loop_range - 1:
-                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :h_dim], KV_shared_1_l)
+                        T.tma_copy(
+                            KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :h_dim],
+                            KV_shared_1_l,
+                            barrier=kv_shared_1_l_is_ready,
+                        )
                         T.barrier_arrive(kv_shared_1_l_is_ready)
 
-                        T.copy(K_pe[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :], K_pe_shared_1)
+                        T.tma_copy(
+                            K_pe[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, :],
+                            K_pe_shared_1,
+                            barrier=kv_shared_1_pe_is_ready,
+                        )
                         T.barrier_arrive(kv_shared_1_pe_is_ready)
 
                 T.copy(logsum_0, logsum)
@@ -210,23 +222,23 @@ def main_no_split(
                 T.fill(acc_o_r, 0)
                 T.fill(logsum_1, 0)
 
-                T.copy(KV[bid, :block_N, cur_kv_head, :h_dim], KV_shared_0_l)
+                T.tma_copy(KV[bid, :block_N, cur_kv_head, :h_dim], KV_shared_0_l, barrier=kv_shared_0_l_is_ready)
                 T.barrier_arrive(kv_shared_0_l_is_ready)
-                T.copy(KV[bid, :block_N, cur_kv_head, h_dim:], KV_shared_0_r)
+                T.tma_copy(KV[bid, :block_N, cur_kv_head, h_dim:], KV_shared_0_r, barrier=kv_shared_0_r_is_ready)
                 T.barrier_arrive(kv_shared_0_r_is_ready)
-                T.copy(K_pe[bid, :block_N, cur_kv_head, :], K_pe_shared_0)
+                T.tma_copy(K_pe[bid, :block_N, cur_kv_head, :], K_pe_shared_0, barrier=kv_shared_0_pe_is_ready)
                 T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 for k in T.serial(loop_range):
                     # Step 2.
                     T.barrier_wait(kv_shared_1_l_is_ready, k % 2)
-                    T.gemm(Q_shared_l, KV_shared_1_l, acc_s_1, transpose_B=True, clear_accum=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_l, KV_shared_1_l, acc_s_1, transpose_B=True, clear_accum=True)
 
                     T.barrier_wait(kv_shared_1_r_is_ready, k % 2)
-                    T.gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_shared_r, KV_shared_1_r, acc_s_1, transpose_B=True)
 
                     T.barrier_wait(kv_shared_1_pe_is_ready, k % 2)
-                    T.gemm(Q_pe_local_1, K_pe_shared_1, acc_s_1, transpose_B=True, wg_wait=-1)
+                    T.wgmma_gemm(Q_pe_local_1, K_pe_shared_1, acc_s_1, transpose_B=True)
 
                     T.wait_wgmma(0)
 
@@ -258,12 +270,16 @@ def main_no_split(
 
                     # Step 10. compute O1 with KV_shared_1_rd
                     T.copy(acc_s_1, acc_s_1_cast)
-                    T.gemm(acc_s_1_cast, KV_shared_1_r, acc_o_r, wg_wait=-1)
+                    T.wgmma_gemm(acc_s_1_cast, KV_shared_1_r, acc_o_r)
                     T.copy(acc_s_1_cast, SP1_shared)
                     T.barrier_arrive(s_shared_ready_barrier)
 
                     if k < loop_range - 1:
-                        T.copy(KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, h_dim:], KV_shared_1_r)
+                        T.tma_copy(
+                            KV[bid, (2 * k + 3) * block_N : (2 * k + 4) * block_N, cur_kv_head, h_dim:],
+                            KV_shared_1_r,
+                            barrier=kv_shared_1_r_is_ready,
+                        )
                         T.barrier_arrive(kv_shared_1_r_is_ready)
 
                     T.barrier_wait(p0_1_1_ready_barrier, k % 2)
@@ -271,10 +287,18 @@ def main_no_split(
                     T.gemm(SP0_shared, KV_shared_0_r, acc_o_r)
 
                     if k < loop_range - 1:
-                        T.copy(KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, h_dim:], KV_shared_0_r)
+                        T.tma_copy(
+                            KV[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, h_dim:],
+                            KV_shared_0_r,
+                            barrier=kv_shared_0_r_is_ready,
+                        )
                         T.barrier_arrive(kv_shared_0_r_is_ready)
 
-                        T.copy(K_pe[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :], K_pe_shared_0)
+                        T.tma_copy(
+                            K_pe[bid, (2 * k + 2) * block_N : (2 * k + 3) * block_N, cur_kv_head, :],
+                            K_pe_shared_0,
+                            barrier=kv_shared_0_pe_is_ready,
+                        )
                         T.barrier_arrive(kv_shared_0_pe_is_ready)
 
                 T.barrier_wait(lse_0_ready_barrier, 0)
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
index 1672dbfb80..90d0941fa1 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_barrierpipe_stage2.py
@@ -1,8 +1,6 @@
 import tilelang
 import tilelang.language as T
 
-tilelang.disable_cache()
-
 
 # add decorator @tilelang.jit if you want to return a torch function
 # @tilelang.jit
@@ -24,21 +22,29 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
 
             # create mbarrier for tma
-            T.create_list_of_mbarrier(mbarrier_list)
+            mbars = T.alloc_barrier(mbarrier_list)
 
             with T.ws(0):
                 T.clear(C_local)
 
             for ko in range(T.ceildiv(K, block_K)):
                 with T.ws(1):
-                    T.mbarrier_wait_parity(mbarrier=ko % num_stages + num_stages, parity=((ko // num_stages) % num_stages) ^ 1)
-                    T.copy(A[by * block_M : (by + 1) * block_M, ko * block_K : (ko + 1) * block_K], A_shared[ko % num_stages, :, :])
-                    T.copy(B[ko * block_K : (ko + 1) * block_K, bx * block_N : (bx + 1) * block_N], B_shared[ko % num_stages, :, :])
-                    T.mbarrier_arrive(mbarrier=ko % num_stages)
+                    T.mbarrier_wait_parity(mbarrier=mbars[ko % num_stages + num_stages], parity=((ko // num_stages) % num_stages) ^ 1)
+                    T.tma_copy(
+                        A[by * block_M : (by + 1) * block_M, ko * block_K : (ko + 1) * block_K],
+                        A_shared[ko % num_stages, :, :],
+                        barrier=mbars[ko % num_stages],
+                    )
+                    T.tma_copy(
+                        B[ko * block_K : (ko + 1) * block_K, bx * block_N : (bx + 1) * block_N],
+                        B_shared[ko % num_stages, :, :],
+                        barrier=mbars[ko % num_stages],
+                    )
+                    T.mbarrier_arrive(mbarrier=mbars[ko % num_stages])
                 with T.ws(0):
-                    T.mbarrier_wait_parity(mbarrier=ko % num_stages, parity=(ko // num_stages) % num_stages)
+                    T.mbarrier_wait_parity(mbarrier=mbars[ko % num_stages], parity=(ko // num_stages) % num_stages)
                     T.gemm(A_shared[ko % num_stages, :, :], B_shared[ko % num_stages, :, :], C_local)
-                    T.mbarrier_arrive(mbarrier=ko % num_stages + num_stages)
+                    T.mbarrier_arrive(mbarrier=mbars[ko % num_stages + num_stages])
 
             with T.ws(0):
                 T.copy(C_local, C[by * block_M, bx * block_N])
@@ -47,7 +53,6 @@ def main(
 
 
 def main(M=16384, N=16384, K=16384):
-    tilelang.disable_cache()
     block_M = 128
     block_N = 128
     block_K = 64
@@ -83,7 +88,6 @@ def main(M=16384, N=16384, K=16384):
 
 
 def run_regression_perf(M=16384, N=16384, K=16384):
-    tilelang.disable_cache()
     block_M = 128
     block_N = 128
     block_K = 64
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
index b582ee74cc..7ffd938cea 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_0_gemm_1.py
@@ -26,8 +26,8 @@ def main(
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
                 with T.ws(0):
                     T.barrier_wait(compute_is_done, (ko + 1) % 2)
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
+                    T.tma_copy(A[by * block_M, ko * block_K], A_shared, barrier=data_is_ready)
+                    T.tma_copy(B[ko * block_K, bx * block_N], B_shared, barrier=data_is_ready)
                     T.barrier_arrive(data_is_ready)
                 with T.ws(1):
                     T.barrier_wait(data_is_ready, ko % 2)
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
index d6d243bb01..aef3c0e90d 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_1_gemm_0.py
@@ -27,9 +27,9 @@ def main(
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
                 with T.ws(1):
                     T.barrier_wait(compute_is_done, (ko + 1) % 2)
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
-                    T.barrier_arrive(data_is_ready)
+                    T.tma_copy(A[by * block_M, ko * block_K], A_shared, barrier=data_is_ready)
+                    T.tma_copy(B[ko * block_K, bx * block_N], B_shared, barrier=data_is_ready)
+                    T.barrier_arrive(data_is_ready)  # Arrive once after all tma_copy
                 with T.ws(0):
                     T.barrier_wait(data_is_ready, ko % 2)
                     T.gemm(A_shared, B_shared, C_local)
@@ -47,7 +47,6 @@ def main(M=16384, N=16384, K=16384):
     block_K = 64
 
     jit_kernel = matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K)
-
     import torch
 
     a = torch.randn(M, K, device="cuda", dtype=torch.float16)
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
index 5468aa6eac..ad6dd2909d 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_copy_gemm_0_1.py
@@ -6,9 +6,7 @@
 # @tilelang.jit
 @tilelang.jit(
     out_idx=[2],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },
+    pass_configs={},
 )
 def matmul_warp_specialize_copy_1_gemm_0(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     warp_group_num = 2
diff --git a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
index 54566b785d..6ec8c4b7ca 100644
--- a/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
+++ b/examples/warp_specialize/example_warp_specialize_gemm_softpipe_stage2.py
@@ -28,8 +28,8 @@ def main(
             for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
                 with T.ws(1):
                     T.barrier_wait(compute_is_done, (ko + 1) % 2)
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
-                    T.copy(B[ko * block_K, bx * block_N], B_shared)
+                    T.tma_copy(A[by * block_M, ko * block_K], A_shared, barrier=data_is_ready)
+                    T.tma_copy(B[ko * block_K, bx * block_N], B_shared, barrier=data_is_ready)
                     T.barrier_arrive(data_is_ready)
                 with T.ws(0):
                     T.barrier_wait(data_is_ready, ko % 2)
@@ -48,7 +48,6 @@ def main(M=16384, N=16384, K=16384):
     block_K = 64
 
     jit_kernel = matmul(M, N, K, block_M, block_N, block_K)
-
     # 3. Test the kernel in Python with PyTorch data
     import torch
 
@@ -58,7 +57,6 @@ def main(M=16384, N=16384, K=16384):
 
     # Run the kernel through the Profiler
     c = jit_kernel(a, b)
-
     # Reference multiplication using PyTorch
     ref_c = a @ b
 
diff --git a/maint/scripts/pytest_cuda_scheduler.py b/maint/scripts/pytest_cuda_scheduler.py
new file mode 100644
index 0000000000..c60dc7ac43
--- /dev/null
+++ b/maint/scripts/pytest_cuda_scheduler.py
@@ -0,0 +1,242 @@
+# ruff: noqa
+"""CUDA-aware pytest-xdist scheduler.
+
+This plugin ensures that each xdist worker is pinned to a unique CUDA device.
+The controller process assigns devices ahead of worker start-up so each worker
+sees its dedicated GPU via ``CUDA_VISIBLE_DEVICES``.
+"""
+
+from __future__ import annotations
+
+import os
+from collections import deque
+from typing import Any, Deque
+from collections.abc import Iterable, MutableMapping
+
+import pytest
+from _pytest.config import Config
+from _pytest.stash import StashKey
+from xdist.scheduler.load import LoadScheduling
+
+ENV_DEVICE_LIST = "PYTEST_XDIST_CUDA_DEVICES"
+ENV_VISIBLE_DEVICES = "CUDA_VISIBLE_DEVICES"
+WORKER_ENV_DEVICE = "PYTEST_XDIST_WORKER_CUDA_DEVICE"
+# Optional override: how many workers per selected GPU.
+ENV_WORKERS_PER_DEVICE = "PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE"
+
+ALLOCATOR_KEY: StashKey[CudaDeviceAllocator] = StashKey()
+NUM_WORKERS_PER_DEVICE = 4  # default if ENV_WORKERS_PER_DEVICE is unset/invalid
+
+
+def _workers_per_device() -> int:
+    """Return number of workers per GPU from env or default."""
+    raw = os.environ.get(ENV_WORKERS_PER_DEVICE)
+    if not raw:
+        return NUM_WORKERS_PER_DEVICE
+    try:
+        val = int(str(raw).strip())
+        if val > 0:
+            return val
+    except Exception:
+        pass
+    return NUM_WORKERS_PER_DEVICE
+
+
+class CudaDeviceAllocator:
+    """Allocate one CUDA device per execnet spec/worker."""
+
+    def __init__(self, devices: Iterable[str]) -> None:
+        # cleaned = [str(device).strip() for device in devices if str(device).strip()]
+        cleaned = []
+        workers_per_device = _workers_per_device()
+        for device in devices:
+            dev_str = str(device).strip()
+            if dev_str:
+                cleaned += [dev_str] * workers_per_device
+
+        self._total = len(cleaned)
+        self._queue: Deque[str] = deque(cleaned)
+        self._assignments: dict[str, str] = {}
+
+    @property
+    def count(self) -> int:
+        return self._total
+
+    @property
+    def assigned(self) -> dict[str, str]:
+        return dict(self._assignments)
+
+    def assign(self, spec: object) -> str:
+        key = self._spec_key(spec)
+        if key in self._assignments:
+            return self._assignments[key]
+        if not self._queue:
+            raise pytest.UsageError(
+                f"Not enough CUDA devices for pytest-xdist workers; need at least {len(self._assignments) + 1}, available {self._total}."
+            )
+        device = self._queue.popleft()
+        self._assignments[key] = device
+        return device
+
+    @staticmethod
+    def _spec_key(spec: object) -> str:
+        identifier = getattr(spec, "id", None)
+        if identifier is None:
+            identifier = repr(spec)
+        return str(identifier)
+
+
+class CudaDeviceScheduler(LoadScheduling):
+    """Load scheduler that records device assignments for logging."""
+
+    def __init__(self, config: Config, log, allocator: CudaDeviceAllocator) -> None:
+        super().__init__(config, log)
+        self._allocator = allocator
+        if allocator.count < self.numnodes:
+            raise pytest.UsageError(f"Not enough CUDA devices for pytest-xdist workers: need {self.numnodes}, found {allocator.count}.")
+
+    def add_node(self, node) -> None:  # WorkerController (runtime import avoidance)
+        device = self._allocator.assign(node.gateway.spec)
+        node.workerinput.setdefault("cuda_device", device)
+        self.log("assigned", node.gateway.id, "to CUDA device", device)
+        super().add_node(node)
+
+
+def _parse_device_list(raw: str) -> list[str]:
+    return [token.strip() for token in raw.split(",") if token.strip()]
+
+
+def _discover_cuda_devices() -> list[str]:
+    """
+    Discover available CUDA devices, then select only ceil(n/2) devices.
+
+    - Priority of discovery:
+      1) ENV: PYTEST_XDIST_CUDA_DEVICES
+      2) ENV: CUDA_VISIBLE_DEVICES
+      3) torch.cuda device count (if available)
+
+    After discovery, if there are n devices, we keep ceil(n/2) devices.
+    """
+    devices: list[str] = []
+
+    primary = os.environ.get(ENV_DEVICE_LIST)
+    if primary:
+        devices = _parse_device_list(primary)
+    else:
+        visible = os.environ.get(ENV_VISIBLE_DEVICES)
+        if visible:
+            devices = _parse_device_list(visible)
+        else:
+            try:  # Lazy import; torch is optional.
+                import torch  # type: ignore
+            except Exception:
+                torch = None  # type: ignore
+            if torch and torch.cuda.is_available():  # type: ignore[truthy-function]
+                devices = [str(idx) for idx in range(torch.cuda.device_count())]
+
+    if devices:
+        # Keep only ceil(n/2) devices
+        n = len(devices)
+        limit = (n + 1) // 2
+        devices = devices[:limit]
+
+    return devices
+
+
+def _xdist_enabled(config: Config) -> bool:
+    try:
+        dist_mode = config.getoption("dist")
+        tx_specs = config.getoption("tx")
+    except (AttributeError, ValueError):
+        return False
+    return dist_mode != "no" and bool(tx_specs)
+
+
+def _ensure_allocator(config: Config) -> CudaDeviceAllocator:
+    try:
+        return config.stash[ALLOCATOR_KEY]
+    except KeyError:
+        devices = _discover_cuda_devices()
+        if not devices:
+            raise pytest.UsageError(f"Cannot auto-discover CUDA devices. Set CUDA_VISIBLE_DEVICES or {ENV_DEVICE_LIST}.")
+        allocator = CudaDeviceAllocator(devices)
+        config.stash[ALLOCATOR_KEY] = allocator
+        return allocator
+
+
+@pytest.hookimpl
+def pytest_configure(config: Config) -> None:
+    if getattr(config, "workerinput", None) is not None:
+        device = config.workerinput.get("cuda_device")  # type: ignore[arg-type]
+        if device:
+            os.environ[ENV_VISIBLE_DEVICES] = str(device)
+            os.environ[WORKER_ENV_DEVICE] = str(device)
+        _set_worker_process_title(config, device, None)
+        return
+    if not _xdist_enabled(config):
+        return
+    _ensure_allocator(config)
+
+
+@pytest.hookimpl
+def pytest_xdist_setupnodes(config: Config, specs: Iterable[Any]) -> None:
+    if not _xdist_enabled(config):
+        return
+    allocator = _ensure_allocator(config)
+    for spec in specs:
+        device = allocator.assign(spec)
+        env: MutableMapping[str, str]
+        env = getattr(spec, "env", None) or {}
+        env = dict(env)
+        env[ENV_VISIBLE_DEVICES] = device
+        env[WORKER_ENV_DEVICE] = device
+        spec.env = env  # type: ignore[attr-defined]
+
+
+@pytest.hookimpl
+def pytest_configure_node(node) -> None:  # WorkerController
+    allocator = _ensure_allocator(node.config)
+    device = allocator.assign(node.gateway.spec)
+    node.workerinput["cuda_device"] = device
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_xdist_make_scheduler(config: Config, log):
+    if getattr(config, "workerinput", None) is not None:
+        return None
+    if not _xdist_enabled(config):
+        return None
+    allocator = _ensure_allocator(config)
+    return CudaDeviceScheduler(config, log, allocator)
+
+
+@pytest.hookimpl(tryfirst=True)
+def pytest_runtest_protocol(item, nextitem):
+    # Only act on workers; controller has no workerinput/device.
+    if getattr(item.config, "workerinput", None) is not None:
+        device = os.environ.get(WORKER_ENV_DEVICE)
+        _set_worker_process_title(item.config, device, item.nodeid)
+    return None
+
+
+def _set_worker_process_title(config: Config, device: str | None, test_name: str | None) -> None:
+    """Optionally label worker processes for easier inspection."""
+    try:
+        import setproctitle
+    except Exception:
+        return
+
+    workerid = None
+    workerinput = getattr(config, "workerinput", None)
+    if isinstance(workerinput, dict):
+        workerid = workerinput.get("workerid")
+
+    title_parts = ["pytest-xdist-worker"]
+    if workerid:
+        title_parts.append(str(workerid))
+    if device:
+        title_parts.append(f"cuda{device}")
+    if test_name:
+        title_parts.append(test_name)
+
+    setproctitle.setproctitle(" ".join(title_parts))
diff --git a/maint/scripts/regression_all.py b/maint/scripts/regression_all.py
index bed6ff68b9..0b39c8aa08 100644
--- a/maint/scripts/regression_all.py
+++ b/maint/scripts/regression_all.py
@@ -12,13 +12,6 @@
 except Exception:  # pragma: no cover
     tabulate = None  # type: ignore
 
-try:
-    from tqdm import tqdm
-except ImportError:
-
-    def tqdm(iterable, **kwargs):  # type: ignore
-        return iterable
-
 
 @dataclass(frozen=True)
 class PerfResult:
@@ -94,48 +87,90 @@ def regression_all(examples_root: str | os.PathLike[str] | None = None) -> None:
     merged: dict[str, float] = {}
     failures: list[str] = []
 
-    for bench_file in tqdm(bench_files, desc="Running regression tests ..."):
-        proc = subprocess.run(
+    total = len(bench_files)
+    print(f"\n{'═' * 60}")
+    print("  TileLang Performance Regression Suite")
+    print(f"  Found {total} test file(s)")
+    print(f"{'═' * 60}")
+    for idx, bench_file in enumerate(bench_files, 1):
+        rel_path = bench_file.relative_to(root)
+        print(f"\n{'─' * 60}")
+        print(f"[{idx}/{total}] 📂 {rel_path}")
+        print(f"{'─' * 60}")
+
+        proc = subprocess.Popen(
             [sys.executable, str(bench_file)],
             cwd=str(bench_file.parent),
-            capture_output=True,
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
             text=True,
             env={
                 **os.environ,
-                # Keep child processes from picking up user-site or random paths.
-                "PYTHONNOUSERSITE": "1",
-                # Ask child to emit a single JSON marker line for robust parsing.
                 "TL_PERF_REGRESSION_FORMAT": "json",
             },
         )
+
+        stdout_lines: list[str] = []
+        # Stream stdout in real-time
+        assert proc.stdout is not None
+        for line in proc.stdout:
+            stdout_lines.append(line)
+            # Don't print the JSON result line
+            if not line.startswith(_RESULTS_JSON_PREFIX):
+                print(line, end="", flush=True)
+
+        proc.wait()
+        stdout_content = "".join(stdout_lines)
+        stderr_content = proc.stderr.read() if proc.stderr else ""
+
         if proc.returncode != 0:
-            failures.append(f"{bench_file.relative_to(root)}\nSTDOUT:\n{proc.stdout}\nSTDERR:\n{proc.stderr}")
+            failures.append(f"{rel_path}\nSTDOUT:\n{stdout_content}\nSTDERR:\n{stderr_content}")
+            print("  └─ ❌ FAILED")
             continue
 
-        parsed = _parse_table(proc.stdout)
+        parsed = _parse_table(stdout_content)
+        num_tests = len(parsed)
         for k, v in parsed.items():
-            # First writer wins to keep stable behavior if duplicates happen.
             if k not in merged:
                 merged[k] = v
                 _RESULTS.append(PerfResult(name=k, latency=v))
 
+        print(f"  └─ ✅ Completed ({num_tests} tests)")
+
+    # Print summary
+    print(f"\n{'═' * 60}")
+    print("  Summary")
+    print(f"{'═' * 60}")
+    passed = total - len(failures)
+    print(f"  ✅ Passed: {passed}/{total} files")
+    if failures:
+        print(f"  ❌ Failed: {len(failures)}/{total} files")
+    print(f"  📊 Total tests: {len(merged)}")
+    print()
+
     if failures and not merged:
         raise RuntimeError("All benchmark drivers failed:\n\n" + "\n\n".join(failures))
     if failures:
         # Don't hard-fail if we have some results; surface the errors for debugging.
-        print("# Some benchmark drivers failed (partial results)")
+        print(f"{'─' * 60}")
+        print("  Failed benchmarks (partial results):")
+        print(f"{'─' * 60}")
         for msg in failures:
-            print("# ---")
+            print("  ---")
             for line in msg.splitlines():
-                print(f"# {line}")
+                print(f"  {line}")
+        print()
 
     fmt = os.environ.get("TL_PERF_REGRESSION_FORMAT", "text").strip().lower()
     if fmt == "json":
         print(_RESULTS_JSON_PREFIX + json.dumps(merged, separators=(",", ":")))
         return
 
+    print(f"{'─' * 60}")
+    print("  Results")
+    print(f"{'─' * 60}")
     rows = [[k, merged[k]] for k in sorted(merged.keys())]
-    headers = ["File", "Latency"]
+    headers = ["Name", "Latency (ms)"]
     if tabulate is None:
         print(f"| {headers[0]} | {headers[1]} |")
         print("|---|---|")
diff --git a/maint/scripts/run_local_ci_test.sh b/maint/scripts/run_local_ci_test.sh
index f6a58d78da..a1de0d1ec7 100755
--- a/maint/scripts/run_local_ci_test.sh
+++ b/maint/scripts/run_local_ci_test.sh
@@ -1,22 +1,194 @@
 #!/bin/bash
 
+# Usage:
+#   bash maint/scripts/run_local_ci_test.sh [DEVICE]
+#   or
+#   bash maint/scripts/run_local_ci_test.sh --device <cuda|hip|cpu|metal>
+#
+# What it does:
+#   - Runs project tests (examples/ and testing/python/) with pytest-xdist.
+#   - Loads the CUDA-aware scheduler plugin (-p pytest_cuda_scheduler).
+#   - When DEVICE=cuda (default): auto-detects available GPUs (n), selects
+#     ceil(n/2) GPUs, and runs W workers per selected GPU (default W=4), i.e.
+#     total workers = W * ceil(n/2). Uses the pytest CUDA scheduler plugin.
+#   - When DEVICE is hip/cpu/metal: no device auto-detection is performed;
+#     runs pytest without the CUDA plugin. Parallelism can be controlled via
+#     $PYTEST_XDIST_WORKERS (default 4 if unset).
+#
+# GPU detection precedence:
+#   1) $PYTEST_XDIST_CUDA_DEVICES (comma-separated, e.g. "0,1,2,3")
+#   2) $CUDA_VISIBLE_DEVICES
+#   3) torch.cuda.device_count() (if torch is available)
+#   4) nvidia-smi -L
+#
+# Environment variables:
+#   - PYTEST_XDIST_CUDA_DEVICES: explicit device list to use; if not set and we
+#     detect GPUs via torch/nvidia-smi, the script exports this variable.
+#   - CUDA_VISIBLE_DEVICES: if set, used as the device list for worker count.
+#   - PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE: W (workers per selected GPU), default 4.
+#   - PYTEST_XDIST_WORKERS: total workers for non-CUDA runs (hip/cpu/metal), default 4.
+#
+# Examples:
+#   - Use all visible GPUs, auto workers:  bash maint/scripts/run_local_ci_test.sh
+#   - Limit to subset of GPUs:             PYTEST_XDIST_CUDA_DEVICES=0,2 bash maint/scripts/run_local_ci_test.sh
+#   - Respect an existing visibility:      CUDA_VISIBLE_DEVICES=0,1 bash maint/scripts/run_local_ci_test.sh
+#   - Increase workers per GPU:            PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE=8 bash maint/scripts/run_local_ci_test.sh
+#
+# Requirements:
+#   - pytest, pytest-xdist
+#   - torch (optional) or nvidia-smi (optional) for auto-detection
+#   - NVIDIA drivers and CUDA-capable GPUs for GPU tests
+
+# Parse args
+DEVICE="cuda"
+if [[ $# -ge 1 ]]; then
+  case "$1" in
+    --device)
+      shift
+      DEVICE="${1:-cuda}"
+      shift || true
+      ;;
+    --device=*)
+      DEVICE="${1#*=}"
+      shift
+      ;;
+    cuda|hip|cpu|metal)
+      DEVICE="$1"
+      shift
+      ;;
+    -h|--help)
+      echo "Usage: $0 [DEVICE] | --device <cuda|hip|cpu|metal>"; exit 0;;
+    *)
+      echo "[WARN] Unrecognized arg '$1'; treating as DEVICE." >&2
+      DEVICE="$1"; shift;;
+  esac
+fi
+
+# Normalize DEVICE to lowercase
+DEVICE=$(echo "$DEVICE" | tr 'A-Z' 'a-z')
+case "$DEVICE" in
+  cuda|hip|cpu|metal) ;;
+  *) echo "[ERROR] Unsupported DEVICE='$DEVICE'. Choose cuda|hip|cpu|metal." >&2; exit 2;;
+esac
+
 # Set ROOT_DIR to the project root (two levels up from this script's directory)
 ROOT_DIR=$(cd "$(dirname "$0")/../.." && pwd)
 
 # Change to the project root directory for local testing of changes
-cd $ROOT_DIR
-
-# Add the project root to PYTHONPATH so Python can find local modules
-export PYTHONPATH=$ROOT_DIR:$PYTHONPATH
-
-# Run pytest in parallel (4 workers) for all tests in the examples directory
-cd examples
-python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
-cd ..
-
-# Run pytest in parallel (4 workers) for all tests in the testing/python directory.
-# IMPORTANT: CuTeDSL backend currently requires GEMM v1 (TILELANG_USE_GEMM_V1=1).
-# Do NOT export it globally here, or you'll silently change the default GEMM selection
-# for unrelated tests. Run the CuTeDSL JIT tests in a separate pytest invocation.
-cd testing/python
-python -m pytest -n 4 . --verbose --color=yes --durations=0 --showlocals --cache-clear
+cd "$ROOT_DIR" || exit 1
+
+# Add the project root and plugin directory to PYTHONPATH so Python can find local modules and the pytest plugin
+export PYTHONPATH=$ROOT_DIR:$ROOT_DIR/maint/scripts:$PYTHONPATH
+
+# Decide worker count automatically based on selected DEVICE.
+# - For cuda: use ceil(num_gpus/2) GPUs, with W workers per GPU (default 4).
+# - For hip/cpu/metal: no device detection; use $PYTEST_XDIST_WORKERS or 4.
+
+detect_device_list() {
+  # Priority: PYTEST_XDIST_CUDA_DEVICES > CUDA_VISIBLE_DEVICES > torch > nvidia-smi
+  if [[ -n "$PYTEST_XDIST_CUDA_DEVICES" ]]; then
+    echo "$PYTEST_XDIST_CUDA_DEVICES"
+    return
+  fi
+  if [[ -n "$CUDA_VISIBLE_DEVICES" ]]; then
+    echo "$CUDA_VISIBLE_DEVICES"
+    return
+  fi
+
+  # Try torch
+  local torch_cnt
+  torch_cnt=$(python - <<'PY' 2>/dev/null
+try:
+    import torch
+    print(torch.cuda.device_count() if torch.cuda.is_available() else 0)
+except Exception:
+    print(-1)
+PY
+  )
+
+  if [[ "$torch_cnt" =~ ^[0-9]+$ ]] && [[ $torch_cnt -ge 1 ]]; then
+    # Build list 0..torch_cnt-1
+    local lst=""
+    for ((i=0;i<torch_cnt;i++)); do
+      if [[ -z "$lst" ]]; then lst="$i"; else lst="$lst,$i"; fi
+    done
+    echo "$lst"
+    return
+  fi
+
+  # Fallback to nvidia-smi
+  if command -v nvidia-smi >/dev/null 2>&1; then
+    local smi_cnt
+    smi_cnt=$(nvidia-smi -L 2>/dev/null | wc -l | awk '{print $1}')
+    if [[ "$smi_cnt" =~ ^[0-9]+$ ]] && [[ $smi_cnt -ge 1 ]]; then
+      local lst=""
+      for ((i=0;i<smi_cnt;i++)); do
+        if [[ -z "$lst" ]]; then lst="$i"; else lst="$lst,$i"; fi
+      done
+      echo "$lst"
+      return
+    fi
+  fi
+
+  echo ""  # no devices detected
+}
+
+compute_workers_from_devices() {
+  local devlist="$1"
+  local workers_per_device="$2"
+  # remove whitespace
+  devlist=$(echo "$devlist" | tr -d ' ')
+  if [[ -z "$devlist" ]]; then
+    echo 0
+    return
+  fi
+  local n
+  n=$(awk -F, '{print NF}' <<< "$devlist")
+  local half
+  half=$(( (n + 1) / 2 ))
+  echo $(( half * workers_per_device ))
+}
+
+# Prepare pytest args and worker counts depending on DEVICE
+PYTEST_ARGS_COMMON=(--verbose --color=yes --durations=0 --showlocals --cache-clear)
+if [[ "$DEVICE" == "cuda" ]]; then
+  DEVLIST=$(detect_device_list)
+
+  # If we had to discover via torch or nvidia-smi (no env set), export PYTEST_XDIST_CUDA_DEVICES
+  if [[ -z "$PYTEST_XDIST_CUDA_DEVICES" && -z "$CUDA_VISIBLE_DEVICES" && -n "$DEVLIST" ]]; then
+    export PYTEST_XDIST_CUDA_DEVICES="$DEVLIST"
+  fi
+
+  # Determine workers per device (sync with plugin via env var)
+  WORKERS_PER_DEVICE_DEFAULT=4
+  WORKERS_PER_DEVICE=${PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE:-$WORKERS_PER_DEVICE_DEFAULT}
+  if ! [[ "$WORKERS_PER_DEVICE" =~ ^[0-9]+$ ]] || [[ "$WORKERS_PER_DEVICE" -le 0 ]]; then
+    WORKERS_PER_DEVICE=$WORKERS_PER_DEVICE_DEFAULT
+  fi
+  export PYTEST_XDIST_CUDA_WORKERS_PER_DEVICE="$WORKERS_PER_DEVICE"
+
+  NWORKERS=$(compute_workers_from_devices "$DEVLIST" "$WORKERS_PER_DEVICE")
+  if [[ "$NWORKERS" -le 0 ]]; then
+    echo "[ERROR] No CUDA devices detected. Cannot run GPU tests with pytest_cuda_scheduler." >&2
+    exit 1
+  fi
+  echo "[INFO] DEVICE=cuda; devices: ${DEVLIST:-none}. Workers: $NWORKERS ($WORKERS_PER_DEVICE per ceil(n/2))."
+  PYTEST_ARGS_DEVICE=(-p pytest_cuda_scheduler -n "$NWORKERS")
+else
+  # Non-CUDA: do not auto-detect devices; do not load CUDA plugin
+  NWORKERS_NONCUDA=${PYTEST_XDIST_WORKERS:-4}
+  if ! [[ "$NWORKERS_NONCUDA" =~ ^[0-9]+$ ]] || [[ "$NWORKERS_NONCUDA" -le 0 ]]; then
+    NWORKERS_NONCUDA=4
+  fi
+  echo "[INFO] DEVICE=$DEVICE; running without CUDA plugin. Workers: $NWORKERS_NONCUDA."
+  PYTEST_ARGS_DEVICE=(-n "$NWORKERS_NONCUDA")
+fi
+
+# Run pytest in parallel for all tests in the examples directory
+cd examples || exit 1
+python -m pytest "${PYTEST_ARGS_DEVICE[@]}" . "${PYTEST_ARGS_COMMON[@]}"
+cd .. || exit 1
+
+# Run pytest in parallel for all tests in the testing/python directory.
+cd testing/python || exit 1
+python -m pytest "${PYTEST_ARGS_DEVICE[@]}" . "${PYTEST_ARGS_COMMON[@]}"
diff --git a/maint/scripts/test_perf_regression.py b/maint/scripts/test_perf_regression.py
index 3ef098e0b4..010bc731c2 100644
--- a/maint/scripts/test_perf_regression.py
+++ b/maint/scripts/test_perf_regression.py
@@ -44,11 +44,26 @@ def run_cmd(cmd, env=None):
     full_env = os.environ.copy()
     if env:
         full_env.update(env)
-    # Don't capture stderr so that tqdm progress bar is visible
-    p = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=None, text=True, env=full_env)
+    # Stream output in real-time while capturing it
+    p = subprocess.Popen(
+        cmd,
+        stdout=subprocess.PIPE,
+        stderr=None,
+        text=True,
+        env=full_env,
+    )
+    stdout_lines = []
+    assert p.stdout is not None
+    for line in p.stdout:
+        stdout_lines.append(line)
+        # Don't print the JSON result line
+        if not line.startswith(_RESULTS_JSON_PREFIX):
+            print(line, end="", flush=True)
+    p.wait()
     if p.returncode != 0:
-        raise RuntimeError(f"Command failed: {' '.join(cmd)}\nSTDOUT:\n{p.stdout}")
-    return p.stdout
+        stdout_content = "".join(stdout_lines)
+        raise RuntimeError(f"Command failed: {' '.join(cmd)}\nSTDOUT:\n{stdout_content}")
+    return "".join(stdout_lines)
 
 
 def draw(df: pd.DataFrame) -> None:
diff --git a/pyproject.toml b/pyproject.toml
index 4256809f42..29e2dd81da 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,20 +27,24 @@ classifiers = [
 ]
 dynamic = ["version"]
 dependencies = [
-    # Memory issue: tilelang#1502
-    "apache-tvm-ffi~=0.1.0,>=0.1.6",
+    # >=0.1.6 fixes a memory issue: tilelang#1502, but keep
+    # requirement as wide as possible to be compatible with other libraries
+    # pip will try to use latest version whenever possible.
+    "apache-tvm-ffi~=0.1.0,>=0.1.2",
     # torch-c-dlpack-ext provides prebuilt torch extensions.
     # Without it, TVM FFI may require JIT compilation on first import.
-    "torch-c-dlpack-ext",
+    "torch-c-dlpack-ext; python_version < '3.14'",
     "cloudpickle",
     "ml-dtypes",
     "numpy>=1.23.5",
     "psutil",
     "torch",
-    "torch>=2.7; platform_system == 'Darwin'",
+    "torch>=2.4; platform_system == 'Darwin'",
+    # jit-compiling a torch extension needs setuptools
+    "setuptools; platform_system == 'Darwin'",
     "tqdm>=4.62.3",
     "typing-extensions>=4.10.0",
-    "z3-solver>=4.13.0",
+    "z3-solver>=4.13.0,<4.15.5",
     "nvidia-nvshmem-cu12; platform_system == 'Linux'",
 ]
 
@@ -50,12 +54,17 @@ dependencies = [
 fp4 = ["ml-dtypes>=0.5.1"]
 # if you want to enable layout inference visualization
 vis = ["matplotlib"]
+# if you want to build with CUDA NVCC support
+nvcc = [
+    "nvidia-cuda-nvcc>=13.0.48",
+    "nvidia-cuda-cccl>=13.0.50",
+]
 
 [build-system]
 requires = [
-    "cython>=3.0.0",
+    "cython>=3.1.0",
     "scikit-build-core",
-    "z3-solver>=4.13.0",
+    "z3-solver>=4.13.0,<4.15.5",
     # Not for auditwheel, explicitly add patchelf for repairing libz3.so.
     # See tvm's CMakeLists.txt for more information.
     "patchelf>=0.17.2; platform_system == 'Linux'",
@@ -100,11 +109,10 @@ include = [
     # Composable Kernel
     "3rdparty/composable_kernel/include",
     "3rdparty/composable_kernel/library",
-    # tilescale_ext C++ extension
-    "tilelang/utils/ts_ext/**",
-    "tilescale_ext/**",
     "testing/**",
     "examples/**",
+    # Shared memory C++ extension
+    "tilelang/distributed/shared_memory/**",
 ]
 exclude = [
     ".git",
@@ -117,7 +125,6 @@ exclude = [
 
 [tool.scikit-build.wheel.packages]
 tilelang = "tilelang"
-tilescale_ext = "tilescale_ext"
 "tilelang/src" = "src"
 # NOTE: The mapping below places the contents of '3rdparty' inside 'tilelang/3rdparty' in the wheel.
 # This is necessary to find TVM shared libraries at runtime.
@@ -126,6 +133,7 @@ tilescale_ext = "tilescale_ext"
 "tilelang/3rdparty/tvm/src" = "3rdparty/tvm/src"
 "tilelang/3rdparty/tvm/python" = "3rdparty/tvm/python"
 "tilelang/3rdparty/tvm/include" = "3rdparty/tvm/include"
+"tilelang/3rdparty/tvm/3rdparty/dmlc-core/include" = "3rdparty/tvm/3rdparty/dmlc-core/include"
 "tilelang/3rdparty/tvm/version.py" = "3rdparty/tvm/version.py"
 # CUTLASS
 "tilelang/3rdparty/cutlass/include" = "3rdparty/cutlass/include"
@@ -210,6 +218,10 @@ ignore = [
 [tool.pytest.ini_options]
 verbosity_assertions = 3
 filterwarnings = ["always"]
+markers = [
+    "perf: benchmark and performance API tests skipped by default",
+    "slow: long-running correctness sweeps",
+]
 
 [tool.cibuildwheel]
 archs = ["auto64"]
@@ -264,19 +276,18 @@ esac
 
 cudaver="$(echo "${CUDA_VERSION:-$DEFAULT_CUDA_VERSION}" | cut -d '.' -f-2)"
 v="${cudaver//./-}"
-yum install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}" nvidia-driver-cuda-libs
-yum clean all
+dnf install -y "cuda-minimal-build-${v}" "cuda-driver-devel-${v}" "cuda-nvrtc-devel-${v}"
+dnf clean all
+
+# CMake's FindCUDAToolkit expects libcuda.so in lib64, but cuda-driver-devel
+# places it in lib64/stubs. Create a symlink so CUDA::cuda_driver is found.
+ln -sf /usr/local/cuda/lib64/stubs/libcuda.so /usr/local/cuda/lib64/libcuda.so
 """
-repair-wheel-command = [
-    "auditwheel -v repair --exclude libtvm_ffi.so --exclude libz3.so --exclude libcuda.so.1 --exclude 'libnvshmem*' --exclude 'libnccl*' --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}",
-    "pipx run abi3audit --verbose {wheel} || true",
-]
+# Do not bundle libtvm_ffi.so and libz3.so because they are shipped by the dependency packages.
+repair-wheel-command = "auditwheel -v repair --exclude libtvm_ffi.so --exclude libz3.so --exclude libcuda.so.1 --exclude '/usr/local/cuda*' -w {dest_dir} {wheel}"
 
 [tool.cibuildwheel.macos]
-repair-wheel-command = [
-    "delocate-wheel --verbose --ignore-missing-dependencies --no-sanitize-rpaths --require-archs {delocate_archs} -w {dest_dir} -v {wheel}",
-    "pipx run abi3audit --verbose {wheel} || true",
-]
+repair-wheel-command = "delocate-wheel --verbose --ignore-missing-dependencies --no-sanitize-rpaths --require-archs {delocate_archs} -w {dest_dir} -v {wheel}"
 
 [[tool.cibuildwheel.overrides]]
 select = "*linux*x86_64*"
diff --git a/requirements-dev.txt b/requirements-dev.txt
index f2661d5c08..f8dccdc871 100644
--- a/requirements-dev.txt
+++ b/requirements-dev.txt
@@ -1,17 +1,16 @@
 # Requirements to run local build with `--no-build-isolation` or other developments
 
-apache-tvm-ffi~=0.1.0,>=0.1.6
+apache-tvm-ffi~=0.1.0,>=0.1.2
 build
 cmake>=3.26
-cython>=3.0.0
+cython>=3.1.0
 ninja
 packaging
+pre-commit>=4.0.0
 scikit-build-core
 setuptools>=61
-torch
-cuda-python>=12.0.0
 wheel
-z3-solver>=4.13.0
+z3-solver>=4.13.0,<4.15.5
 
 auditwheel; platform_system == 'Linux'
 patchelf; platform_system == 'Linux'
diff --git a/requirements-lint.txt b/requirements-lint.txt
index b68f81a2d3..873af12e1f 100644
--- a/requirements-lint.txt
+++ b/requirements-lint.txt
@@ -1,6 +1,5 @@
 # Format and lint requirements
 pre-commit
-clang-format==21.1.7
-clang-tidy==21.1.6
+clang-format==21.1.8
 codespell[toml]==2.4.1
-ruff==0.14.9
+ruff==0.14.14
diff --git a/requirements-test-cuda.txt b/requirements-test-cuda.txt
index d40f2221ec..420d6f1f52 100644
--- a/requirements-test-cuda.txt
+++ b/requirements-test-cuda.txt
@@ -8,4 +8,4 @@
 flash-attn==2.5.8
 cuda-python==12.9.4
 # CuTeDSL (CUTLASS Python DSL with CuTe support)
-nvidia-cutlass-dsl==4.3.3
+nvidia-cutlass-dsl==4.3.5
diff --git a/requirements-test.txt b/requirements-test.txt
index 533cab5676..2078abc33a 100644
--- a/requirements-test.txt
+++ b/requirements-test.txt
@@ -33,4 +33,4 @@ seaborn
 tabulate
 tornado
 wheel
-z3-solver>=4.13.0
+z3-solver>=4.13.0,<4.15.5
diff --git a/requirements.txt b/requirements.txt
index 6735d178ba..2dbe070d9a 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
 # Runtime requirements
 
-apache-tvm-ffi~=0.1.0,>=0.1.6
-torch-c-dlpack-ext
+apache-tvm-ffi~=0.1.0,>=0.1.2
+torch-c-dlpack-ext; python_version < '3.14'
 cloudpickle
 ml-dtypes
 numpy>=1.23.5
@@ -10,4 +10,4 @@ torch
 torch>=2.7; platform_system == 'Darwin'
 tqdm>=4.62.3
 typing-extensions>=4.10.0
-z3-solver>=4.13.0
+z3-solver>=4.13.0,<4.15.5
diff --git a/src/backend/cuda/CMakeLists.txt b/src/backend/cuda/CMakeLists.txt
new file mode 100644
index 0000000000..ef022644b4
--- /dev/null
+++ b/src/backend/cuda/CMakeLists.txt
@@ -0,0 +1,132 @@
+# CUDA backend: toolchain, stub libraries, source files, and build configuration.
+if(NOT USE_CUDA)
+  return()
+endif()
+
+set(CMAKE_CUDA_STANDARD 17)
+find_package(CUDAToolkit REQUIRED)
+set(CMAKE_CUDA_COMPILER "${CUDAToolkit_BIN_DIR}/nvcc")
+add_compile_definitions("CUDA_MAJOR_VERSION=${CUDAToolkit_VERSION_MAJOR}")
+
+# Set `USE_CUDA=/usr/local/cuda-x.y`
+cmake_path(GET CUDAToolkit_BIN_DIR PARENT_PATH USE_CUDA)
+
+if(TILELANG_USE_CUDA_STUBS)
+  if(WIN32 AND NOT CYGWIN)
+    message(FATAL_ERROR "TILELANG_USE_CUDA_STUBS=ON is not supported on Windows. "
+                        "Please configure with -DTILELANG_USE_CUDA_STUBS=OFF.")
+  endif()
+
+  # ============================================================================
+  # CUDA Driver Stub Library (libcuda_stub.so)
+  # ============================================================================
+  # This library provides drop-in replacements for CUDA driver API functions.
+  # Instead of linking directly against libcuda.so (which would fail on
+  # CPU-only machines), we link against this stub which loads libcuda.so
+  # lazily at runtime on first API call.
+  #
+  # The stub exports global C functions matching the CUDA driver API:
+  #   - cuModuleLoadData, cuLaunchKernel, cuMemsetD32_v2, etc.
+  # These can be called directly without any wrapper macros.
+  # ============================================================================
+  add_library(cuda_stub SHARED src/target/stubs/cuda.cc)
+  target_include_directories(cuda_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+  # Export symbols with visibility="default" when building
+  target_compile_definitions(cuda_stub PRIVATE TILELANG_CUDA_STUB_EXPORTS)
+  # Use dlopen/dlsym for runtime library loading
+  target_link_libraries(cuda_stub PRIVATE ${CMAKE_DL_LIBS})
+  set_target_properties(cuda_stub PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    # Use consistent naming
+    OUTPUT_NAME "cuda_stub"
+  )
+
+  # ============================================================================
+  # CUDA Runtime Stub Library (libcudart_stub.so)
+  # ============================================================================
+  # libcudart's SONAME includes its major version (e.g. libcudart.so.11.0 / .12 / .13).
+  # Link against this stub instead of the real libcudart so a single wheel can
+  # run in environments that provide different libcudart major versions.
+  #
+  # The stub exports a minimal set of CUDA Runtime API entrypoints used by TVM
+  # and lazily loads libcudart at runtime on first API call.
+  # ============================================================================
+  add_library(cudart_stub SHARED src/target/stubs/cudart.cc)
+  target_include_directories(cudart_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+  target_compile_definitions(cudart_stub PRIVATE TILELANG_CUDART_STUB_EXPORTS)
+  target_link_libraries(cudart_stub PRIVATE ${CMAKE_DL_LIBS})
+  set_target_properties(cudart_stub PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    OUTPUT_NAME "cudart_stub"
+  )
+
+  # Make TVM link against our CUDA Runtime stub instead of the real libcudart.
+  #
+  # NOTE: TVM's `find_cuda()` calls `find_library(CUDA_CUDART_LIBRARY cudart ...)`.
+  # `find_library()` will not override an already-cached variable, so setting it
+  # here ensures TVM doesn't record a DT_NEEDED on `libcudart.so.<major>`.
+  set(CUDA_CUDART_LIBRARY cudart_stub CACHE STRING "CUDART library to link against" FORCE)
+
+  # ============================================================================
+  # NVRTC Stub Library (libnvrtc_stub.so)
+  # ============================================================================
+  # NVRTC's SONAME includes its major version (e.g. libnvrtc.so.11.2 / .12 / .13).
+  # Link against this stub instead of the real NVRTC library so a single wheel
+  # can run in environments that provide different NVRTC major versions.
+  #
+  # The stub exports a minimal set of NVRTC C API entrypoints used by TVM and
+  # lazily loads libnvrtc at runtime on first API call.
+  # ============================================================================
+  add_library(nvrtc_stub SHARED src/target/stubs/nvrtc.cc)
+  target_include_directories(nvrtc_stub PRIVATE ${CUDAToolkit_INCLUDE_DIRS})
+  target_compile_definitions(nvrtc_stub PRIVATE TILELANG_NVRTC_STUB_EXPORTS)
+  target_link_libraries(nvrtc_stub PRIVATE ${CMAKE_DL_LIBS})
+  set_target_properties(nvrtc_stub PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    OUTPUT_NAME "nvrtc_stub"
+  )
+
+  # Make TVM link against our NVRTC stub instead of the real libnvrtc.
+  #
+  # NOTE: TVM's `find_cuda()` calls `find_library(CUDA_NVRTC_LIBRARY nvrtc ...)`.
+  # `find_library()` will not override an already-cached variable, so setting it
+  # here ensures TVM doesn't record a DT_NEEDED on `libnvrtc.so.<major>`.
+  set(CUDA_NVRTC_LIBRARY nvrtc_stub CACHE STRING "NVRTC library to link against" FORCE)
+endif()
+
+file(GLOB TILE_LANG_CUDA_SRCS
+  src/runtime/runtime.cc
+  src/runtime/tilescale_cuda_module.cc
+  src/shared_memory/shared_memory.cc
+  src/target/ptx.cc
+  src/target/codegen_cuda.cc
+  src/target/codegen_py.cc
+  src/target/codegen_utils.cc
+  src/target/codegen_cutedsl.cc
+  src/target/rt_mod_cuda.cc
+  src/target/rt_mod_cutedsl.cc
+)
+list(APPEND TILE_LANG_SRCS ${TILE_LANG_CUDA_SRCS})
+
+list(APPEND TILE_LANG_INCLUDES ${CUDAToolkit_INCLUDE_DIRS})
+link_directories(${CUDAToolkit_LIBRARY_DIR} ${CUDAToolkit_LIBRARY_DIR}/stubs)
+
+# Register stubs for linking and install
+if(TILELANG_USE_CUDA_STUBS)
+  set(TILELANG_ACTIVE_BACKEND_STUB_LINK cuda_stub)
+  set(TILELANG_ACTIVE_BACKEND_STUB_TARGETS cuda_stub cudart_stub nvrtc_stub)
+endif()
+
+# Register additional RPATH for CUDA toolkit lib directory
+if(UNIX)
+  set(TILELANG_ACTIVE_BACKEND_RPATH_EXTRA ":\$ORIGIN/../../nvidia/cu${CUDAToolkit_VERSION_MAJOR}/lib")
+endif()
+
+# Register patchelf removals (SONAMEs to strip for portable wheels)
+set(TILELANG_ACTIVE_BACKEND_PATCHELF_REMOVE "libcuda.so.1;libcuda.so")
diff --git a/src/backend/metal/CMakeLists.txt b/src/backend/metal/CMakeLists.txt
new file mode 100644
index 0000000000..9dbf33204a
--- /dev/null
+++ b/src/backend/metal/CMakeLists.txt
@@ -0,0 +1,18 @@
+# Metal backend: source files and build configuration.
+if(NOT USE_METAL)
+  return()
+endif()
+
+if(NOT APPLE)
+  # On non-Apple platforms USE_METAL=ON enables only codegen (Metal source
+  # generation) without requiring the Metal/Foundation frameworks.
+  message(STATUS "Metal backend on non-Apple: enabling codegen-only mode (no Metal runtime)")
+  set(USE_METAL OFF)
+endif()
+
+file(GLOB TILE_LANG_METAL_SRCS
+  src/target/rt_mod_metal.cc
+)
+list(APPEND TILE_LANG_SRCS ${TILE_LANG_METAL_SRCS})
+# FIXME: CIBW failed with backtrace, why???
+set(TVM_FFI_USE_LIBBACKTRACE OFF)
diff --git a/src/backend/rocm/CMakeLists.txt b/src/backend/rocm/CMakeLists.txt
new file mode 100644
index 0000000000..49da4bf80c
--- /dev/null
+++ b/src/backend/rocm/CMakeLists.txt
@@ -0,0 +1,79 @@
+# ROCm backend: toolchain, stub libraries, source files, and build configuration.
+if(NOT USE_ROCM)
+  return()
+endif()
+
+set(CMAKE_HIP_STANDARD 17)
+include(${TVM_SOURCE}/cmake/utils/FindROCM.cmake)
+find_rocm(${USE_ROCM})
+add_compile_definitions(__HIP_PLATFORM_AMD__ __HIP_PLATFORM_HCC__=1)
+
+if(TILELANG_USE_HIP_STUBS)
+  if(WIN32 AND NOT CYGWIN)
+    message(FATAL_ERROR "TILELANG_USE_HIP_STUBS=ON is not supported on Windows. "
+                        "Please configure with -DTILELANG_USE_HIP_STUBS=OFF.")
+  endif()
+
+  # ============================================================================
+  # HIP Stub Library (libhip_stub.so)
+  # ============================================================================
+  # This library provides drop-in replacements for HIP runtime/module APIs by
+  # lazily loading libamdhip64.so at runtime.
+  #
+  # It also provides minimal HSA wrappers (hsa_init / hsa_shut_down) to avoid a
+  # hard DT_NEEDED dependency on libhsa-runtime64 in ROCm-enabled wheels.
+  # ============================================================================
+  add_library(hip_stub SHARED src/target/stubs/hip.cc)
+  target_include_directories(hip_stub PRIVATE ${ROCM_INCLUDE_DIRS})
+  target_compile_definitions(hip_stub PRIVATE TILELANG_HIP_STUB_EXPORTS)
+  target_link_libraries(hip_stub PRIVATE ${CMAKE_DL_LIBS})
+  set_target_properties(hip_stub PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    OUTPUT_NAME "hip_stub"
+  )
+
+  # ============================================================================
+  # HIPRTC Stub Library (libhiprtc_stub.so)
+  # ============================================================================
+  # This library provides a minimal HIPRTC API surface and lazily loads
+  # libhiprtc.so at runtime.
+  # ============================================================================
+  add_library(hiprtc_stub SHARED src/target/stubs/hiprtc.cc)
+  target_include_directories(hiprtc_stub PRIVATE ${ROCM_INCLUDE_DIRS})
+  target_compile_definitions(hiprtc_stub PRIVATE TILELANG_HIPRTC_STUB_EXPORTS)
+  target_link_libraries(hiprtc_stub PRIVATE ${CMAKE_DL_LIBS})
+  set_target_properties(hiprtc_stub PROPERTIES
+    LIBRARY_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    ARCHIVE_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/lib"
+    OUTPUT_NAME "hiprtc_stub"
+  )
+
+  # Make TVM link against our HIP stub instead of the real libamdhip64.so.
+  #
+  # NOTE: TVM's `find_rocm()` calls `find_library(ROCM_HIPHCC_LIBRARY amdhip64 ...)`.
+  # `find_library()` will not override an already-cached variable, so setting it
+  # here ensures TVM doesn't record a DT_NEEDED on libamdhip64.
+  set(ROCM_HIPHCC_LIBRARY hip_stub CACHE STRING "HIP runtime library to link against" FORCE)
+
+  # Prevent TVM from recording a DT_NEEDED on libhsa-runtime64.
+  # The few HSA entrypoints used by TVM are stubbed by hip_stub and resolved
+  # lazily when available.
+  set(ROCM_HSA_LIBRARY ROCM_HSA_LIBRARY-NOTFOUND CACHE STRING
+      "HSA runtime library to link against" FORCE)
+endif()
+
+file(GLOB TILE_LANG_HIP_SRCS
+  src/target/codegen_hip.cc
+  src/target/rt_mod_hip.cc
+)
+list(APPEND TILE_LANG_SRCS ${TILE_LANG_HIP_SRCS})
+list(APPEND TILE_LANG_INCLUDES ${ROCM_INCLUDE_DIRS})
+
+# Register stubs for linking and install
+if(TILELANG_USE_HIP_STUBS)
+  set(TILELANG_ACTIVE_BACKEND_STUB_LINK hip_stub)
+  set(TILELANG_ACTIVE_BACKEND_STUB_TARGETS hip_stub hiprtc_stub)
+endif()
diff --git a/src/config.h b/src/config.h
new file mode 100644
index 0000000000..1899831488
--- /dev/null
+++ b/src/config.h
@@ -0,0 +1,38 @@
+/*!
+ * \file tl/config.h
+ * \brief TileLang configuration utilities.
+ */
+
+#ifndef TVM_TL_CONFIG_H_
+#define TVM_TL_CONFIG_H_
+
+#include <tvm/ir/transform.h>
+
+namespace tvm {
+namespace tl {
+namespace tl_config {
+
+/*!
+ * \brief Check if vectorize planner verbose output is enabled.
+ */
+inline bool VectorizePlannerVerboseEnabled() {
+  auto ctxt = transform::PassContext::Current();
+  return ctxt
+      ->GetConfig("tl.enable_vectorize_planner_verbose", Optional<Bool>())
+      .value_or(Bool(false));
+}
+
+/*!
+ * \brief Check if 256-bit vectorization is disabled.
+ */
+inline bool Vectorize256Disabled() {
+  auto ctxt = transform::PassContext::Current();
+  return ctxt->GetConfig("tl.disable_vectorize_256", Optional<Bool>())
+      .value_or(Bool(false));
+}
+
+} // namespace tl_config
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_CONFIG_H_
diff --git a/src/ir.cc b/src/ir.cc
index b40723a843..3b06480c3d 100644
--- a/src/ir.cc
+++ b/src/ir.cc
@@ -80,8 +80,20 @@ ForFrame ParallelFor(const Array<PrimExpr> &extents,
       Var var = vars[i];
       Optional<PrimExpr> step =
           i < steps.size() ? steps[i] : Optional<PrimExpr>(std::nullopt);
+      // Only attach annotations to the outermost parallel loop.
+      // Rationale: In TileLang's design, inner loops cannot govern or annotate
+      // their outer loops, while the outermost loop can manage and transform
+      // the entire nested region. Placing the layout on the outermost loop
+      // lets lowering/validators reason about and rewrite the whole nest.
+      // Layout annotations (like parallel_loop_layout) and other hints are
+      // read from the outermost loop.
+      Map<String, tvm::ffi::Any> loop_annotations;
+      if (i == 0) {
+        loop_annotations = annotations;
+      }
       body = For(var, dom->min, dom->extent, ForKind::kParallel, body,
-                 /*thread_binding=*/std::nullopt, /*annotations=*/annotations,
+                 /*thread_binding=*/std::nullopt,
+                 /*annotations=*/loop_annotations,
                  /*step=*/step);
     }
     return body;
@@ -304,11 +316,12 @@ KernelLaunchFrame KernelLaunch(const Array<PrimExpr> &grid_size,
   }
 
   if (attrs.defined()) {
-    auto empty_block = tvm::script::ir_builder::tir::Block(MainBlockName);
+    auto empty_block = tvm::script::ir_builder::tir::Block(DeviceMainBlockName);
     empty_block->annotations = attrs;
     n->frames.push_back(empty_block);
   } else {
-    n->frames.push_back(tvm::script::ir_builder::tir::Block(MainBlockName));
+    n->frames.push_back(
+        tvm::script::ir_builder::tir::Block(DeviceMainBlockName));
   }
 
   return KernelLaunchFrame(n);
diff --git a/src/layout/gemm_layouts.cc b/src/layout/gemm_layouts.cc
index 97fcf2069a..f46c47715f 100644
--- a/src/layout/gemm_layouts.cc
+++ b/src/layout/gemm_layouts.cc
@@ -4,6 +4,7 @@
  *
  */
 
+#include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 
 #include <cmath>
@@ -378,9 +379,54 @@ PrimExpr xor8x8(const PrimExpr &i, const PrimExpr j) {
   return 2 * xor4x4(i1, j1) + xor2x2(i0, j0);
 }
 
+namespace {
+struct SwizzleShapeInfo {
+  int64_t stride;
+  int64_t continuous;
+  int element_size;
+};
+
+SwizzleShapeInfo GetSwizzleShapeInfoChecked(const Buffer &buffer) {
+  ICHECK(buffer.defined()) << "Swizzle layout expects a defined buffer";
+  ICHECK(buffer->shape.size() >= 2)
+      << "Swizzle layout expects rank >= 2 buffer, got rank="
+      << buffer->shape.size();
+  size_t ndim = buffer->shape.size();
+  auto stride = as_const_int(buffer->shape[ndim - 2]);
+  auto continuous = as_const_int(buffer->shape[ndim - 1]);
+  ICHECK(stride && continuous)
+      << "Swizzle layout requires constant last-2 dims";
+  return SwizzleShapeInfo{*stride, *continuous, buffer->dtype.bits()};
+}
+
+bool TryGetSwizzleShapeInfo(const Buffer &buffer, SwizzleShapeInfo *info) {
+  if (!buffer.defined() || buffer->shape.size() < 2) {
+    return false;
+  }
+  size_t ndim = buffer->shape.size();
+  auto stride = as_const_int(buffer->shape[ndim - 2]);
+  auto continuous = as_const_int(buffer->shape[ndim - 1]);
+  if (!stride || !continuous) {
+    return false;
+  }
+  *info = SwizzleShapeInfo{*stride, *continuous, buffer->dtype.bits()};
+  return true;
+}
+
+} // namespace
+
+static Layout ExpandLayout2D(const Layout &base, const Buffer &buffer) {
+  Array<PrimExpr> leading_shape;
+  leading_shape.reserve(buffer->shape.size() - 2);
+  for (size_t i = 0; i + 2 < buffer->shape.size(); ++i) {
+    leading_shape.push_back(buffer->shape[i]);
+  }
+  return base->Expand(leading_shape);
+}
+
 // Layout swizzling for 32 bytes
-Layout makeQuarterBankSwizzleLayout(int stride, int continuous,
-                                    int element_size) {
+static Layout MakeQuarterBankSwizzleLayout2D(int stride, int continuous,
+                                             int element_size) {
   // Swizzle 1 bit
   Var i = InputPlaceholder(0);
   Var j = InputPlaceholder(1);
@@ -398,8 +444,17 @@ Layout makeQuarterBankSwizzleLayout(int stride, int continuous,
   return Layout(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
 }
 
+Layout makeQuarterBankSwizzleLayout(const Buffer &buffer) {
+  auto info = GetSwizzleShapeInfoChecked(buffer);
+  auto base = MakeQuarterBankSwizzleLayout2D(static_cast<int>(info.stride),
+                                             static_cast<int>(info.continuous),
+                                             info.element_size);
+  return ExpandLayout2D(base, buffer);
+}
+
 // Layout swizzling for 64 bytes
-Layout makeHalfBankSwizzleLayout(int stride, int continuous, int element_size) {
+static Layout MakeHalfBankSwizzleLayout2D(int stride, int continuous,
+                                          int element_size) {
   // Swizzle 2 bit
   Var i = InputPlaceholder(0);
   Var j = InputPlaceholder(1);
@@ -417,8 +472,17 @@ Layout makeHalfBankSwizzleLayout(int stride, int continuous, int element_size) {
   return Layout(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
 }
 
+Layout makeHalfBankSwizzleLayout(const Buffer &buffer) {
+  auto info = GetSwizzleShapeInfoChecked(buffer);
+  auto base = MakeHalfBankSwizzleLayout2D(static_cast<int>(info.stride),
+                                          static_cast<int>(info.continuous),
+                                          info.element_size);
+  return ExpandLayout2D(base, buffer);
+}
+
 // Layout swizzling for 128 bytes
-Layout makeFullBankSwizzleLayout(int stride, int continuous, int element_size) {
+static Layout MakeFullBankSwizzleLayout2D(int stride, int continuous,
+                                          int element_size) {
   // Swizzle 3 bit
   Var i = InputPlaceholder(0);
   Var j = InputPlaceholder(1);
@@ -436,6 +500,14 @@ Layout makeFullBankSwizzleLayout(int stride, int continuous, int element_size) {
   return Layout(Array<PrimExpr>{stride, continuous}, {tc, ts, index});
 }
 
+Layout makeFullBankSwizzleLayout(const Buffer &buffer) {
+  auto info = GetSwizzleShapeInfoChecked(buffer);
+  auto base = MakeFullBankSwizzleLayout2D(static_cast<int>(info.stride),
+                                          static_cast<int>(info.continuous),
+                                          info.element_size);
+  return ExpandLayout2D(base, buffer);
+}
+
 // Detail implementation please ref to
 // bitblas::tl::mfma_layout::make_mfma_swizzle_layout
 Layout makeMatrixCoreSwizzleLayout(int stride, int continuous, int element_size,
@@ -453,8 +525,8 @@ Layout makeMatrixCoreSwizzleLayout(int stride, int continuous, int element_size,
 
   IterVar row = make_itervar("row", stride);
   IterVar col = make_itervar("col", continuous);
-  PrimExpr phase = FloorMod(row / perPhase, maxPhase);
-  PrimExpr colOffSwizzled = ((col / vecSize) ^ phase) * vecSize;
+  PrimExpr phase = FloorMod(FloorDiv(row, perPhase), maxPhase);
+  PrimExpr colOffSwizzled = (FloorDiv(col, vecSize) ^ phase) * vecSize;
   PrimExpr colOffOrdered = FloorMod(col, vecSize);
   PrimExpr colOff = colOffSwizzled + colOffOrdered;
 
@@ -742,9 +814,11 @@ Layout makeGemmABLayout(int mat_stride, int mat_continuous, int continuity,
   if (!k_inner && element_size == 8) // int8 KxN
     return makeGemmABLayoutPadded(mat_stride, mat_continuous, element_size);
   else if (mat_continuous % (vector_size * 8) == 0)
-    return makeFullBankSwizzleLayout(mat_stride, mat_continuous, element_size);
+    return MakeFullBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                       element_size);
   else if (mat_continuous % (vector_size * 4) == 0)
-    return makeHalfBankSwizzleLayout(mat_stride, mat_continuous, element_size);
+    return MakeHalfBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                       element_size);
   else {
     return makeGemmABLayoutPadded(mat_stride, mat_continuous, element_size);
   }
@@ -761,21 +835,21 @@ Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
     if (mat_stride % 8 != 0)
       return makeLinearLayout(
           Array<PrimExpr>{Integer(mat_stride), Integer(mat_continuous)});
-    return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
-                                        element_size);
+    return MakeQuarterBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                          element_size);
   }
   int vector_size = 128 / element_size;
 
   if (mat_stride % 8 == 0) {
     if (mat_continuous % (vector_size * 8) == 0)
-      return makeFullBankSwizzleLayout(mat_stride, mat_continuous,
-                                       element_size);
+      return MakeFullBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                         element_size);
     else if (mat_continuous % (vector_size * 4) == 0)
-      return makeHalfBankSwizzleLayout(mat_stride, mat_continuous,
-                                       element_size);
+      return MakeHalfBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                         element_size);
     else if (mat_continuous % (vector_size * 2) == 0)
-      return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
-                                          element_size);
+      return MakeQuarterBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                            element_size);
   }
 
   if (mat_continuous % vector_size == 0)
@@ -785,6 +859,7 @@ Layout makeGemmABLayoutHopper(int mat_stride, int mat_continuous,
     ICHECK(0) << "Unsupported layout for Hopper with stride=" << mat_stride
               << ", continuous=" << mat_continuous
               << ", element_size=" << element_size << ", k_inner=" << k_inner;
+  __builtin_unreachable(); // to prevent compiler warning
 }
 
 Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
@@ -796,14 +871,14 @@ Layout makeGemmABLayoutSm100(int mat_stride, int mat_continuous, int continuity,
 
   if (mat_stride % 8 == 0) {
     if (mat_continuous % (vector_size * 8) == 0)
-      return makeFullBankSwizzleLayout(mat_stride, mat_continuous,
-                                       element_size);
+      return MakeFullBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                         element_size);
     else if (mat_continuous % (vector_size * 4) == 0)
-      return makeHalfBankSwizzleLayout(mat_stride, mat_continuous,
-                                       element_size);
+      return MakeHalfBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                         element_size);
     else if (mat_continuous % (vector_size * 2) == 0)
-      return makeQuarterBankSwizzleLayout(mat_stride, mat_continuous,
-                                          element_size);
+      return MakeQuarterBankSwizzleLayout2D(mat_stride, mat_continuous,
+                                            element_size);
   }
 
   if (mat_continuous % vector_size == 0)
@@ -820,5 +895,111 @@ Layout makeGemmABLayoutCDNA(int stride, int continuous, int element_size,
                             int kPack) {
   return makeMatrixCoreSwizzleLayout(stride, continuous, element_size, kPack);
 }
+
+Layout makeSwizzledLayout(const Buffer &buffer, bool k_inner, bool allow_pad) {
+  auto info = GetSwizzleShapeInfoChecked(buffer);
+  Layout base;
+  if (allow_pad) {
+    base = makeGemmABLayout(
+        static_cast<int>(info.stride), static_cast<int>(info.continuous),
+        static_cast<int>(info.continuous), info.element_size, k_inner);
+  } else {
+    base = makeGemmABLayoutHopper(
+        static_cast<int>(info.stride), static_cast<int>(info.continuous),
+        static_cast<int>(info.continuous), info.element_size, k_inner);
+  }
+  return ExpandLayout2D(base, buffer);
+}
+
+Layout makeVoltaSwizzledLayout(const Buffer &buffer, bool is_a, bool k_inner) {
+  auto info = GetSwizzleShapeInfoChecked(buffer);
+  auto base =
+      makeGemmVoltaABLayout(static_cast<int>(info.stride),
+                            static_cast<int>(info.continuous), is_a, k_inner);
+  return ExpandLayout2D(base, buffer);
+}
+
+Layout makeWgmmaSwizzledLayout(const Buffer &buffer, int continuity,
+                               bool k_inner) {
+  auto info = GetSwizzleShapeInfoChecked(buffer);
+  if (continuity < 0)
+    continuity = static_cast<int>(info.continuous);
+  auto base = makeGemmABLayoutHopper(static_cast<int>(info.stride),
+                                     static_cast<int>(info.continuous),
+                                     continuity, info.element_size, k_inner);
+  return ExpandLayout2D(base, buffer);
+}
+
+Layout makeTcgen05mmaSwizzledLayout(const Buffer &buffer, int continuity,
+                                    bool k_inner) {
+  auto info = GetSwizzleShapeInfoChecked(buffer);
+  if (continuity < 0)
+    continuity = static_cast<int>(info.continuous);
+  auto base = makeGemmABLayoutSm100(static_cast<int>(info.stride),
+                                    static_cast<int>(info.continuous),
+                                    continuity, info.element_size, k_inner);
+  return ExpandLayout2D(base, buffer);
+}
+
+SwizzleMode DetectSwizzleMode(const Layout &layout, const Buffer &buffer) {
+  SwizzleShapeInfo info;
+  if (!TryGetSwizzleShapeInfo(buffer, &info)) {
+    return SwizzleMode::kNone;
+  }
+  int vector_size = 128 / info.element_size;
+
+  // Check from smallest to largest granularity
+  // Need to verify stride and continuous constraints before comparing
+  if (info.stride % 8 == 0 &&
+      info.continuous % (static_cast<int64_t>(vector_size) * 2) == 0) {
+    if (StructuralEqual()(layout, makeQuarterBankSwizzleLayout(buffer))) {
+      return SwizzleMode::kQuarter;
+    }
+  }
+  if (info.stride % 8 == 0 &&
+      info.continuous % (static_cast<int64_t>(vector_size) * 4) == 0) {
+    if (StructuralEqual()(layout, makeHalfBankSwizzleLayout(buffer))) {
+      return SwizzleMode::kHalf;
+    }
+  }
+  if (info.stride % 8 == 0 &&
+      info.continuous % (static_cast<int64_t>(vector_size) * 8) == 0) {
+    if (StructuralEqual()(layout, makeFullBankSwizzleLayout(buffer))) {
+      return SwizzleMode::kFull;
+    }
+  }
+  return SwizzleMode::kNone;
+}
+
+Optional<Layout> MergeSwizzleLayouts(const Layout &layout1,
+                                     const Layout &layout2,
+                                     const Buffer &buffer) {
+  SwizzleShapeInfo info;
+  if (!TryGetSwizzleShapeInfo(buffer, &info)) {
+    return std::nullopt;
+  }
+  SwizzleMode mode1 = DetectSwizzleMode(layout1, buffer);
+  SwizzleMode mode2 = DetectSwizzleMode(layout2, buffer);
+
+  // If either is not a swizzle layout, cannot merge
+  if (mode1 == SwizzleMode::kNone || mode2 == SwizzleMode::kNone) {
+    return std::nullopt;
+  }
+
+  // Take the smaller swizzle granularity (smaller enum value)
+  SwizzleMode min_mode = std::min(mode1, mode2);
+
+  switch (min_mode) {
+  case SwizzleMode::kQuarter:
+    return makeQuarterBankSwizzleLayout(buffer);
+  case SwizzleMode::kHalf:
+    return makeHalfBankSwizzleLayout(buffer);
+  case SwizzleMode::kFull:
+    return makeFullBankSwizzleLayout(buffer);
+  default:
+    return std::nullopt;
+  }
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/layout/layout.cc b/src/layout/layout.cc
index ccccc903d2..c5661dc91d 100644
--- a/src/layout/layout.cc
+++ b/src/layout/layout.cc
@@ -4,6 +4,7 @@
  */
 
 #include "layout.h"
+#include <tvm/ffi/error.h>
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/runtime/logging.h>
 
@@ -21,12 +22,278 @@ namespace tl {
 
 using namespace tir;
 
-static Var getPlaceholder(const std::string &s) {
-  static std::unordered_map<std::string, Var> map;
-  if (map.find(s) == map.end()) {
-    map[s] = Var(s);
+namespace {
+
+Array<Var> CreateReshapeVars(const Array<PrimExpr> &shape,
+                             arith::Analyzer *analyzer) {
+  Array<Var> vars;
+  vars.reserve(shape.size());
+  for (size_t i = 0; i < shape.size(); ++i) {
+    auto var = Var(std::string("n_") + std::to_string(i), shape[i].dtype());
+    analyzer->Bind(var, Range(0, shape[i]));
+    vars.push_back(var);
+  }
+  return vars;
+}
+
+PrimExpr ComputeFlatIndex(const Array<PrimExpr> &shape,
+                          const Array<Var> &vars) {
+  PrimExpr flat_index = Integer(0);
+  for (size_t i = 0; i < shape.size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < shape.size(); ++j) {
+      stride = stride * shape[j];
+    }
+    flat_index = flat_index + vars[i] * stride;
+  }
+  return flat_index;
+}
+
+Array<PrimExpr> RecoverOriginalIndices(const Array<PrimExpr> &shape,
+                                       const PrimExpr &flat_index) {
+  Array<PrimExpr> original_indices;
+  PrimExpr remaining = flat_index;
+  for (size_t i = 0; i < shape.size(); ++i) {
+    PrimExpr stride = Integer(1);
+    for (size_t j = i + 1; j < shape.size(); ++j) {
+      stride = stride * shape[j];
+    }
+    original_indices.push_back(floordiv(remaining, stride));
+    remaining = floormod(remaining, stride);
+  }
+  return original_indices;
+}
+
+Array<PrimExpr> SubstituteForwardIndex(const Array<PrimExpr> &forward_index,
+                                       const Array<PrimExpr> &input_shape,
+                                       const Array<PrimExpr> &original_indices,
+                                       arith::Analyzer *analyzer) {
+  Array<PrimExpr> new_forward_index;
+  for (const auto &fwd_expr : forward_index) {
+    PrimExpr substituted = fwd_expr;
+    for (size_t i = 0; i < input_shape.size(); ++i) {
+      substituted =
+          Substitute(substituted, {{InputPlaceholder(i), original_indices[i]}});
+    }
+    new_forward_index.push_back(analyzer->Simplify(substituted));
+  }
+  return new_forward_index;
+}
+
+PrimExpr SubstituteReshapedExpr(const PrimExpr &expr,
+                                const Array<PrimExpr> &input_shape,
+                                const Array<PrimExpr> &original_indices,
+                                arith::Analyzer *analyzer) {
+  PrimExpr substituted = expr;
+  for (size_t i = 0; i < input_shape.size(); ++i) {
+    substituted =
+        Substitute(substituted, {{InputPlaceholder(i), original_indices[i]}});
   }
-  return map[s];
+  return analyzer->Simplify(substituted);
+}
+
+Array<PrimExpr> RestoreInputPlaceholders(const Array<PrimExpr> &forward_index,
+                                         const Array<Var> &vars) {
+  Array<PrimExpr> restored = forward_index;
+  for (size_t i = 0; i < vars.size(); ++i) {
+    restored = Substitute(restored, {{vars[i], InputPlaceholder(i)}});
+  }
+  return restored;
+}
+
+PrimExpr RestoreInputPlaceholders(const PrimExpr &expr,
+                                  const Array<Var> &vars) {
+  PrimExpr restored = expr;
+  for (size_t i = 0; i < vars.size(); ++i) {
+    restored = Substitute(restored, {{vars[i], InputPlaceholder(i)}});
+  }
+  return restored;
+}
+
+Layout TryPackedSubtypeReshape(const LayoutNode *layout_node,
+                               const Array<PrimExpr> &shape,
+                               arith::Analyzer *analyzer,
+                               const PrimExpr &old_elem_bits_expr,
+                               const PrimExpr &new_elem_bits_expr) {
+  const int64_t *old_elem_bits = as_const_int(old_elem_bits_expr);
+  const int64_t *new_elem_bits = as_const_int(new_elem_bits_expr);
+  if (old_elem_bits == nullptr || new_elem_bits == nullptr) {
+    return Layout();
+  }
+  if (*old_elem_bits <= 0 || *new_elem_bits <= 0) {
+    return Layout();
+  }
+
+  const Array<PrimExpr> &input_shape = layout_node->InputShape();
+
+  // Narrower target element, e.g. uint8 -> fp4.
+  // One old logical element now contains `pack_factor` new logical elements.
+  // The generic flat-index reshape would lose this packed-storage structure, so
+  // we materialize it as an extra trailing output dimension ("pack lane").
+  if (*old_elem_bits > *new_elem_bits && *old_elem_bits % *new_elem_bits == 0 &&
+      *new_elem_bits < 8) {
+    int64_t pack_factor = *old_elem_bits / *new_elem_bits;
+    Array<Var> new_vars = CreateReshapeVars(shape, analyzer);
+    PrimExpr flat_index = ComputeFlatIndex(shape, new_vars);
+    PrimExpr old_flat_index = floordiv(flat_index, Integer(pack_factor));
+    PrimExpr lane_in_pack = floormod(flat_index, Integer(pack_factor));
+
+    Array<PrimExpr> original_indices =
+        RecoverOriginalIndices(input_shape, old_flat_index);
+    Array<PrimExpr> new_forward_index =
+        SubstituteForwardIndex(layout_node->GetForwardIndex(), input_shape,
+                               original_indices, analyzer);
+    new_forward_index.push_back(analyzer->Simplify(lane_in_pack));
+    new_forward_index = RestoreInputPlaceholders(new_forward_index, new_vars);
+    return Layout(shape, new_forward_index);
+  }
+
+  // Wider target element, e.g. fp4 -> uint8.
+  // This is only valid if the current layout already exposes the packed
+  // sub-elements as its last output dimension.  We collapse that trailing pack
+  // lane back into the logical element index of the wider dtype.
+  if (*old_elem_bits < *new_elem_bits && *new_elem_bits % *old_elem_bits == 0 &&
+      *old_elem_bits < 8) {
+    int64_t pack_factor = *new_elem_bits / *old_elem_bits;
+    Array<PrimExpr> output_shape = layout_node->OutputShape();
+    if (output_shape.empty() ||
+        !analyzer->CanProveEqual(output_shape.back(), Integer(pack_factor))) {
+      return Layout();
+    }
+
+    Array<Var> new_vars = CreateReshapeVars(shape, analyzer);
+    PrimExpr flat_index = ComputeFlatIndex(shape, new_vars);
+    PrimExpr old_flat_index = flat_index * Integer(pack_factor);
+    Array<PrimExpr> original_indices =
+        RecoverOriginalIndices(input_shape, old_flat_index);
+
+    Array<PrimExpr> expanded_forward_index =
+        SubstituteForwardIndex(layout_node->GetForwardIndex(), input_shape,
+                               original_indices, analyzer);
+    ICHECK_GT(expanded_forward_index.size(), 0);
+    Array<PrimExpr> new_forward_index;
+    new_forward_index.reserve(expanded_forward_index.size() - 1);
+    for (size_t i = 0; i + 1 < expanded_forward_index.size(); ++i) {
+      new_forward_index.push_back(expanded_forward_index[i]);
+    }
+    new_forward_index = RestoreInputPlaceholders(new_forward_index, new_vars);
+    return Layout(shape, new_forward_index);
+  }
+
+  return Layout();
+}
+
+Fragment TryPackedSubtypeReshape(const FragmentNode *fragment_node,
+                                 const Array<PrimExpr> &shape,
+                                 arith::Analyzer *analyzer,
+                                 const PrimExpr &old_elem_bits_expr,
+                                 const PrimExpr &new_elem_bits_expr) {
+  const int64_t *old_elem_bits = as_const_int(old_elem_bits_expr);
+  const int64_t *new_elem_bits = as_const_int(new_elem_bits_expr);
+  if (old_elem_bits == nullptr || new_elem_bits == nullptr) {
+    return Fragment();
+  }
+  if (*old_elem_bits <= 0 || *new_elem_bits <= 0) {
+    return Fragment();
+  }
+
+  const Array<PrimExpr> &input_shape = fragment_node->InputShape();
+
+  // Same idea as Layout::Reshape above: preserve packed sub-byte storage by
+  // making the pack lane explicit in the fragment mapping instead of silently
+  // flattening it away.
+  if (*old_elem_bits > *new_elem_bits && *old_elem_bits % *new_elem_bits == 0 &&
+      *new_elem_bits < 8) {
+    int64_t pack_factor = *old_elem_bits / *new_elem_bits;
+    Array<Var> new_vars = CreateReshapeVars(shape, analyzer);
+    PrimExpr flat_index = ComputeFlatIndex(shape, new_vars);
+    PrimExpr old_flat_index = floordiv(flat_index, Integer(pack_factor));
+    PrimExpr lane_in_pack = floormod(flat_index, Integer(pack_factor));
+
+    Array<PrimExpr> original_indices =
+        RecoverOriginalIndices(input_shape, old_flat_index);
+    Array<PrimExpr> new_forward_index =
+        SubstituteForwardIndex(fragment_node->GetForwardIndex(), input_shape,
+                               original_indices, analyzer);
+    new_forward_index.push_back(analyzer->Simplify(lane_in_pack));
+
+    PrimExpr new_forward_thread =
+        SubstituteReshapedExpr(fragment_node->GetForwardThread(), input_shape,
+                               original_indices, analyzer);
+    new_forward_index = RestoreInputPlaceholders(new_forward_index, new_vars);
+    new_forward_thread = RestoreInputPlaceholders(new_forward_thread, new_vars);
+
+    Fragment reshaped(shape, new_forward_index, new_forward_thread,
+                      fragment_node->ReplicateExtent(), std::nullopt);
+    if (fragment_node->ThreadRange().defined()) {
+      reshaped = reshaped->BindThreadRange(fragment_node->ThreadRange());
+    }
+    return reshaped;
+  }
+
+  if (*old_elem_bits < *new_elem_bits && *new_elem_bits % *old_elem_bits == 0 &&
+      *old_elem_bits < 8) {
+    int64_t pack_factor = *new_elem_bits / *old_elem_bits;
+    Array<PrimExpr> output_shape = fragment_node->OutputShape();
+    if (output_shape.empty() ||
+        !analyzer->CanProveEqual(output_shape.back(), Integer(pack_factor))) {
+      return Fragment();
+    }
+
+    Array<Var> new_vars = CreateReshapeVars(shape, analyzer);
+    PrimExpr flat_index = ComputeFlatIndex(shape, new_vars);
+    PrimExpr old_flat_index = flat_index * Integer(pack_factor);
+    Array<PrimExpr> original_indices =
+        RecoverOriginalIndices(input_shape, old_flat_index);
+
+    Array<PrimExpr> expanded_forward_index =
+        SubstituteForwardIndex(fragment_node->GetForwardIndex(), input_shape,
+                               original_indices, analyzer);
+    ICHECK_GT(expanded_forward_index.size(), 0);
+    Array<PrimExpr> new_forward_index;
+    new_forward_index.reserve(expanded_forward_index.size() - 1);
+    for (size_t i = 0; i + 1 < expanded_forward_index.size(); ++i) {
+      new_forward_index.push_back(expanded_forward_index[i]);
+    }
+
+    PrimExpr new_forward_thread =
+        SubstituteReshapedExpr(fragment_node->GetForwardThread(), input_shape,
+                               original_indices, analyzer);
+    new_forward_index = RestoreInputPlaceholders(new_forward_index, new_vars);
+    new_forward_thread = RestoreInputPlaceholders(new_forward_thread, new_vars);
+
+    Fragment reshaped(shape, new_forward_index, new_forward_thread,
+                      fragment_node->ReplicateExtent(), std::nullopt);
+    if (fragment_node->ThreadRange().defined()) {
+      reshaped = reshaped->BindThreadRange(fragment_node->ThreadRange());
+    }
+    return reshaped;
+  }
+
+  return Fragment();
+}
+
+} // namespace
+
+static constexpr size_t kMaxPlaceholders = 16;
+
+static Var getPlaceholder(const std::string &s) {
+  // Pre-allocate all possible placeholders so the map is immutable after init.
+  // C++11 guarantees thread-safe initialization of function-local statics,
+  // so concurrent reads are safe without a mutex.
+  static const std::unordered_map<std::string, Var> map = []() {
+    std::unordered_map<std::string, Var> m;
+    m.reserve(kMaxPlaceholders + 1);
+    m["_rep"] = Var("_rep");
+    for (size_t i = 0; i < kMaxPlaceholders; ++i) {
+      std::string key{'_', char('i' + i)};
+      m[key] = Var(key);
+    }
+    return m;
+  }();
+  auto it = map.find(s);
+  ICHECK(it != map.end()) << "Unknown placeholder: " << s;
+  return it->second;
 }
 
 Var ReplicationPlaceholder() { return getPlaceholder("_rep"); }
@@ -159,6 +426,88 @@ Array<PrimExpr> LayoutNode::Forward(const Array<PrimExpr> &vars) const {
   return result;
 }
 
+Layout LayoutNode::Repeat(int dim, int factor) const {
+  if (factor < 1) {
+    TVM_FFI_THROW(ValueError) << "factor must be >= 1, got " << factor;
+  }
+  if (factor == 1) {
+    return ffi::GetRef<Layout>(this);
+  }
+
+  const int ndim = static_cast<int>(InputDim());
+  if (ndim <= 0) {
+    TVM_FFI_THROW(ValueError) << "Cannot repeat a 0-dim layout";
+  }
+  int normalized_dim = dim;
+  if (normalized_dim < 0) {
+    normalized_dim += ndim;
+  }
+  if (normalized_dim < 0 || normalized_dim >= ndim) {
+    TVM_FFI_THROW(ValueError)
+        << "dim out of range: dim=" << dim << ", ndim=" << ndim;
+  }
+
+  Array<PrimExpr> new_input_size = input_size_;
+  PrimExpr extent_dim = input_size_[normalized_dim];
+  new_input_size.Set(normalized_dim, extent_dim * Integer(factor));
+
+  Map<Var, PrimExpr> vmap;
+  vmap.Set(InputPlaceholder(normalized_dim),
+           FloorMod(InputPlaceholder(normalized_dim), extent_dim));
+
+  Array<PrimExpr> new_forward_index;
+  new_forward_index.reserve(OutputDim() + 1);
+  new_forward_index.push_back(
+      FloorDiv(InputPlaceholder(normalized_dim), extent_dim));
+  for (const auto &e : forward_index_) {
+    new_forward_index.push_back(Substitute(e, vmap));
+  }
+
+  return Layout(new_input_size, new_forward_index);
+}
+
+Layout LayoutNode::Expand(const Array<PrimExpr> &leading_shape) const {
+  if (leading_shape.empty()) {
+    return ffi::GetRef<Layout>(this);
+  }
+
+  for (size_t i = 0; i < leading_shape.size(); ++i) {
+    if (auto imm = leading_shape[i].as<IntImm>()) {
+      if ((*imm)->value <= 0) {
+        TVM_FFI_THROW(ValueError)
+            << "leading_shape[" << i << "] must be > 0, got " << (*imm)->value;
+      }
+    }
+  }
+
+  const size_t offset = leading_shape.size();
+
+  Array<PrimExpr> new_input_size;
+  new_input_size.reserve(offset + InputDim());
+  for (const auto &s : leading_shape) {
+    new_input_size.push_back(s);
+  }
+  for (const auto &s : input_size_) {
+    new_input_size.push_back(s);
+  }
+
+  Map<Var, PrimExpr> vmap;
+  for (size_t i = 0; i < InputDim(); ++i) {
+    vmap.Set(InputPlaceholder(i), InputPlaceholder(i + offset));
+  }
+
+  Array<PrimExpr> new_forward_index;
+  new_forward_index.reserve(offset + OutputDim());
+  for (size_t i = 0; i < offset; ++i) {
+    new_forward_index.push_back(InputPlaceholder(i));
+  }
+  for (const auto &e : forward_index_) {
+    new_forward_index.push_back(Substitute(e, vmap));
+  }
+
+  return Layout(new_input_size, new_forward_index);
+}
+
 Fragment FragmentNode::Repeat(const Array<PrimExpr> &repeats,
                               bool repeat_on_thread,
                               bool lower_dim_first) const {
@@ -236,7 +585,8 @@ Fragment FragmentNode::DeReplicate() const {
   PrimExpr new_forward_thread = Substitute(forward_thread_, vmap);
   Array<PrimExpr> new_forward_index = {FloorDiv(forward_index_[0], factor)};
   return Fragment(input_size_, new_forward_index, new_forward_thread,
-                  int(*rep_size) / factor, std::nullopt);
+                  int(*rep_size) / factor, std::nullopt)
+      ->BindThreadRange(Range(0, ThreadExtent()));
 }
 
 Fragment FragmentNode::BindThreadRange(Range thread_range) const {
@@ -329,59 +679,31 @@ Layout LayoutNode::Reshape(const Array<PrimExpr> &shape,
       << "InputShape() = " << InputShape() << " shape = " << shape
       << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den;
 
-  // Step 2. Create new forward indices by reshaping
-  // For each dimension in the new shape, we create a placeholder variable
-  Array<Var> new_vars;
-  new_vars.reserve(shape.size());
-  for (size_t i = 0; i < shape.size(); ++i) {
-    auto var = Var(std::string("n_") + std::to_string(i), shape[i].dtype());
-    az->Bind(var, Range(0, shape[i]));
-    new_vars.push_back(var);
+  // The generic reshape below only reasons about a flat logical element index.
+  // For subtype-changing views on packed sub-byte dtypes, that is not enough:
+  // we must preserve which sub-element inside a packed storage slot is being
+  // referenced.  Handle that first, then fall back to the ordinary reshape.
+  if (auto packed =
+          TryPackedSubtypeReshape(this, shape, az, rescale_num, rescale_den);
+      packed.defined()) {
+    return packed;
   }
+
+  // Step 2. Create new forward indices by reshaping
+  Array<Var> new_vars = CreateReshapeVars(shape, az);
   // Step 3. Compute the flat index from new shape indices
   // flat_index = k0 * (s1 * s2 * ...) + k1 * (s2 * s3 * ...) + ... + kn
-  PrimExpr flat_index = Integer(0);
-  for (size_t i = 0; i < shape.size(); ++i) {
-    PrimExpr stride = Integer(1);
-    for (size_t j = i + 1; j < shape.size(); ++j) {
-      stride = stride * shape[j];
-    }
-    flat_index = flat_index + new_vars[i] * stride;
-  }
+  PrimExpr flat_index = ComputeFlatIndex(shape, new_vars);
   // Convert new flat index (in units of new elements) to the old flat index
   // (in units of old elements) using the rational rescale factor.
   // old_flat = floor((flat_index * rescale_den) / rescale_num)
   PrimExpr old_flat_index = floordiv(flat_index * rescale_den, rescale_num);
-  // Step 4. Convert flat index back to original shape indices
-  // For original shape [s0, s1, ..., sm]:
-  // i0 = flat_index // (s1 * s2 * ... * sm)
-  // i1 = (flat_index % (s1 * s2 * ... * sm)) // (s2 * s3 * ... * sm)
-  // ...
-  Array<PrimExpr> original_indices;
-  PrimExpr remaining = old_flat_index;
-  for (size_t i = 0; i < InputShape().size(); ++i) {
-    PrimExpr stride = Integer(1);
-    for (size_t j = i + 1; j < InputShape().size(); ++j) {
-      stride = stride * InputShape()[j];
-    }
-    original_indices.push_back(floordiv(remaining, stride));
-    remaining = floormod(remaining, stride);
-  }
+  Array<PrimExpr> original_indices =
+      RecoverOriginalIndices(InputShape(), old_flat_index);
   // Step 5. Substitute original indices into forward_index_
-  Array<PrimExpr> new_forward_index;
-  for (const auto &fwd_expr : forward_index_) {
-    PrimExpr substituted = fwd_expr;
-    // Replace each InputPlaceholder(i) with original_indices[i]
-    for (size_t i = 0; i < InputShape().size(); ++i) {
-      substituted =
-          Substitute(substituted, {{InputPlaceholder(i), original_indices[i]}});
-    }
-    new_forward_index.push_back(az->Simplify(substituted));
-  }
-  for (size_t i = 0; i < new_vars.size(); ++i) {
-    new_forward_index =
-        Substitute(new_forward_index, {{new_vars[i], InputPlaceholder(i)}});
-  }
+  Array<PrimExpr> new_forward_index = SubstituteForwardIndex(
+      forward_index_, InputShape(), original_indices, az);
+  new_forward_index = RestoreInputPlaceholders(new_forward_index, new_vars);
   return Layout(shape, new_forward_index);
 }
 
@@ -411,62 +733,29 @@ Layout FragmentNode::Reshape(const Array<PrimExpr> &shape,
       << ", rescale_num = " << rescale_num << ", rescale_den = " << rescale_den
       << " input fragment layout is = " << DebugOutput();
 
-  // 2) Build flat index from new-shape indices
-  Array<Var> new_vars;
-  new_vars.reserve(shape.size());
-  for (size_t i = 0; i < shape.size(); ++i) {
-    // Cannot use InputPlaceholder(i) here, because it would cause name capture
-    // (variable capture) with InputPlaceholder(i) in upper scopes. Therefore,
-    // we must create a fresh variable here to avoid confusion when
-    // substituting.
-    auto var = Var(std::string("n_") + std::to_string(i), shape[i].dtype());
-    az->Bind(var, Range(0, shape[i]));
-    new_vars.push_back(var);
+  // Fragments need the same special handling as plain layouts so that packed
+  // subtype views keep a stable thread/data mapping through reshape.
+  if (auto packed =
+          TryPackedSubtypeReshape(this, shape, az, rescale_num, rescale_den);
+      packed.defined()) {
+    return packed;
   }
 
-  PrimExpr flat = Integer(0);
-  for (size_t i = 0; i < shape.size(); ++i) {
-    PrimExpr stride = Integer(1);
-    for (size_t j = i + 1; j < shape.size(); ++j)
-      stride = stride * shape[j];
-    flat = flat + new_vars[i] * stride;
-  }
+  // 2) Build flat index from new-shape indices
+  Array<Var> new_vars = CreateReshapeVars(shape, az);
+  PrimExpr flat = ComputeFlatIndex(shape, new_vars);
   // Convert to old flat index units using the rational rescale factor.
   // old_flat = floor((flat * rescale_den) / rescale_num)
   PrimExpr old_flat = floordiv(flat * rescale_den, rescale_num);
   // 3) Recover original indices from flat index
-  Array<PrimExpr> orig_indices;
-  PrimExpr remain = old_flat;
-  for (size_t i = 0; i < InputShape().size(); ++i) {
-    PrimExpr stride = Integer(1);
-    for (size_t j = i + 1; j < InputShape().size(); ++j)
-      stride = stride * InputShape()[j];
-    orig_indices.push_back(floordiv(remain, stride));
-    remain = floormod(remain, stride);
-  }
+  Array<PrimExpr> orig_indices = RecoverOriginalIndices(InputShape(), old_flat);
   // 4) Substitute old placeholders with expressions of new indices
-  Array<PrimExpr> new_forward_index;
-  for (const auto &e : forward_index_) {
-    PrimExpr cur = e;
-    for (size_t i = 0; i < InputShape().size(); ++i) {
-      cur = Substitute(cur, {{InputPlaceholder(i), orig_indices[i]}});
-    }
-    cur = az->Simplify(cur);
-    new_forward_index.push_back(cur);
-  }
-  PrimExpr new_forward_thread = forward_thread_;
-  for (size_t i = 0; i < InputShape().size(); ++i) {
-    new_forward_thread = Substitute(new_forward_thread,
-                                    {{InputPlaceholder(i), orig_indices[i]}});
-  }
-  new_forward_thread = az->Simplify(new_forward_thread);
-  for (size_t i = 0; i < new_vars.size(); ++i) {
-    auto var = new_vars[i];
-    new_forward_index =
-        Substitute(new_forward_index, {{var, InputPlaceholder(i)}});
-    new_forward_thread =
-        Substitute(new_forward_thread, {{var, InputPlaceholder(i)}});
-  }
+  Array<PrimExpr> new_forward_index =
+      SubstituteForwardIndex(forward_index_, InputShape(), orig_indices, az);
+  PrimExpr new_forward_thread =
+      SubstituteReshapedExpr(forward_thread_, InputShape(), orig_indices, az);
+  new_forward_index = RestoreInputPlaceholders(new_forward_index, new_vars);
+  new_forward_thread = RestoreInputPlaceholders(new_forward_thread, new_vars);
   Fragment reshaped(shape, new_forward_index, new_forward_thread,
                     ReplicateExtent(), std::nullopt);
   if (thread_range_.defined()) {
@@ -554,7 +843,8 @@ Fragment::Fragment(Array<PrimExpr> input_size, Array<PrimExpr> forward_index,
 Fragment Fragment::FullyReplicated(Array<PrimExpr> shape,
                                    PrimExpr thread_extent) {
   return Fragment(shape, {}, ReplicationPlaceholder(), thread_extent,
-                  std::nullopt);
+                  std::nullopt)
+      ->BindThreadRange(Range(0, thread_extent));
 }
 
 // which means the forward_thread is rep_var -> lambda i, rep: rep
@@ -794,10 +1084,23 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def("tl.Layout_output_shape",
            [](Layout layout) { return layout->OutputShape(); })
       .def("tl.Layout_inverse", [](Layout layout) { return layout->Inverse(); })
+      .def("tl.Layout_reshape",
+           [](Layout layout, Array<PrimExpr> shape, PrimExpr rescale_num,
+              PrimExpr rescale_den) {
+             return layout->Reshape(shape, nullptr, rescale_num, rescale_den);
+           })
       .def("tl.Layout_index",
            [](Layout layout) { return layout->GetForwardIndex(); })
       .def("tl.Layout_forward_vars",
            [](Layout layout) { return layout->GetForwardVars(); })
+      .def("tl.Layout_repeat",
+           [](Layout layout, int dim, int factor) {
+             return layout->Repeat(dim, factor);
+           })
+      .def("tl.Layout_expand",
+           [](Layout layout, Array<PrimExpr> leading_shape) {
+             return layout->Expand(leading_shape);
+           })
       .def("tl.Layout_is_equal",
            [](Layout layout, Layout other) {
              const LayoutNode *other_node = other.as<LayoutNode>();
@@ -833,48 +1136,42 @@ TVM_FFI_STATIC_INIT_BLOCK() {
       .def("tl.Fragment_condense_rep_var",
            [](Fragment fragment) { return fragment->CondenseReplicateVar(); })
       .def("tl.make_swizzled_layout",
-           [](int stride, int continuous, int element_size, bool k_inner,
-              bool allow_pad = true) {
-             if (allow_pad) {
-               return makeGemmABLayout(stride, continuous, continuous,
-                                       element_size, k_inner);
-             } else {
-               return makeGemmABLayoutHopper(stride, continuous, continuous,
-                                             element_size, k_inner);
-             }
+           [](const Buffer &buffer, bool k_inner, bool allow_pad) {
+             return makeSwizzledLayout(buffer, k_inner, allow_pad);
            })
       .def("tl.make_volta_swizzled_layout",
-           [](int stride, int mat_continuous, bool is_a, bool k_inner) {
-             return makeGemmVoltaABLayout(stride, mat_continuous, is_a,
-                                          k_inner);
+           [](const Buffer &buffer, bool is_a, bool k_inner) {
+             return makeVoltaSwizzledLayout(buffer, is_a, k_inner);
            })
       .def("tl.make_wgmma_swizzled_layout",
-           [](int stride, int mat_continuous, int continuity, int element_size,
-              bool k_inner) {
-             return makeGemmABLayoutHopper(stride, mat_continuous, continuity,
-                                           element_size, k_inner);
+           [](const Buffer &buffer, int continuity, bool k_inner) {
+             return makeWgmmaSwizzledLayout(buffer, continuity, k_inner);
            })
       .def("tl.make_tcgen05mma_swizzled_layout",
-           [](int stride, int mat_continuous, int continuity, int element_size,
-              bool k_inner) {
-             return makeGemmABLayoutSm100(stride, mat_continuous, continuity,
-                                          element_size, k_inner);
+           [](const Buffer &buffer, int continuity, bool k_inner) {
+             return makeTcgen05mmaSwizzledLayout(buffer, continuity, k_inner);
            })
       .def("tl.make_full_bank_swizzled_layout",
-           [](int stride, int continuous, int element_size) {
-             return makeFullBankSwizzleLayout(stride, continuous, element_size);
+           [](const Buffer &buffer) {
+             return makeFullBankSwizzleLayout(buffer);
            })
       .def("tl.make_half_bank_swizzled_layout",
-           [](int stride, int continuous, int element_size) {
-             return makeHalfBankSwizzleLayout(stride, continuous, element_size);
+           [](const Buffer &buffer) {
+             return makeHalfBankSwizzleLayout(buffer);
            })
       .def("tl.make_quarter_bank_swizzled_layout",
-           [](int stride, int continuous, int element_size) {
-             return makeQuarterBankSwizzleLayout(stride, continuous,
-                                                 element_size);
+           [](const Buffer &buffer) {
+             return makeQuarterBankSwizzleLayout(buffer);
            })
       .def("tl.make_linear_layout",
-           [](Array<PrimExpr> shape) { return makeLinearLayout(shape); });
+           [](Array<PrimExpr> shape) { return makeLinearLayout(shape); })
+      .def("tl.make_gemm_fragment_8x8", []() { return makeGemmFragment8x8(); })
+      .def("tl.make_gemm_fragment_8x8_transposed",
+           []() { return makeGemmFragment8x8Transposed(); })
+      .def("tl.make_fully_replicated_layout_fragment",
+           [](Array<PrimExpr> shape, PrimExpr thread_extent) {
+             return Fragment::FullyReplicated(shape, thread_extent);
+           });
 }
 
 TVM_FFI_STATIC_INIT_BLOCK() {
diff --git a/src/layout/layout.h b/src/layout/layout.h
index 0737183423..8043c5765d 100644
--- a/src/layout/layout.h
+++ b/src/layout/layout.h
@@ -10,6 +10,7 @@
 #include <tvm/arith/analyzer.h>
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/ffi/object.h>
+#include <tvm/tir/buffer.h>
 #include <utility>
 
 #include "../support/ffi_aliases.h"
@@ -60,18 +61,39 @@ class LayoutNode : public Object {
 
   virtual Array<PrimExpr> Forward(const Array<PrimExpr> &vars) const;
 
+  // Repeat the layout along a single input dimension and prepend a new output
+  // dimension that indicates the repeat-group index.
+  //
+  // For a layout L with input shape S and forward index F, repeating along
+  // dimension `dim` with `factor` constructs a new layout L' where:
+  //   - New input shape: S'[dim] = S[dim] * factor
+  //   - New forward index: [i_dim // S[dim]] + F(..., i_dim % S[dim], ...)
+  virtual Layout Repeat(int dim, int factor) const;
+
+  // Expand (lift) this layout by prepending new leading input dimensions that
+  // are forwarded unchanged to the output.
+  //
+  // For example, given a 2D layout L: [J, K] -> F(J, K), calling
+  // Expand([I]) produces a 3D layout L': [I, J, K] -> [I] + F(J, K).
+  //
+  // `leading_shape` can contain multiple dimensions.
+  virtual Layout Expand(const Array<PrimExpr> &leading_shape) const;
+
   virtual Layout Inverse() const;
 
   // Reshape the layout to a new logical shape. When aliasing buffers of
   // different dtypes, the element count may change while the underlying
-  // byte-size stays equal. Use rescale_num/rescale_den to represent the
-  // ratio between the old element size and the new element size in bytes.
+  // storage footprint stays equal. Use rescale_num/rescale_den to represent
+  // the ratio between the old element size and the new element size in bits.
   // Specifically, define factor = rescale_num / rescale_den where:
   //   new_num_elems = old_num_elems * factor
-  // For example, f32->i8 (4B -> 1B) uses rescale_num=4, rescale_den=1.
-  // i8->f32 (1B -> 4B) uses rescale_num=1, rescale_den=4.
+  // For example, f32->i8 (32b -> 8b) uses rescale_num=32, rescale_den=8.
+  // i8->f32 (8b -> 32b) uses rescale_num=8, rescale_den=32.
+  // For sub-byte subtype views, the output layout may temporarily gain or drop
+  // a trailing "pack lane" dimension so that the layout still describes how
+  // multiple logical elements share the same physical storage slot.
   virtual Layout Reshape(const Array<PrimExpr> &shape,
-                         arith::Analyzer *analyzer,
+                         arith::Analyzer *analyzer = nullptr,
                          const PrimExpr rescale_num = Integer(1),
                          const PrimExpr rescale_den = Integer(1)) const;
 
@@ -116,7 +138,8 @@ class FragmentNode : public LayoutNode {
 
   Layout Inverse() const final;
 
-  Layout Reshape(const Array<PrimExpr> &shape, arith::Analyzer *analyzer,
+  Layout Reshape(const Array<PrimExpr> &shape,
+                 arith::Analyzer *analyzer = nullptr,
                  const PrimExpr rescale_num = Integer(1),
                  const PrimExpr rescale_den = Integer(1)) const;
 
@@ -250,10 +273,35 @@ Layout makeTensorOpMultiplicand(int mat_stride, int mat_continuous,
 Layout makeGemmSparseAmpereABLayout(int mat_stride, int mat_continuous,
                                     int elementsize);
 
-Layout makeFullBankSwizzleLayout(int stride, int continuous, int element_size);
-Layout makeHalfBankSwizzleLayout(int stride, int continuous, int element_size);
-Layout makeQuarterBankSwizzleLayout(int stride, int continuous,
-                                    int element_size);
+Layout makeSwizzledLayout(const Buffer &buffer, bool k_inner = true,
+                          bool allow_pad = true);
+Layout makeVoltaSwizzledLayout(const Buffer &buffer, bool is_a = true,
+                               bool k_inner = true);
+Layout makeWgmmaSwizzledLayout(const Buffer &buffer, int continuity = -1,
+                               bool k_inner = true);
+Layout makeTcgen05mmaSwizzledLayout(const Buffer &buffer, int continuity = -1,
+                                    bool k_inner = true);
+Layout makeFullBankSwizzleLayout(const Buffer &buffer);
+Layout makeHalfBankSwizzleLayout(const Buffer &buffer);
+Layout makeQuarterBankSwizzleLayout(const Buffer &buffer);
+
+// Swizzle mode for shared memory layouts (nvidia only)
+// Smaller enum value = smaller swizzle granularity
+enum class SwizzleMode {
+  kNone = 0,    // Not a swizzle layout (linear or padded)
+  kQuarter = 1, // 32B swizzle (CU_TENSOR_MAP_SWIZZLE_32B)
+  kHalf = 2,    // 64B swizzle (CU_TENSOR_MAP_SWIZZLE_64B)
+  kFull = 3     // 128B swizzle (CU_TENSOR_MAP_SWIZZLE_128B)
+};
+
+// Detect which swizzle mode a layout uses
+SwizzleMode DetectSwizzleMode(const Layout &layout, const Buffer &buffer);
+
+// Merge two swizzle layouts by taking the smaller granularity
+// Returns NullOpt if either layout is not a swizzle layout
+Optional<Layout> MergeSwizzleLayouts(const Layout &layout1,
+                                     const Layout &layout2,
+                                     const Buffer &buffer);
 
 namespace attr {
 // BlockAttr, Containing the layout for all the buffers in the block
diff --git a/src/layout/tcgen05_layout.cc b/src/layout/tcgen05_layout.cc
index 64e0cdd646..461e7141e9 100644
--- a/src/layout/tcgen05_layout.cc
+++ b/src/layout/tcgen05_layout.cc
@@ -19,22 +19,24 @@ static IterVar make_itervar(std::string name, Range dom) {
   return IterVar(dom, var, IterVarType::kDataPar);
 }
 
-Tcgen05Meta getTcgen05Meta_32dp32b() {
+// ld and st share the same Fragment layout; only the instruction name differs.
+static Tcgen05Meta makeTcgen05Meta_32dp32b(bool is_store) {
   constexpr int INST_WIDTH = 1;
   IterVar inst_row = make_itervar("row", 128);
   IterVar inst_col = make_itervar("col", INST_WIDTH);
-  return Tcgen05Meta{"tl::tcgen05_ld_32dp32bNx",
+  return Tcgen05Meta{is_store ? "tl::tcgen05_st_32dp32bNx"
+                              : "tl::tcgen05_ld_32dp32bNx",
                      Fragment({inst_row, inst_col}, {inst_col}, {inst_row},
                               make_itervar("rep", Range(0, 1))),
                      INST_WIDTH};
 }
 
-Tcgen05Meta getTcgen05Meta_32dp64b() {
+static Tcgen05Meta makeTcgen05Meta_32dp64b(bool is_store) {
   constexpr int INST_WIDTH = 2;
   IterVar inst_row = make_itervar("row", 128);
   IterVar inst_col = make_itervar("col", INST_WIDTH);
   return Tcgen05Meta{
-      "tl::tcgen05_ld_32dp64bNx",
+      is_store ? "tl::tcgen05_st_32dp64bNx" : "tl::tcgen05_ld_32dp64bNx",
       Fragment({inst_row, inst_col}, {FloorDiv(FloorMod(inst_row, 32), 16)},
                {FloorDiv(inst_row, 32) * 32 + FloorMod(inst_row, 8) * 4 +
                 FloorDiv(FloorMod(inst_row, 16), 8) +
@@ -43,12 +45,12 @@ Tcgen05Meta getTcgen05Meta_32dp64b() {
       INST_WIDTH};
 }
 
-Tcgen05Meta getTcgen05Meta_32dp128b() {
+static Tcgen05Meta makeTcgen05Meta_32dp128b(bool is_store) {
   constexpr int INST_WIDTH = 4;
   IterVar inst_row = make_itervar("row", 128);
   IterVar inst_col = make_itervar("col", INST_WIDTH);
   return Tcgen05Meta{
-      "tl::tcgen05_ld_32dp128bNx",
+      is_store ? "tl::tcgen05_st_32dp128bNx" : "tl::tcgen05_ld_32dp128bNx",
       Fragment({inst_row, inst_col}, {FloorDiv(FloorMod(inst_row, 32), 8)},
                {FloorDiv(inst_row, 32) * 32 + FloorMod(inst_row, 8) * 4 +
                 FloorMod(inst_col, 4)},
@@ -56,12 +58,12 @@ Tcgen05Meta getTcgen05Meta_32dp128b() {
       INST_WIDTH};
 }
 
-Tcgen05Meta getTcgen05Meta_32dp256b() {
+static Tcgen05Meta makeTcgen05Meta_32dp256b(bool is_store) {
   constexpr int INST_WIDTH = 8;
   IterVar inst_row = make_itervar("row", 128);
   IterVar inst_col = make_itervar("col", INST_WIDTH);
   return Tcgen05Meta{
-      "tl::tcgen05_ld_32dp256bNx",
+      is_store ? "tl::tcgen05_st_32dp256bNx" : "tl::tcgen05_ld_32dp256bNx",
       Fragment(
           {inst_row, inst_col},
           {FloorMod(inst_col, 2) + FloorDiv(FloorMod(inst_row, 32), 8) * 2},
@@ -71,6 +73,28 @@ Tcgen05Meta getTcgen05Meta_32dp256b() {
       INST_WIDTH};
 }
 
+Tcgen05Meta getTcgen05MetaLd_32dp32b() {
+  return makeTcgen05Meta_32dp32b(false);
+}
+Tcgen05Meta getTcgen05MetaLd_32dp64b() {
+  return makeTcgen05Meta_32dp64b(false);
+}
+Tcgen05Meta getTcgen05MetaLd_32dp128b() {
+  return makeTcgen05Meta_32dp128b(false);
+}
+Tcgen05Meta getTcgen05MetaLd_32dp256b() {
+  return makeTcgen05Meta_32dp256b(false);
+}
+
+Tcgen05Meta getTcgen05MetaSt_32dp32b() { return makeTcgen05Meta_32dp32b(true); }
+Tcgen05Meta getTcgen05MetaSt_32dp64b() { return makeTcgen05Meta_32dp64b(true); }
+Tcgen05Meta getTcgen05MetaSt_32dp128b() {
+  return makeTcgen05Meta_32dp128b(true);
+}
+Tcgen05Meta getTcgen05MetaSt_32dp256b() {
+  return makeTcgen05Meta_32dp256b(true);
+}
+
 std::tuple<bool, Fragment, int>
 expandTcgen05Layout(const Tcgen05Meta &meta, int tmem_phy_col_extent,
                     int num_threads, Range row_dom, Range col_dom) {
diff --git a/src/layout/tcgen05_layout.h b/src/layout/tcgen05_layout.h
index 8148d7077e..67837b0688 100644
--- a/src/layout/tcgen05_layout.h
+++ b/src/layout/tcgen05_layout.h
@@ -17,11 +17,17 @@ struct Tcgen05Meta {
   int width;
 };
 
-// Obtain the metadata for tcgen05.ld/st instructions.
-Tcgen05Meta getTcgen05Meta_32dp32b();
-Tcgen05Meta getTcgen05Meta_32dp64b();
-Tcgen05Meta getTcgen05Meta_32dp128b();
-Tcgen05Meta getTcgen05Meta_32dp256b();
+// Obtain the metadata for tcgen05.ld instructions.
+Tcgen05Meta getTcgen05MetaLd_32dp32b();
+Tcgen05Meta getTcgen05MetaLd_32dp64b();
+Tcgen05Meta getTcgen05MetaLd_32dp128b();
+Tcgen05Meta getTcgen05MetaLd_32dp256b();
+
+// Obtain the metadata for tcgen05.st instructions.
+Tcgen05Meta getTcgen05MetaSt_32dp32b();
+Tcgen05Meta getTcgen05MetaSt_32dp64b();
+Tcgen05Meta getTcgen05MetaSt_32dp128b();
+Tcgen05Meta getTcgen05MetaSt_32dp256b();
 
 // Expand a tcgen05 layout along thread_idx/value_idx (T/V) dimensions.
 // Return {is_success, fragment, num_chunks_each_wg}
diff --git a/src/layout/utils.cc b/src/layout/utils.cc
index 860e746a7f..73236d2e7c 100644
--- a/src/layout/utils.cc
+++ b/src/layout/utils.cc
@@ -325,8 +325,8 @@ std::pair<PrimExpr, IterVar> CompressIterator(const PrimExpr &expr,
   collector.Collect({iter_sum});
   IterMark mark;
   for (const IterMark &m : collector.visited_) {
-    ICHECK(m->source.as<Var>()) << "Not a normalized iterator: " << mark;
-    if (m->source.as<Var>().value().same_as(var)) {
+    auto v = m->source.as<Var>();
+    if (v && v.value().same_as(var)) {
       mark = m;
       break;
     }
@@ -377,5 +377,87 @@ Map<Var, Range> ToVMap(const Array<IterVar> &ivs) {
   return result;
 }
 
+// ProveFragmentContains checks whether the threads that access elements of a
+// smaller fragment (small_frag) are a subset of the threads that access
+// elements of a larger fragment (large_frag) for any given loop index. This
+// function ensures that if the small fragment's layout corresponds to the loop
+// itself, accessing the large fragment's elements is valid. Additionally, if
+// small is updated to large, the originally valid access remains valid. The
+// proof is performed by:
+//
+// 1. Defining a variable `rep_small` to represent the replicate index of the
+//    small fragment that is being checked.
+// 2. Using the `small_frag_indices` and `rep_small` to derive the thread
+//    accessing the element in the small fragment.
+// 3. Using `large_frag_indices` to derive the physical index of the large
+//    fragment along with the thread information, and then feeding these into
+//    the inverse of the large fragment to obtain the logical index and
+//    replicate index.
+// 4. Verifying the mapping by checking whether the computed thread using the
+//    inverse layout corresponds to the original thread calculated for the small
+//    fragment. If they don't match, this indicates that the inverse layout's
+//    domain does not include the thread and thus the access is invalid.
+// Thanks @huanqicao for contributing this algorithm.
+bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
+                           Array<PrimExpr> small_frag_indices,
+                           Array<PrimExpr> large_frag_indices,
+                           Analyzer &analyzer, bool check_forward_index) {
+  // When check_forward_index is true, verify that the physical indices
+  // (forward index) of both fragments are equal. This is required when
+  // validating loop layout against buffer fragment, as code generation
+  // needs to correctly derive buffer physical indices from loop layout.
+  bool large_physical_is_fully_replicated = large_frag->IsCompletedReplicated();
+  if (large_physical_is_fully_replicated) {
+    return true; // fully replicated fragments are always compatible
+  }
+
+  if (check_forward_index) {
+    auto small_physical = small_frag->Forward(small_frag_indices);
+    auto large_physical = large_frag->Forward(large_frag_indices);
+    // Dimension mismatch means they are not equal.
+    if (small_physical.size() != large_physical.size()) {
+      return false;
+    }
+    // Check each physical index component for equality.
+    for (size_t i = 0; i < small_physical.size(); i++) {
+      auto diff = analyzer.Simplify(small_physical[i] - large_physical[i]);
+      if (!analyzer.CanProve(diff == 0)) {
+        return false;
+      }
+    }
+  }
+
+  Var rep_small("__checking_frag_contains_rep");
+  analyzer.Bind(rep_small,
+                Range(IntImm(small_frag->ReplicateExtent()->dtype, 0),
+                      small_frag->ReplicateExtent()),
+                true); // Bind the replicate extent of small_frag.
+  // Derive thread for small_frag.
+  auto thread = small_frag->ForwardThread(small_frag_indices, rep_small);
+
+  // Get physical index and thread for large_frag.
+  auto large_frag_physical_and_thread = large_frag->Forward(large_frag_indices);
+  // Add small_frag's thread to the large fragment's thread info.
+  large_frag_physical_and_thread.push_back(thread);
+  // Get the inverse of the large fragment.
+  auto inv_large_frag = large_frag->Inverse();
+  // Compute logical index and replicate index using inverse layout.
+  auto inv_large_frag_logical_and_rep =
+      inv_large_frag->Forward(large_frag_physical_and_thread);
+
+  // Extract replicate index from the result.
+  auto inv_large_frag_rep =
+      inv_large_frag_logical_and_rep[inv_large_frag_logical_and_rep.size() - 1];
+
+  // Calculate thread based on the logical index and replicate index.
+  auto check_thread =
+      large_frag->ForwardThread(large_frag_indices, inv_large_frag_rep);
+
+  // Simplify the difference between the threads.
+  auto diff = analyzer.Simplify(thread - check_thread);
+  // If the difference is zero, the threads match and the access is valid.
+  return analyzer.CanProve(diff == 0);
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/layout/utils.h b/src/layout/utils.h
index 0f03a8617f..cae9ecde58 100644
--- a/src/layout/utils.h
+++ b/src/layout/utils.h
@@ -10,6 +10,7 @@
 #include <tvm/arith/iter_affine_map.h>
 
 #include "../support/ffi_aliases.h"
+#include "layout.h"
 
 namespace tvm {
 namespace tl {
@@ -66,6 +67,28 @@ Map<Var, Range> ToVMap(const Array<IterVar> &ivs);
  */
 Array<IterVar> ToIterVars(const Map<Var, Range> &vmap);
 
+/*!
+ * \brief Check whether the threads that access elements of a smaller fragment
+ *        are a subset of the threads that access elements of a larger fragment.
+ *
+ * This function ensures that if the small fragment's layout corresponds to the
+ * loop itself, accessing the large fragment's elements is valid. Additionally,
+ * if small is updated to large, the originally valid access remains valid.
+ *
+ * \param small_frag The smaller fragment to check
+ * \param large_frag The larger fragment to check against
+ * \param small_frag_indices The indices used to access small_frag
+ * \param large_frag_indices The indices used to access large_frag
+ * \param analyzer The analyzer for simplification
+ * \param check_forward_index Whether to also check physical index equality
+ * \return true if small_frag's threads are contained in large_frag's threads
+ */
+bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
+                           Array<PrimExpr> small_frag_indices,
+                           Array<PrimExpr> large_frag_indices,
+                           arith::Analyzer &analyzer,
+                           bool check_forward_index = false);
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/op/atomic_add.cc b/src/op/atomic_add.cc
index 408895966b..b4fa1fb6b0 100644
--- a/src/op/atomic_add.cc
+++ b/src/op/atomic_add.cc
@@ -5,6 +5,7 @@
  */
 
 #include "./atomic_add.h"
+#include "./copy.h"
 #include "utils.h"
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
@@ -12,9 +13,7 @@
 
 #include "../layout/layout.h"
 #include "../target/utils.h"
-#include "../transform/atomicadd_vectorize.h"
 #include "../transform/common/loop_fusion_utils.h"
-#include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "builtin.h"
 
@@ -42,16 +41,28 @@ using namespace tir;
  * - The constructed node is stored in this->data_.
  */
 AtomicAdd::AtomicAdd(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ICHECK(args.size() >= 2)
+      << "AtomicAdd expects at least 2 arguments (src, dst), got "
+      << args.size();
   ObjectPtr<AtomicAddNode> node = tvm::ffi::make_object<AtomicAddNode>();
-  Array<Range> rgs[2];
-  Buffer bf[2];
-  for (int i = 0; i < 2; i++) {
-    auto region = NormalizeToBufferRegion(args[i]);
-    rgs[i] = region->region;
-    bf[i] = region->buffer;
+  std::vector<AccessRegion> access_regions;
+
+  if (IsBufferLikeExpr(args[0])) {
+    auto src_access = NormalizeToAccessRegion(args[0], kAccessRead);
+    node->src = src_access.region->buffer;
+    node->src_range = src_access.region->region;
+    access_regions.push_back(std::move(src_access));
+  } else {
+    node->src_value = args[0];
   }
-  std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
-  std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
+
+  auto dst_access = NormalizeToAccessRegion(args[1], kAccessReadWrite);
+  dst_access.access_mask = kAccessReadWrite;
+  node->dst = dst_access.region->buffer;
+  node->dst_range = dst_access.region->region;
+  access_regions.push_back(std::move(dst_access));
+  node->SetAccessRegions(std::move(access_regions));
+
   // Copy annotations from the Call node
   node->annotations = annotations;
   data_ = std::move(node);
@@ -74,71 +85,29 @@ TileOperator AtomicAddNode::Clone() const {
   return AtomicAdd(op);
 }
 
-/**
- * @brief Create data-parallel iteration variables for non-singleton dimensions
- * of the source.
- *
- * Constructs an Array of IterVar corresponding to each dimension in `src_range`
- * whose extent is not equal to 1. Each IterVar has domain Range(0, extent), a
- * Var named sequentially ("i", "j", "k", ...) with the same dtype as the
- * extent, and type IterVarType::kDataPar. The ordering of returned itervars
- * matches the order of dimensions in `src_range`.
- *
- * @return Array<IterVar> Iteration variables for all non-singleton extents in
- * `src_range`.
- */
-Array<IterVar> AtomicAddNode::MakeIterVars() const {
-  Array<IterVar> loop_vars;
-  size_t idx = 0;
-  for (size_t i = 0; i < src_range.size(); i++) {
-    if (is_one(src_range[i]->extent))
-      continue;
-    Var var = Var(std::string{char('i' + idx)}, src_range[i]->extent->dtype);
-    idx++;
-    loop_vars.push_back(
-        {Range(0, src_range[i]->extent), var, IterVarType::kDataPar});
-  }
-  return loop_vars;
-}
+const Op &AtomicAddNode::GetElemOp() const { return atomic_add_elem_op(); }
 
-// ivs: itervars returned by MakeIterVars()
 /**
- * @brief Build index expressions for either source or destination from loop
- * iter vars.
+ * @brief Get vectorization length based on dst dtype and target SM version.
  *
- * Given a list of iteration variables that correspond to the non-singleton
- * extents of the selected region (source when src_dst == 0, destination when
- * src_dst == 1), return an array of index expressions matching the full rank of
- * that region. For dimensions with extent == 1, the corresponding index is the
- * range's minimum; otherwise the index is `min + ivar`.
+ * Returns:
+ *   - 2 for float16/bfloat16
+ *   - 4 for float32 on SM >= 90
+ *   - 1 for all other cases
  *
- * @param ivs Iteration variables in order for all non-singleton dimensions of
- * the chosen region.
- * @param src_dst Selects which region to index: 0 for source (src_range), 1 for
- * destination (dst_range).
- * @return Array<PrimExpr> Index expressions for every dimension of the selected
- * region, in original dimension order.
- *
- * @note The function checks that the number of provided iter vars equals the
- * number of non-singleton extents; it will abort (ICHECK) if they differ.
+ * @param target The target architecture to check SM version.
+ * @return int The vectorization length.
  */
-Array<PrimExpr> AtomicAddNode::MakeIndices(const Array<IterVar> &ivs,
-                                           int src_dst) const {
-  Array<PrimExpr> indices;
-  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
-  size_t idx = 0;
-  for (size_t i = 0; i < ranges.size(); i++) {
-    if (is_one(ranges[i]->extent))
-      indices.push_back(ranges[i]->min);
-    else {
-      indices.push_back(ranges[i]->min + ivs[idx]->var);
-      idx++;
-    }
+int AtomicAddNode::GetVectorizeLength(Target target) const {
+  DataType dtype = dst->dtype;
+  if (dtype.is_float16() || dtype.is_bfloat16()) {
+    return 2;
   }
-  ICHECK(idx == ivs.size())
-      << "idx = " << idx << ", ivs.size() = " << ivs.size()
-      << "src name = " << src->name << ", dst name = " << dst->name;
-  return indices;
+  if (dtype.is_float() && dtype.bits() == 32 &&
+      TargetHasSMVersionGE(target, 90)) {
+    return 4;
+  }
+  return 1;
 }
 
 std::pair<Array<PrimExpr>, PrimExpr>
@@ -153,62 +122,6 @@ AtomicAddNode::ReturnIndicesAndSize(int src_dst) const {
   return {indices, size};
 }
 
-/**
- * @brief Build a combined bound-check predicate for indexed access.
- *
- * Constructs an AND'd predicate ensuring each non-singleton index (derived from
- * `ivs`) stays within [0, extent) for the selected operand (source when
- * `src_dst==0`, destination otherwise). For each non-unit Range in the chosen
- * range list this produces two conditions:
- *   - range.min + iv >= 0
- *   - range.min + iv < extent
- *
- * Conditions that the analyzer can prove (with symbolic bounds) are omitted.
- * If no uncertain conditions remain, an empty PrimExpr is returned.
- *
- * Note: the function ICHECKs that `extents.size()` equals the number of ranges
- * for the selected operand.
- *
- * @param ivs Iteration variables corresponding to non-singleton extents (order
- *            matches the non-unit ranges of the chosen operand).
- * @param extents Per-dimension upper bounds to check against; must have the
- *                same size as the selected range list.
- * @param src_dst Selects which ranges to validate: 0 => `src_range`, else
- *                `dst_range`.
- * @return PrimExpr A conjunction of remaining (non-provable) bounds checks, or
- *         an empty PrimExpr when no checks are required.
- */
-PrimExpr AtomicAddNode::MakePredicate(arith::Analyzer *analyzer,
-                                      const Array<IterVar> &ivs,
-                                      Array<PrimExpr> extents,
-                                      int src_dst) const {
-  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
-  Array<PrimExpr> cond_list;
-  ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
-  size_t idx = 0;
-  for (size_t i = 0; i < ranges.size(); i++) {
-    if (is_one(ranges[i]->extent))
-      continue;
-    PrimExpr cond = ranges[i]->min + ivs[idx]->var < extents[i];
-    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
-      cond_list.push_back(cond);
-    }
-    cond = ranges[i]->min + ivs[idx]->var >= 0;
-    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
-      cond_list.push_back(cond);
-    }
-    idx++;
-  }
-  if (cond_list.empty())
-    return {};
-  else {
-    PrimExpr cond = cond_list[0];
-    for (size_t i = 1; i < cond_list.size(); i++)
-      cond = And(cond, cond_list[i]);
-    return cond;
-  }
-}
-
 /**
  * @brief Build a SIMT-style loop nest that performs element-wise atomic
  * additions from src to dst.
@@ -225,8 +138,9 @@ PrimExpr AtomicAddNode::MakePredicate(arith::Analyzer *analyzer,
  * - Validates loop variable counts against src/dst ranges (ICHECK on mismatch).
  * - Computes indexed accesses and emits optional bound predicates;
  * out-of-bounds accesses are masked to zero when predicates are uncertain.
- * - Emits an extern `call_extern("AtomicAdd", address_of(dst_value),
- * src_value)` call wrapped in an Evaluate statement.
+ * - Emits an extern `call_intrin(op.Op.get("tl.atomic_add_elem_op"),
+ * address_of(dst_value), src_value), annotations)` call wrapped in an Evaluate
+ * statement.
  * - Wraps the body with a parallel For at each loop level. If `coalesced_width`
  * is defined it is attached as the "coalesced_width" annotation on each loop.
  *
@@ -239,57 +153,70 @@ PrimExpr AtomicAddNode::MakePredicate(arith::Analyzer *analyzer,
  */
 For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   Array<IterVar> loop_vars = MakeIterVars();
-  bool is_scalar = loop_vars.empty();
-  if (is_scalar) {
-    return For(Var("i"), 0, 1, ForKind::kSerial,
-               BufferStore(dst, BufferLoad(src, {0}), {0}));
-  }
+  ICHECK(!loop_vars.empty()) << "MakeIterVars in AtomicOp should not return "
+                                "empty vars (at least 1 var)";
 
   for (const auto &iv : loop_vars)
     analyzer->Bind(iv->var, iv->dom);
 
-  ICHECK(loop_vars.size() <= src_range.size())
-      << "loop_vars.size() = " << loop_vars.size()
-      << ", src_range.size() = " << src_range.size() << ", src = " << src->name
-      << ", dst = " << dst->name;
-
   ICHECK(loop_vars.size() <= dst_range.size())
       << "loop_vars.size() = " << loop_vars.size()
-      << ", dst_range.size() = " << dst_range.size() << ", src = " << src->name
-      << ", dst = " << dst->name;
+      << ", dst_range.size() = " << dst_range.size() << ", dst = " << dst->name;
 
-  Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
   Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
-
   Array<PrimExpr> new_args;
 
   // Optional bounds predicates for src and dst
-  PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
   PrimExpr dst_predicate = MakePredicate(analyzer, loop_vars, dst->shape, 1);
 
-  // Load source value and cast to dst dtype if needed
-  PrimExpr src_value = BufferLoad(src, src_indices);
-  if (src->dtype != dst->dtype)
-    src_value = Cast(dst->dtype, src_value);
-
-  // Build a pointer to destination element using tvm_access_ptr
-  PrimExpr dst_ptr = Call(DataType::Handle(), builtin::address_of(),
-                          {BufferLoad(dst, dst_indices)});
+  // Src arg to be passed to the Call atomic operation
+  PrimExpr src_value_arg;
+
+  // If src is a Buffer
+  if (!src_value.defined()) {
+    ICHECK(loop_vars.size() <= src_range.size())
+        << "loop_vars.size() = " << loop_vars.size()
+        << ", src_range.size() = " << src_range.size()
+        << ", src = " << src->name << ", dst = " << dst->name;
+
+    Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
+    PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
+    // Load source value
+    src_value_arg = BufferLoad(src, src_indices);
+  } else {
+    src_value_arg = src_value;
+  }
+  // Cast to dst dtype if needed
+  if (src_value_arg->dtype != dst->dtype)
+    src_value_arg = Cast(dst->dtype, src_value_arg);
+
+  // Build an access pointer to the destination element (rw).
+  DataType idx_dtype =
+      dst_indices.empty() ? DataType::Int(32) : dst_indices[0].dtype();
+  PrimExpr dst_ptr =
+      Call(DataType::Handle(), tl::access_ptr(),
+           {BufferLoad(dst, dst_indices), make_const(idx_dtype, 1),
+            make_const(DataType::Int(32), 3)});
 
   new_args.push_back(dst_ptr);
-  new_args.push_back(src_value);
+  new_args.push_back(src_value_arg);
   new_args.push_back(GetMemoryOrder());
 
+  // erase use_tma from annotations
+  auto annotations = this->annotations;
+  annotations.erase("use_tma");
   Call atomicadd_call =
-      tvm::tir::Call(dst->dtype, atomicadd_elem_op(), new_args);
+      tvm::tir::Call(dst->dtype, atomic_add_elem_op(), new_args, annotations);
 
   Stmt body = tvm::tir::Evaluate(atomicadd_call);
 
   for (int i = loop_vars.size() - 1; i >= 0; i--) {
     Map<String, ObjectRef> loop_annotations;
-    if (annotations.count(attr::kCoalescedWidth)) {
-      loop_annotations.Set(attr::kCoalescedWidth,
-                           annotations.Get(attr::kCoalescedWidth).value());
+    if (i == 0) {
+      if (annotations.count(attr::kCoalescedWidth)) {
+        loop_annotations.Set(attr::kCoalescedWidth,
+                             annotations.Get(attr::kCoalescedWidth).value());
+      }
     }
 
     body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
@@ -298,39 +225,105 @@ For AtomicAddNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   return Downcast<For>(body);
 }
 
+/**
+ * @brief Compute linear layout for shared tensor (used in TMA atomic add).
+ *
+ * Creates a tiled layout that splits each dimension into blocks of 256
+ * elements. The layout maps [i, j, ...] to [i // 256, j // 256, ..., i % 256, j
+ * % 256, ...].
+ *
+ * @param shared_tensor The shared memory buffer to compute layout for.
+ * @return Layout A tiled linear layout for the buffer.
+ */
+Layout AtomicAddNode::ComputeLinearLayout(const Buffer &shared_tensor) const {
+  Array<PrimExpr> input_size = shared_tensor->shape;
+  Array<PrimExpr> forward_vars;
+  for (size_t i = 0; i < input_size.size(); i++) {
+    forward_vars.push_back(InputPlaceholder(i));
+  }
+  // [i, j] -> [i // 256, j // 256, i % 256, j % 256]
+  Array<PrimExpr> forward_index;
+  for (size_t i = 0; i < input_size.size(); i++) {
+    forward_index.push_back(FloorDiv(forward_vars[i], 256));
+  }
+  for (size_t i = 0; i < input_size.size(); i++) {
+    forward_index.push_back(FloorMod(forward_vars[i], 256));
+  }
+  return Layout(input_size, forward_index);
+}
+
 /**
  * @brief Infer and return the layout map for the atomic add operator.
  *
- * Constructs a cached ParallelOp (by building the SIMT loop) if not already
- * present, validates that local.fragment layouts for src and dst match when
- * both are provided, and then delegates layout inference to the underlying
- * ParallelOp.
+ * For TMA atomic add operations (when use_tma=True):
+ *   - src is always shared memory, dst is always global memory
+ *   - Automatically applies swizzle layout to the shared memory buffer when
+ *     the operation is not 1D, improving memory access efficiency
+ *
+ * For non-TMA atomic add operations:
+ *   - Returns empty layout map (no layout inference needed)
  *
  * @param T Layout inference inputs, including an optional mapping of buffers to
  * layouts.
  * @param level Inference strictness level.
  * @return LayoutMap The inferred layout mapping for buffers used by this
  * operator.
- *
- * @note This method mutates the AtomicAddNode by creating and storing a
- * ParallelOp on first invocation.
- * @throws If both src and dst have layouts in `local.fragment` and their
- * fragment layouts differ, an ICHECK failure is raised with diagnostic output.
  */
 LayoutMap AtomicAddNode::InferLayout(const LayoutInferArgs &T,
                                      InferLevel level) const {
-  if (T.layout_map.count(src) && T.layout_map.count(dst)) {
-    if (IsFragmentBuffer(src) && IsFragmentBuffer(dst)) {
-      const FragmentNode *src_layout = T.layout_map[src].as<FragmentNode>();
-      const FragmentNode *dst_layout = T.layout_map[dst].as<FragmentNode>();
-      if (src_layout && dst_layout) {
-        ICHECK(src_layout->IsEqual(dst_layout, true))
-            << "Get different layout for " << src << " and " << dst
-            << "\nLHS = " << src_layout->DebugOutput()
-            << "\nRHS = " << dst_layout->DebugOutput()
-            << "\nYou may need to use a shared memory to transform the layout";
+  // Handle TMA atomic add layout inference
+  if (GetUseTMA()) {
+    Map<Buffer, Layout> result_map;
+
+    // For TMA atomic add: src is shared memory, dst is global memory
+    Buffer shared_tensor = src;
+    Array<Range> shared_range = src_range;
+
+    // Check if this is 1D TMA
+    bool is_tma_1d = shared_range.size() == 1;
+
+    if (is_tma_1d) {
+      // 1D TMA atomic add with single dimension cannot be swizzled
+      return result_map;
+    }
+
+    // For non-1D TMA atomic add, apply swizzle layout if possible
+    if (level == InferLevel::kFree && !T.layout_map.count(shared_tensor)) {
+      // TMA atomic add is similar to TMA Store - we should perform swizzle if
+      // possible Use the last two dimensions to analyze swizzling
+      int dim = shared_tensor->shape.size();
+      const int64_t mat_stride = *as_const_int(shared_tensor->shape[dim - 2]);
+      const int64_t mat_continuous =
+          *as_const_int(shared_tensor->shape[dim - 1]);
+      Layout swizzle_layout_2d =
+          makeGemmABLayoutHopper(mat_stride, mat_continuous, mat_continuous,
+                                 shared_tensor->dtype.bits(), /*k_inner=*/true);
+      // If makeGemmABLayoutHopper returns a linear layout, fallback to
+      // ComputeLinearLayout which handles arbitrary tensor shapes correctly.
+      if (StructuralEqual()(swizzle_layout_2d, makeLinearLayout(Array<PrimExpr>{
+                                                   Integer(mat_stride),
+                                                   Integer(mat_continuous)}))) {
+        result_map.Set(shared_tensor, ComputeLinearLayout(shared_tensor));
+      } else {
+        result_map.Set(shared_tensor, ExpandLayoutToMatchBuffer(
+                                          swizzle_layout_2d, shared_tensor));
       }
     }
+
+    return result_map;
+  }
+
+  // For non-TMA atomic add, check that src and dst have the same layout if both
+  // are fragments
+  if (IsFragmentBuffer(src) && IsFragmentBuffer(dst)) {
+    if (T.layout_map.count(src) && T.layout_map.count(dst)) {
+      Layout src_layout = T.layout_map.at(src);
+      Layout dst_layout = T.layout_map.at(dst);
+      ICHECK(StructuralEqual()(src_layout, dst_layout))
+          << "AtomicAdd requires src and dst to have the same layout, but got "
+          << "src layout: " << src_layout << ", dst layout: " << dst_layout
+          << " for src buffer: " << src->name << ", dst buffer: " << dst->name;
+    }
   }
   return {};
 }
@@ -373,165 +366,249 @@ LayoutMap AtomicAddNode::InferLayout(const LayoutInferArgs &T,
 Stmt AtomicAddNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   Target target = T.target;
   if (GetUseTMA()) {
-    Array<PrimExpr> src_indices, dst_indices;
-    PrimExpr src_size, dst_size;
-    std::tie(src_indices, src_size) = ReturnIndicesAndSize(0);
-    std::tie(dst_indices, dst_size) = ReturnIndicesAndSize(1);
-    ICHECK(analyzer->CanProveEqual(src_size, dst_size))
-        << "src_size = " << src_size << ", dst_size = " << dst_size;
-    BufferLoad src_node = BufferLoad(src, src_indices);
-    BufferLoad dst_node = BufferLoad(dst, dst_indices);
-    Call address_of_src =
-        Call(DataType::Handle(), builtin::address_of(), {src_node});
-    Call address_of_dst =
-        Call(DataType::Handle(), builtin::address_of(), {dst_node});
-
-    int need_reduce = 1;
-    int eviction_policy = 0;
-    auto body = Evaluate(Call(DataType::Handle(), tma_store(),
-                              {address_of_src, address_of_dst,
-                               ceildiv(src_size * src->dtype.bits(), 8),
-                               need_reduce, eviction_policy}));
-    return IfThenElse(EQ(T.thread_var, T.thread_bounds->min), body);
-  }
-  auto simt_loop = MakeSIMTLoop(analyzer);
-  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
-  auto transformed_loop =
-      Downcast<For>(ParallelLoopTransformer::Substitute(fused_loop));
-
-  auto GetArchInt = [&](const Target &tgt) -> int {
-    int arch_int = 0;
-    if (auto s = tgt->GetAttr<String>("arch")) {
-      std::string arch = s.value();
-      if (arch.rfind("sm_", 0) == 0)
-        arch_int = std::stoi(arch.substr(3));
-    }
-    return arch_int;
-  };
-
-  struct AtomicLoopNestCollector : tir::StmtExprVisitor {
-    Array<IterVar> loop_vars;
-    Map<Buffer, Array<PrimExpr>> indice_map;
-    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> writes;
-    arith::Analyzer analyzer;
-
-    void Run(const Stmt &s) { StmtExprVisitor::VisitStmt(s); }
-
-    void VisitStmt_(const ForNode *op) final {
-      if (op->kind == ForKind::kParallel) {
-        loop_vars.push_back(IterVar(Range(op->min, op->extent), op->loop_var,
-                                    IterVarType::kDataPar));
+    // For AtomicAdd with TMA: src is shared memory, dst is global memory
+    // Use cp.reduce.async.bulk.tensor instruction with tensor descriptor
+    Buffer shared_tensor = src;
+    Buffer global_tensor = dst;
+    Array<Range> shared_range = src_range;
+    Array<Range> global_range = dst_range;
+
+    // Build TMADesc for the global tensor
+    TMADesc desc;
+    desc.rank = global_tensor->shape.size();
+    ICHECK(desc.rank >= 1 && desc.rank <= 5)
+        << "TMA reduce only supports 1-5 dimensions, got " << desc.rank;
+
+    // Data type must match
+    ICHECK(global_tensor->dtype == shared_tensor->dtype)
+        << "AtomicAdd between buffer " << shared_tensor->name << " and "
+        << global_tensor->name << " with different data type "
+        << shared_tensor->dtype << " and " << global_tensor->dtype;
+
+    desc.data_type = to_CUtensorMapDataType(global_tensor->dtype);
+
+    // Global tensor shape and stride
+    desc.global_addr = global_tensor->data;
+    desc.global_shape = ReverseArray(global_tensor->shape);
+    Array<PrimExpr> global_coords =
+        ReverseArray(global_range.Map([](Range r) { return r->min; }));
+
+    if (!global_tensor->strides.empty()) {
+      desc.global_stride = ReverseArray(global_tensor->strides);
+    } else {
+      // Create stride from shape (row-major)
+      PrimExpr stride = 1;
+      desc.global_stride.reserve(desc.rank);
+      for (size_t i = 0; i < desc.rank; i++) {
+        desc.global_stride.push_back(stride);
+        stride *= desc.global_shape[i];
       }
-      analyzer.Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
-      StmtExprVisitor::VisitStmt_(op);
     }
-    void VisitStmt_(const BufferStoreNode *op) final {
-      if (IsFragmentBuffer(op->buffer)) {
-        indice_map.Set(op->buffer, op->indices);
-        writes.insert(op->buffer);
-      }
-      StmtExprVisitor::VisitStmt_(op);
+    // Make global stride in bytes
+    desc.global_stride = desc.global_stride.Map([&](PrimExpr e) {
+      return cast(DataType::Int(64), e) * global_tensor->dtype.bytes();
+    });
+
+    // Shared memory box (copy extent)
+    desc.smem_box =
+        ReverseArray(global_range.Map([](Range r) { return r->extent; }));
+    desc.smem_stride = Array<PrimExpr>(desc.rank, PrimExpr(1));
+
+    // L2 & OOB settings
+    desc.l2_promotion = static_cast<int>(CU_TENSOR_MAP_L2_PROMOTION_L2_128B);
+    desc.oob_fill = static_cast<int>(CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
+
+    // Detect smem layout for swizzle (similar to copy.cc)
+    // linear layout must be computed before remapping
+    auto linear_layout = makeLinearLayout(shared_tensor->shape);
+    Buffer shared_tensor_unmapped = shared_tensor;
+    desc.interleave = static_cast<int>(CU_TENSOR_MAP_INTERLEAVE_NONE);
+    Layout shared_layout;
+    if (T.layout_map.count(shared_tensor)) {
+      shared_layout = T.layout_map.at(shared_tensor);
+      ICHECK(T.buffer_remap.count(shared_tensor))
+          << "shared_tensor: " << shared_tensor->name
+          << " not found in buffer_remap";
+      shared_tensor = T.buffer_remap.at(shared_tensor);
     }
-    void VisitExpr_(const BufferLoadNode *op) final {
-      if (IsFragmentBuffer(op->buffer)) {
-        indice_map.Set(op->buffer, op->indices);
+    if (!shared_layout.defined()) {
+      desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
+    } else if (StructuralEqual()(shared_layout, linear_layout)) {
+      desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
+    } else {
+      ICHECK(shared_layout->InputDim() >= 2) << "Cannot detect TMA layout.";
+      const int ndim = static_cast<int>(shared_layout->InputDim());
+      auto stride = as_const_int(shared_layout->InputShape()[ndim - 2]);
+      auto continuous = as_const_int(shared_layout->InputShape()[ndim - 1]);
+      ICHECK(stride != nullptr && continuous != nullptr);
+      if (StructuralEqual()(shared_layout, makeQuarterBankSwizzleLayout(
+                                               shared_tensor_unmapped))) {
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B);
+      } else if (StructuralEqual()(
+                     shared_layout,
+                     makeHalfBankSwizzleLayout(shared_tensor_unmapped))) {
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B);
+      } else if (StructuralEqual()(
+                     shared_layout,
+                     makeFullBankSwizzleLayout(shared_tensor_unmapped))) {
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
+      } else if (StructuralEqual()(
+                     shared_layout,
+                     makeGemmABLayoutPadded(*stride, *continuous,
+                                            shared_tensor->dtype.bits()))) {
+        DLOG(WARNING)
+            << "AtomicAdd TMA cannot support a padded layout for src: "
+            << src->name << ", dst: " << dst->name
+            << " fallback to none swizzle";
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
+      } else {
+        DLOG(WARNING) << "AtomicAdd TMA unsupported swizzle layout for src: "
+                      << src->name << ", dst: " << dst->name
+                      << " fallback to none swizzle";
+        desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
       }
-      StmtExprVisitor::VisitExpr_(op);
     }
-  };
-
-  auto ComputeLoopLayoutFromBuffer =
-      [&](const Buffer &buf, const Array<PrimExpr> &indices,
-          const LayoutMap &layout_map, const Range &thread_bounds,
-          const Array<IterVar> &loop_vars) -> Fragment {
-    Fragment src = layout_map[buf].as<Fragment>().value();
-    Var rep;
-    auto rep_iter =
-        IterVar(Range(0, src->ReplicateExtent()), rep, IterVarType::kDataPar);
-    PrimExpr fth = src->ForwardThread(indices, rep);
-    fth = analyzer->Simplify(fth);
-    Fragment out = Fragment(loop_vars, /*forward_index=*/{}, fth, rep_iter)
-                       ->BindThreadRange(thread_bounds);
-    return out;
-  };
-
-  struct AtomicInferResult {
-    Fragment loop_layout;
-    Optional<PrimExpr> predicate;
-  };
-
-  auto AtomicAddInferLayout =
-      [&](const For &loop, const LayoutInferArgs &args) -> AtomicInferResult {
-    AtomicLoopNestCollector C;
-    C.Run(loop);
-    Optional<Buffer> read_src;
-    int best_rank = -1;
-    for (auto kv : C.indice_map) {
-      const Buffer &buf = kv.first;
-      if (!IsFragmentBuffer(buf))
-        continue;
-      if (!args.layout_map.count(buf))
-        continue;
-      int rank = static_cast<int>(kv.second.size());
-      if (rank > best_rank) {
-        best_rank = rank;
-        read_src = buf;
-      }
+
+    // Adjust instruction_dim based on swizzle type (similar to copy.cc)
+    auto inner_box_dim = as_const_int(desc.smem_box[0]);
+    ICHECK(inner_box_dim != nullptr)
+        << "inner_box_dim must be a constant integer for TMA atomic add";
+    int instruction_dim = *inner_box_dim;
+    if (desc.swizzle == static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B)) {
+      instruction_dim = 64 / shared_tensor->dtype.bytes();
+    } else if (desc.swizzle == static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B)) {
+      instruction_dim = 128 / shared_tensor->dtype.bytes();
     }
-    AtomicAddVectorizePlanner planner;
-    int sm = GetArchInt(target);
-    auto plan = planner.Plan(loop, sm);
-    int vec = std::max(plan.vector_size, 1);
-    if (auto cw = loop->annotations.Get(attr::kCoalescedWidth)) {
-      if (const auto *imm = cw->as<IntImmNode>()) {
-        int expected = imm->value;
-        ICHECK_GT(expected, 0);
-        ICHECK(vec % expected == 0)
-            << "vector_size " << vec << " not divisible by coalesced_width "
-            << expected;
-        vec = expected;
-      } else {
-        LOG(FATAL) << "coalesced_width should be IntImmNode.";
+    if (instruction_dim > 256) {
+      ICHECK((*inner_box_dim) % 256 == 0)
+          << "inner_box_dim: " << *inner_box_dim << " is not divisible by 256";
+      instruction_dim = 256;
+    }
+    ICHECK((*inner_box_dim) % instruction_dim == 0)
+        << "inner_box_dim: " << *inner_box_dim
+        << " is not divisible by instruction_dim: " << instruction_dim;
+    desc.smem_box.Set(0, PrimExpr(instruction_dim));
+
+    int inner_box_dim_ = instruction_dim * shared_tensor->dtype.bytes();
+    // Check inner_box_dim_ for each swizzle type
+    struct SwizzleCheck {
+      int swizzle;
+      int max_dim;
+    };
+    static const std::vector<SwizzleCheck> swizzle_checks = {
+        {static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B), 32},
+        {static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B), 64},
+        {static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B), 128},
+    };
+    for (const auto &check : swizzle_checks) {
+      if (desc.swizzle == check.swizzle && inner_box_dim_ > check.max_dim) {
+        DLOG(WARNING) << "AtomicAdd TMA cannot support swizzled layout with "
+                         "inner_box_dim_ > "
+                      << check.max_dim;
       }
     }
-    PrimExpr total = 1;
-    for (Stmt s = loop; s.as<For>().has_value(); s = s.as<For>().value()->body)
-      total = total * s.as<For>().value()->extent;
-    PrimExpr denom = args.thread_bounds->extent * vec;
-    while (!analyzer->CanProve(floormod(total, denom) == 0) && vec > 1) {
-      vec >>= 1;
-      denom = args.thread_bounds->extent * vec;
+
+    // Compute shared memory offset
+    Array<PrimExpr> shared_indices;
+    for (auto r : shared_range)
+      shared_indices.push_back(r->min);
+    std::vector<PrimExpr> shared_strides;
+    PrimExpr shared_stride = 1;
+    for (size_t i = 0; i < shared_tensor->shape.size(); i++) {
+      auto s = shared_tensor->shape[shared_tensor->shape.size() - i - 1];
+      shared_strides.insert(shared_strides.begin(), shared_stride);
+      shared_stride *= s;
     }
-    if (vec < 1)
-      vec = 1;
-    Fragment loop_layout;
-    if (read_src) {
-      loop_layout = ComputeLoopLayoutFromBuffer(
-          read_src.value(), C.indice_map[read_src.value()], args.layout_map,
-          args.thread_bounds, C.loop_vars);
-    } else {
-      const For &remapped = loop;
-      loop_layout = PlanLoopPartition(remapped, vec, args.thread_bounds);
+    PrimExpr shared_offset = 0;
+    for (size_t i = 0; i < shared_indices.size(); i++) {
+      shared_offset += shared_indices[i] * shared_strides[i];
     }
 
-    Optional<PrimExpr> pred;
-    if (plan.dynamic && plan.condition.defined()) {
-      pred = plan.condition;
+    // Create TMA descriptor
+    Call create_descriptor = Call(DataType::Handle(), create_tma_descriptor(),
+                                  desc.EncodeCallArgs());
+
+    // Compute total elements for access_ptr
+    PrimExpr total_elements = 1;
+    for (auto e : desc.smem_box)
+      total_elements *= e;
+
+    // erase use_tma from annotations
+    auto op_annotations = this->annotations;
+    op_annotations.erase("use_tma");
+
+    Stmt tma_reduce;
+    if ((*inner_box_dim) != instruction_dim) {
+      // Need to split the operation into multiple TMA calls
+      Var loop_var("i");
+      int loop_extent = (*inner_box_dim) / instruction_dim;
+
+      Array<PrimExpr> args;
+      args.reserve(desc.rank + 4);
+      args.push_back(create_descriptor);
+      PrimExpr shared_addr = shared_tensor.access_ptr(
+          1, DataType::Handle(), 1, shared_offset + total_elements * loop_var,
+          total_elements);
+      args.push_back(shared_addr);
+      Array<PrimExpr> loop_global_coords = global_coords;
+      loop_global_coords.Set(0, global_coords[0] + instruction_dim * loop_var);
+      for (auto coord : loop_global_coords)
+        args.push_back(coord);
+      int need_reduce = 1;
+      args.push_back(need_reduce);
+      int eviction_policy = 0;
+      args.push_back(eviction_policy);
+      tma_reduce = For(loop_var, 0, loop_extent, ForKind::kUnrolled,
+                       Evaluate(Call(DataType::Handle(), tma_store(), args,
+                                     op_annotations)));
+    } else {
+      Array<PrimExpr> args;
+      args.reserve(desc.rank + 4);
+      args.push_back(create_descriptor);
+      PrimExpr shared_addr = shared_tensor.access_ptr(
+          1, DataType::Handle(), 1, shared_offset, total_elements);
+      args.push_back(shared_addr);
+      for (auto coord : global_coords)
+        args.push_back(coord);
+      int need_reduce = 1;
+      args.push_back(need_reduce);
+      int eviction_policy = 0;
+      args.push_back(eviction_policy);
+      tma_reduce =
+          Evaluate(Call(DataType::Handle(), tma_store(), args, op_annotations));
     }
-    DLOG(INFO) << "[AtomicAddInferLayout] vec=" << vec
-               << " loop_layout=" << loop_layout->DebugOutput();
-    return {loop_layout, pred};
-  };
-
-  auto ret = AtomicAddInferLayout(transformed_loop,
-                                  {T.target, T.thread_bounds, T.layout_map,
-                                   analyzer, false, T.buffer_remap});
-  Fragment loop_layout = ret.loop_layout;
-  auto thread_loop =
-      PartitionLoop(transformed_loop, T.thread_var, analyzer, loop_layout);
-  auto vectorized_thread_loop =
-      VectorizeAtomicAdd(thread_loop, GetArchInt(target));
-  return vectorized_thread_loop;
+
+    Array<Stmt> seq;
+    seq.reserve(3);
+    seq.push_back(tma_reduce);
+    seq.push_back(Evaluate(Call(DataType::Handle(), tma_store_arrive(), {})));
+    seq.push_back(Evaluate(Call(DataType::Handle(), tma_store_wait(),
+                                {IntImm(DataType::Int(32), 0)})));
+    return IfThenElse(EQ(T.thread_var, T.thread_bounds->min),
+                      SeqStmt(std::move(seq)));
+  }
+  auto simt_loop = MakeSIMTLoop(analyzer);
+  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
+  auto par_op = ParallelOp(fused_loop);
+  std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                    InferLevel::kFree};
+  // 1.give par_op a recommended vectorize size. (only works for free layout
+  // inference).
+  for (auto level : levels) {
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
+                        level);
+  }
+  auto loop_layout = par_op->GetLoopLayout();
+  auto lowered_loop =
+      LowerParallelLoop(fused_loop, loop_layout, T.thread_var, analyzer,
+                        T.layout_map, par_op->GetPredicate(T.thread_var));
+  return lowered_loop;
 }
 
 TIR_REGISTER_TL_TILE_OP(AtomicAdd, atomicadd)
diff --git a/src/op/atomic_add.h b/src/op/atomic_add.h
index 56f48839fb..ed60f267d9 100644
--- a/src/op/atomic_add.h
+++ b/src/op/atomic_add.h
@@ -6,77 +6,70 @@
 #ifndef TVM_TL_OP_ATOMIC_ADD_H_
 #define TVM_TL_OP_ATOMIC_ADD_H_
 
-#include "operator.h"
-#include "parallel.h"
+#include "atomic_reduce.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
-/// Node class for atomic addition operations
-class AtomicAddNode : public TileOperatorNode {
+/*!
+ * \brief Node class for atomic addition operations.
+ *
+ * Inherits from AtomicOpBaseNode and adds TMA support and vectorization.
+ */
+class AtomicAddNode : public AtomicOpBaseNode {
 public:
-  Buffer src, dst; ///< Source and destination buffers
-  Array<Range> src_range,
-      dst_range; ///< Access ranges for source and destination
-  Map<String, ObjectRef> annotations; ///< Annotations for the atomic operation
-  // Supported annotation keys:
-  //   - "use_tma": IntImm, whether to use TMA for memory operations
-  //   - "coalesced_width": IntImm, width for memory coalescing optimization
-  //   - "memory_order": IntImm, memory order for atomic operations
-
-  mutable ParallelOp par_op_; ///< Associated parallel operation
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.AtomicAdd", AtomicAddNode,
                                     TileOperatorNode);
 
+  /// Override Lower to add TMA support
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const;
+
+  /// Override InferLayout to add TMA layout inference
   LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) const;
 
   static const Op &Get();
+  const Op &GetElemOp() const override;
   TileOperator Clone() const;
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<AtomicAddNode>()
         .def_ro("src", &AtomicAddNode::src)
+        .def_ro("src_value", &AtomicAddNode::src_value)
         .def_ro("dst", &AtomicAddNode::dst)
         .def_ro("src_range", &AtomicAddNode::src_range)
         .def_ro("dst_range", &AtomicAddNode::dst_range)
         .def_ro("annotations", &AtomicAddNode::annotations);
   }
 
-  // Helper methods to get annotation values
+  /// Check if TMA should be used
   bool GetUseTMA() const {
     if (auto val = annotations.Get("use_tma")) {
       if (auto int_val = val->as<IntImmNode>()) {
-        return int_val->value != 0;
+        if (int_val->value != 0) {
+          ICHECK(!src_value.defined())
+              << "TMA is not supported when using TiledAtomicAdd with PrimExpr "
+                 "as value.";
+          return true;
+        }
       }
     }
     return false;
   }
 
-  int GetMemoryOrder() const {
-    if (auto val = annotations.Get("memory_order")) {
-      if (auto int_val = val->as<IntImmNode>()) {
-        return int_val->value;
-      }
-    }
-    return 0; // default: relaxed
-  }
+  /// Get vectorization length based on dst dtype and target SM version
+  int GetVectorizeLength(Target target) const;
 
 protected:
-  /// Create SIMT-style parallel loop structure
+  /// Override MakeSIMTLoop to handle AtomicAdd-specific logic
   For MakeSIMTLoop(arith::Analyzer *analyzer) const;
-  /// Generate iteration variables for loop nest
-  Array<IterVar> MakeIterVars() const;
-  /// Generate buffer indices from iteration variables
-  Array<PrimExpr> MakeIndices(const Array<IterVar> &ivs, int src_dst) const;
-  /// Return buffer indices and size
+
+  /// Return buffer indices and total size
   std::pair<Array<PrimExpr>, PrimExpr> ReturnIndicesAndSize(int src_dst) const;
-  /// Create boundary predicate for memory safety
-  PrimExpr MakePredicate(arith::Analyzer *analyzer, const Array<IterVar> &ivs,
-                         Array<PrimExpr> extents, int src_dst) const;
+  /// Compute linear layout for shared tensor (used in TMA atomic add)
+  Layout ComputeLinearLayout(const Buffer &shared_tensor) const;
 };
 
 /// Wrapper class for atomic addition operations
@@ -93,4 +86,4 @@ class AtomicAdd : public TileOperator {
 } // namespace tl
 } // namespace tvm
 
-#endif //  TVM_TL_OP_ATOMIC_ADD_H_
+#endif // TVM_TL_OP_ATOMIC_ADD_H_
diff --git a/src/op/atomic_reduce.cc b/src/op/atomic_reduce.cc
new file mode 100644
index 0000000000..534189c5bf
--- /dev/null
+++ b/src/op/atomic_reduce.cc
@@ -0,0 +1,318 @@
+/*!
+ * \file tl/op/atomic_reduce.cc
+ *
+ * Define atomic reduction operators (max/min).
+ */
+
+#include "./atomic_reduce.h"
+#include "utils.h"
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include "../layout/layout.h"
+#include "../target/utils.h"
+
+#include "../transform/common/loop_fusion_utils.h"
+#include "../transform/loop_partition.h"
+#include "builtin.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// ============================================================================
+// AtomicMax Implementation
+// ============================================================================
+
+AtomicMax::AtomicMax(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ICHECK(args.size() >= 2)
+      << "AtomicMax expects at least 2 arguments (src, dst), got "
+      << args.size();
+  ObjectPtr<AtomicMaxNode> node = tvm::ffi::make_object<AtomicMaxNode>();
+  std::vector<AccessRegion> access_regions;
+
+  if (IsBufferLikeExpr(args[0])) {
+    auto src_access = NormalizeToAccessRegion(args[0], kAccessRead);
+    node->src = src_access.region->buffer;
+    node->src_range = src_access.region->region;
+    access_regions.push_back(std::move(src_access));
+  } else {
+    node->src_value = args[0];
+  }
+
+  auto dst_access = NormalizeToAccessRegion(args[1], kAccessReadWrite);
+  dst_access.access_mask = kAccessReadWrite;
+  node->dst = dst_access.region->buffer;
+  node->dst_range = dst_access.region->region;
+  access_regions.push_back(std::move(dst_access));
+  node->SetAccessRegions(std::move(access_regions));
+
+  node->annotations = annotations;
+  data_ = std::move(node);
+}
+
+TileOperator AtomicMaxNode::Clone() const {
+  auto op = tvm::ffi::make_object<AtomicMaxNode>(*this);
+  if (par_op_.defined()) {
+    op->par_op_ = Downcast<ParallelOp>(par_op_->Clone());
+  }
+  return AtomicMax(op);
+}
+
+const Op &AtomicMaxNode::GetElemOp() const { return atomic_max_elem_op(); }
+
+// ============================================================================
+// AtomicMin Implementation
+// ============================================================================
+
+AtomicMin::AtomicMin(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ICHECK(args.size() >= 2)
+      << "AtomicMin expects at least 2 arguments (src, dst), got "
+      << args.size();
+  ObjectPtr<AtomicMinNode> node = tvm::ffi::make_object<AtomicMinNode>();
+  std::vector<AccessRegion> access_regions;
+
+  if (IsBufferLikeExpr(args[0])) {
+    auto src_access = NormalizeToAccessRegion(args[0], kAccessRead);
+    node->src = src_access.region->buffer;
+    node->src_range = src_access.region->region;
+    access_regions.push_back(std::move(src_access));
+  } else {
+    node->src_value = args[0];
+  }
+
+  auto dst_access = NormalizeToAccessRegion(args[1], kAccessReadWrite);
+  dst_access.access_mask = kAccessReadWrite;
+  node->dst = dst_access.region->buffer;
+  node->dst_range = dst_access.region->region;
+  access_regions.push_back(std::move(dst_access));
+  node->SetAccessRegions(std::move(access_regions));
+
+  node->annotations = annotations;
+  data_ = std::move(node);
+}
+
+TileOperator AtomicMinNode::Clone() const {
+  auto op = tvm::ffi::make_object<AtomicMinNode>(*this);
+  if (par_op_.defined()) {
+    op->par_op_ = Downcast<ParallelOp>(par_op_->Clone());
+  }
+  return AtomicMin(op);
+}
+
+const Op &AtomicMinNode::GetElemOp() const { return atomic_min_elem_op(); }
+
+// ============================================================================
+// Common AtomicOpBaseNode Implementation
+// ============================================================================
+
+Array<IterVar> AtomicOpBaseNode::MakeIterVars() const {
+  Array<IterVar> loop_vars;
+  size_t idx = 0;
+  // Make IterVars according to dst, not src
+  // Since src may be a scalar Expr
+  for (size_t i = 0; i < dst_range.size(); i++) {
+    if (is_one(dst_range[i]->extent))
+      continue;
+    Var var = Var(std::string{char('i' + idx)}, dst_range[i]->extent->dtype);
+    idx++;
+    loop_vars.push_back(
+        {Range(0, dst_range[i]->extent), var, IterVarType::kDataPar});
+  }
+
+  // If is scalar, create a dummy loop var
+  if (loop_vars.empty()) {
+    Var var = Var("i");
+    loop_vars.push_back({Range(0, 1), var, IterVarType::kDataPar});
+  }
+
+  return loop_vars;
+}
+
+Array<PrimExpr> AtomicOpBaseNode::MakeIndices(const Array<IterVar> &ivs,
+                                              int src_dst) const {
+  Array<PrimExpr> indices;
+  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      indices.push_back(ranges[i]->min);
+    else {
+      indices.push_back(ranges[i]->min + ivs[idx]->var);
+      idx++;
+    }
+  }
+
+  // Special case: scalar range, when there is one var and one range(0, 1)
+  ICHECK(idx == ivs.size() || (idx == 0 && ivs.size() == 1))
+      << "Unmatched indices: idx = " << idx << ", ivs.size() = " << ivs.size()
+      << ", dst name = " << dst->name;
+  return indices;
+}
+
+PrimExpr AtomicOpBaseNode::MakePredicate(arith::Analyzer *analyzer,
+                                         const Array<IterVar> &ivs,
+                                         Array<PrimExpr> extents,
+                                         int src_dst) const {
+  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+  Array<PrimExpr> cond_list;
+  ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      continue;
+    PrimExpr cond = ranges[i]->min + ivs[idx]->var < extents[i];
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    cond = ranges[i]->min + ivs[idx]->var >= 0;
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    idx++;
+  }
+  if (cond_list.empty())
+    return {};
+  else {
+    PrimExpr cond = cond_list[0];
+    for (size_t i = 1; i < cond_list.size(); i++)
+      cond = And(cond, cond_list[i]);
+    return cond;
+  }
+}
+
+For AtomicOpBaseNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
+  Array<IterVar> loop_vars = MakeIterVars();
+  ICHECK(!loop_vars.empty()) << "MakeIterVars in AtomicOp should not return "
+                                "empty vars (at least 1 var)";
+
+  for (const auto &iv : loop_vars)
+    analyzer->Bind(iv->var, iv->dom);
+
+  ICHECK(loop_vars.size() <= dst_range.size())
+      << "loop_vars.size() = " << loop_vars.size()
+      << ", dst_range.size() = " << dst_range.size() << ", dst = " << dst->name;
+
+  Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
+  Array<PrimExpr> new_args;
+
+  // Src arg to be passed to the Call atomic operation
+  PrimExpr src_value_arg;
+
+  // If src is a Buffer
+  if (!src_value.defined()) {
+    ICHECK(loop_vars.size() <= src_range.size())
+        << "loop_vars.size() = " << loop_vars.size()
+        << ", src_range.size() = " << src_range.size()
+        << ", src = " << src->name << ", dst = " << dst->name;
+
+    Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
+    // Load source value
+    src_value_arg = BufferLoad(src, src_indices);
+  } else {
+    src_value_arg = src_value;
+  }
+  // Cast to dst dtype if needed
+  if (src_value_arg->dtype != dst->dtype)
+    src_value_arg = Cast(dst->dtype, src_value_arg);
+
+  // Build an access pointer to the destination element (rw).
+  DataType idx_dtype =
+      dst_indices.empty() ? DataType::Int(32) : dst_indices[0].dtype();
+  PrimExpr dst_ptr =
+      Call(DataType::Handle(), tl::access_ptr(),
+           {BufferLoad(dst, dst_indices), make_const(idx_dtype, 1),
+            make_const(DataType::Int(32), 3)});
+
+  new_args.push_back(dst_ptr);
+  new_args.push_back(src_value_arg);
+  new_args.push_back(GetMemoryOrder());
+
+  // Use the appropriate elem_op based on the derived type (via virtual call)
+  Call atomic_call =
+      tvm::tir::Call(dst->dtype, GetElemOp(), new_args, annotations);
+
+  Stmt body = tvm::tir::Evaluate(atomic_call);
+
+  for (int i = loop_vars.size() - 1; i >= 0; i--) {
+    Map<String, ObjectRef> loop_annotations;
+    if (i == 0) {
+      if (annotations.count(attr::kCoalescedWidth)) {
+        loop_annotations.Set(attr::kCoalescedWidth,
+                             annotations.Get(attr::kCoalescedWidth).value());
+      }
+    }
+
+    body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
+               ForKind::kParallel, body, std::nullopt, loop_annotations);
+  }
+  return Downcast<For>(body);
+}
+
+LayoutMap AtomicOpBaseNode::InferLayout(const LayoutInferArgs &T,
+                                        InferLevel level) const {
+  // For atomic reduce operations, check that src and dst have the same layout
+  // if both are fragments
+  if (IsFragmentBuffer(src) && IsFragmentBuffer(dst)) {
+    if (T.layout_map.count(src) && T.layout_map.count(dst)) {
+      Layout src_layout = T.layout_map.at(src);
+      Layout dst_layout = T.layout_map.at(dst);
+      ICHECK(StructuralEqual()(src_layout, dst_layout))
+          << "Atomic reduce requires src and dst to have the same layout, but "
+             "got "
+          << "src layout: " << src_layout << ", dst layout: " << dst_layout
+          << " for src buffer: " << src->name << ", dst buffer: " << dst->name;
+    }
+  }
+  return {};
+}
+
+Stmt AtomicOpBaseNode::Lower(const LowerArgs &T,
+                             arith::Analyzer *analyzer) const {
+  Target target = T.target;
+
+  auto simt_loop = MakeSIMTLoop(analyzer);
+  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
+  auto par_op = ParallelOp(fused_loop);
+  std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                    InferLevel::kFree};
+  for (auto level : levels) {
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
+                        level);
+  }
+  auto loop_layout = par_op->GetLoopLayout();
+  auto lowered_loop =
+      LowerParallelLoop(fused_loop, loop_layout, T.thread_var, analyzer,
+                        T.layout_map, par_op->GetPredicate(T.thread_var));
+  return lowered_loop;
+}
+
+// ============================================================================
+// Operator Registration
+// ============================================================================
+
+TIR_REGISTER_TL_TILE_OP(AtomicMax, atomicmax)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_REGISTER_TL_TILE_OP(AtomicMin, atomicmin)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  AtomicMaxNode::RegisterReflection();
+  AtomicMinNode::RegisterReflection();
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/atomic_reduce.h b/src/op/atomic_reduce.h
new file mode 100644
index 0000000000..57b13e1391
--- /dev/null
+++ b/src/op/atomic_reduce.h
@@ -0,0 +1,140 @@
+/*!
+ * \file tl/op/atomic_reduce.h
+ * \brief Atomic operations base class and reduction operations (max/min)
+ */
+
+#ifndef TVM_TL_OP_ATOMIC_REDUCE_H_
+#define TVM_TL_OP_ATOMIC_REDUCE_H_
+
+#include "operator.h"
+#include "parallel.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Base node class for atomic operations (add/max/min).
+ *
+ * This base class provides common functionality for all atomic
+ * operations including buffer management, loop generation, and layout
+ * inference.
+ */
+class AtomicOpBaseNode : public TileOperatorNode {
+public:
+  PrimExpr src_value; ///< Source values, for cases src is not a buffer
+  Buffer src, dst;    ///< Source and destination buffers
+  Array<Range> src_range,
+      dst_range; ///< Access ranges for source and destination
+  Map<String, ObjectRef> annotations; ///< Annotations for the atomic operation
+  // Supported annotation keys:
+  //   - "coalesced_width": IntImm, width for memory coalescing optimization
+  //   - "memory_order": IntImm, memory order for atomic operations
+
+  mutable ParallelOp par_op_; ///< Associated parallel operation
+
+  /// Default Lower implementation for non-TMA atomic ops
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const;
+
+  /// Default InferLayout implementation
+  LayoutMap InferLayout(const LayoutInferArgs &T, InferLevel level) const;
+
+  /// Get memory order from annotations (default: relaxed = 0)
+  int GetMemoryOrder() const {
+    if (auto val = annotations.Get("memory_order")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value;
+      }
+    }
+    return 0;
+  }
+
+  /// Get the element-wise operation Op (pure virtual, implemented by derived)
+  virtual const Op &GetElemOp() const = 0;
+
+protected:
+  /// Create SIMT-style parallel loop structure
+  For MakeSIMTLoop(arith::Analyzer *analyzer) const;
+
+  /// Generate iteration variables for loop nest
+  Array<IterVar> MakeIterVars() const;
+
+  /// Generate buffer indices from iteration variables
+  Array<PrimExpr> MakeIndices(const Array<IterVar> &ivs, int src_dst) const;
+
+  /// Create boundary predicate for memory safety
+  PrimExpr MakePredicate(arith::Analyzer *analyzer, const Array<IterVar> &ivs,
+                         Array<PrimExpr> extents, int src_dst) const;
+};
+
+/// Node class for atomic maximum operations
+class AtomicMaxNode : public AtomicOpBaseNode {
+public:
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.AtomicMax", AtomicMaxNode,
+                                    TileOperatorNode);
+
+  static const Op &Get();
+  const Op &GetElemOp() const override;
+  TileOperator Clone() const;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<AtomicMaxNode>()
+        .def_ro("src", &AtomicMaxNode::src)
+        .def_ro("src_value", &AtomicMaxNode::src_value)
+        .def_ro("dst", &AtomicMaxNode::dst)
+        .def_ro("src_range", &AtomicMaxNode::src_range)
+        .def_ro("dst_range", &AtomicMaxNode::dst_range)
+        .def_ro("annotations", &AtomicMaxNode::annotations);
+  }
+};
+
+/// Wrapper class for atomic maximum operations
+class AtomicMax : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(AtomicMax, TileOperator,
+                                             AtomicMaxNode);
+  TVM_DLL
+  AtomicMax(Array<PrimExpr> args,
+            Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
+  static const Op &Get();
+};
+
+/// Node class for atomic minimum operations
+class AtomicMinNode : public AtomicOpBaseNode {
+public:
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.AtomicMin", AtomicMinNode,
+                                    TileOperatorNode);
+
+  static const Op &Get();
+  const Op &GetElemOp() const override;
+  TileOperator Clone() const;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<AtomicMinNode>()
+        .def_ro("src", &AtomicMinNode::src)
+        .def_ro("src_value", &AtomicMinNode::src_value)
+        .def_ro("dst", &AtomicMinNode::dst)
+        .def_ro("src_range", &AtomicMinNode::src_range)
+        .def_ro("dst_range", &AtomicMinNode::dst_range)
+        .def_ro("annotations", &AtomicMinNode::annotations);
+  }
+};
+
+/// Wrapper class for atomic minimum operations
+class AtomicMin : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(AtomicMin, TileOperator,
+                                             AtomicMinNode);
+  TVM_DLL
+  AtomicMin(Array<PrimExpr> args,
+            Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_ATOMIC_REDUCE_H_
diff --git a/src/op/builtin.cc b/src/op/builtin.cc
index e82870af74..450b3dcc67 100644
--- a/src/op/builtin.cc
+++ b/src/op/builtin.cc
@@ -10,18 +10,18 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
-#include "../target/cuda.h"
+#include "../target/stubs/cuda.h"
 #include "../target/utils.h"
 
 namespace tvm {
 namespace tl {
 
 TVM_REGISTER_PASS_CONFIG_OPTION(kDebugMergeSharedMemoryAllocations, Bool);
-TVM_REGISTER_PASS_CONFIG_OPTION(kDisableTMALower, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableSafeMemoryLegalize, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWarpSpecialized, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableThreadStorageSync, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kConfigIndexBitwidth, Integer);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDisableTMALower, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnableAggressiveSharedMemoryMerge, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kForceLetInline, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableFastMath, Bool);
@@ -29,6 +29,8 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kEnableFastMath, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kPtxasRegisterUsageLevel, Integer);
 TVM_REGISTER_PASS_CONFIG_OPTION(kEnablePTXASVerboseOutput, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableVectorize256, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kEnableAsyncCopy, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kEnableVectorizePlannerVerbose, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableWGMMA, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDisableShuffleElect, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kStorageRewriteDetectInplace, Bool);
@@ -36,6 +38,14 @@ TVM_REGISTER_PASS_CONFIG_OPTION(kASTPrintEnable, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationEnable, Bool);
 TVM_REGISTER_PASS_CONFIG_OPTION(kLayoutVisualizationFormats, String);
 TVM_REGISTER_PASS_CONFIG_OPTION(kDeviceCompileFlags, ffi::Array<ffi::String>);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDisableDataRaceCheck, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kEnableLowerLDGSTG, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kEnableLowerLDGSTGPredicated, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDisableLoopUnswitching, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kLoopUnswitchingAllowNonTrivialElse, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDisableOutOfBoundWarning, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kEnableDumpIR, Bool);
+TVM_REGISTER_PASS_CONFIG_OPTION(kDumpIRDir, ffi::String);
 
 DataType cuTensorMapType() { return DataType::UInt(8, 128); }
 
@@ -47,6 +57,12 @@ DataType cuTensorMapType() { return DataType::UInt(8, 128); }
   TVM_REGISTER_OP("tl." #OpName)                                               \
       .set_attr<TScriptPrinterName>("TScriptPrinterName", #OpName)
 
+// Pointer access metadata op (frontend-only, lowered later).
+TIR_DEFINE_TL_BUILTIN(access_ptr)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
 // fast math related op
 TIR_DEFINE_TL_BUILTIN(__exp).set_num_inputs(1).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
@@ -72,6 +88,12 @@ TIR_DEFINE_TL_BUILTIN(__cos).set_num_inputs(1).set_attr<TCallEffectKind>(
 TIR_DEFINE_TL_BUILTIN(__sin).set_num_inputs(1).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(max_nan).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(min_nan).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
 // high precision with IEEE-compliant
 TIR_DEFINE_TL_BUILTIN(ieee_add).set_num_inputs(3).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
@@ -101,14 +123,36 @@ TIR_DEFINE_TL_BUILTIN(ieee_frsqrt)
 TIR_DEFINE_TL_BUILTIN(ieee_fdiv).set_num_inputs(3).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
-TIR_DEFINE_TL_BUILTIN(rng_init).set_num_inputs(3).set_attr<TCallEffectKind>(
+// Packed x2 element-wise math intrinsics (float32x2, bfloat16x2, float16x2)
+TIR_DEFINE_TL_BUILTIN(add2).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(sub2).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(mul2).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(fma2).set_num_inputs(3).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(max2).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(min2).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(abs2).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(rng_init).set_num_inputs(4).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_TL_BUILTIN(rng_rand).set_num_inputs(0).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(create_list_of_mbarrier)
-    .set_num_inputs(-1)
+TIR_DEFINE_TL_BUILTIN(rng_rand_float)
+    .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
@@ -122,11 +166,6 @@ TIR_DEFINE_TL_BUILTIN(create_tma_im2col_descriptor)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure));
 
-TIR_DEFINE_TL_BUILTIN(get_mbarrier)
-    .set_num_inputs(1)
-    .set_attr<TCallEffectKind>("TCallEffectKind",
-                               Integer(CallEffectKind::kPure));
-
 TIR_DEFINE_TL_BUILTIN(tma_load).set_num_inputs(-1).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
@@ -143,6 +182,11 @@ TIR_DEFINE_TL_BUILTIN(ptx_fence_barrier_init)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(ptx_arrive_cluster_barrier)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(mbarrier_wait_parity)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -173,6 +217,26 @@ TIR_DEFINE_TL_BUILTIN(ptx_tcgen05_mma_ts)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(ptx_tcgen05_mma_blockscaled_ss)
+    .set_num_inputs(16)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(ptx_tcgen05_cp_warpx4)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(ptx_tcgen05_sf_warp_transpose)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(deallocate_tmem)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(ptx_init_tensor_memory)
     .set_num_inputs(2)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -203,18 +267,33 @@ TIR_DEFINE_TL_BUILTIN(ptx_cp_async_barrier_noinc)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(ptx_cp_async)
+    .set_num_inputs(-1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(fence_proxy_async)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(tcgen05_before_thread_sync)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tcgen05_after_thread_sync)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(tma_store_arrive)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
 TIR_DEFINE_TL_BUILTIN(tma_store_wait)
-    .set_num_inputs(0)
+    .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 TIR_DEFINE_TL_BUILTIN(set_max_nreg)
@@ -222,6 +301,16 @@ TIR_DEFINE_TL_BUILTIN(set_max_nreg)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(annotate_producer_reg_dealloc)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(annotate_consumer_reg_alloc)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(no_set_max_nreg)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -275,9 +364,143 @@ TIR_DEFINE_TL_BUILTIN(wait_wgmma)
 TIR_DEFINE_TL_BUILTIN(pack_b16).set_num_inputs(2).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
+TIR_DEFINE_TL_BUILTIN(cluster_arrive_relaxed)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(cluster_arrive)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(cluster_wait)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(cluster_sync)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(block_rank_in_cluster)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(clc_try_cancel)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(clc_try_cancel_multicast)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(clc_is_canceled)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(clc_get_first_ctaid_x)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(clc_get_first_ctaid_y)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
+TIR_DEFINE_TL_BUILTIN(clc_get_first_ctaid_z)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
 TIR_DEFINE_TL_BUILTIN(sync_grid).set_num_inputs(0).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(sync_warp).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(pdl_trigger)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(pdl_sync).set_num_inputs(0).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+// Warp-vote / warp-ballot intrinsics. These synchronize the warp, so they are
+// marked opaque to prevent reordering across divergent control flow.
+TIR_DEFINE_TL_BUILTIN(any_sync).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(all_sync).set_num_inputs(2).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(ballot_sync)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(ballot).set_num_inputs(1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(activemask)
+    .set_num_inputs(0)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+// Block-wide predicated barriers.
+TIR_DEFINE_TL_BUILTIN(syncthreads_count)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(syncthreads_and)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(syncthreads_or)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+// Warp-shuffle intrinsics. All four accept (mask, value, lane_or_offset,
+// width) and are opaque because they involve inter-lane communication.
+TIR_DEFINE_TL_BUILTIN(shfl_sync).set_num_inputs(4).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(shfl_xor_sync)
+    .set_num_inputs(4)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(shfl_down_sync)
+    .set_num_inputs(4)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(shfl_up_sync)
+    .set_num_inputs(4)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+// Warp match-any/match-all intrinsics (CUDA sm_70+). HIP lowering errors.
+TIR_DEFINE_TL_BUILTIN(match_any_sync)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(match_all_sync)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(loop_break)
     .set_num_inputs(0)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -329,10 +552,52 @@ TIR_DEFINE_TL_BUILTIN(increase_descriptor_offset)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(atom_add).set_num_inputs(4).set_attr<TCallEffectKind>(
-    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+TIR_DEFINE_TL_BUILTIN(atomic_add_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
 
-TIR_DEFINE_TL_BUILTIN(atomicadd_elem_op)
+TIR_DEFINE_TL_BUILTIN(atomic_add_ret_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_addx2_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_addx4_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_load_elem_op)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_store_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_max_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_max_ret_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_min_elem_op)
+    .set_num_inputs(3)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(atomic_min_ret_elem_op)
     .set_num_inputs(3)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
@@ -352,6 +617,16 @@ TIR_DEFINE_TL_BUILTIN(tcgen05_mma_arrive)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TIR_DEFINE_TL_BUILTIN(tcgen05_ld)
+    .set_num_inputs(6)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TIR_DEFINE_TL_BUILTIN(tcgen05_st)
+    .set_num_inputs(6)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TIR_DEFINE_TL_BUILTIN(warp_reduce_sum)
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
@@ -377,23 +652,64 @@ TIR_DEFINE_TL_BUILTIN(warp_reduce_bitor)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+// ds_read_tr16_b64(smem_ptr) -> uint32x2
+// gfx950 LDS transpose read: 64-bit, 16-element transpose (FP16/BF16 MFMA)
+TIR_DEFINE_TL_BUILTIN(ds_read_tr16_b64)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
+// ds_read_tr8_b64(smem_ptr) -> uint32x2
+// gfx950 LDS transpose read: 64-bit, 8-element transpose (FP32 MFMA)
+TIR_DEFINE_TL_BUILTIN(ds_read_tr8_b64)
+    .set_num_inputs(1)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kPure));
+
 // __ldg(BufferLoad | Buffer, idx?) -> value
 // Treat as a pure call that returns the loaded value.
 TIR_DEFINE_TL_BUILTIN(__ldg).set_num_inputs(-1).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
-// =====================================================================
-// TileScale Distributed Features
-// =====================================================================
+// ldg32(address, predicate(optional)) -> 32-bit value
+// Global memory load with 32-bit vector width
+TIR_DEFINE_TL_BUILTIN(ldg32).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
 
-TIR_DEFINE_TL_BUILTIN(sync_warp).set_num_inputs(0).set_attr<TCallEffectKind>(
-    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+// ldg64(address, predicate(optional)) -> 64-bit value
+// Global memory load with 64-bit vector width
+TIR_DEFINE_TL_BUILTIN(ldg64).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kPure));
 
-TIR_DEFINE_TL_BUILTIN(warp_any).set_num_inputs(2).set_attr<TCallEffectKind>(
+// ldg128(address, predicate(optional)) -> 128-bit value
+// Global memory load with 128-bit vector width
+TIR_DEFINE_TL_BUILTIN(ldg128).set_num_inputs(-1).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
-TIR_DEFINE_TL_BUILTIN(warp_all).set_num_inputs(2).set_attr<TCallEffectKind>(
+// ldg256(address, predicate(optional)) -> 256-bit value
+// Global memory load with 256-bit vector width
+TIR_DEFINE_TL_BUILTIN(ldg256).set_num_inputs(-1).set_attr<TCallEffectKind>(
     "TCallEffectKind", Integer(CallEffectKind::kPure));
 
+// stg32(Buffer, idx, value) -> void
+// Global memory store with 32-bit vector width
+TIR_DEFINE_TL_BUILTIN(stg32).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+// stg64(Buffer, idx, value) -> void
+// Global memory store with 64-bit vector width
+TIR_DEFINE_TL_BUILTIN(stg64).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+// stg128(Buffer, idx, value) -> void
+// Global memory store with 128-bit vector width
+TIR_DEFINE_TL_BUILTIN(stg128).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
+// stg256(Buffer, idx, value) -> void
+// Global memory store with 256-bit vector width
+TIR_DEFINE_TL_BUILTIN(stg256).set_num_inputs(-1).set_attr<TCallEffectKind>(
+    "TCallEffectKind", Integer(CallEffectKind::kOpaque));
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/builtin.h b/src/op/builtin.h
index 99da3d7555..69a65ab10a 100644
--- a/src/op/builtin.h
+++ b/src/op/builtin.h
@@ -27,21 +27,69 @@ static constexpr const char *kWarpSpecializationScope =
     "kWarpSpecializationScope";
 static constexpr const char *kCustomWarpSpecialization =
     "kCustomWarpSpecialization";
+// Loop annotation key controlling whether PTX async-copy rewriting is enabled
+// in the annotated loop subtree. Value should be Bool (False/True).
+static constexpr const char *kLoopPreferAsync = "parallel_prefer_async";
+// Loop annotation key controlling whether async commit/wait should be omitted
+// for injected cp.async in this parallel loop subtree. Value should be Bool.
+static constexpr const char *kParallelAsyncWithoutAsyncCommitWait =
+    "parallel_async_without_async_commit_wait";
+// Copy-op annotation key controlling whether cp.async commit/wait are managed
+// by an enclosing transform (e.g. software pipeline / warp specialization).
+// Value should be IntImm/Bool-like truthy scalar.
+static constexpr const char *kAsyncCopyNoImplicitCommitWait =
+    "no_implicit_async_commit_wait";
+// Tile-op annotation key carrying an explicit mbarrier parity expression.
+// Pipeline transforms set this on ops whose lowering would otherwise infer
+// parity from surrounding loop context.
+static constexpr const char *kPipelineMbarPhaseExpr =
+    "tl.pipeline_mbar_phase_expr";
 static constexpr const char *kLocalVarInit = "tl.local_var_init";
 // A PrimFunc-level attribute carrying a list of handle Vars
 // that must NOT be marked with the restrict qualifier in codegen.
 // Type: Array<tir::Var>
 static constexpr const char *kNonRestrictParams = "tl.non_restrict_params";
+// A PrimFunc-level attribute carrying the minimum number of thread blocks
+// per SM (multiprocessor).  When present it is emitted as the second
+// argument of __launch_bounds__(maxThreads, minBlocksPerMultiprocessor).
+// Type: Integer
+static constexpr const char *kMinBlocksPerSM = "tl.min_blocks_per_sm";
+// lexical_alloc_scope may first appear as a Block annotation, requesting that
+// LowerOpaqueBlock materialize a lexical scope boundary for that block subtree.
+// After LowerOpaqueBlock, the same key appears as an AttrStmt marker that
+// generates a C/CUDA lexical scope `{ ... }` in codegen. Allocations nested
+// inside this scope cannot be hoisted past the boundary by StorageRewrite,
+// giving the underlying compiler accurate variable lifetime information for
+// register allocation.
+static constexpr const char *kLexicalAllocScope = "lexical_alloc_scope";
 } // namespace attr
 
+inline Optional<PrimExpr>
+GetAnnotatedMbarPhaseExpr(const Map<String, ObjectRef> &annotations) {
+  if (auto val = annotations.Get(attr::kPipelineMbarPhaseExpr)) {
+    if (val.value()->IsInstance<PrimExprNode>()) {
+      return Downcast<PrimExpr>(val.value());
+    }
+    LOG(FATAL) << "Annotation `" << attr::kPipelineMbarPhaseExpr
+               << "` expects a PrimExpr value, but got "
+               << val.value().GetTypeKey();
+  }
+  return Optional<PrimExpr>();
+}
+
 static constexpr const char *kDebugMergeSharedMemoryAllocations =
     "tl.debug_merge_shared_memory_allocations";
-static constexpr const char *kDisableTMALower = "tl.disable_tma_lower";
+// PrimFunc attribute: set by LowerTileOp to indicate TMA operations were
+// actually generated.  Read by OptimizeForTarget to pick the right pipeline.
+static constexpr const char *kHasTMA = "tl.has_tma";
 static constexpr const char *kDisableSafeMemoryLegalize =
     "tl.disable_safe_memory_legalize";
 static constexpr const char *kDisableWarpSpecialized =
     "tl.disable_warp_specialized";
 static constexpr const char *kConfigIndexBitwidth = "tl.config_index_bitwidth";
+// Deprecated pass config, temporarily re-enabled. Prevents plain T.copy()
+// from auto-lowering to TMA store. Will be removed in v0.1.10.
+static constexpr const char *kDisableTMALower = "tl.disable_tma_lower";
 static constexpr const char *kEnableAggressiveSharedMemoryMerge =
     "tl.enable_aggressive_shared_memory_merge";
 static constexpr const char *kDisableFastMath = "tl.disable_fast_math";
@@ -51,8 +99,41 @@ static constexpr const char *kPtxasRegisterUsageLevel =
 static constexpr const char *kEnablePTXASVerboseOutput =
     "tl.enable_ptxas_verbose_output";
 static constexpr const char *kDisableVectorize256 = "tl.disable_vectorize_256";
+static constexpr const char *kEnableAsyncCopy = "tl.enable_async_copy";
+static constexpr const char *kEnableVectorizePlannerVerbose =
+    "tl.enable_vectorize_planner_verbose";
 static constexpr const char *kDisableWGMMA = "tl.disable_wgmma";
 static constexpr const char *kDisableShuffleElect = "tl.disable_shuffle_elect";
+static constexpr const char *kDisableLoopUnswitching =
+    "tl.disable_loop_unswitching";
+// Allow loop unswitching even when the else-version of the loop body is
+// non-trivial (has side effects). Default: false (conservative).
+static constexpr const char *kLoopUnswitchingAllowNonTrivialElse =
+    "tl.loop_unswitching_allow_non_trivial_else";
+
+/*!
+ * \brief Enable lowering non-predicated global load/store to ldg/stg intrinsics
+ *
+ * When enabled, transforms regular (non-predicated) global memory loads and
+ * stores to explicit ldg/stg intrinsics for potentially better performance.
+ * Default: OFF (disabled)
+ *
+ * kEnableLowerLDGSTG = "tl.enable_lower_ldgstg"
+ */
+static constexpr const char *kEnableLowerLDGSTG = "tl.enable_lower_ldgstg";
+
+/*!
+ * \brief Enable lowering predicated global load/store to ldg/stg intrinsics
+ *
+ * When enabled (set to true), predicated loads (if_then_else with else=0) and
+ * predicated stores (IfThenElse with store in then case) will be lowered
+ * to predicated ldg/stg intrinsics.
+ * Default: OFF (predicated lowering is disabled by default)
+ *
+ * kEnableLowerLDGSTGPredicated = "tl.enable_lower_ldgstg_predicated"
+ */
+static constexpr const char *kEnableLowerLDGSTGPredicated =
+    "tl.enable_lower_ldgstg_predicated";
 static constexpr const char *kStorageRewriteDetectInplace =
     "tl.storage_rewrite_detect_inplace";
 static constexpr const char *kASTPrintEnable = "tl.ast_print_enable";
@@ -61,6 +142,8 @@ static constexpr const char *kLayoutVisualizationEnable =
 static constexpr const char *kLayoutVisualizationFormats =
     "tl.layout_visualization_formats";
 static constexpr const char *kDeviceCompileFlags = "tl.device_compile_flags";
+static constexpr const char *kDisableDataRaceCheck =
+    "tl.disable_data_race_check";
 
 /*!
  * \brief Whether to disable thread storage synchronization
@@ -84,6 +167,24 @@ static constexpr const char *kDisableThreadStorageSync =
  */
 static constexpr const char *kForceLetInline = "tl.force_let_inline";
 
+/*!
+ * \brief Disable out of bound warning in LegalizeSafeMemoryAccess pass.
+ *
+ * kDisableOutOfBoundWarning = "tl.disable_out_of_bound_warning"
+ *
+ */
+static constexpr const char *kDisableOutOfBoundWarning =
+    "tl.disable_out_of_bound_warning";
+
+/*!
+ * \brief Enable dumping IR during lowering between passes.
+ *
+ * kEnableDumpIR = "tl.enable_dump_ir"
+ *
+ */
+static constexpr const char *kEnableDumpIR = "tl.enable_dump_ir";
+static constexpr const char *kDumpIRDir = "tl.dump_ir_path";
+
 /*!
  * \brief Get the type of the CUDA tensor map
  *
@@ -92,6 +193,25 @@ static constexpr const char *kForceLetInline = "tl.force_let_inline";
  */
 DataType cuTensorMapType();
 
+/*!
+ * \brief TileLang intrinsic for carrying pointer access metadata in frontend.
+ *
+ * Unlike `tir.builtin.tvm_access_ptr`, this op keeps a `BufferLoad` argument so
+ * downstream analysis can recover the referenced `Buffer` (and its strides /
+ * scope), while also carrying the access mask required by synchronization and
+ * safety checks.
+ *
+ * The frontend is expected to lower this op to `tir.builtin.tvm_access_ptr`
+ * once the additional metadata is no longer needed.
+ *
+ * access_ptr(base_load, extent, rw_mask)
+ *
+ * - base_load: BufferLoad whose indices denote the base element address.
+ * - extent: 1D extent in elements (same meaning as tvm_access_ptr arg3).
+ * - rw_mask: 1=read, 2=write, 3=read-write.
+ */
+TVM_DLL const Op &access_ptr();
+
 // fast math related op
 // __exp(x) - fast exponential
 TVM_DLL const Op &__exp();
@@ -109,6 +229,10 @@ TVM_DLL const Op &__tan();
 TVM_DLL const Op &__cos();
 // __sin(x) - fast sine
 TVM_DLL const Op &__sin();
+// max_nan(x, y) - max with CUDA __hmax_nan semantics for fp16/bf16
+TVM_DLL const Op &max_nan();
+// min_nan(x, y) - min with CUDA __hmin_nan semantics for fp16/bf16
+TVM_DLL const Op &min_nan();
 
 // high precision with IEEE-compliant.
 // ieee_add(x, y, rounding_mode) - IEEE-compliant addition
@@ -128,9 +252,19 @@ TVM_DLL const Op &ieee_frsqrt();
 // ieee_fdiv(x, y, rounding_mode) - IEEE-compliant division
 TVM_DLL const Op &ieee_fdiv();
 
+// Packed x2 element-wise math (float32x2, bfloat16x2, float16x2)
+TVM_DLL const Op &add2();
+TVM_DLL const Op &sub2();
+TVM_DLL const Op &mul2();
+TVM_DLL const Op &fma2();
+TVM_DLL const Op &max2();
+TVM_DLL const Op &min2();
+TVM_DLL const Op &abs2();
+
 // random op
 TVM_DLL const Op &rng_init();
 TVM_DLL const Op &rng_rand();
+TVM_DLL const Op &rng_rand_float();
 
 /*!
  * \brief tvm intrinsics for TMADescriptor creation for tiled load
@@ -153,22 +287,6 @@ TVM_DLL const Op &create_tma_descriptor();
  */
 TVM_DLL const Op &create_tma_im2col_descriptor();
 
-/*!
- * \brief Create a list of mbarrier with num_threads
- *
- * create_list_of_mbarrier(num_threads0, num_threads1, ...)
- *
- */
-TVM_DLL const Op &create_list_of_mbarrier();
-
-/*!
- * \brief Get the mbarrier with barrier_id
- *
- * int64_t* GetMBarrier(barrier_id)
- *
- */
-TVM_DLL const Op &get_mbarrier();
-
 /*!
  * \brief tvm intrinsics for loading data from global tensor descriptor to
  * shared memory
@@ -205,6 +323,14 @@ TVM_DLL const Op &tma_store();
  */
 const Op &ptx_fence_barrier_init();
 
+/*
+ * \brief tvm intrinsics for cluster barrier arrive
+ *
+ * ptx_arrive_cluster_barrier(mbarrier, cta_id)
+ *
+ */
+TVM_DLL const Op &ptx_arrive_cluster_barrier();
+
 /*!
  * \brief tvm intrinsics for mbarrier wait with parity bit
  *
@@ -253,6 +379,31 @@ TVM_DLL const Op &ptx_tcgen05_mma_ss();
  */
 TVM_DLL const Op &ptx_tcgen05_mma_ts();
 
+/*!
+ * \brief tvm intrinsic for tcgen05 block-scaled mma shared-shared instructions.
+ */
+TVM_DLL const Op &ptx_tcgen05_mma_blockscaled_ss();
+
+/*!
+ * \brief tvm intrinsic for tcgen05 copy warpx4 (smem to tmem).
+ */
+TVM_DLL const Op &ptx_tcgen05_cp_warpx4();
+
+/*!
+ * \brief tvm intrinsic for scale factor warp transpose in shared memory.
+ */
+TVM_DLL const Op &ptx_tcgen05_sf_warp_transpose();
+
+/*!
+ * \brief Frontend TMEM deallocation marker.
+ *
+ * deallocate_tmem(tmem_buffer_data)
+ *
+ * This op is produced by the TileLang Python frontend and must be lowered by
+ * LowerSharedTmem into ptx_deallocate_tensor_memory(access_ptr, num_cols).
+ */
+TVM_DLL const Op &deallocate_tmem();
+
 /*!
  * \brief tvm intrinsics for initializing tensor memory
  *
@@ -304,6 +455,15 @@ TVM_DLL const Op &ptx_stmatrix();
  */
 TVM_DLL const Op &ptx_cp_async_barrier_noinc();
 
+/*!
+ * \brief TileLang intrinsic for PTX async copy from global to shared memory
+ *
+ * ptx_cp_async(dst_access_ptr, src_access_ptr, num_elems)
+ * ptx_cp_async(dst_access_ptr, src_access_ptr, num_elems, predicate)
+ *
+ */
+TVM_DLL const Op &ptx_cp_async();
+
 /*!
  * \brief Pack two b16 value into a b32 value
  *
@@ -344,6 +504,22 @@ TVM_DLL const Op &tma_store_wait();
  */
 TVM_DLL const Op &set_max_nreg();
 
+/*!
+ * \brief Annotation-only producer reg dealloc hint for warp specialization
+ *
+ * annotate_producer_reg_dealloc(num_reg)
+ *
+ */
+TVM_DLL const Op &annotate_producer_reg_dealloc();
+
+/*!
+ * \brief Annotation-only consumer reg alloc hint for warp specialization
+ *
+ * annotate_consumer_reg_alloc(num_reg)
+ *
+ */
+TVM_DLL const Op &annotate_consumer_reg_alloc();
+
 /*!
  * \brief No set reg hint for warp-specialized branched
  *
@@ -424,6 +600,95 @@ TVM_DLL const Op &get_warp_group_idx();
  */
 TVM_DLL const Op &wait_wgmma();
 
+/*!
+ * \brief Cluster barrier arrive with relaxed ordering
+ *
+ * cluster_arrive_relaxed()
+ *
+ */
+TVM_DLL const Op &cluster_arrive_relaxed();
+
+/*!
+ * \brief Cluster barrier arrive
+ *
+ * cluster_arrive()
+ *
+ */
+TVM_DLL const Op &cluster_arrive();
+
+/*!
+ * \brief Cluster barrier wait
+ *
+ * cluster_wait()
+ *
+ */
+TVM_DLL const Op &cluster_wait();
+
+/*!
+ * \brief Cluster barrier arrive + wait (full sync)
+ *
+ * cluster_sync()
+ *
+ */
+TVM_DLL const Op &cluster_sync();
+
+/*!
+ * \brief Return the 1-D rank of the calling CTA within its cluster
+ *
+ * int block_rank_in_cluster()
+ *
+ */
+TVM_DLL const Op &block_rank_in_cluster();
+
+/*!
+ * \brief Issue a Blackwell cluster launch control query that writes a 16-byte
+ * response into shared memory and signals completion on the given mbarrier.
+ *
+ * clc_try_cancel(result_ptr, mbar_ptr)
+ *
+ */
+TVM_DLL const Op &clc_try_cancel();
+
+/*!
+ * \brief Cluster-wide multicast variant of cluster launch control query.
+ *
+ * clc_try_cancel_multicast(result_ptr, mbar_ptr)
+ *
+ */
+TVM_DLL const Op &clc_try_cancel_multicast();
+
+/*!
+ * \brief Return 1 when a CLC response represents a successful cancellation.
+ *
+ * int32 clc_is_canceled(result_ptr)
+ *
+ */
+TVM_DLL const Op &clc_is_canceled();
+
+/*!
+ * \brief Return the x coordinate of the first CTA in a successful CLC response.
+ *
+ * uint32 clc_get_first_ctaid_x(result_ptr)
+ *
+ */
+TVM_DLL const Op &clc_get_first_ctaid_x();
+
+/*!
+ * \brief Return the y coordinate of the first CTA in a successful CLC response.
+ *
+ * uint32 clc_get_first_ctaid_y(result_ptr)
+ *
+ */
+TVM_DLL const Op &clc_get_first_ctaid_y();
+
+/*!
+ * \brief Return the z coordinate of the first CTA in a successful CLC response.
+ *
+ * uint32 clc_get_first_ctaid_z(result_ptr)
+ *
+ */
+TVM_DLL const Op &clc_get_first_ctaid_z();
+
 /*!
  * \brief Synchronize all threads in a grid
  *
@@ -432,6 +697,156 @@ TVM_DLL const Op &wait_wgmma();
  */
 TVM_DLL const Op &sync_grid();
 
+/*!
+ * \brief Synchronize all threads in a warp
+ *
+ * sync_warp()
+ *
+ */
+TVM_DLL const Op &sync_warp();
+
+/*!
+ * \brief Programmatic dependency trigger.
+ *
+ * pdl_trigger()
+ *
+ */
+TVM_DLL const Op &pdl_trigger();
+
+/*!
+ * \brief Programmatic grid dependency synchronization.
+ *
+ * pdl_sync()
+ *
+ */
+TVM_DLL const Op &pdl_sync();
+
+/*!
+ * \brief Warp-vote: non-zero if ANY active lane in the mask has a non-zero
+ * predicate. Lowers to `__any_sync(mask, predicate)` on CUDA and
+ * `__any(predicate)` on HIP (mask is ignored on HIP).
+ *
+ * int32 any_sync(mask, predicate)
+ */
+TVM_DLL const Op &any_sync();
+
+/*!
+ * \brief Warp-vote: non-zero only if ALL active lanes in the mask have a
+ * non-zero predicate. Lowers to `__all_sync(mask, predicate)` on CUDA and
+ * `__all(predicate)` on HIP (mask is ignored on HIP).
+ *
+ * int32 all_sync(mask, predicate)
+ */
+TVM_DLL const Op &all_sync();
+
+/*!
+ * \brief Warp-ballot: bitmask of lanes in the mask with non-zero predicate.
+ *
+ * CUDA: `__ballot_sync(mask, predicate)` returns `uint32`; the codegen
+ * zero-extends the result to `uint64`.
+ * HIP: `__ballot(predicate)` returns `uint64` natively, covering all 64
+ * lanes of the wavefront. Mask is ignored on HIP.
+ *
+ * uint64 ballot_sync(mask, predicate)
+ */
+TVM_DLL const Op &ballot_sync();
+
+/*!
+ * \brief Full-warp / full-wavefront ballot. Equivalent to
+ * `ballot_sync(0xFFFFFFFF, predicate)`.
+ *
+ * uint64 ballot(predicate)
+ */
+TVM_DLL const Op &ballot();
+
+/*!
+ * \brief Bitmask of currently active (non-exited) lanes. Lowers to
+ * `__activemask()` (zero-extended to `uint64`) on CUDA and `__ballot(1)` on
+ * HIP.
+ *
+ * uint64 activemask()
+ */
+TVM_DLL const Op &activemask();
+
+/*!
+ * \brief Block barrier that returns the number of threads whose predicate
+ * evaluates to non-zero. Lowers to `__syncthreads_count(predicate)` on both
+ * CUDA and HIP.
+ *
+ * int32 syncthreads_count(predicate)
+ */
+TVM_DLL const Op &syncthreads_count();
+
+/*!
+ * \brief Block barrier that returns non-zero only if ALL threads have a
+ * non-zero predicate. Lowers to `__syncthreads_and(predicate)` on both
+ * CUDA and HIP.
+ *
+ * int32 syncthreads_and(predicate)
+ */
+TVM_DLL const Op &syncthreads_and();
+
+/*!
+ * \brief Block barrier that returns non-zero if ANY thread has a non-zero
+ * predicate. Lowers to `__syncthreads_or(predicate)` on both CUDA and HIP.
+ *
+ * int32 syncthreads_or(predicate)
+ */
+TVM_DLL const Op &syncthreads_or();
+
+/*!
+ * \brief Warp shuffle: broadcast `value` from `src_lane` within each subgroup
+ * of `width` lanes. Lowers to `__shfl_sync(mask, value, src_lane, width)` on
+ * CUDA and `__shfl(value, src_lane, width)` on HIP. The dtype of the result
+ * matches the dtype of `value`.
+ *
+ * T shfl_sync(mask, value, src_lane, width)
+ */
+TVM_DLL const Op &shfl_sync();
+
+/*!
+ * \brief Warp shuffle (XOR-swap variant). Lowers to `__shfl_xor_sync` on CUDA
+ * and `__shfl_xor` on HIP.
+ *
+ * T shfl_xor_sync(mask, value, lane_mask, width)
+ */
+TVM_DLL const Op &shfl_xor_sync();
+
+/*!
+ * \brief Warp shuffle (shift-down variant). Lowers to `__shfl_down_sync` on
+ * CUDA and `__shfl_down` on HIP.
+ *
+ * T shfl_down_sync(mask, value, delta, width)
+ */
+TVM_DLL const Op &shfl_down_sync();
+
+/*!
+ * \brief Warp shuffle (shift-up variant). Lowers to `__shfl_up_sync` on CUDA
+ * and `__shfl_up` on HIP.
+ *
+ * T shfl_up_sync(mask, value, delta, width)
+ */
+TVM_DLL const Op &shfl_up_sync();
+
+/*!
+ * \brief Warp match-any: returns a mask of lanes in `mask` whose `value`
+ * equals the calling lane's value. Lowers to `__match_any_sync` on CUDA
+ * (compute capability >= 7.0). Not supported on HIP.
+ *
+ * uint32 match_any_sync(mask, value)
+ */
+TVM_DLL const Op &match_any_sync();
+
+/*!
+ * \brief Warp match-all: returns `mask` if all lanes in `mask` agree on
+ * `value`, else 0. Lowers to `__match_all_sync` on CUDA (compute capability
+ * >= 7.0, the trailing `int*` predicate output is discarded via an
+ * immediately-invoked lambda). Not supported on HIP.
+ *
+ * uint32 match_all_sync(mask, value)
+ */
+TVM_DLL const Op &match_all_sync();
+
 /*!
  * \brief tvm intrinsic for loop continue
  *
@@ -440,6 +855,26 @@ TVM_DLL const Op &sync_grid();
  */
 TVM_DLL const Op &loop_break();
 
+/*!
+ * \brief tilelang intrinsic for gfx950 LDS transpose read, 64-bit, 16-element.
+ *
+ * Reads 8 bytes from LDS with a 16-element transpose (FP16/BF16 MFMA B-load).
+ * Only available on gfx950 (MI350/MI355X).
+ *
+ * uint32x2 ds_read_tr16_b64(smem_access_ptr)
+ */
+TVM_DLL const Op &ds_read_tr16_b64();
+
+/*!
+ * \brief tilelang intrinsic for gfx950 LDS transpose read, 64-bit, 8-element.
+ *
+ * Reads 8 bytes from LDS with an 8-element transpose (FP32 MFMA B-load).
+ * Only available on gfx950 (MI350/MI355X).
+ *
+ * uint32x2 ds_read_tr8_b64(smem_access_ptr)
+ */
+TVM_DLL const Op &ds_read_tr8_b64();
+
 /*!
  * \brief tvm intrinsic for amd matrix core mfma instructions.
  *
@@ -491,7 +926,13 @@ TVM_DLL const Op &tvm_rdna_wmma_store();
 /*!
  * \brief tilelang intrinsic for general matrix multiplication (GEMM).
  *
- *  This op is used to represent a generic GEMM operation in tilelang.
+ *  This op wraps a templated `tl::gemm_*<...>` call into the generated device
+ *  code. Python-side lowering backends that want to delegate to the C++
+ *  template implementations in `src/tl_templates/<target>/gemm*.h` can emit a
+ *  call to this builtin directly via
+ *    T.call_intrin("handle", "tl.tl_gemm", op_instance_str, A_ptr, B_ptr,
+ * C_ptr) where `op_instance_str` is the fully-instantiated `tl::gemm_ss<M, N,
+ * K, ...>` template string.
  */
 TVM_DLL const Op &tl_gemm();
 
@@ -533,6 +974,35 @@ TVM_DLL const Op &initialize_tcgen05_descriptor();
  */
 TVM_DLL const Op &tcgen05_mma_arrive();
 
+/*!
+ * \brief tilelang intrinsic for lowered TCGEN05 tensor-memory load.
+ *
+ *  Internal lowering op used by LowerTmemCopy to represent
+ *  `tl::tcgen05_ld_*` calls without routing through `call_extern`.
+ */
+TVM_DLL const Op &tcgen05_ld();
+
+/*!
+ * \brief tilelang intrinsic for lowered TCGEN05 tensor-memory store.
+ *
+ *  Internal lowering op used by LowerTmemCopy to represent
+ *  `tl::tcgen05_st_*` calls without routing through `call_extern`.
+ */
+TVM_DLL const Op &tcgen05_st();
+
+/*!
+ * \brief TCGEN05 fence before a thread-block-wide sync (__syncthreads /
+ * bar.sync). Matches PTX \c tcgen05.fence::before_thread_sync (DeepGEMM /
+ * Blackwell UMMA sequencing).
+ */
+TVM_DLL const Op &tcgen05_before_thread_sync();
+
+/*!
+ * \brief TCGEN05 fence after a thread-block-wide sync. Matches PTX \c
+ * tcgen05.fence::after_thread_sync.
+ */
+TVM_DLL const Op &tcgen05_after_thread_sync();
+
 /*!
  * \brief tilelang intrinsic for setting the start address of a descriptor
  * buffer for wgmma/utcmma.
@@ -544,20 +1014,82 @@ TVM_DLL const Op &tcgen05_mma_arrive();
 TVM_DLL const Op &increase_descriptor_offset();
 
 /*!
- * \brief tilelang intrinsic for atomic add that returns the original value.
+ * \brief tilelang intrinsic for element-wise atomic addition.
  *
- *  This op is used to represent an atomic add operation that returns the
- * original value before addition in tilelang.
+ *  This op is used to represent an element-wise atomic add operation in
+ * tilelang.
  */
-TVM_DLL const Op &atom_add();
+TVM_DLL const Op &atomic_add_elem_op();
 
 /*!
- * \brief tilelang intrinsic for element-wise atomic addition.
+ * \brief tilelang intrinsic for element-wise atomic addition with return value.
  *
  *  This op is used to represent an element-wise atomic add operation in
+ * tilelang that returns the previous value.
+ */
+TVM_DLL const Op &atomic_add_ret_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for vectorized (x2) atomic addition.
+ *
+ *  This op is used to represent a vectorized atomic add operation (2 elements)
+ * in tilelang.
+ */
+TVM_DLL const Op &atomic_addx2_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for vectorized (x4) atomic addition.
+ *
+ *  This op is used to represent a vectorized atomic add operation (4 elements)
+ * in tilelang.
+ */
+TVM_DLL const Op &atomic_addx4_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for atomic load.
+ *
+ *  This op is used to represent an atomic load operation in tilelang.
+ */
+TVM_DLL const Op &atomic_load_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for atomic store.
+ *
+ *  This op is used to represent an atomic store operation in tilelang.
+ */
+TVM_DLL const Op &atomic_store_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic maximum.
+ *
+ *  This op is used to represent an element-wise atomic max operation in
  * tilelang.
  */
-TVM_DLL const Op &atomicadd_elem_op();
+TVM_DLL const Op &atomic_max_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic maximum with return value.
+ *
+ *  This op is used to represent an element-wise atomic max operation in
+ * tilelang that returns the previous value.
+ */
+TVM_DLL const Op &atomic_max_ret_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic minimum.
+ *
+ *  This op is used to represent an element-wise atomic min operation in
+ * tilelang.
+ */
+TVM_DLL const Op &atomic_min_elem_op();
+
+/*!
+ * \brief tilelang intrinsic for element-wise atomic minimum with return value.
+ *
+ *  This op is used to represent an element-wise atomic min operation in
+ * tilelang that returns the previous value.
+ */
+TVM_DLL const Op &atomic_min_ret_elem_op();
 
 /*!
  * \brief tilelang intrinsic for assert on device.
@@ -617,27 +1149,96 @@ TVM_DLL const Op &warp_reduce_bitor();
 TVM_DLL const Op &__ldg();
 
 /*!
- * \brief Synchronize all threads in a warp
+ * \brief tilelang intrinsic for global memory load with 32-bit vector width.
  *
- * sync_warp()
+ *  This op loads 32 bits (4 bytes) from global memory using explicit
+ *  PTX ld.global instructions for performance-sensitive loads.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.ldg32(x, i)
  */
-TVM_DLL const Op &sync_warp();
+TVM_DLL const Op &ldg32();
 
 /*!
- * \brief Check if any lane in the warp has a true value
+ * \brief tilelang intrinsic for global memory load with 64-bit vector width.
+ *
+ *  This op loads 64 bits (8 bytes) from global memory using explicit
+ *  PTX ld.global.v2 instructions for vectorized loads.
  *
- * int warp_any(value, mask)
+ *  Usage from TVMScript:
+ *    y[i] = T.ldg64(x, i)
  */
-TVM_DLL const Op &warp_any();
+TVM_DLL const Op &ldg64();
 
 /*!
- * \brief Check if all lanes in the warp have a true value
+ * \brief tilelang intrinsic for global memory load with 128-bit vector width.
+ *
+ *  This op loads 128 bits (16 bytes) from global memory using explicit
+ *  PTX ld.global.v4 or ld.global.v2.s64 instructions for wide vectorized loads.
  *
- * int warp_all(value, mask)
+ *  Usage from TVMScript:
+ *    y[i] = T.ldg128(x, i)
  */
-TVM_DLL const Op &warp_all();
+TVM_DLL const Op &ldg128();
 
-// Note: ld and st are TileOperators defined in remote_copy.h, not builtins
+/*!
+ * \brief tilelang intrinsic for global memory load with 256-bit vector width.
+ *
+ *  This op loads 256 bits (32 bytes) from global memory using explicit
+ *  PTX ld.global.v4.s64 instructions for maximum vectorized loads.
+ *  Requires CUDA 12.9+ for native support; older versions use two 128-bit
+ * loads.
+ *
+ *  Usage from TVMScript:
+ *    y[i] = T.ldg256(x, i)
+ */
+TVM_DLL const Op &ldg256();
+
+/*!
+ * \brief tilelang intrinsic for global memory store with 32-bit vector width.
+ *
+ *  This op stores 32 bits (4 bytes) to global memory using explicit
+ *  PTX st.global instructions for performance-sensitive stores.
+ *
+ *  Usage from TVMScript:
+ *    T.stg32(y, i, value)
+ */
+TVM_DLL const Op &stg32();
+
+/*!
+ * \brief tilelang intrinsic for global memory store with 64-bit vector width.
+ *
+ *  This op stores 64 bits (8 bytes) to global memory using explicit
+ *  PTX st.global.v2 instructions for vectorized stores.
+ *
+ *  Usage from TVMScript:
+ *    T.stg64(y, i, value)
+ */
+TVM_DLL const Op &stg64();
+
+/*!
+ * \brief tilelang intrinsic for global memory store with 128-bit vector width.
+ *
+ *  This op stores 128 bits (16 bytes) to global memory using explicit
+ *  PTX st.global.v4 instructions for wide vectorized stores.
+ *
+ *  Usage from TVMScript:
+ *    T.stg128(y, i, value)
+ */
+TVM_DLL const Op &stg128();
+
+/*!
+ * \brief tilelang intrinsic for global memory store with 256-bit vector width.
+ *
+ *  This op stores 256 bits (32 bytes) to global memory using explicit
+ *  PTX st.global.v4.s64 instructions for maximum vectorized stores.
+ *  Requires CUDA 12.9+ for native support; older versions use two 128-bit
+ * stores.
+ *
+ *  Usage from TVMScript:
+ *    T.stg256(y, i, value)
+ */
+TVM_DLL const Op &stg256();
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/copy.cc b/src/op/copy.cc
index 23770fc955..bd225ba415 100644
--- a/src/op/copy.cc
+++ b/src/op/copy.cc
@@ -12,14 +12,13 @@
 #include "../layout/tcgen05_layout.h"
 #include "../target/utils.h"
 #include "../transform/common/loop_fusion_utils.h"
-#include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
+#include "../transform/ptx_async_copy_injector.h"
 #include "utils.h"
 
-#include "../target/cuda.h"
-#include "../target/utils.h"
 #include "builtin.h"
+#include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
@@ -30,90 +29,67 @@ namespace tl {
 
 using namespace tir;
 
-// Maps TVM DataType to CUDA's CUtensorMapDataType enum value.
-static int to_CUtensorMapDataType(DataType dtype) {
-  CUtensorMapDataType tp;
-  if (dtype.is_float()) {
-    switch (dtype.bits()) {
-    case 64:
-      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT64;
-      break;
-    case 32:
-      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT32;
-      break;
-    case 16:
-      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
-      break;
-    case 8:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
-      break;
-    default:
-      ICHECK(0) << dtype;
-    }
-  } else if (dtype.is_bfloat16()) {
-    tp = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
-  } else if (dtype.is_float8()) {
-    tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
-  } else if (dtype.is_int()) {
-    switch (dtype.bits()) {
-    case 64:
-      tp = CU_TENSOR_MAP_DATA_TYPE_INT64;
-      break;
-    case 32:
-      tp = CU_TENSOR_MAP_DATA_TYPE_INT32;
-      break;
-    case 16:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT16;
-      break;
-    case 8:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
-      break;
-    default:
-      ICHECK(0) << dtype;
-    }
-  } else if (dtype.is_uint()) {
-    switch (dtype.bits()) {
-    case 64:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT64;
-      break;
-    case 32:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT32;
-      break;
-    case 16:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT16;
-      break;
-    case 8:
-      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
-      break;
-    default:
-      ICHECK(0) << dtype;
-    }
-  } else {
-    ICHECK(0) << dtype;
+namespace {
+
+/// Build a TMA leader-thread condition using tl_shuffle_elect.
+/// \param thread_extent The number of threads in the current group
+///        (e.g., full block extent for non-WS, producer_extent for WS).
+///        The elected thread will be the first lane of the first warp in
+///        the group.
+static PrimExpr MakeTmaLeaderCondition(PrimExpr thread_extent) {
+  return Call(DataType::Bool(), tl_shuffle_elect(), {std::move(thread_extent)});
+}
+
+static int64_t TMABytesFromElements(int64_t elements, DataType dtype) {
+  return (elements * dtype.bits() + 7) / 8;
+}
+
+static PrimExpr TMABytesFromElements(PrimExpr elements, DataType dtype) {
+  PrimExpr elements_i64 = cast(DataType::Int(64), elements);
+  int bits = dtype.bits();
+  if (bits % 8 == 0) {
+    return elements_i64 * IntImm(DataType::Int(64), bits / 8);
   }
-  return static_cast<int>(tp);
+  return FloorDiv(elements_i64 * IntImm(DataType::Int(64), bits) +
+                      IntImm(DataType::Int(64), 7),
+                  IntImm(DataType::Int(64), 8));
+}
+
+static PrimExpr TMABitsFromElements(PrimExpr elements, DataType dtype) {
+  return cast(DataType::Int(64), elements) *
+         IntImm(DataType::Int(64), dtype.bits());
+}
+
+static int64_t TMAElementsForBytes(int64_t bytes, DataType dtype) {
+  ICHECK_EQ((bytes * 8) % dtype.bits(), 0)
+      << bytes << " bytes cannot be represented as whole elements of " << dtype;
+  return bytes * 8 / dtype.bits();
 }
 
-// Reverses an array (used for row-major/column-major layout conversion).
-template <typename T> static Array<T> ReverseArray(Array<T> array) {
-  return Array<T>{array.rbegin(), array.rend()};
+PrimExpr GetCopyMbarPhaseExpr(const Map<String, ObjectRef> &annotations,
+                              const LowerArgs &T) {
+  PrimExpr phase = T.mbar_phase_expr;
+  if (auto explicit_phase = GetAnnotatedMbarPhaseExpr(annotations)) {
+    phase = explicit_phase.value();
+  }
+  return phase;
 }
 
+} // namespace
+
 // Constructs a Copy operator node from call arguments and annotations.
 // args[0]: source region, args[1]: destination region
 // annotations: Map containing coalesced_width, disable_tma, eviction_policy,
 // etc.
 Copy::Copy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<CopyNode> node = tvm::ffi::make_object<CopyNode>();
-  Array<Range> rgs[2];
-  Buffer bf[2];
-  for (int i = 0; i < 2; i++) {
-    auto region = NormalizeToBufferRegion(args[i]);
-    rgs[i] = region->region;
-    bf[i] = region->buffer;
-  }
-  std::tie(node->src, node->dst) = std::tie(bf[0], bf[1]);
-  std::tie(node->src_range, node->dst_range) = std::tie(rgs[0], rgs[1]);
+  auto src_access = NormalizeToAccessRegion(args[0], kAccessRead);
+  auto dst_access = NormalizeToAccessRegion(args[1], kAccessWrite);
+  node->src = src_access.region->buffer;
+  node->dst = dst_access.region->buffer;
+  node->src_range = src_access.region->region;
+  node->dst_range = dst_access.region->region;
+  node->SetAccessRegions({src_access, dst_access});
   // Copy annotations from the Call node
   node->annotations = annotations;
   data_ = std::move(node);
@@ -312,12 +288,24 @@ For CopyNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
   if (is_scalar) {
     return For(Var("i"), 0, 1, ForKind::kSerial, body);
   }
+
   for (int i = loop_vars.size() - 1; i >= 0; i--) {
     Map<String, ObjectRef> loop_annotations;
-    if (annotations.count(attr::kCoalescedWidth)) {
-      loop_annotations.Set(attr::kCoalescedWidth,
-                           annotations.Get(attr::kCoalescedWidth).value());
+
+    // Only attach the parallel related annotations on the outermost loop (i ==
+    // 0)
+    if (i == 0) {
+      if (annotations.count(attr::kCoalescedWidth)) {
+        loop_annotations.Set(attr::kCoalescedWidth,
+                             annotations.Get(attr::kCoalescedWidth).value());
+      }
+      if (annotations.count(attr::kParallelLoopLayout)) {
+        loop_annotations.Set(
+            attr::kParallelLoopLayout,
+            annotations.Get(attr::kParallelLoopLayout).value());
+      }
     }
+
     body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
                ForKind::kParallel, body, std::nullopt, loop_annotations);
   }
@@ -348,30 +336,60 @@ Layout CopyNode::ComputeLinearLayout(const Buffer &shared_tensor) const {
 LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
                                 InferLevel level) const {
   auto target = T.target;
-  using namespace tvm::transform;
-  PassContext pass_ctx = PassContext::Current();
-  bool disable_tma_lower =
-      pass_ctx->GetConfig<Bool>(kDisableTMALower, Bool(false)).value();
-  auto copy_inst = GetCopyInst(target, disable_tma_lower || GetDisableTMA(),
-                               T.layout_map, T.analyzer, T.buffer_oob);
+  CopyInst copy_inst;
+  if (GetIsAsyncCopy()) {
+    // Layout inference does not require a full cp.async legality proof (which
+    // depends on final vectorization decisions). Keep the op as CPAsync for
+    // inference, and enforce legality during lowering.
+    if (!TargetHasAsyncCopy(target)) {
+      LOG(FATAL) << "T.async_copy is only supported on targets with cp.async "
+                    "support (SM80+). Got target="
+                 << target;
+    }
+    if (!IsGlobalBuffer(src) || !IsSharedBuffer(dst)) {
+      LOG(FATAL)
+          << "T.async_copy only supports global->shared/shared.dyn copies. "
+          << "Got src=" << src->name << " (scope=" << src.scope()
+          << "), dst=" << dst->name << " (scope=" << dst.scope() << ").";
+    }
+    if (src->dtype != dst->dtype) {
+      LOG(FATAL) << "T.async_copy requires equal byte-addressable dtypes. "
+                 << "Got src dtype=" << src->dtype
+                 << ", dst dtype=" << dst->dtype << ".";
+    }
+    copy_inst = CopyInst::kCPAsync;
+  } else {
+    copy_inst = GetCopyInst(target, T.layout_map, T.analyzer, T.buffer_oob);
+  }
+
+  // If user annotated a loop layout on T.copy, enforce SIMT (normal) copy.
+  // Parallel-loop layout only applies to SIMT-style loops we generate here;
+  // other copy instructions (TMA/LDSM/STSM/TMem) are incompatible.
+  if (annotations.count(attr::kParallelLoopLayout)) {
+    if (copy_inst != CopyInst::kNormal && copy_inst != CopyInst::kCPAsync) {
+      std::ostringstream oss;
+      oss << "T.copy loop layout annotation requires SIMT copy; got "
+          << CopyInstToString(copy_inst) << " for src=" << src->name
+          << ", dst=" << dst->name
+          << ". Remove loop_layout or change copy pattern.";
+      LOG(FATAL) << oss.str();
+    }
+  }
 
-  // Handle tensor memory (tmem) layout inference
+  // Handle tensor memory (tmem) layout inference for both load and store
   if (copy_inst == CopyInst::kTMemLoad || copy_inst == CopyInst::kTMemStore) {
-    // Tensor memory copy
-    // TODO (mzw) Add support for tcgen05.st/cp (in conj. with LowerTmemCopy)
-    ICHECK(copy_inst == CopyInst::kTMemLoad)
-        << "Only support tensor memory copy from shared.tmem to local.fragment "
-           "currently";
+    // TODO (mzw) Add support for tcgen05.cp (in conj. with LowerTmemCopy)
     LayoutMap results;
-    if (!T.layout_map.count(dst) && T.layout_map.count(src)) {
-      // Use the default layout (32dp32b) if not specified
-      // NOTE (mzw) We will check the layout in LowerTmemCopy(), so don't
-      // worry for tmem-incompatible layout
-      Layout src_layout = T.layout_map[src];
+    bool is_tmem_load = (copy_inst == CopyInst::kTMemLoad);
+    Buffer tmem_buf = is_tmem_load ? src : dst;
+    Buffer reg_buf = is_tmem_load ? dst : src;
+
+    if (!T.layout_map.count(reg_buf) && T.layout_map.count(tmem_buf)) {
+      Layout tmem_layout = T.layout_map[tmem_buf];
       Array<IterVar> logical_coords = MakeIterVars();
       Array<PrimExpr> logical_coords_var = {logical_coords[0]->var,
                                             logical_coords[1]->var};
-      Array<PrimExpr> phy_indices = src_layout->Forward(logical_coords_var);
+      Array<PrimExpr> phy_indices = tmem_layout->Forward(logical_coords_var);
 
       // Tmem physical coord range analysis
       auto analyzer = std::make_shared<arith::Analyzer>();
@@ -386,7 +404,7 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
       Range col_dom = Range((int)(phy_col_bounds->min_value),
                             (int)(phy_col_bounds->max_value + 1));
 
-      constexpr int WARP_SIZE = 32; // Set to 32 since only sm100 is supported
+      constexpr int WARP_SIZE = 32;
       constexpr int WARPGROUP_SIZE = 4 * WARP_SIZE;
       ICHECK(is_const_int(T.thread_bounds->extent))
           << "Tensor memory copy requires thread_bounds->extent (num_threads) "
@@ -400,7 +418,7 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
       for (int num_useful_wgs = num_threads / WARPGROUP_SIZE;
            num_useful_wgs >= 1; --num_useful_wgs) {
         int num_useful_threads = num_useful_wgs * WARPGROUP_SIZE;
-        Tcgen05Meta meta = getTcgen05Meta_32dp32b();
+        Tcgen05Meta meta = getTcgen05MetaLd_32dp32b();
         auto [is_success, tmem_coord2frag, num_chunks_each_wg] =
             expandTcgen05Layout(
                 meta, phy_col_bounds->max_value - phy_col_bounds->min_value + 1,
@@ -412,10 +430,12 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
             Fragment(logical_coords, tmem_coord2frag->Forward(phy_indices),
                      tmem_coord2frag->ForwardThread(phy_indices, std::nullopt),
                      make_itervar("rep", 1));
-        results.Set(dst, logical_coord2frag->BindThreadRange(T.thread_bounds));
+        results.Set(reg_buf,
+                    logical_coord2frag->BindThreadRange(T.thread_bounds));
         break;
       }
     }
+
     return results;
   }
 
@@ -463,6 +483,13 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
                              thread_extent, T.thread_bounds, result_map);
     }
 
+    if (is_tma_1d) {
+      // 1D TMA requires contiguous shared memory. Do not infer a swizzled
+      // shared layout here, otherwise the final instruction selection may fall
+      // back to descriptor-based multidimensional TMA.
+      return result_map;
+    }
+
     // check shared layout is non-swizzle
     // skip layout inference if shared layout is already annotated
     if (level == InferLevel::kFree && !T.layout_map.count(shared_tensor)) {
@@ -474,17 +501,19 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
         const int64_t mat_stride = *as_const_int(shared_tensor->shape[dim - 2]);
         const int64_t mat_continuous =
             *as_const_int(shared_tensor->shape[dim - 1]);
-        Layout swizzle_layout = makeGemmABLayoutHopper(
+        Layout swizzle_layout_2d = makeGemmABLayoutHopper(
             mat_stride, mat_continuous, mat_continuous,
             shared_tensor->dtype.bits(), /*k_inner=*/true);
         // If makeGemmABLayoutHopper returns a linear layout, fallback to
         // ComputeLinearLayout which handles arbitrary tensor shapes correctly.
-        if (StructuralEqual()(swizzle_layout, makeLinearLayout(Array<PrimExpr>{
-                                                  Integer(mat_stride),
-                                                  Integer(mat_continuous)}))) {
+        if (StructuralEqual()(
+                swizzle_layout_2d,
+                makeLinearLayout(Array<PrimExpr>{Integer(mat_stride),
+                                                 Integer(mat_continuous)}))) {
           result_map.Set(shared_tensor, ComputeLinearLayout(shared_tensor));
         } else {
-          result_map.Set(shared_tensor, swizzle_layout);
+          result_map.Set(shared_tensor, ExpandLayoutToMatchBuffer(
+                                            swizzle_layout_2d, shared_tensor));
         }
       } else if (level == InferLevel::kFree) {
         // create a new layout map for tma linear layout
@@ -505,6 +534,52 @@ LayoutMap CopyNode::InferLayout(const LayoutInferArgs &T,
   auto layout_map = par_op_->InferLayout(T, level);
   return layout_map;
 }
+// Shared stride validation for TMA bulk load/store.
+bool CopyNode::CheckGlobalStrides(const Buffer &buffer,
+                                  arith::Analyzer *analyzer) {
+  Array<PrimExpr> strides = buffer->strides;
+  if (strides.empty()) {
+    PrimExpr stride = 1;
+    strides.resize(buffer->shape.size());
+    for (int i = static_cast<int>(buffer->shape.size()) - 1; i >= 0; --i) {
+      strides.Set(i, stride);
+      stride *= buffer->shape[i];
+    }
+  }
+
+  if (!strides.empty() &&
+      analyzer->CanProve(strides[strides.size() - 1] != 1,
+                         arith::ProofStrength::kSymbolicBound)) {
+    LOG(WARNING) << "TMA bulk copy requires contiguous innermost global stride"
+                 << ", but got " << strides[strides.size() - 1]
+                 << " for buffer " << buffer->name
+                 << ", fallback to normal copy.";
+    return false;
+  }
+
+  for (size_t i = 0; i + 1 < strides.size(); ++i) {
+    PrimExpr stride_bytes = TMABytesFromElements(strides[i], buffer->dtype);
+    if (analyzer->CanProve(
+            FloorMod(stride_bytes, IntImm(DataType::Int(64), 16)) != 0,
+            arith::ProofStrength::kSymbolicBound)) {
+      LOG(WARNING) << "TMA bulk copy cannot support a global stride of "
+                   << stride_bytes << " for buffer " << buffer->name
+                   << ", fallback to normal copy.";
+      return false;
+    }
+    if (const int64_t *stride =
+            as_const_int(analyzer->Simplify(stride_bytes))) {
+      if (*stride >= (int64_t{1} << 40)) {
+        LOG(WARNING) << "TMA bulk copy cannot support a global stride of "
+                     << stride_bytes << " for buffer " << buffer->name
+                     << ", fallback to normal copy.";
+        return false;
+      }
+    }
+  }
+  return true;
+}
+
 // Checks if this copy can be lowered to a Bulk Load (TMA) instruction.
 // Requires: TMA support, global->shared scope, matching dtypes.
 bool CopyNode::CheckBulkLoad(Target target, arith::Analyzer *analyzer,
@@ -522,13 +597,14 @@ bool CopyNode::CheckBulkLoad(Target target, arith::Analyzer *analyzer,
   // now we check src (gmem) as tma box dim is deduced from src
   if (check_last_dim &&
       analyzer->CanProve(
-          FloorMod(src_range[src_range.size() - 1]->extent * src->dtype.bytes(),
-                   16) != 0,
+          FloorMod(TMABitsFromElements(src_range[src_range.size() - 1]->extent,
+                                       src->dtype),
+                   IntImm(DataType::Int(64), 128)) != 0,
           arith::ProofStrength::kSymbolicBound)) {
     LOG(WARNING)
         << "src range must have last dim multiple of 16 for tma bulk load "
         << src->name << " range " << src_range[src_range.size() - 1]->extent
-        << " * " << src->dtype.bytes() << " % 16 != 0";
+        << " * " << src->dtype.bits() << " bits % 128 != 0";
     return false;
   }
 
@@ -539,6 +615,8 @@ bool CopyNode::CheckBulkLoad(Target target, arith::Analyzer *analyzer,
                  << " vs. " << dst->dtype << " will be fallback to normal copy";
     return false;
   }
+  if (!CheckGlobalStrides(src, analyzer))
+    return false;
   return true;
 }
 
@@ -634,13 +712,14 @@ bool CopyNode::CheckBulkStore(Target target, arith::Analyzer *analyzer,
   // now we check dst (gmem) as tma box dim is deduced from dst
   if (check_last_dim &&
       analyzer->CanProve(
-          FloorMod(dst_range[dst_range.size() - 1]->extent * dst->dtype.bytes(),
-                   16) != 0,
+          FloorMod(TMABitsFromElements(dst_range[dst_range.size() - 1]->extent,
+                                       dst->dtype),
+                   IntImm(DataType::Int(64), 128)) != 0,
           arith::ProofStrength::kSymbolicBound)) {
     LOG(WARNING)
         << "dst range must have last dim multiple of 16 for tma bulk store "
         << dst->name << " range " << dst_range[dst_range.size() - 1]->extent
-        << " * " << dst->dtype.bytes() << " % 16 != 0";
+        << " * " << dst->dtype.bits() << " bits % 128 != 0";
     return false;
   }
   // 4. src and dst must have the same dtype
@@ -650,14 +729,15 @@ bool CopyNode::CheckBulkStore(Target target, arith::Analyzer *analyzer,
                  << " vs. " << dst->dtype << " will be fallback to normal copy";
     return false;
   }
+  if (!CheckGlobalStrides(dst, analyzer))
+    return false;
   return true;
 }
 
 // Checks if copy can use CUDA's Load Matrix (LDSM) instruction.
 // Requires: LDMATRIX support, shared->fragment scope.
 bool CopyNode::CheckLDSMCopy(Target target) const {
-  return TargetHasLdmatrix(target) &&
-         (src.scope() == "shared.dyn" || src.scope() == "shared") &&
+  return TargetHasLdmatrix(target) && IsSharedBuffer(src) &&
          IsFragmentBuffer(dst);
 }
 
@@ -665,7 +745,7 @@ bool CopyNode::CheckLDSMCopy(Target target) const {
 // Requires: STMATRIX support, fragment->shared scope.
 bool CopyNode::CheckSTSMCopy(Target target) const {
   return TargetHasStmatrix(target) && IsFragmentBuffer(src) &&
-         (dst.scope() == "shared.dyn" || dst.scope() == "shared");
+         IsSharedBuffer(dst);
 }
 
 // Checks if copy can use tensor memory load (tcgen05.ld).
@@ -682,30 +762,108 @@ bool CopyNode::CheckTMemStore(Target target) const {
          dst.scope() == "shared.tmem";
 }
 
+// Checks if copy can use cp.async global->shared path.
+// Requirements:
+// - target has async copy capability
+// - source is global and destination is shared/shared.dyn
+// - source/destination dtypes match
+// - vectorized copy width (bytes) is one of {4, 8, 16}
+// - if OOB guards are required, only a *uniform* (scalar) source predicate
+//   is supported (dst must be in-bounds)
+bool CopyNode::CheckCPAsyncCopyPreconditions() const {
+  if (!IsGlobalBuffer(src) || !IsSharedBuffer(dst)) {
+    return false;
+  }
+  if (src->dtype != dst->dtype) {
+    return false;
+  }
+  return true;
+}
+
+bool CopyNode::CheckPipelineManagedCPAsyncCopy() const {
+  return !GetIsTmaCopy() && !GetIsAsyncCopy() &&
+         CheckCPAsyncCopyPreconditions();
+}
+
+bool CopyNode::CheckPipelineManagedCPAsyncCopy(
+    Target target, arith::Analyzer *analyzer) const {
+  return CheckPipelineManagedCPAsyncCopy() &&
+         CheckCPAsyncCopy(target, LayoutMap(), analyzer);
+}
+
+bool CopyNode::CheckCPAsyncCopy(Target target, const LayoutMap &layout_map,
+                                arith::Analyzer *analyzer) const {
+  if (!TargetHasAsyncCopy(target)) {
+    return false;
+  }
+  if (!CheckCPAsyncCopyPreconditions()) {
+    return false;
+  }
+  // Skip vectorize size check here because, during the Infer Layout stage,
+  // the layout is not stable and the vectorized size cannot be determined.
+  return true;
+}
+
 // Selects the most specific copy instruction for the given target and buffers.
-// Priority: BulkLoad1D, BulkStore1D, BulkLoad, BulkStore, LDSM, STSM, TMemLoad,
-// TMemStore, Normal.
-CopyInst CopyNode::GetCopyInst(Target target, bool disable_tma_lower,
-                               const LayoutMap &layout_map,
+// Priority: BulkLoad1D, BulkStore1D, BulkLoad, BulkStore, LDSM, STSM,
+// TMemLoad, TMemStore, CPAsync, Normal.
+CopyInst CopyNode::GetCopyInst(Target target, const LayoutMap &layout_map,
                                arith::Analyzer *analyzer,
-                               bool buffer_oob = false) const {
-  // disable_tma_lower is from pass_configs
-  // when tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER is True,
-  // we will not use tma for bulk load/store
+                               bool buffer_oob) const {
+  // When is_tma_copy is set (from T.tma_copy()), force TMA path.
+  if (GetIsTmaCopy()) {
+    // Check if target is CuTeDSL backend
+    bool is_cutedsl = TargetIsCuTeDSL(target);
+    if (!is_cutedsl && !buffer_oob &&
+        CheckBulkLoad1D(target, layout_map, analyzer)) {
+      return CopyInst::kBulkLoad1D;
+    } else if (!is_cutedsl && !buffer_oob &&
+               CheckBulkStore1D(target, layout_map, analyzer)) {
+      return CopyInst::kBulkStore1D;
+    } else if (CheckBulkLoad(target, analyzer)) {
+      return CopyInst::kBulkLoad;
+    } else if (CheckBulkStore(target, analyzer)) {
+      return CopyInst::kBulkStore;
+    } else {
+      LOG(FATAL) << "T.tma_copy() requires TMA-capable target and "
+                    "global<->shared copy pattern, but TMA is not available "
+                    "for src="
+                 << src->name << ", dst=" << dst->name;
+    }
+  }
+
+  bool is_async_copy = GetIsAsyncCopy();
+  bool no_implicit_commit_wait = GetNoImplicitAsyncCommitWait();
+
+  if (is_async_copy || no_implicit_commit_wait) {
+    bool cp_async_supported = CheckCPAsyncCopy(target, layout_map, analyzer);
+    ICHECK(cp_async_supported)
+        << "Explicit async copy semantics require cp.async lowering, but "
+           "constraints were not satisfied. Got src="
+        << src->name << " (scope=" << src.scope() << ", dtype=" << src->dtype
+        << "), dst=" << dst->name << " (scope=" << dst.scope()
+        << ", dtype=" << dst->dtype << ").";
+    return CopyInst::kCPAsync;
+  }
+
+  // Plain T.copy does not auto-upgrade to TMA loads anymore. Store-side TMA
+  // remains allowed because it is self-synchronized locally and does not
+  // participate in pipeline producer scheduling.
+  // Also honour the (deprecated) global pass config for backward compat.
+  if (!GetDisableTMA() && !tvm::transform::PassContext::Current()
+                               ->GetConfig<Bool>(kDisableTMALower, Bool(false))
+                               .value()) {
+    bool is_cutedsl = TargetIsCuTeDSL(target);
+    if (!is_cutedsl && !buffer_oob &&
+        CheckBulkStore1D(target, layout_map, analyzer)) {
+      return CopyInst::kBulkStore1D;
+    } else if (CheckBulkStore(target, analyzer)) {
+      return CopyInst::kBulkStore;
+    }
+  }
 
   // Check tensor memory operations first (highest priority for SM100/Blackwell)
-  // 1d tma access can not support out of bound access
-  if (!disable_tma_lower && !buffer_oob &&
-      CheckBulkLoad1D(target, layout_map, analyzer)) {
-    return CopyInst::kBulkLoad1D;
-  } else if (!disable_tma_lower && !buffer_oob &&
-             CheckBulkStore1D(target, layout_map, analyzer)) {
-    return CopyInst::kBulkStore1D;
-  } else if (!disable_tma_lower && CheckBulkLoad(target, analyzer)) {
-    return CopyInst::kBulkLoad;
-  } else if (!disable_tma_lower && CheckBulkStore(target, analyzer)) {
-    return CopyInst::kBulkStore;
-  } else if (CheckLDSMCopy(target)) {
+  if (CheckLDSMCopy(target)) {
     return CopyInst::kLDSM;
   } else if (CheckSTSMCopy(target)) {
     return CopyInst::kSTSM;
@@ -722,13 +880,8 @@ CopyInst CopyNode::GetCopyInst(Target target, bool disable_tma_lower,
 // functions.
 Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   Target target = T.target;
-
-  using namespace tvm::transform;
-  PassContext pass_ctx = PassContext::Current();
-  bool disable_tma_lower =
-      pass_ctx->GetConfig<Bool>(kDisableTMALower, Bool(false)).value();
-  auto copy_inst = GetCopyInst(target, disable_tma_lower || GetDisableTMA(),
-                               T.layout_map, analyzer);
+  auto copy_inst =
+      GetCopyInst(target, T.layout_map, analyzer, /*buffer_oob=*/false);
   if (copy_inst == CopyInst::kTMemLoad || copy_inst == CopyInst::kTMemStore) {
     auto tmem_copy = LowerTmemCopy(T, analyzer);
     ICHECK(tmem_copy.defined()) << "Failed to lower tensor memory copy";
@@ -747,6 +900,10 @@ Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     auto ldsm_copy = LowerLDSMCopy(T, analyzer, copy_inst);
     ICHECK(ldsm_copy.defined()) << "Failed to lower ptx matrix copy";
     return ldsm_copy;
+  } else if (copy_inst == CopyInst::kCPAsync) {
+    auto cp_async_copy = LowerCPAsyncCopy(T, analyzer);
+    ICHECK(cp_async_copy.defined()) << "Failed to lower cp.async copy";
+    return cp_async_copy;
   } else if (copy_inst == CopyInst::kNormal) {
     return LowerNormalCopy(T, analyzer);
   } else {
@@ -754,6 +911,84 @@ Stmt CopyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   }
 }
 
+// Lowers copy to cp.async global->shared transfers.
+// - T.copy annotated for cp.async keeps synchronous semantics by committing
+//   and waiting after the loop.
+// - T.async_copy commits but does not wait (explicit async semantics).
+// - Copies annotated with kAsyncCopyNoImplicitCommitWait emit only cp.async;
+//   an enclosing pass is responsible for commit/wait placement.
+Stmt CopyNode::LowerCPAsyncCopy(const LowerArgs &T,
+                                arith::Analyzer *analyzer) const {
+  using namespace tvm::transform;
+  PassContext pass_ctx = PassContext::Current();
+  bool enable_async_copy =
+      pass_ctx->GetConfig<Bool>(kEnableAsyncCopy, Bool(true)).value();
+  bool no_implicit_commit_wait = GetNoImplicitAsyncCommitWait();
+  bool explicit_async_semantics = no_implicit_commit_wait || GetIsAsyncCopy();
+  if (!enable_async_copy && !explicit_async_semantics) {
+    return LowerNormalCopy(T, analyzer);
+  }
+
+  auto simt_loop = MakeSIMTLoop(analyzer);
+  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
+  auto par_op = ParallelOp(fused_loop);
+
+  std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                    InferLevel::kFree};
+  for (auto level : levels) {
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
+                        level);
+  }
+  auto loop_layout = par_op->GetLoopLayout();
+  Stmt lowered_loop =
+      LowerParallelLoop(par_op->GetRoot(), loop_layout, T.thread_var, analyzer,
+                        T.layout_map, par_op->GetPredicate(T.thread_var));
+
+  bool async_without_implicit_commit_wait =
+      no_implicit_commit_wait || GetIsAsyncCopy();
+  auto inject_result =
+      InjectPTXAsyncCopy(lowered_loop, /*enable_auto_async_copy=*/true,
+                         async_without_implicit_commit_wait);
+  Stmt cp_async_loop = inject_result.stmt;
+  if (!inject_result.injected_ptx_async_copy) {
+    LOG(WARNING) << "cp.async rewrite miss for copy src=" << src->name
+                 << " (scope=" << src.scope() << ", dtype=" << src->dtype
+                 << "), dst=" << dst->name << " (scope=" << dst.scope()
+                 << ", dtype=" << dst->dtype
+                 << "), no_implicit_async_commit_wait="
+                 << no_implicit_commit_wait
+                 << ", is_async_copy=" << GetIsAsyncCopy();
+    if (no_implicit_commit_wait) {
+      LOG(WARNING)
+          << "Pipeline-managed async copy fallback to normal copy because "
+             "cp.async rewrite found no eligible global->shared store.";
+      return lowered_loop;
+    }
+    if (explicit_async_semantics) {
+      LOG(FATAL) << "Explicit async copy semantics require cp.async lowering, "
+                    "but no eligible global->shared store was rewritten.";
+    }
+    LOG(WARNING) << "Fallback to normal copy because cp.async rewrite found "
+                    "no eligible global->shared store.";
+    return LowerNormalCopy(T, analyzer);
+  }
+  if (no_implicit_commit_wait) {
+    return cp_async_loop;
+  }
+  if (GetIsAsyncCopy()) {
+    Stmt commit_group =
+        Evaluate(Call(DataType::Handle(), builtin::ptx_commit_group(), {}));
+    return SeqStmt({cp_async_loop, commit_group});
+  }
+  return cp_async_loop;
+}
+
 // Lowers the copy using standard load/store with loop transformations.
 Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
@@ -761,19 +996,31 @@ Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
   auto simt_loop = MakeSIMTLoop(analyzer);
   auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
 
-  auto transformed_loop =
-      Downcast<For>(ParallelLoopTransformer::Substitute(fused_loop));
-
   For vectorized_thread_loop;
-  auto par_op = ParallelOp(transformed_loop);
+  auto par_op = ParallelOp(fused_loop);
 
   if (is_cpu_target || IsLocalBuffer(src) || IsLocalBuffer(dst)) {
     if (IsLocalBuffer(src) && !IsLocalBuffer(dst)) {
-      LOG(WARNING) << "Copy from local buffer `" << src->name << "` to "
-                   << dst.scope() << " buffer `" << dst->name
-                   << "` may cause conflicted write.";
+      // A conflict write only occurs when multiple threads write to the same
+      // global address. If any dst_range dimension's min depends on the thread
+      // variable, each thread targets a distinct location and there is no
+      // conflict.
+      bool dst_depends_on_thread = false;
+      for (const auto &range : dst_range) {
+        if (tir::UsesVar(range->min, [&](const VarNode *v) {
+              return v == T.thread_var.get();
+            })) {
+          dst_depends_on_thread = true;
+          break;
+        }
+      }
+      if (!dst_depends_on_thread) {
+        LOG(WARNING) << "Copy from local buffer `" << src->name << "` to "
+                     << dst.scope() << " buffer `" << dst->name
+                     << "` may cause conflicted write.";
+      }
     }
-    vectorized_thread_loop = VectorizeLoop(transformed_loop);
+    vectorized_thread_loop = VectorizeLoop(fused_loop, T.layout_map);
     return vectorized_thread_loop;
   } else {
     std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
@@ -792,7 +1039,8 @@ Stmt CopyNode::LowerNormalCopy(const LowerArgs &T,
     // Use LowerParallelLoop to handle partitioning, vectorization, and
     // predicate
     return LowerParallelLoop(par_op->GetRoot(), loop_layout, T.thread_var,
-                             analyzer, par_op->GetPredicate(T.thread_var));
+                             analyzer, T.layout_map,
+                             par_op->GetPredicate(T.thread_var));
   }
 }
 
@@ -821,6 +1069,20 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
 
   Buffer shared_tensor = is_ldmatrix ? src : dst;
   Buffer local_tensor = is_ldmatrix ? dst : src;
+  Array<Range> local_region = is_ldmatrix ? src_range : dst_range;
+  bool is_full_range = true;
+  for (size_t i = 0; i < local_region.size(); i++) {
+    if (!analyzer->CanProveEqual(local_region[i]->extent,
+                                 local_tensor->shape[i])) {
+      is_full_range = false;
+      break;
+    }
+  }
+  if (!is_full_range) {
+    // ldmatrix/stmatrix can only support full range, will be fallback to
+    // normal copy
+    return LowerNormalCopy(T, analyzer);
+  }
 
   Array<PrimExpr> local_indices = MakeIndices(loop_vars, is_ldmatrix ? 1 : 0);
   Fragment local_layout = Downcast<Fragment>(T.layout_map[local_tensor]);
@@ -835,14 +1097,6 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   }
 
   Array<PrimExpr> shared_indices = MakeIndices(loop_vars, is_ldmatrix ? 0 : 1);
-  Array<PrimExpr> shared_indices_transformed = shared_indices;
-  Layout shared_layout;
-  if (T.buffer_remap.count(shared_tensor)) {
-    shared_layout = T.layout_map[shared_tensor];
-    shared_tensor = T.buffer_remap[shared_tensor];
-    shared_indices_transformed = shared_layout->Forward(shared_indices);
-  }
-
   // Check local_layout follows 8x8 layout
   // LDSM/STSM instructions require 8x8 matrix fragment layout
   // This matches the warp-level matrix multiplication pattern used in tensor
@@ -861,13 +1115,13 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   PrimExpr local_indices_flattened =
       local_tensor.OffsetOf(local_indices_transformed).back();
   if (analyzer->CanProveEqual(matrix_8x8_thread_map, local_layout_thread_map) &&
-      IndiceCanVectorize(local_indices_flattened, col_var->var,
-                         col_var->dom->extent, 2, analyzer)) {
+      IndicesCanVectorize(local_indices_flattened, col_var->var,
+                          col_var->dom->extent, 2, analyzer)) {
     is_transposed = false;
   } else if (analyzer->CanProveEqual(matrix_8x8_thread_map_trans,
                                      local_layout_thread_map) &&
-             IndiceCanVectorize(local_indices_flattened, row_var->var,
-                                row_var->dom->extent, 2, analyzer)) {
+             IndicesCanVectorize(local_indices_flattened, row_var->var,
+                                 row_var->dom->extent, 2, analyzer)) {
     is_transposed = true;
   } else {
     // TMA ldmatrix/stmatrix cannot support non-8x8 layout, will be fallback to
@@ -882,10 +1136,9 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     // be fallback to normal copy
     return LowerNormalCopy(T, analyzer);
   }
-  PrimExpr flattened_indice =
-      shared_tensor.OffsetOf(shared_indices_transformed).back();
-  if (!IndiceCanVectorize(flattened_indice, loop_vars.back()->var,
-                          loop_vars.back()->dom->extent, 8, analyzer)) {
+  PrimExpr flattened_indice = shared_tensor.OffsetOf(shared_indices).back();
+  if (!IndicesCanVectorize(flattened_indice, loop_vars.back()->var,
+                           loop_vars.back()->dom->extent, 8, analyzer)) {
     // TMA ldmatrix/stmatrix cannot support non-16 bytes continuous layout, will
     // be fallback to normal copy
     return LowerNormalCopy(T, analyzer);
@@ -901,11 +1154,16 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   }
 
   // Do the lowering here, try vectorized ldmatrix/stmatrix by 4/2/1
+  // now, local_tensor is local instead of shared.
   PrimExpr extent = local_tensor->shape[0];
   int num = 1;
   if (analyzer->CanProveEqual(FloorMod(extent, 8), 0))
+    // 16x16 -> full warp, we use x4, for 32 threads in a warp, each thread can
+    // hold 4 elements
     num = 4;
   else if (analyzer->CanProveEqual(FloorMod(extent, 4), 0))
+    // 8x16 -> half warp, we use x2, for 32 threads in a warp, each thread can
+    // hold 2 elements
     num = 2;
 
   Array<PrimExpr> args;
@@ -923,21 +1181,25 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   Layout inv = local_layout->Inverse();
   Array<PrimExpr> shared_coords;
   PrimExpr warp = FloorDiv(T.thread_var, 32) * 32;
-  if (!is_transposed)
-    shared_coords = inv->Forward(
-        {local_iter * 2 * num + 2 * FloorMod(FloorDiv(T.thread_var, 8), num),
-         warp + FloorMod(T.thread_var, 8) * 4});
-  else
-    shared_coords = inv->Forward(
-        {local_iter * 2 * num + 2 * FloorMod(FloorDiv(T.thread_var, 8), num) +
-             FloorMod(T.thread_var, 2),
-         warp + FloorDiv(FloorMod(T.thread_var, 8), 2)});
+  if (!is_transposed) {
+    auto local_index = analyzer->Simplify(
+        local_iter * 2 * num + 2 * FloorMod(FloorDiv(T.thread_var, 8), num));
+    auto thread_index =
+        analyzer->Simplify(warp + FloorMod(T.thread_var, 8) * 4);
+    shared_coords = inv->Forward({local_index, thread_index});
+  } else {
+    auto local_index = analyzer->Simplify(
+        local_iter * 2 * num + 2 * FloorMod(FloorDiv(T.thread_var, 8), num) +
+        FloorMod(T.thread_var, 2));
+    auto thread_index =
+        analyzer->Simplify(warp + FloorDiv(FloorMod(T.thread_var, 8), 2));
+    shared_coords = inv->Forward({local_index, thread_index});
+  }
   shared_coords.pop_back(); // remove rep
-  if (shared_layout.defined())
-    shared_coords = shared_layout->Forward(shared_coords);
-  PrimExpr shared_addr = shared_tensor.access_ptr(
-      is_ldmatrix ? 1 : 2, DataType::Handle(), 1,
-      shared_tensor.OffsetOf(shared_coords).back(), PrimExpr(2 * num));
+  PrimExpr shared_addr =
+      Call(DataType::Handle(), tl::access_ptr(),
+           {BufferLoad(shared_tensor, shared_coords), PrimExpr(2 * num),
+            make_const(DataType::Int(32), is_ldmatrix ? 1 : 2)});
   args.push_back(shared_addr);
 
   if (is_ldmatrix) {
@@ -947,8 +1209,10 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
       // copy
       return LowerNormalCopy(T, analyzer);
     }
-    PrimExpr local_addr = local_tensor.access_ptr(
-        2, DataType::Handle(), 1, local_iter * 2 * num, PrimExpr(2 * num));
+    PrimExpr local_addr =
+        Call(DataType::Handle(), tl::access_ptr(),
+             {BufferLoad(local_tensor, {local_iter * 2 * num}),
+              PrimExpr(2 * num), make_const(DataType::Int(32), 2)});
     args.push_back(local_addr);
   } else {
     for (int i = 0; i < num; i++) {
@@ -969,7 +1233,7 @@ Stmt CopyNode::LowerLDSMCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   auto body = Evaluate(Call(DataType::Handle(), op, args));
   For for_node =
       For(local_iter, 0, FloorDiv(extent, 2 * num), ForKind::kSerial, body);
-  for_node = LoopPragmaUnroll(for_node);
+  for_node = PragmaUnrollLoop(for_node);
   auto range = T.thread_bounds;
   if (range.defined()) {
     auto thread_var = T.thread_var;
@@ -1012,12 +1276,12 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
   }
   // Currently tcgen05.cp is not supported
   // TODO (mzw) Support tcgen05.cp
+
+  // NOTE(wt): For copying scaling factor from SMEM to TMEM,
+  // please use `T.tcgen05_cp_warpx4` instead,
+  // as blockscaled GEMM on SM100 requires 4 duplicated 32-row sf.
   ICHECK(!is_cp)
       << "Copy from shared memory to tensor memory is not supported yet";
-  // Currently tcgen05.st is not supported
-  // TODO (mzw) Support tcgen05.st
-  ICHECK(!is_st) << "Copy from register to tensor memory is not supported yet";
-
   // Extract loop variables and ranges
   Array<IterVar> loop_vars = MakeIterVars();
   ICHECK(loop_vars.size() == 2) << "Only support 2D tensor memory copy, got "
@@ -1055,18 +1319,24 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
 
   // TODO (mzw) Buffer remap for shared.dyn when is_cp is true?
 
+  // Determine tmem and register buffers based on copy direction
+  Buffer tmem_buf = is_ld ? src : dst;
+  Buffer reg_buf = is_ld ? dst : src;
+  int tmem_side = is_ld ? 0 : 1;
+  bool needs_pack_unpack = is_ld ? src_needs_pack : dst_needs_unpack;
+
   // Retrieve layout
-  ICHECK(T.layout_map.count(src))
-      << "Source buffer " << src->name << " does not have a layout specified";
-  ICHECK(T.layout_map.count(dst)) << "Destination buffer " << dst->name
-                                  << " does not have a layout specified";
-  Layout src_layout = T.layout_map[src];
-  Fragment dst_layout = Downcast<Fragment>(T.layout_map[dst]);
+  ICHECK(T.layout_map.count(tmem_buf)) << "Tmem buffer " << tmem_buf->name
+                                       << " does not have a layout specified";
+  ICHECK(T.layout_map.count(reg_buf)) << "Register buffer " << reg_buf->name
+                                      << " does not have a layout specified";
+  Layout tmem_layout = T.layout_map[tmem_buf];
+  Fragment reg_layout = Downcast<Fragment>(T.layout_map[reg_buf]);
 
   // Check layout
-  Array<PrimExpr> logical_indices = MakeIndices(loop_vars, 0);
+  Array<PrimExpr> logical_indices = MakeIndices(loop_vars, tmem_side);
   Array<PrimExpr> phy_indices =
-      src_layout->Forward(logical_indices); // "phy" for "physical"
+      tmem_layout->Forward(logical_indices); // "phy" for "physical"
 
   // Analyse the range of tmem_phy_row and tmem_phy_col
   arith::ConstIntBound phy_row_bounds =
@@ -1108,40 +1378,64 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
 
       PrimExpr target_thread =
           target_frag->ForwardThread(phy_indices, std::nullopt);
-      PrimExpr dst_thread =
-          dst_layout->ForwardThread(logical_indices, std::nullopt);
-      if (!analyzer->CanProveEqual(target_thread, dst_thread)) {
+      PrimExpr reg_thread =
+          reg_layout->ForwardThread(logical_indices, std::nullopt);
+      if (!analyzer->CanProveEqual(target_thread, reg_thread)) {
         continue;
       }
       PrimExpr target_reg = target_frag->Forward(phy_indices)[0];
-      PrimExpr dst_reg = dst_layout->Forward(logical_indices)[0];
-      if (!analyzer->CanProveEqual(target_reg, dst_reg)) {
+      PrimExpr reg_val = reg_layout->Forward(logical_indices)[0];
+      if (!analyzer->CanProveEqual(target_reg, reg_val)) {
         continue;
       }
 
       // All checks passed, we can use this instruction
+      // For tcgen05_st, bf16 data should be stored packed (without
+      // unpack::16b) so MMA TS reads correctly packed bf16 from TMEM columns.
+      // For tcgen05_ld, pack::16b is still needed when reading unpacked data.
+      bool use_pack_unpack_modifier = is_ld ? needs_pack_unpack : false;
+      int effective_chunks =
+          needs_pack_unpack ? num_chunks_each_wg / 2 : num_chunks_each_wg;
       PrimExpr relative_wg_idx =
           FloorDiv(Sub(T.thread_var, T.thread_bounds->min), WARPGROUP_SIZE);
       PrimExpr col_offset =
           num_useful_threads == WARPGROUP_SIZE
               ? PrimExpr(0)
-              : relative_wg_idx * (num_chunks_each_wg * meta.width);
+              : relative_wg_idx * (effective_chunks * meta.width);
       have_succeeded = true;
       Array<PrimExpr> args;
-      const char *bool_str = src_needs_pack ? "true" : "false";
-      args.push_back(StringImm(meta.intrinsics_name + "<" +
-                               std::to_string(num_chunks_each_wg) + ", " +
-                               bool_str + ">"));
-      args.push_back(
-          BufferLoad(src, {(int)logical_row_min,
-                           (int)logical_col_min})); // Will be translated later
-                                                    // in lower_shared_tmem pass
-      args.push_back(col_offset);
-      args.push_back(dst.access_ptr(2, DataType::Handle(), 1, 0,
-                                    PrimExpr(tmem_phy_col_extent)));
-
-      Stmt call =
-          Evaluate(Call(DataType::Handle(), builtin::call_extern(), args));
+      Stmt call;
+      if (is_ld) {
+        args.push_back(IntImm(DataType::Int(32), meta.width * 32));
+        args.push_back(IntImm(DataType::Int(32), effective_chunks));
+        args.push_back(Bool(use_pack_unpack_modifier));
+        args.push_back(
+            BufferLoad(tmem_buf, {(int)logical_row_min,
+                                  (int)logical_col_min})); // Will be translated
+                                                           // later in
+                                                           // lower_shared_tmem
+                                                           // pass
+        args.push_back(col_offset);
+        args.push_back(reg_buf.access_ptr(/*access_mask=*/2, DataType::Handle(),
+                                          /*content_lanes=*/1, /*offset=*/0,
+                                          PrimExpr(tmem_phy_col_extent)));
+        call = Evaluate(Call(DataType::Handle(), tcgen05_ld(), args));
+      } else {
+        args.push_back(IntImm(DataType::Int(32), meta.width * 32));
+        args.push_back(IntImm(DataType::Int(32), effective_chunks));
+        args.push_back(Bool(use_pack_unpack_modifier));
+        args.push_back(
+            BufferLoad(tmem_buf, {(int)logical_row_min,
+                                  (int)logical_col_min})); // Will be translated
+                                                           // later in
+                                                           // lower_shared_tmem
+                                                           // pass
+        args.push_back(col_offset);
+        int reg_access_mode = 1;
+        args.push_back(reg_buf.access_ptr(reg_access_mode, DataType::Handle(),
+                                          1, 0, PrimExpr(tmem_phy_col_extent)));
+        call = Evaluate(Call(DataType::Handle(), tcgen05_st(), args));
+      }
       if (num_useful_threads != num_threads) {
         body =
             IfThenElse(T.thread_var < T.thread_bounds->min + num_useful_threads,
@@ -1154,13 +1448,20 @@ Stmt CopyNode::LowerTmemCopy(const LowerArgs &T,
     }
   };
 
-  try_tcgen05_instruction(getTcgen05Meta_32dp32b());
-  try_tcgen05_instruction(getTcgen05Meta_32dp64b());
-  try_tcgen05_instruction(getTcgen05Meta_32dp128b());
-  try_tcgen05_instruction(getTcgen05Meta_32dp256b());
+  if (is_ld) {
+    try_tcgen05_instruction(getTcgen05MetaLd_32dp32b());
+    try_tcgen05_instruction(getTcgen05MetaLd_32dp64b());
+    try_tcgen05_instruction(getTcgen05MetaLd_32dp128b());
+    try_tcgen05_instruction(getTcgen05MetaLd_32dp256b());
+  } else {
+    try_tcgen05_instruction(getTcgen05MetaSt_32dp32b());
+    try_tcgen05_instruction(getTcgen05MetaSt_32dp64b());
+    try_tcgen05_instruction(getTcgen05MetaSt_32dp128b());
+    try_tcgen05_instruction(getTcgen05MetaSt_32dp256b());
+  }
 
-  ICHECK(have_succeeded) << "Failed to find a suitable instruction for "
-                            "tcgen05.ld. Check your layout.";
+  ICHECK(have_succeeded) << "Failed to find a suitable instruction for tcgen05."
+                         << (is_ld ? "ld" : "st") << ". Check your layout.";
 
   return body;
 }
@@ -1174,6 +1475,7 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   bool is_load = copy_inst == CopyInst::kBulkLoad;
   Buffer global_tensor = is_load ? src : dst;
   Buffer shared_tensor = is_load ? dst : src;
+  Buffer shared_tensor_unmapped = shared_tensor;
   Array<Range> global_range = is_load ? src_range : dst_range;
   Array<Range> shared_range = is_load ? dst_range : src_range;
   // TMA bulk copy cannot support a non-swizzled global layout, will be fallback
@@ -1255,7 +1557,7 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   ICHECK(is_one(desc.global_stride[0])) << desc.global_stride;
   // Make global stride in bytes
   desc.global_stride = desc.global_stride.Map([&](PrimExpr e) {
-    return cast(DataType::Int(64), e) * global_tensor->dtype.bytes();
+    return TMABytesFromElements(e, global_tensor->dtype);
   });
   for (size_t i{1}; i < desc.global_stride.size(); i++) {
     auto stride = desc.global_stride[i].as<IntImmNode>();
@@ -1327,25 +1629,25 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   } else if (StructuralEqual()(shared_layout, linear_layout)) {
     desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
   } else {
-    ICHECK(shared_layout->InputDim() == 2) << "Cannot detect TMA layout.";
-    auto stride = as_const_int(shared_layout->InputShape()[0]);
-    auto continuous = as_const_int(shared_layout->InputShape()[1]);
+    if (shared_layout->InputDim() < 2) {
+      LOG(WARNING) << "TMA bulk copy cannot support shared layout with input "
+                   << "dimension " << shared_layout->InputDim()
+                   << ", fallback to normal copy.";
+      return LowerNormalCopy(T, analyzer);
+    }
+    const int ndim = static_cast<int>(shared_layout->InputDim());
+    auto stride = as_const_int(shared_layout->InputShape()[ndim - 2]);
+    auto continuous = as_const_int(shared_layout->InputShape()[ndim - 1]);
     ICHECK(stride != nullptr && continuous != nullptr);
     // We also need to check if the shape satisfies the following doc:
     // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html#group__CUDA__TENSOR__MEMORY_1ga7c7d2aaac9e49294304e755e6f341d7
-    if (StructuralEqual()(shared_layout, makeQuarterBankSwizzleLayout(
-                                             *stride, *continuous,
-                                             shared_tensor->dtype.bits()))) {
+    SwizzleMode swizzle_mode =
+        DetectSwizzleMode(shared_layout, shared_tensor_unmapped);
+    if (swizzle_mode == SwizzleMode::kQuarter) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B);
-    } else if (StructuralEqual()(
-                   shared_layout,
-                   makeHalfBankSwizzleLayout(*stride, *continuous,
-                                             shared_tensor->dtype.bits()))) {
+    } else if (swizzle_mode == SwizzleMode::kHalf) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B);
-    } else if (StructuralEqual()(
-                   shared_layout,
-                   makeFullBankSwizzleLayout(*stride, *continuous,
-                                             shared_tensor->dtype.bits()))) {
+    } else if (swizzle_mode == SwizzleMode::kFull) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
     } else if (StructuralEqual()(
                    shared_layout,
@@ -1372,9 +1674,9 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   }
   int instruction_dim = *inner_box_dim;
   if (desc.swizzle == static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B)) {
-    instruction_dim = 64 / src->dtype.bytes();
+    instruction_dim = TMAElementsForBytes(64, src->dtype);
   } else if (desc.swizzle == static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B)) {
-    instruction_dim = 128 / src->dtype.bytes();
+    instruction_dim = TMAElementsForBytes(128, src->dtype);
   }
   if (instruction_dim > 256) {
     // smem_box dim must be in [0, 256]
@@ -1388,7 +1690,8 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
       << " is not divisible by instruction_dim: " << instruction_dim;
   desc.smem_box.Set(0, PrimExpr(instruction_dim));
 
-  int inner_box_dim_ = instruction_dim * shared_tensor->dtype.bytes();
+  int inner_box_dim_ =
+      TMABytesFromElements(instruction_dim, shared_tensor->dtype);
 
   // Check inner_box_dim_ for each swizzle type in a cleaner way
   struct SwizzleCheck {
@@ -1412,11 +1715,39 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
   Call create_descriptor =
       Call(DataType::Handle(), create_tma_descriptor(), desc.EncodeCallArgs());
 
+  // For TMA loads, allocate mbarrier(s) for synchronous semantics.
+  // Determine the mbarrier handle for TMA loads.
+  // T.tma_copy(): requires user-provided barrier
+  // T.copy(): allocates internal mbarrier via AllocMBarrier
+  int barrier_base_id = -1;
+  PrimExpr mbar_handle;
+  bool is_cluster_barrier = false;
+  if (is_load) {
+    if (auto user_barrier = annotations.Get("barrier")) {
+      // User-provided barrier (T.tma_copy): use directly
+      mbar_handle = Downcast<PrimExpr>(user_barrier.value());
+      barrier_base_id = 0;
+      // Detect cluster barrier by checking the buffer scope
+      if (auto bl = mbar_handle.as<BufferLoadNode>()) {
+        is_cluster_barrier = bl->buffer.scope() == "shared.cluster_barrier";
+      }
+    } else if (GetIsTmaCopy()) {
+      LOG(FATAL) << "T.tma_copy() requires a barrier argument. "
+                 << "Use T.tma_copy(src, dst, barrier=mbar[idx]).";
+    } else if (T.AllocMBarrier) {
+      // Internal mbarrier (T.copy()): allocate a single barrier slot.
+      // Pipeline buffer versioning expands it per stage when needed.
+      barrier_base_id = T.AllocMBarrier(1);
+      PrimExpr mbar_idx = IntImm(DataType::Int(32), barrier_base_id);
+      mbar_handle = BufferLoad(T.mbarrier_buffer->value(), {mbar_idx});
+    }
+  }
+
   Array<PrimExpr> args;
   args.reserve(desc.rank + 4);
   args.push_back(create_descriptor);
   if (is_load)
-    args.push_back(0); // mbarrier id placeholder
+    args.push_back(barrier_base_id >= 0 ? mbar_handle : PrimExpr(0));
   auto op = is_load ? tma_load() : tma_store();
 
   Stmt tma_copy;
@@ -1439,8 +1770,12 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     if (!is_load)
       args.push_back(need_reduce);
     args.push_back(GetEvictionPolicy());
+    Map<String, ObjectRef> ann_loop;
+    if (is_cluster_barrier && TargetIsSm100(T.target) && is_load) {
+      ann_loop.Set("use_2cta", IntImm(DataType::Int(32), 1));
+    }
     tma_copy = For(loop_var, 0, loop_extent, ForKind::kUnrolled,
-                   Evaluate(Call(DataType::Handle(), op, args)));
+                   Evaluate(Call(DataType::Handle(), op, args, ann_loop)));
   } else {
     PrimExpr shared_addr = shared_tensor.access_ptr(
         is_load ? 2 : 1, DataType::Handle(), 1, shared_offset, total_elements);
@@ -1451,9 +1786,118 @@ Stmt CopyNode::LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer,
     if (!is_load)
       args.push_back(need_reduce);
     args.push_back(GetEvictionPolicy());
-    tma_copy = Evaluate(Call(DataType::Handle(), op, args));
+    Map<String, ObjectRef> ann;
+    if (TargetIsSm100(T.target) && is_load &&
+        (annotations.find("use_2cta") != annotations.end() ||
+         is_cluster_barrier)) {
+      ann.Set("use_2cta", IntImm(DataType::Int(32), 1));
+    }
+    tma_copy = Evaluate(Call(DataType::Handle(), op, args, ann));
+  }
+
+  // Bulk TMA stores participate in the cp.async.bulk group mechanism, so we
+  // must commit and wait to ensure completion before the store buffer is
+  // reused or the kernel exits.
+  if (!is_load) {
+    Array<Stmt> seq;
+    seq.reserve(3);
+    seq.push_back(tma_copy);
+    seq.push_back(Evaluate(Call(DataType::Handle(), tma_store_arrive(), {})));
+    if (!GetIsTmaCopy()) {
+      // T.copy(): emit both arrive and wait for automatic synchronization.
+      seq.push_back(Evaluate(Call(DataType::Handle(), tma_store_wait(),
+                                  {IntImm(DataType::Int(32), 0)})));
+    }
+    // T.tma_copy(): only arrive, no wait. The user must call
+    // T.tma_store_wait() explicitly to synchronize.
+    tma_copy = SeqStmt(std::move(seq));
+  }
+
+  // For TMA loads with inline mbarrier: emit expect_tx before tma_load
+  // (inside thread-gated block), and wait_parity after (all threads).
+  // The producer is annotated with the shared buffer so PipelinePlanning can
+  // detect it as a copy stage and schedule it at pipeline stage 0.
+  if (is_load && barrier_base_id >= 0) {
+    // Compute total bytes for all TMA sub-copies in this operation
+    PrimExpr total_bytes;
+    if ((*inner_box_dim) != instruction_dim) {
+      int loop_extent = (*inner_box_dim) / instruction_dim;
+      total_bytes = TMABytesFromElements(total_elements * loop_extent,
+                                         shared_tensor->dtype);
+    } else {
+      total_bytes = TMABytesFromElements(total_elements, shared_tensor->dtype);
+    }
+
+    Stmt barrier_before_tma_stmt;
+    Optional<Stmt> barrier_after_tma_stmt = std::nullopt;
+    if (GetIsTmaCopy()) {
+      // T.tma_copy(): only expect_tx (no arrive). User must call
+      // T.barrier_arrive() explicitly. This allows multiple tma_copy operations
+      // to share a single arrive.
+      if (is_cluster_barrier) {
+        // For cluster barriers in 2CTA mode: all CTAs' TMA arrivals go to
+        // CTA 0's barrier (via tma_load_2sm peer-bit clearing). So expect_tx
+        // must account for ALL CTAs' bytes and only execute on CTA 0.
+        PrimExpr cluster_total_bytes =
+            total_bytes * IntImm(DataType::Int(32), T.cluster_size);
+        Stmt expect_stmt =
+            Evaluate(Call(DataType::Handle(), mbarrier_expect_tx(),
+                          {mbar_handle, cluster_total_bytes}));
+        PrimExpr rank = Call(DataType::Int(32), block_rank_in_cluster(), {});
+        barrier_before_tma_stmt =
+            IfThenElse(EQ(rank, IntImm(DataType::Int(32), 0)), expect_stmt);
+      } else {
+        barrier_before_tma_stmt =
+            Evaluate(Call(DataType::Handle(), mbarrier_expect_tx(),
+                          {mbar_handle, total_bytes}));
+      }
+      // When emit_arrive is set (by InjectSoftwarePipeline for pipeline-level
+      // barrier management), also emit arrive inside the thread-0 guard.
+      if (auto emit_arrive_val = annotations.Get("emit_arrive")) {
+        if (Downcast<IntImm>(emit_arrive_val.value())->value != 0) {
+          barrier_after_tma_stmt =
+              Evaluate(Call(DataType::Handle(), builtin::ptx_arrive_barrier(),
+                            {mbar_handle}));
+        }
+      }
+    } else {
+      // T.copy() with TMA: keep expect_tx and arrive as separate control ops.
+      // This lets downstream WS/barrier passes reason about the arrival
+      // domain explicitly when TMA shares a stage barrier with cp.async.
+      barrier_before_tma_stmt =
+          Evaluate(Call(DataType::Handle(), mbarrier_expect_tx(),
+                        {mbar_handle, total_bytes}));
+      barrier_after_tma_stmt = Evaluate(Call(
+          DataType::Handle(), builtin::ptx_arrive_barrier(), {mbar_handle}));
+    }
+
+    Array<Stmt> producer_seq{barrier_before_tma_stmt, tma_copy};
+    if (barrier_after_tma_stmt.defined()) {
+      producer_seq.push_back(barrier_after_tma_stmt.value());
+    }
+
+    // Thread-gated block: expect_tx + tma_load (+ optional arrive)
+    Stmt producer = IfThenElse(MakeTmaLeaderCondition(T.thread_bounds->extent),
+                               SeqStmt(producer_seq));
+
+    // tma_copy (from T.tma_copy()) is fire-and-forget: only emit the
+    // producer (expect_tx + tma_load). The user manages synchronization
+    // (arrive + wait) explicitly.
+    if (GetIsTmaCopy()) {
+      return producer;
+    }
+
+    // For T.copy() with TMA: emit producer + wait pair so the pipeline/WS
+    // passes can split them into different stages.
+    Stmt wait_stmt =
+        Evaluate(Call(DataType::Handle(), mbarrier_wait_parity(),
+                      {mbar_handle, GetCopyMbarPhaseExpr(annotations, T)}));
+
+    return SeqStmt({producer, wait_stmt});
   }
-  tma_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_copy);
+
+  tma_copy =
+      IfThenElse(MakeTmaLeaderCondition(T.thread_bounds->extent), tma_copy);
 
   return tma_copy;
 }
@@ -1516,21 +1960,105 @@ Stmt CopyNode::LowerBulkCopy1D(const LowerArgs &T, arith::Analyzer *analyzer,
       is_load ? 2 : 1, DataType::Handle(), 1, shared_offset, elements);
   PrimExpr global_addr = global_tensor.access_ptr(
       is_load ? 1 : 2, DataType::Handle(), 1, global_offset, elements);
+
+  // Determine the mbarrier handle for 1D TMA loads.
+  // T.tma_copy(): requires user-provided barrier
+  // T.copy(): allocates internal mbarrier via AllocMBarrier
+  int barrier_base_id = -1;
+  PrimExpr mbar_handle;
+  if (is_load) {
+    if (auto user_barrier = annotations.Get("barrier")) {
+      mbar_handle = Downcast<PrimExpr>(user_barrier.value());
+      barrier_base_id = 0;
+    } else if (GetIsTmaCopy()) {
+      LOG(FATAL) << "T.tma_copy() requires a barrier argument. "
+                 << "Use T.tma_copy(src, dst, barrier=mbar[idx]).";
+    } else if (T.AllocMBarrier) {
+      // Internal mbarrier (T.copy()): allocate a single barrier slot.
+      // Pipeline buffer versioning expands it per stage when needed.
+      barrier_base_id = T.AllocMBarrier(1);
+      PrimExpr mbar_idx = IntImm(DataType::Int(32), barrier_base_id);
+      mbar_handle = BufferLoad(T.mbarrier_buffer->value(), {mbar_idx});
+    }
+  }
+
   Stmt tma_copy;
+  PrimExpr total_bytes = TMABytesFromElements(elements, shared_tensor->dtype);
   if (is_load) {
-    // the zero is a placeholder for mbarrier ids
-    tma_copy = Evaluate(
-        Call(DataType::Handle(), tma_load(),
-             {shared_addr, global_addr, 0,
-              elements * shared_tensor->dtype.bytes(), GetEvictionPolicy()}));
+    // 1D TMA load: args = {shared_addr, global_addr, mbarrier, bytes, eviction}
+    PrimExpr mbar_arg = barrier_base_id >= 0 ? mbar_handle : PrimExpr(0);
+    tma_copy = Evaluate(Call(DataType::Handle(), tma_load(),
+                             {shared_addr, global_addr, mbar_arg, total_bytes,
+                              GetEvictionPolicy()}));
   } else {
     int need_reduce = 0;
-    tma_copy = Evaluate(
-        Call(DataType::Handle(), tma_store(),
-             {global_addr, shared_addr, elements * shared_tensor->dtype.bytes(),
-              need_reduce, GetEvictionPolicy()}));
+    tma_copy = Evaluate(Call(DataType::Handle(), tma_store(),
+                             {global_addr, shared_addr, total_bytes,
+                              need_reduce, GetEvictionPolicy()}));
+  }
+
+  if (!is_load) {
+    Array<Stmt> seq;
+    seq.reserve(3);
+    seq.push_back(tma_copy);
+    seq.push_back(Evaluate(Call(DataType::Handle(), tma_store_arrive(), {})));
+    if (!GetIsTmaCopy()) {
+      // T.copy(): emit both arrive and wait for automatic synchronization.
+      seq.push_back(Evaluate(Call(DataType::Handle(), tma_store_wait(),
+                                  {IntImm(DataType::Int(32), 0)})));
+    }
+    // T.tma_copy(): only arrive, no wait. The user must call
+    // T.tma_store_wait() explicitly to synchronize.
+    tma_copy = SeqStmt(std::move(seq));
+  }
+
+  // For 1D TMA loads with inline mbarrier: emit expect_tx + tma_load
+  // (inside thread-gated block), and wait_parity after (all threads).
+  if (is_load && barrier_base_id >= 0) {
+    Stmt barrier_before_tma_stmt;
+    Optional<Stmt> barrier_after_tma_stmt = std::nullopt;
+    if (GetIsTmaCopy()) {
+      // T.tma_copy(): only expect_tx (no arrive). User must call
+      // T.barrier_arrive() explicitly. This allows multiple tma_copy operations
+      // to share a single arrive.
+      barrier_before_tma_stmt =
+          Evaluate(Call(DataType::Handle(), mbarrier_expect_tx(),
+                        {mbar_handle, total_bytes}));
+    } else {
+      // T.copy() with TMA: keep expect_tx and arrive as separate control ops.
+      barrier_before_tma_stmt =
+          Evaluate(Call(DataType::Handle(), mbarrier_expect_tx(),
+                        {mbar_handle, total_bytes}));
+      barrier_after_tma_stmt = Evaluate(Call(
+          DataType::Handle(), builtin::ptx_arrive_barrier(), {mbar_handle}));
+    }
+
+    Array<Stmt> producer_seq{barrier_before_tma_stmt, tma_copy};
+    if (barrier_after_tma_stmt.defined()) {
+      producer_seq.push_back(barrier_after_tma_stmt.value());
+    }
+
+    Stmt producer = IfThenElse(MakeTmaLeaderCondition(T.thread_bounds->extent),
+                               SeqStmt(producer_seq));
+
+    // tma_copy (from T.tma_copy()) is fire-and-forget: only emit the
+    // producer (expect_tx + tma_load). The user manages synchronization
+    // (arrive + wait) explicitly.
+    if (GetIsTmaCopy()) {
+      return producer;
+    }
+
+    // For T.copy() with TMA: emit producer + wait pair so the pipeline/WS
+    // passes can split them into different stages.
+    Stmt wait_stmt =
+        Evaluate(Call(DataType::Handle(), mbarrier_wait_parity(),
+                      {mbar_handle, GetCopyMbarPhaseExpr(annotations, T)}));
+
+    return SeqStmt({producer, wait_stmt});
   }
-  tma_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), tma_copy);
+
+  tma_copy =
+      IfThenElse(MakeTmaLeaderCondition(T.thread_bounds->extent), tma_copy);
   return tma_copy;
 }
 // Encodes the TMA descriptor into an array of PrimExpr for
@@ -1565,8 +2093,11 @@ Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args,
                                Map<String, ObjectRef> annotations) {
   ObjectPtr<Conv2DIm2ColOpNode> node =
       tvm::ffi::make_object<Conv2DIm2ColOpNode>();
-  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
-  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  auto src_access = NormalizeToAccessRegion(args[0], kAccessRead);
+  auto dst_access = NormalizeToAccessRegion(args[1], kAccessWrite);
+  node->srcRegion_ = src_access.region;
+  node->dstRegion_ = dst_access.region;
+  node->SetAccessRegions({src_access, dst_access});
   node->src_ = node->srcRegion_->buffer;
   node->dst_ = node->dstRegion_->buffer;
   node->nhw_step_ = args[2];
@@ -1576,6 +2107,7 @@ Conv2DIm2ColOp::Conv2DIm2ColOp(Array<PrimExpr> args,
   node->dilation_ = args[6].as<IntImm>().value()->value;
   node->padding_ = args[7].as<IntImm>().value()->value;
   node->eviction_policy_ = args[8].as<IntImm>().value()->value;
+  node->annotations_ = annotations;
   data_ = std::move(node);
 }
 
@@ -1589,11 +2121,16 @@ TileOperator Conv2DIm2ColOpNode::Clone() const {
 Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
                                arith::Analyzer *analyzer) const {
   ICHECK(TargetIsHopper(T.target));
-  ICHECK(src_.scope() == "global" &&
-         (dst_.scope() == "shared.dyn" || dst_.scope() == "shared"));
+  ICHECK(IsGlobalBuffer(src_) && IsSharedBuffer(dst_));
   ICHECK(src_->shape.size() == 4);
-  ICHECK(dst_->shape.size() == 2);
   ICHECK(src_->dtype == dst_->dtype);
+
+  // Use dstRegion_ to derive tile dimensions and shared memory offset.
+  // dstRegion_ always has the correct ranges regardless of whether MVB
+  // added a leading stage dimension to the buffer — the last two ranges
+  // give the tile (pixel, channel) extents and mins.
+  size_t ndim = dstRegion_->region.size();
+  ICHECK(ndim >= 2) << "im2col dstRegion must have at least 2 dims";
   Layout shared_layout;
   if (T.layout_map.count(dst_)) {
     shared_layout = T.layout_map[dst_];
@@ -1619,36 +2156,29 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
   // The first stride element should be 1
   ICHECK(is_one(desc.global_stride[0])) << desc.global_stride;
   // Make global stride in bytes
-  desc.global_stride = desc.global_stride.Map([&](PrimExpr e) {
-    return cast(DataType::Int(64), e) * src_->dtype.bytes();
-  });
+  desc.global_stride = desc.global_stride.Map(
+      [&](PrimExpr e) { return TMABytesFromElements(e, src_->dtype); });
   desc.elem_stride = {1, stride_, stride_, 1};
   desc.lower_corner = {-padding_, -padding_};
   desc.upper_corner = {-padding_, -padding_};
-  desc.smem_box_pixel = Downcast<IntImm>(dst_->shape[0])->value;
-  desc.smem_box_channel = Downcast<IntImm>(dst_->shape[1])->value;
+  desc.smem_box_pixel =
+      Downcast<IntImm>(dstRegion_->region[ndim - 2]->extent)->value;
+  desc.smem_box_channel =
+      Downcast<IntImm>(dstRegion_->region[ndim - 1]->extent)->value;
   desc.l2_promotion = static_cast<int>(CU_TENSOR_MAP_L2_PROMOTION_L2_128B);
   desc.oob_fill = static_cast<int>(CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE);
   desc.interleave = static_cast<int>(CU_TENSOR_MAP_INTERLEAVE_NONE);
   if (!shared_layout.defined()) {
     desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_NONE);
   } else {
-    ICHECK(shared_layout->InputDim() == 2) << "Cannot detect TMA layout.";
-    auto stride = as_const_int(shared_layout->InputShape()[0]);
-    auto continuous = as_const_int(shared_layout->InputShape()[1]);
-    ICHECK(stride != nullptr && continuous != nullptr);
-
-    if (StructuralEqual()(shared_layout,
-                          makeQuarterBankSwizzleLayout(*stride, *continuous,
-                                                       dst_->dtype.bits()))) {
+    ICHECK(shared_layout->InputDim() >= 2) << "Cannot detect TMA layout.";
+    if (StructuralEqual()(shared_layout, makeQuarterBankSwizzleLayout(dst_))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_32B);
-    } else if (StructuralEqual()(shared_layout, makeHalfBankSwizzleLayout(
-                                                    *stride, *continuous,
-                                                    dst_->dtype.bits()))) {
+    } else if (StructuralEqual()(shared_layout,
+                                 makeHalfBankSwizzleLayout(dst_))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_64B);
-    } else if (StructuralEqual()(shared_layout, makeFullBankSwizzleLayout(
-                                                    *stride, *continuous,
-                                                    dst_->dtype.bits()))) {
+    } else if (StructuralEqual()(shared_layout,
+                                 makeFullBankSwizzleLayout(dst_))) {
       desc.swizzle = static_cast<int>(CU_TENSOR_MAP_SWIZZLE_128B);
     } else {
       LOG(FATAL) << "Cannot detect TMA layout.";
@@ -1692,21 +2222,98 @@ Stmt Conv2DIm2ColOpNode::Lower(const LowerArgs &T,
   global_coords.push_back(
       FloorDiv(nhw_step_ * desc.smem_box_pixel, w_dim * h_dim));
 
+  // Allocate mbarrier(s) for TMA im2col load synchronization,
+  // matching the protocol used by regular TMA loads.
+  // If a barrier was provided by the WS pass (via annotation), use it directly.
+  int barrier_base_id = -1;
+  PrimExpr mbar_handle;
+  if (auto user_barrier = annotations_.Get("barrier")) {
+    // WS pass provided a barrier: use it without allocating a new one.
+    mbar_handle = Downcast<PrimExpr>(user_barrier.value());
+    barrier_base_id = 0;
+  } else if (T.AllocMBarrier) {
+    // Allocate a single barrier slot; pipeline buffer versioning expands it
+    // per stage when needed.
+    barrier_base_id = T.AllocMBarrier(1);
+    PrimExpr mbar_idx = IntImm(DataType::Int(32), barrier_base_id);
+    mbar_handle = BufferLoad(T.mbarrier_buffer->value(), {mbar_idx});
+  }
+
   Array<PrimExpr> args;
   args.reserve(desc.rank * 2 + 2);
   args.push_back(create_desc);
-  args.push_back(0); // mbar placeholder
+  args.push_back(barrier_base_id >= 0 ? mbar_handle : PrimExpr(0));
   auto dst_buffer = T.buffer_remap.count(dst_) ? T.buffer_remap[dst_] : dst_;
-  auto shared_addr = dst_buffer.access_ptr(2);
+  // Compute flat element offset from dstRegion_ mins and buffer strides.
+  // For a plain 2D buffer this is 0; for a versioned 3D buffer this
+  // resolves to stage_idx * pixel * channel — no special-casing needed.
+  PrimExpr flat_offset = IntImm(DataType::Int(32), 0);
+  {
+    PrimExpr stride = IntImm(DataType::Int(32), 1);
+    for (int i = static_cast<int>(ndim) - 1; i >= 0; --i) {
+      flat_offset = flat_offset + dstRegion_->region[i]->min * stride;
+      stride = stride * dst_->shape[i];
+    }
+  }
+  PrimExpr tile_elems =
+      IntImm(DataType::Int(32), desc.smem_box_pixel * desc.smem_box_channel);
+  PrimExpr shared_addr = dst_buffer.access_ptr(
+      /*access_mask=*/2, /*dtype=*/DataType::Handle(), /*content_lanes=*/1,
+      /*offset=*/flat_offset, /*extent=*/tile_elems);
   args.push_back(shared_addr);
   for (auto coord : global_coords)
     args.push_back(coord);
   for (auto offset : image_offset)
     args.push_back(offset);
   args.push_back(this->eviction_policy_);
-  Stmt tma_copy =
-      IfThenElse(EQ(T.thread_var, T.thread_bounds->min),
-                 Evaluate(Call(DataType::Handle(), tma_load_im2col(), args)));
+  Stmt tma_copy_stmt =
+      Evaluate(Call(DataType::Handle(), tma_load_im2col(), args));
+
+  if (barrier_base_id >= 0) {
+    bool ws_barrier = annotations_.Get("barrier").has_value();
+    // Total bytes transferred by im2col TMA copy
+    PrimExpr total_bytes = TMABytesFromElements(
+        IntImm(DataType::Int(32), desc.smem_box_pixel * desc.smem_box_channel),
+        dst_->dtype);
+
+    Stmt barrier_before_tma_stmt = Evaluate(Call(
+        DataType::Handle(), mbarrier_expect_tx(), {mbar_handle, total_bytes}));
+
+    if (ws_barrier) {
+      // External barrier (WS pass or InjectSoftwarePipeline).
+      // Build: expect_tx + tma_load [+ arrive if emit_arrive is set].
+      Array<Stmt> producer_seq{barrier_before_tma_stmt, tma_copy_stmt};
+      if (auto emit_arrive_val = annotations_.Get("emit_arrive")) {
+        if (Downcast<IntImm>(emit_arrive_val.value())->value != 0) {
+          producer_seq.push_back(
+              Evaluate(Call(DataType::Handle(), builtin::ptx_arrive_barrier(),
+                            {mbar_handle})));
+        }
+      }
+      Stmt producer =
+          IfThenElse(MakeTmaLeaderCondition(T.thread_bounds->extent),
+                     SeqStmt(producer_seq));
+      return producer;
+    }
+
+    Stmt barrier_after_tma_stmt = Evaluate(
+        Call(DataType::Handle(), builtin::ptx_arrive_barrier(), {mbar_handle}));
+
+    // Thread-gated block: expect_tx + tma_load_im2col + arrive
+    Stmt producer = IfThenElse(MakeTmaLeaderCondition(T.thread_bounds->extent),
+                               SeqStmt({barrier_before_tma_stmt, tma_copy_stmt,
+                                        barrier_after_tma_stmt}));
+
+    // Emit producer + wait pair for pipeline/WS passes.
+    Stmt wait_stmt =
+        Evaluate(Call(DataType::Handle(), mbarrier_wait_parity(),
+                      {mbar_handle, GetCopyMbarPhaseExpr(annotations_, T)}));
+
+    return SeqStmt({producer, wait_stmt});
+  }
+
+  Stmt tma_copy = IfThenElse(MakeTmaLeaderCondition(T.thread_bounds->extent),
+                             tma_copy_stmt);
   return tma_copy;
 }
 
@@ -1772,6 +2379,36 @@ TIR_REGISTER_TL_TILE_OP(Copy, copy)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TVM_REGISTER_OP("tl.tileop.async_copy")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "async_copy")
+    .set_attr<OpBuilderFunc>("TLOpBuilder",
+                             [](Array<PrimExpr> args,
+                                Map<String, ObjectRef> annotations) {
+                               Map<String, ObjectRef> ann = annotations;
+                               ann.Set("is_async_copy",
+                                       IntImm(DataType::Int(32), 1));
+                               return Copy(args, ann);
+                             })
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+// Register the tma_copy operation — same as copy but forces TMA path
+// and emits only expect_tx + tma_load (no wait).
+TVM_REGISTER_OP("tl.tileop.tma_copy")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "tma_copy")
+    .set_attr<OpBuilderFunc>("TLOpBuilder",
+                             [](Array<PrimExpr> args,
+                                Map<String, ObjectRef> annotations) {
+                               Map<String, ObjectRef> ann = annotations;
+                               ann.Set("is_tma_copy",
+                                       IntImm(DataType::Int(32), 1));
+                               return Copy(args, ann);
+                             })
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 // Layout inference hook - returns empty map (no layout suggestions).
 LayoutMap Conv2DIm2ColOpNode::InferLayout(const LayoutInferArgs &T,
                                           InferLevel level) const {
diff --git a/src/op/copy.h b/src/op/copy.h
index 6009c7ce02..d20f519815 100644
--- a/src/op/copy.h
+++ b/src/op/copy.h
@@ -6,6 +6,7 @@
 #ifndef TVM_TL_OP_COPY_H_
 #define TVM_TL_OP_COPY_H_
 
+#include "builtin.h"
 #include "operator.h"
 #include "parallel.h"
 
@@ -15,17 +16,18 @@ using namespace tir;
 
 /// Copy instruction types for different memory access patterns
 enum class CopyInst : uint8_t {
-  kNormal = 0,    // utilize ldg/stg or cpasync or any buffer copy
+  kNormal = 0,    // utilize plain buffer copy
   kLDSM = 1,      // ldmatrix memory copy
   kSTSM = 2,      // stmatrix memory copy
   kBulkLoad = 3,  // utilize tma load
   kBulkStore = 4, // utilize tma store
+  kCPAsync = 5,   // cp.async global->shared copy
   // we should separate the bulk load and store for 1d and multi-dim
   // as they have different memory access patterns
-  kBulkLoad1D = 5,  // utilize tma load 1d
-  kBulkStore1D = 6, // utilize tma store 1d
-  kTMemLoad = 7,    // tcgen05.ld (tensor memory -> register)
-  kTMemStore = 8,   // tcgen05.st (register -> tensor memory)
+  kBulkLoad1D = 6,  // utilize tma load 1d
+  kBulkStore1D = 7, // utilize tma store 1d
+  kTMemLoad = 8,    // tcgen05.ld (tensor memory -> register)
+  kTMemStore = 9,   // tcgen05.st (register -> tensor memory)
 };
 
 /// Convert CopyInst enum to string for debugging
@@ -41,6 +43,8 @@ inline const char *CopyInstToString(CopyInst inst) {
     return "BulkLoad";
   case CopyInst::kBulkStore:
     return "BulkStore";
+  case CopyInst::kCPAsync:
+    return "CPAsync";
   case CopyInst::kBulkLoad1D:
     return "BulkLoad1D";
   case CopyInst::kBulkStore1D:
@@ -121,6 +125,11 @@ class CopyNode : public TileOperatorNode {
   //   - "disable_tma": Bool, whether to disable TMA acceleration
   //   - "eviction_policy": IntImm, cache eviction policy (0=normal, 1=first,
   //   2=last)
+  //   - attr::kAsyncCopyNoImplicitCommitWait: IntImm/Bool, suppress implicit
+  //     cp.async commit/wait because an enclosing transform manages them
+  //   - attr::kParallelLoopLayout ("parallel_loop_layout"): Fragment, loop
+  //     layout hint applied to the outermost generated parallel loop of this
+  //     copy's SIMT loop nest.
 
   mutable ParallelOp par_op_; // Optional associated parallelization operator
 
@@ -146,6 +155,15 @@ class CopyNode : public TileOperatorNode {
     return false;
   }
 
+  bool GetIsTmaCopy() const {
+    if (auto val = annotations.Get("is_tma_copy")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value != 0;
+      }
+    }
+    return false;
+  }
+
   int GetEvictionPolicy() const {
     if (auto val = annotations.Get("eviction_policy")) {
       if (auto int_val = val->as<IntImmNode>()) {
@@ -155,6 +173,30 @@ class CopyNode : public TileOperatorNode {
     return 0; // default: evict_normal
   }
 
+  bool GetIsAsyncCopy() const {
+    if (auto val = annotations.Get("is_async_copy")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value != 0;
+      }
+    }
+    // Backward-compatibility with historical annotation key.
+    if (auto val = annotations.Get("force_cp_async")) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value != 0;
+      }
+    }
+    return false;
+  }
+
+  bool GetNoImplicitAsyncCommitWait() const {
+    if (auto val = annotations.Get(attr::kAsyncCopyNoImplicitCommitWait)) {
+      if (auto int_val = val->as<IntImmNode>()) {
+        return int_val->value != 0;
+      }
+    }
+    return false;
+  }
+
   /*!
    * \brief Lower the copy operator to a TIR statement.
    * \param T        Arguments for lowering.
@@ -224,13 +266,37 @@ class CopyNode : public TileOperatorNode {
   bool CheckTMemStore(Target target) const;
 
   /*!
-   * \brief Get the copy instruction type.
+   * \brief Check target-independent cp.async prerequisites.
+   */
+  bool CheckCPAsyncCopyPreconditions() const;
+
+  /*!
+   * \brief Check whether this copy can participate in pipeline-managed
+   * cp.async synchronization using only target-independent prerequisites.
+   */
+  bool CheckPipelineManagedCPAsyncCopy() const;
+
+  /*!
+   * \brief Check whether this copy can participate in pipeline-managed
+   * cp.async synchronization for a concrete target.
+   */
+  bool CheckPipelineManagedCPAsyncCopy(Target target,
+                                       arith::Analyzer *analyzer) const;
+
+  /*!
+   * \brief Check if cp.async copy is supported.
    */
-  CopyInst GetCopyInst(Target target, bool disable_tma_lower,
-                       const LayoutMap &layout_map, arith::Analyzer *analyzer,
-                       bool buffer_oob) const;
+  bool CheckCPAsyncCopy(Target target, const LayoutMap &layout_map,
+                        arith::Analyzer *analyzer) const;
 
 protected:
+  /*!
+   * \brief Get the copy instruction type.
+   */
+  CopyInst GetCopyInst(Target target, const LayoutMap &layout_map,
+                       arith::Analyzer *analyzer,
+                       bool buffer_oob = false) const;
+
   /*!
    * \brief Generate lowering for bulk/global-to-shared copy.
    */
@@ -260,6 +326,11 @@ class CopyNode : public TileOperatorNode {
    */
   Stmt LowerNormalCopy(const LowerArgs &T, arith::Analyzer *analyzer) const;
 
+  /*!
+   * \brief Generate lowering for cp.async global->shared copy.
+   */
+  Stmt LowerCPAsyncCopy(const LowerArgs &T, arith::Analyzer *analyzer) const;
+
   /*!
    * \brief Generate SIMT (thread-level) loop for copying.
    */
@@ -312,6 +383,17 @@ class CopyNode : public TileOperatorNode {
    */
   TileOperator Clone() const;
 
+  /*!
+   * \brief Check that a global buffer's strides satisfy TMA requirements.
+   *
+   * Validates: contiguous innermost stride, 16-byte alignment for outer
+   * strides, and stride < 2^40.
+   *
+   * \return true if all stride checks pass.
+   */
+  static bool CheckGlobalStrides(const Buffer &buffer,
+                                 arith::Analyzer *analyzer);
+
 private:
   /*!
    * \brief Collect fragment buffers from expression and create fully replicated
@@ -368,9 +450,10 @@ class Conv2DIm2ColOpNode : public TileOperatorNode {
   int padding_;  // Padding amount
   int dilation_; // Dilation factor
   int kernel_;   // Kernel size
-  int eviction_policy_; // Cache eviction policy
-  PrimExpr nhw_step_;   // Step size in NHW dimensions
-  PrimExpr c_step_;     // Step size in channel dimension
+  int eviction_policy_;                // Cache eviction policy
+  PrimExpr nhw_step_;                  // Step size in NHW dimensions
+  PrimExpr c_step_;                    // Step size in channel dimension
+  Map<String, ObjectRef> annotations_; // Annotations from Call node
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Conv2DIm2Col", Conv2DIm2ColOpNode,
                                     TileOperatorNode);
diff --git a/src/op/distributed.cc b/src/op/distributed.cc
index edd369a34f..b8eea7570f 100644
--- a/src/op/distributed.cc
+++ b/src/op/distributed.cc
@@ -10,7 +10,6 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 
-#include "../target/cuda.h"
 #include "../target/utils.h"
 
 namespace tvm {
diff --git a/src/op/fill.cc b/src/op/fill.cc
index 02962d242c..be0dd8dc10 100644
--- a/src/op/fill.cc
+++ b/src/op/fill.cc
@@ -13,7 +13,6 @@
 #include "../layout/tcgen05_layout.h"
 #include "../target/utils.h"
 #include "../transform/common/loop_fusion_utils.h"
-#include "../transform/common/loop_parallel_transform_utils.h"
 #include "../transform/loop_partition.h"
 #include "../transform/loop_vectorize.h"
 #include "builtin.h"
@@ -61,9 +60,10 @@ using namespace tir;
 Fill::Fill(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<FillNode> node = tvm::ffi::make_object<FillNode>();
 
-  BufferRegion region = NormalizeToBufferRegion(args[0]);
-  node->dst = region->buffer;
-  node->region = region->region;
+  AccessRegion dst_access = NormalizeToAccessRegion(args[0], kAccessWrite);
+  node->dst = dst_access.region->buffer;
+  node->region = dst_access.region->region;
+  node->SetAccessRegions({dst_access});
 
   if (args[1]->dtype != node->dst->dtype) {
     node->value = Cast(node->dst->dtype, args[1]);
@@ -168,16 +168,19 @@ Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
+    auto vectorized_loop = VectorizeLoop(thread_loop, analyzer, T.layout_map);
+    auto unrolled_loop = PragmaUnrollLoop(vectorized_loop);
+
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
-                        vectorized_thread_loop);
+                        unrolled_loop);
     }
-    return vectorized_thread_loop;
+    return unrolled_loop;
   } else if (IsLocalBuffer(dst) || IsLocalVarBuffer(dst)) {
     auto init_loop = MakeSIMTLoop(analyzer);
-    auto vectorized_thread_loop = VectorizeLoop(init_loop, analyzer);
-    return vectorized_thread_loop;
+    auto vectorized_loop = VectorizeLoop(init_loop, analyzer, T.layout_map);
+    auto unrolled_loop = PragmaUnrollLoop(vectorized_loop);
+    return unrolled_loop;
   } else if (IsSharedBuffer(dst) || IsGlobalBuffer(dst)) {
     auto par_op = ParallelOp(MakeSIMTLoop(analyzer));
     par_op->InferLayout({T.target,
@@ -190,12 +193,13 @@ Stmt FillNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
                         InferLevel::kFree);
     auto thread_loop = PartitionLoop(par_op->GetRoot(), T.thread_var, analyzer,
                                      par_op->GetLoopLayout());
-    auto vectorized_thread_loop = VectorizeLoop(thread_loop, analyzer);
+    auto vectorized_loop = VectorizeLoop(thread_loop, analyzer, T.layout_map);
+    auto unrolled_loop = PragmaUnrollLoop(vectorized_loop);
     if (par_op->GetPredicate(T.thread_var).defined()) {
       return IfThenElse(par_op->GetPredicate(T.thread_var).value(),
-                        vectorized_thread_loop);
+                        unrolled_loop);
     }
-    return vectorized_thread_loop;
+    return unrolled_loop;
   } else {
     LOG(FATAL) << "Unsupported scope " << dst.scope();
     return Stmt();
diff --git a/src/op/finalize_reducer.cc b/src/op/finalize_reducer.cc
index e9e2fca54f..c53bcb1307 100644
--- a/src/op/finalize_reducer.cc
+++ b/src/op/finalize_reducer.cc
@@ -34,12 +34,19 @@ using namespace tir;
 FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args,
                                      Map<String, ObjectRef> annotations) {
   auto node = tvm::ffi::make_object<FinalizeReducerOpNode>();
-  // Normalize any supported region expression
-  // (BufferRegion/BufferLoad/tl.region) to a BufferRegion, then take the
-  // underlying Buffer as reducer.
-  auto region = NormalizeToBufferRegion(args[0]);
-  node->reducer = region->buffer;
+  auto reducer_access = NormalizeToAccessRegion(args[0], kAccessReadWrite);
+  reducer_access.region =
+      BufferRegion::FullRegion(reducer_access.region->buffer);
+  reducer_access.access_mask = kAccessReadWrite;
+  node->reducer = reducer_access.region->buffer;
+  node->SetAccessRegions({reducer_access});
   node->op = (ReducerOpType)*as_const_int(args[1]);
+  // Read explicit batch size from annotations (0 means auto-detect).
+  if (annotations.count("batch")) {
+    node->batch = (int)*as_const_int(Downcast<PrimExpr>(annotations["batch"]));
+    CHECK_GE(node->batch, 1)
+        << "finalize_reducer: batch must be >= 1, got " << node->batch;
+  }
   data_ = std::move(node);
 }
 
@@ -53,9 +60,9 @@ FinalizeReducerOp::FinalizeReducerOp(Array<PrimExpr> args,
  * - Builds index Vars for each output dimension.
  * - Reads the layout's ReplicateExtent and:
  *   - if extent == 1, emits a no-op Evaluate(0);
- *   - otherwise constructs an AllReduce extern call (uses `run_hopper` when the
- *     compilation target is Hopper) with an optional workspace (allocated via
- *     T.AddWorkspace when reducing_threads >= 32) and stores the result via
+ *   - otherwise constructs an AllReduce extern call (uses `NamedBarrier` when
+ *     the compilation target is Hopper) with an optional workspace (allocated
+ * via T.AddWorkspace when reducing_threads >= 32) and stores the result via
  *     BufferStore.
  * - Wraps the store in parallel outer For loops over each output dimension.
  *
@@ -97,13 +104,69 @@ Stmt FinalizeReducerOpNode::Lower(const LowerArgs &T,
 
   // adopted from ReduceOp
   int reducing_threads = extent;
-  std::stringstream ss;
   auto thread_offset = T.thread_bounds->min;
-  if (TargetIsHopper(T.target) || TargetIsSm100(T.target) ||
-      TargetIsSM120(T.target)) {
+
+  // Validate batch against the layout's total output element count.
+  int64_t layout_batch_size = 1;
+  for (int i = 0; i < layout->OutputDim(); ++i) {
+    const int64_t *p = as_const_int(layout->OutputShape()[i]);
+    if (p == nullptr) {
+      layout_batch_size = -1;
+      break;
+    }
+    layout_batch_size *= *p;
+  }
+
+  int64_t effective_batch = static_cast<int64_t>(this->batch);
+
+  if (effective_batch > 1 && layout_batch_size > 0) {
+    CHECK_LE(effective_batch, layout_batch_size)
+        << "finalize_reducer: batch (" << effective_batch
+        << ") exceeds total output elements (" << layout_batch_size << ")";
+    CHECK_EQ(layout_batch_size % effective_batch, 0)
+        << "finalize_reducer: batch (" << effective_batch
+        << ") must evenly divide total output elements (" << layout_batch_size
+        << ")";
+  }
+
+  // ROCm wavefronts are 64-wide; only batch when reducing across warps.
+  const int warp_size = TargetIsRocm(T.target) ? 64 : 32;
+  bool use_batch = effective_batch > 1 && reducing_threads > warp_size;
+
+  if (use_batch) {
+    // Batched AllReduce: single butterfly pass for all output elements.
+    int workspace_stride =
+        static_cast<int>(*as_const_int(T.thread_bounds->extent));
+    std::stringstream ss;
+    if (TargetHasSMVersionGE(T.target, 90)) {
+      auto all_threads = T.thread_bounds->extent;
+      ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
+         << ", " << thread_offset << ", tl::NamedBarrier<" << all_threads
+         << ">, " << effective_batch << ", " << workspace_stride
+         << ">::run_batch";
+    } else if (TargetIsRocm(T.target)) {
+      // HIP AllReduce has no Barrier type parameter.
+      ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
+         << ", " << thread_offset << ", " << effective_batch << ", "
+         << workspace_stride << ">::run_batch";
+    } else {
+      ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
+         << ", " << thread_offset << ", tl::SyncThreadsBarrier, "
+         << effective_batch << ", " << workspace_stride << ">::run_batch";
+    }
+    int ws_size = workspace_stride * static_cast<int>(effective_batch);
+    PrimExpr workspace = T.AddWorkspace(ws_size, buffer->dtype);
+    Array<PrimExpr> args = {StringImm(ss.str()), buffer->data, workspace};
+    return Evaluate(Call(DataType::Handle(), builtin::call_extern(), args));
+  }
+
+  // Scalar AllReduce path (original).
+  std::stringstream ss;
+  if (TargetHasSMVersionGE(T.target, 90)) {
     auto all_threads = T.thread_bounds->extent;
     ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
-       << ", " << thread_offset << ", " << all_threads << ">::run_hopper";
+       << ", " << thread_offset << ", tl::NamedBarrier<" << all_threads
+       << ">>::run";
   } else {
     ss << "tl::AllReduce<" << op_str << ", " << reducing_threads << ", " << 1
        << ", " << thread_offset << ">::run";
diff --git a/src/op/finalize_reducer.h b/src/op/finalize_reducer.h
index 3899f59ca9..ee4ddb20d1 100644
--- a/src/op/finalize_reducer.h
+++ b/src/op/finalize_reducer.h
@@ -1,3 +1,6 @@
+// Copyright (c) Tile-AI Corporation.
+// Licensed under the MIT License.
+
 /*!
  * \file src/op/finalize_reducer.h
  * \brief Define finalize_reducer operator.
@@ -23,6 +26,9 @@ class FinalizeReducerOpNode : public TileOperatorNode {
 public:
   tir::Buffer reducer;
   ReducerOpType op;
+  // Batch size for batched AllReduce (1 = scalar path, same as T.reduce
+  // default).
+  int batch{1};
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.FinalizeReducerOp",
                                     FinalizeReducerOpNode, TileOperatorNode);
@@ -31,7 +37,8 @@ class FinalizeReducerOpNode : public TileOperatorNode {
     namespace refl = tvm::ffi::reflection;
     refl::ObjectDef<FinalizeReducerOpNode>()
         .def_ro("reducer", &FinalizeReducerOpNode::reducer)
-        .def_ro("op", &FinalizeReducerOpNode::op);
+        .def_ro("op", &FinalizeReducerOpNode::op)
+        .def_ro("batch", &FinalizeReducerOpNode::batch);
   }
 
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
diff --git a/src/op/gemm.cc b/src/op/gemm.cc
index 7ad8b8c1ed..5472b33386 100644
--- a/src/op/gemm.cc
+++ b/src/op/gemm.cc
@@ -21,41 +21,32 @@ namespace tl {
 using namespace tir;
 
 /**
- * @brief Construct a Gemm operator from serialized TL arguments and a buffer
- * map.
+ * @brief Construct a Gemm operator from serialized TL arguments.
  *
- * This constructor deserializes operator parameters from `args` and resolves
- * buffer references via `vmap`, populating an internal GemmNode with:
- * - device pointers for A, B, C and their corresponding Buffer objects,
- * - transpose flags for A and B,
- * - matrix dimensions M, N, K,
- * - warp allocation policy and clear_accum flag,
- * - strides and memory offsets for A and B,
- * - optional kPack (must be 1 or 2) and optional wg_wait.
- *
- * The populated GemmNode is stored into the wrapper's internal `data_`.
+ * Deserializes operator parameters from `args` and resolves buffer references,
+ * populating an internal GemmNode with buffers, transpose flags, M/N/K,
+ * warp policy, clear_accum, strides, offsets, optional kPack/wg_wait, and
+ * optional mbarrier.
  *
  * @param args Positional serialized arguments produced by the TL frontend:
  *   expected layout is:
  *     [Aptr, Bptr, Cptr, trans_A (Bool), trans_B (Bool),
  *      M (Int), N (Int), K (Int), policy (Int), clear_accum (Bool),
  *      stride_A (Int), stride_B (Int), offset_A (Int), offset_B (Int),
- *      (optional) kPack (Int), (optional) wg_wait (Int)]
- *
- * @note If `kPack` is provided it must be 1; otherwise the constructor
- *       fails with an ICHECK (runtime assertion). No other validation is
- *       performed here.
+ *      (optional) kPack (Int), (optional) internal wg_wait (Int),
+ *      (optional) mbar (BufferLoad), cCoord_y (PrimExpr), cCoord_x (PrimExpr)]
  */
-// NormalizeToBufferRegion moved to src/op/utils.{h,cc}
-
-// MakeAccessPtrFromRegion moved to src/op/utils.{h,cc}
-
 Gemm::Gemm(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<GemmNode> node = tvm::ffi::make_object<GemmNode>();
 
-  node->aRegion_ = NormalizeToBufferRegion(args[0]);
-  node->bRegion_ = NormalizeToBufferRegion(args[1]);
-  node->cRegion_ = NormalizeToBufferRegion(args[2]);
+  auto a_access = NormalizeToAccessRegion(args[0], kAccessRead);
+  auto b_access = NormalizeToAccessRegion(args[1], kAccessRead);
+  auto c_access = NormalizeToAccessRegion(args[2], kAccessReadWrite);
+
+  node->aRegion_ = a_access.region;
+  node->bRegion_ = b_access.region;
+  node->cRegion_ = c_access.region;
+  node->SetAccessRegions({a_access, b_access, c_access});
 
   node->a_ = node->aRegion_->buffer;
   node->b_ = node->bRegion_->buffer;
@@ -80,40 +71,67 @@ Gemm::Gemm(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   if (args.size() > 15) {
     node->wgWait_ = args[15].as<IntImm>().value()->value;
   }
-  if (args.size() > 16) {
-    if (const auto *load = args[16].as<BufferLoadNode>()) {
-      node->mbarRegion_ =
-          NormalizeToBufferRegion(Downcast<BufferLoad>(args[16]));
-      node->mbar_ = node->mbarRegion_->buffer;
-    } else {
-      node->mbar_ = std::nullopt;
-    }
+  if (auto val = annotations.Get("is_wgmma")) {
+    const auto *int_val = val->as<IntImmNode>();
+    ICHECK(int_val) << "is_wgmma annotation must be IntImmNode";
+    node->isWgmma_ = int_val->value != 0;
+  }
+  if (auto val = annotations.Get("is_tcgen05")) {
+    const auto *int_val = val->as<IntImmNode>();
+    ICHECK(int_val) << "is_tcgen05 annotation must be IntImmNode";
+    node->isTcgen05_ = int_val->value != 0;
+  }
+  if (args.size() > 16 && args[16]->IsInstance<BufferLoadNode>()) {
+    node->mbar_ = Downcast<BufferLoad>(args[16]);
   }
   node->cCoords_ = Array<PrimExpr>(
       {args[17].as<PrimExpr>().value(), args[18].as<PrimExpr>().value()});
+  if (args.size() > 19) {
+    node->sfaRegion_ = NormalizeToBufferRegion(args[19]);
+  }
+  if (args.size() > 20) {
+    node->sfbRegion_ = NormalizeToBufferRegion(args[20]);
+  }
+  if (args.size() > 21) {
+    node->sfAId_ = args[21].as<PrimExpr>().value();
+  }
+  if (args.size() > 22) {
+    node->sfBId_ = args[22].as<PrimExpr>().value();
+  }
+  node->annotations_ = annotations;
   data_ = std::move(node);
 }
 
-/**
- * @brief Create a copy of this GemmNode as a TileOperator.
- *
- * Constructs a new GemmNode by copying the current node state and returns it
- * wrapped in a Gemm TileOperator.
- *
- * @return TileOperator A Gemm operator that owns a copy of this node.
- */
+AccessRegions GemmNode::GetAccessRegions() const {
+  AccessRegions result;
+  result.reads.push_back(aRegion_);
+  result.reads.push_back(bRegion_);
+  if (!is_one(clearAccum_)) {
+    result.reads.push_back(cRegion_);
+  }
+  if (sfaRegion_.defined()) {
+    result.reads.push_back(sfaRegion_);
+  }
+  if (sfbRegion_.defined()) {
+    result.reads.push_back(sfbRegion_);
+  }
+  result.writes.push_back(cRegion_);
+  return result;
+}
+
 TileOperator GemmNode::Clone() const {
   auto op = tvm::ffi::make_object<GemmNode>(*this);
   return Gemm(op);
 }
 
 bool GemmNode::allowTcgen5Mma(Target target) const {
-  return TargetIsSm100(target) &&
-         ((a_.scope() == "shared.dyn" || a_.scope() == "shared" ||
-           a_.scope() == "shared.tmem") &&
-          (b_.scope() == "shared.dyn" || b_.scope() == "shared") &&
-          c_.scope() == "shared.tmem") &&
-         GetTCGEN5MMAMeta(m_, n_, k_, a_->dtype, c_->dtype).first;
+  bool scope_ok = (IsSharedBuffer(a_) || a_.scope() == "shared.tmem") &&
+                  IsSharedBuffer(b_) && c_.scope() == "shared.tmem";
+  if (!TargetIsSm100(target) || !scope_ok)
+    return false;
+  // For TS variant (A from TMEM), use B's dtype as the input dtype
+  DataType ab_dtype = (a_.scope() == "shared.tmem") ? b_->dtype : a_->dtype;
+  return GetTCGEN5MMAMeta(m_, n_, k_, ab_dtype, c_->dtype).first;
 }
 
 bool GemmNode::allowWgmma(int block_size, Target target) const {
@@ -127,16 +145,46 @@ bool GemmNode::allowWgmma(int block_size, Target target) const {
 }
 
 GemmInst GemmNode::getGemmInst(int block_size, Target target) const {
-  if (allowTcgen5Mma(target)) {
+  if (isWgmma_) {
+    if (!allowWgmma(block_size, target)) {
+      LOG(FATAL) << "T.wgmma_gemm() requires Hopper WGMMA lowering, but "
+                    "constraints were not satisfied. Got target="
+                 << target << ", A(scope=" << a_.scope()
+                 << ", dtype=" << a_->dtype << "), B(scope=" << b_.scope()
+                 << ", dtype=" << b_->dtype << "), C(scope=" << c_.scope()
+                 << ", dtype=" << c_->dtype << "), M=" << m_ << ", N=" << n_
+                 << ", K=" << k_ << ".";
+    }
+    return GemmInst::kWGMMA;
+  }
+  if (isTcgen05_) {
+    if (!allowTcgen5Mma(target)) {
+      LOG(FATAL) << "T.tcgen05_gemm() requires Blackwell TCGEN5MMA lowering, "
+                    "but constraints were not satisfied. Got target="
+                 << target << ", A(scope=" << a_.scope()
+                 << ", dtype=" << a_->dtype << "), B(scope=" << b_.scope()
+                 << ", dtype=" << b_->dtype << "), C(scope=" << c_.scope()
+                 << ", dtype=" << c_->dtype << "), M=" << m_ << ", N=" << n_
+                 << ", K=" << k_ << ".";
+    }
     return GemmInst::kTCGEN5MMA;
-  } else if (allowWgmma(block_size, target)) {
+  }
+  bool allow_tcgen5mma = allowTcgen5Mma(target);
+  bool allow_wgmma = allowWgmma(block_size, target);
+  if (allow_tcgen5mma) {
+    return GemmInst::kTCGEN5MMA;
+  } else if (allow_wgmma) {
     return GemmInst::kWGMMA;
   } else if (TargetIsCDNA(target)) {
     return GemmInst::kMFMA;
+  } else if (TargetIsRDNA(target)) {
+    return GemmInst::kWMMA;
   } else if (TargetIsCuda(target)) {
     return GemmInst::kMMA;
+  } else if (TargetIsCPU(target)) {
+    return GemmInst::kScalar;
   } else {
-    ICHECK(0) << "Unsupported target for gemm: " << target;
+    ICHECK(0) << "Unsupported target for gemm: " << target->str();
     return GemmInst::kMMA;
   }
 }
@@ -145,6 +193,8 @@ std::pair<int, int> GemmWarpPolicyNode::computeWarpPartition(
     int M, int N, int block_size, Target target, GemmInst gemm_inst) const {
   int num_warps = block_size / TargetGetWarpSize(target);
   if (gemm_inst == GemmInst::kTCGEN5MMA) {
+    this->m_warp = 1;
+    this->n_warp = num_warps;
     return {1, num_warps}; // TCGEN5MMA doesn't care about warp partitioning
   }
 
@@ -153,6 +203,10 @@ std::pair<int, int> GemmWarpPolicyNode::computeWarpPartition(
   int kNPerWarp = 8;            // Columns processed by a single warp
   if (TargetIsVolta(target)) {
     kNPerWarp = 16;
+  } else if (TargetIsCDNA(target)) {
+    kNPerWarp = 16;
+  } else if (TargetIsRDNA(target)) {
+    kNPerWarp = 16;
   }
   ICHECK(M % kMPerWarp == 0)
       << "M must be divisible by " << kMPerWarp << ", but got " << M;
@@ -326,32 +380,8 @@ std::pair<int, int> GemmWarpPolicyNode::computeWarpPartition(
 /**
  * @brief Checks whether WGMMA (warp-group MMA) can be used for this GEMM.
  *
- * Evaluates device-memory placement, data-type combinations, transpose flags,
- * and K divisibility constraints required for the Hopper WGMMA code path.
- *
- * The check returns true only when:
- * - B resides in shared memory ("shared" or "shared.dyn"); and
- * - (C, A, B) dtypes match one of the supported combinations below and K
- *   satisfies the required alignment; and
- * - for combinations that require specific orientations, A is not transposed
- *   and B is transposed.
- *
- * Supported combinations and constraints:
- * - C=float16:
- *   - A=float16, B=float16: K % 16 == 0
- *   - Various float8 mixes (e4m3/e5m2): require (!trans_A && trans_B) and K %
- * 32 == 0
- * - C=float32:
- *   - A=float16, B=float16: K % 16 == 0
- *   - A=bfloat16, B=bfloat16: K % 16 == 0
- *   - A=float32, B=float32: require (!trans_A && trans_B) and K % 8 == 0
- *   - Various float8 mixes: require (!trans_A && trans_B) and K % 32 == 0
- * - C=int32:
- *   - 8-bit integer combinations (Int8/UInt8): require (!trans_A && trans_B)
- * and K % 32 == 0
- *
- * @return true if WGMMA is supported for the current buffers, dtypes, and
- *         transpose/shape constraints; false otherwise.
+ * Returns true only when B resides in shared memory and the (C, A, B) dtype
+ * combination plus K alignment matches one of the supported WGMMA variants.
  */
 bool GemmNode::checkWgmma() const {
   if (b_.scope() != "shared.dyn" && b_.scope() != "shared") {
@@ -394,422 +424,82 @@ bool GemmNode::checkWgmma() const {
   }
 }
 
-/**
- * @brief Parse and return the numeric GPU architecture from a Target's "arch"
- * attribute.
- *
- * Examines the target's "arch" string and, if it matches the pattern
- * "sm_<num>", returns <num> as an int. If the attribute is present but does not
- * match that pattern, returns 0.
- *
- * Preconditions: the target must have an "arch" attribute (this is checked via
- * ICHECK).
- *
- * @return int The parsed architecture number (e.g., 80 for "sm_80"), or 0 if
- * the arch string does not match "sm_<num>".
- */
-static int GetArchInt(Target target) {
-  int arch_int = 0;
-  auto s = target->GetAttr<tvm::ffi::String>("arch");
-  ICHECK(s.has_value());
-  std::string arch = s.value();
-  if (arch.rfind("sm_", 0) == 0) {
-    arch_int = std::stoi(arch.substr(3));
-  } else {
-    arch_int = 0;
-  }
-  return arch_int;
-}
-
-/**
- * @brief Lower the GEMM operator to a TL TIR call expression.
- *
- * Constructs a tl::gemm call string parameterized by M, N, K, warp partition,
- * transpose flags, accumulation clearing, target-specific stride/offset/kPack
- * and optional workgroup wait value, then returns an Evaluate(call) node
- * invoking tl::tl_gemm with the composed string and the A/B/C buffer handles.
- *
- * @param T Contains lowering context including thread bounds and target.
- * @param analyzer Optional arithmetic analyzer used by lowering (may be
- * nullptr).
- * @return Stmt A TIR statement representing the evaluated TL GEMM call.
- */
 Stmt GemmNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  auto block_size = *as_const_int(T.thread_bounds->extent);
-  GemmInst gemm_inst = getGemmInst(block_size, T.target);
-  auto [warp_m, warp_n] =
-      policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
-
-  // Build access pointers from regions locally
-  PrimExpr Aptr =
-      MakeAccessPtrFromRegion(aRegion_, /*r*/ 1, /*require_2d*/ true);
-  PrimExpr Bptr =
-      MakeAccessPtrFromRegion(bRegion_, /*r*/ 1, /*require_2d*/ true);
-  PrimExpr Cptr =
-      MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3, /*require_2d*/ true);
-
-  std::stringstream ss;
-  std::string op_name;
-
-  if (gemm_inst == GemmInst::kTCGEN5MMA) {
-    auto [can_use_tcgen5mma, meta] =
-        GetTCGEN5MMAMeta(m_, n_, k_, a_->dtype, c_->dtype);
-    ICHECK(can_use_tcgen5mma);
-    ICHECK(b_.scope() == "shared.dyn" || b_.scope() == "shared");
-    ICHECK(c_.scope() == "shared.tmem");
-    ICHECK(mbar_.has_value()) << "mbar must be provided for TCGEN5MMA";
-    if (a_.scope() == "shared.tmem") {
-      op_name = "tl::tcgen5mma_gemm_ts";
-    } else if (a_.scope() == "shared.dyn" || a_.scope() == "shared") {
-      op_name = "tl::tcgen5mma_gemm_ss";
-    } else {
-      ICHECK(0)
-          << "Unsupported A scope for TCGEN5MMA: "
-          << a_.scope(); // If this is triggered, it means Tilelang has bugs.
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm.lower")) {
+    PrimExpr mbar_phase = T.mbar_phase_expr;
+    if (auto explicit_phase = GetAnnotatedMbarPhaseExpr(annotations_)) {
+      mbar_phase = explicit_phase.value();
     }
-    ICHECK(wgWait_ == -1)
-        << "Currently only wg_wait == -1 is supported for TCGEN5MMA. Please "
-           "use "
-           "wg_wait = -1 and manually synchronize with mbarrier.";
-
-    std::string accum_dtype = "";
-    if (c_->dtype.is_float()) {
-      if (c_->dtype.bits() == 32) {
-        accum_dtype = "float";
+    // NOTE(wt): Decide GemmInst and compute warp partition on Python side
+    auto prim_func = Downcast<PrimFunc>(
+        (*f)(tvm::ffi::GetRef<Gemm>(this), T.layout_map, T.target,
+             T.thread_bounds, T.thread_var, mbar_phase));
+    ICHECK(prim_func->attrs.defined());
+    auto global_symbol =
+        prim_func->attrs.GetAttr<tvm::ffi::String>("global_symbol");
+    ICHECK(global_symbol.has_value());
+    if (prim_func->body.as<BlockRealizeNode>()) {
+      BlockRealize block_realize = Downcast<BlockRealize>(prim_func->body);
+      auto block = block_realize->block;
+      {
+        BlockNode *n = block.CopyOnWrite();
+        n->name_hint = global_symbol.value();
+        n->annotations.Set(tl::attr::kLexicalAllocScope,
+                           IntImm(DataType::Int(32), 1));
       }
+      return BlockRealize(block_realize->iter_values, block_realize->predicate,
+                          block);
     }
-    ICHECK(!accum_dtype.empty())
-        << "Unsupported C dtype for TCGEN5MMA: " << c_->dtype;
-    ss << op_name << "<" << m_ << ", " << n_ << ", " << k_ << ", ";
-    ss << meta.atom_m << ", " << meta.atom_n << ", " << meta.atom_k << ", ";
-    ss << transA_ << ", " << transB_ << ", ";
-    ss << accum_dtype;
-    ss << ">";
-
-    auto C_buffer = T.buffer_remap.count(c_) ? T.buffer_remap[c_] : c_;
-    Array<PrimExpr> new_args;
-    auto mbarPtr =
-        MakeAccessPtrFromRegion(mbarRegion_, /*rw*/ 3, /*require_2d*/ true);
-    new_args.push_back(StringImm(ss.str()));
-    new_args.push_back(Aptr);
-    new_args.push_back(Bptr);
-    new_args.push_back(BufferLoad(C_buffer, cCoords_));
-    new_args.push_back(mbarPtr);
-    new_args.push_back(clearAccum_);
-    auto new_call = Call(DataType::Handle(), builtin::call_extern(), new_args);
-
-    // Since TCGEN5MMA atoms provided by CUTLASS always have an internal
-    // `elect_one_sync()`, we check if we are calling it using full warps
-    constexpr int warp_size = 32;
-    ICHECK(
-        analyzer->CanProveEqual(FloorMod(T.thread_bounds->min, warp_size), 0) &&
-        analyzer->CanProveEqual(FloorMod(T.thread_bounds->extent, warp_size),
-                                0))
-        << "TCGEN5MMA requires thread bounds to be multiples of warp size (32) "
-           "and aligned to warps.";
-    if (analyzer->CanProveEqual(T.thread_bounds->extent, warp_size)) {
-      // If the thread bounds is exactly one warp, we can use the original call
-      return Evaluate(new_call);
-    } else {
-      // Add an if-else clause
-      auto tcgen5mma_call =
-          IfThenElse(EQ(FloorDiv(T.thread_var, warp_size),
-                        FloorDiv(T.thread_bounds->min, warp_size)),
-                     Evaluate(new_call));
-      return tcgen5mma_call;
-    }
-  }
-
-  if (IsFragmentBuffer(a_)) {
-    ICHECK(!IsFragmentBuffer(b_));
-    ICHECK(!transA_)
-        << "gemm_rs requires the A operand to be in non-transposed layout.";
-    op_name = "tl::gemm_rs";
-  } else if (IsFragmentBuffer(b_)) {
-    op_name = "tl::gemm_sr";
-  } else {
-    op_name = "tl::gemm_ss";
-  }
-  ICHECK(IsFragmentBuffer(c_));
-
-  ss << op_name << "<" << m_ << ", " << n_ << ", " << k_ << ", ";
-  ss << warp_m << ", " << warp_n << ", ";
-  ss << transA_ << ", " << transB_;
-  auto clear_accum_bool = clearAccum_.as<Bool>();
-  ICHECK(clear_accum_bool.has_value())
-      << "clear_accum must be a constant Bool type, got " << clearAccum_;
-  ss << ", " << bool(clear_accum_bool.value());
-  if (TargetIsCuda(T.target) && (GetArchInt(T.target) >= 75)) {
-    ss << ", " << strideA_ << ", " << strideB_;
-    ss << ", " << offsetA_ << ", " << offsetB_;
-  }
-  if (TargetIsCDNA(T.target)) {
-    // for cdna gemm, we need to specify kPack
-    ss << ", " << kPack_;
-  } else if (TargetIsHopper(T.target)) {
-    ss << ", " << (gemm_inst == GemmInst::kWGMMA ? "true" : "false");
-  }
-
-  // Emit wg_wait if necessary
-  if (TargetIsHopper(T.target)) {
-    if (wgWait_ != 0) {
-      ss << ", " << wgWait_;
-    }
-  } else if (TargetIsSm100(T.target)) {
-    // NOTE On sm100, only the leading thread issues the TCGEN5MMA instruction
-    // but all threads need to wait, so we emit another statement for cases
-    // where wg_wait == 0.
-    ICHECK(wgWait_ == 0 || wgWait_ == -1)
-        << "wg_wait must be 0 or -1 for Sm100";
+    // wrap with block realize node
+    Map<String, ObjectRef> block_annotations;
+    block_annotations.Set(tl::attr::kLexicalAllocScope,
+                          IntImm(DataType::Int(32), 1));
+    return BlockRealize(
+        /*iter_values=*/Array<PrimExpr>(),
+        /*predicate=*/const_true(),
+        /*block=*/
+        Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
+              /*name_hint=*/global_symbol.value(), prim_func->body,
+              /*init=*/Optional<Stmt>(), /*alloc_buffers=*/{},
+              /*match_buffers=*/{}, /*annotations=*/block_annotations));
   } else {
-    ICHECK(wgWait_ == 0)
-        << "wg_wait must be 0 for non-Hopper and non-Sm100 targets";
+    LOG(FATAL) << "No lower function found for gemm";
+    return Stmt();
   }
-  ss << ">";
-
-  auto new_call = Call(DataType::Handle(), tl::tl_gemm(),
-                       Array<PrimExpr>{StringImm(ss.str()), Aptr, Bptr, Cptr});
-  return Evaluate(new_call);
 }
 
-/**
- * @brief Infer and bind target-specific memory/layout mappings for A, B, and C.
- *
- * Infers per-buffer layouts (fragment or shared-memory layouts) for this GEMM
- * operator according to the target architecture, thread bounds, warp
- * partitioning, data types, and transpose flags, then binds fragment layouts
- * to the thread range when required.
- *
- * Preconditions:
- * - C.scope() == "local.fragment"
- *
- * Side effects:
- * - Marks layout inference as completed (sets completed_ = true).
- * - May abort via ICHECK on unsupported targets, invalid buffer scopes, or
- *   incompatible shape constraints.
- *
- * @param T Input layout-inference context (provides thread bounds and target).
- * @return LayoutMap mapping A, B, and C to their inferred layouts.
- */
 LayoutMap GemmNode::InferLayout(const LayoutInferArgs &T,
                                 InferLevel level) const {
   if (completed_)
     return {};
   LayoutMap results;
-  auto thread_range = T.thread_bounds;
-  auto block_size = *as_const_int(thread_range->extent);
-  GemmInst gemm_inst = getGemmInst(block_size, T.target);
-  auto [warp_m, warp_n] =
-      policy_->computeWarpPartition(m_, n_, block_size, T.target, gemm_inst);
-  if (TargetIsVolta(T.target)) {
-    ICHECK(IsFragmentBuffer(c_))
-        << "Volta gemm only supports C in local.fragment scope, got "
-        << c_.scope();
-    auto fragment = makeGemmVoltaFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
-                                           c_->dtype.bits());
-    results.Set(c_, fragment->BindThreadRange(thread_range));
-    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
-      int dim_A = a_->shape.size();
-      results.Set(a_, makeGemmVoltaABLayout(*as_const_int(a_->shape[dim_A - 2]),
-                                            *as_const_int(a_->shape[dim_A - 1]),
-                                            true, !transA_));
-    } else if (IsFragmentBuffer(a_)) {
-      ICHECK(transA_ == false);
-      auto fragment =
-          makeGemmVoltaFragmentA(m_, n_, k_, m_ / warp_m, n_ / warp_n);
-      results.Set(a_, fragment->BindThreadRange(thread_range));
-    } else {
-      ICHECK(0);
-    }
-
-    ICHECK(b_.scope() == "shared" || b_.scope() == "shared.dyn");
-    int dim_B = b_->shape.size();
-    results.Set(b_, makeGemmVoltaABLayout(*as_const_int(b_->shape[dim_B - 2]),
-                                          *as_const_int(b_->shape[dim_B - 1]),
-                                          false, transB_));
-  } else if (TargetIsAmpere(T.target) || TargetIsTuring(T.target) ||
-             TargetIsSM120(T.target) ||
-             (TargetIsSm100(T.target) && gemm_inst == GemmInst::kMMA)) {
-    ICHECK(IsFragmentBuffer(c_))
-        << "MMA only supports C in local.fragment scope, got " << c_.scope();
-
-    auto fragment =
-        makeGemmFragmentC(m_, n_, m_ / warp_m, n_ / warp_n, c_->dtype.bits());
-    results.Set(c_, fragment->BindThreadRange(thread_range));
-
-    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
-      int dim_A = a_->shape.size();
-      const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
-      const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
-      results.Set(a_,
-                  makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                   a_->dtype.bits(), !transA_));
-    } else if (IsFragmentBuffer(a_)) {
-      auto fragment = makeGemmFragmentA(m_, n_, k_, m_ / warp_m, n_ / warp_n,
-                                        a_->dtype.bits(), transA_);
-      results.Set(a_, fragment->BindThreadRange(thread_range));
-    } else {
-      ICHECK(0);
-    }
-    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
-      int dim_B = b_->shape.size();
-      const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
-      const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
-      results.Set(b_,
-                  makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                   b_->dtype.bits(), transB_));
-    } else if (IsFragmentBuffer(b_)) {
-      auto fragment =
-          makeGemmFragmentB(m_, n_, k_, m_ / warp_m, n_ / warp_n, transB_);
-      results.Set(b_, fragment->BindThreadRange(thread_range));
-    } else {
-      ICHECK(0);
-    }
-  } else if (TargetIsHopper(T.target)) {
-    ICHECK(IsFragmentBuffer(c_))
-        << (gemm_inst == GemmInst::kWGMMA ? "WGMMA " : "MMA ")
-        << "only supports C in local.fragment scope, got " << c_.scope();
-    auto fragment = gemm_inst == GemmInst::kWGMMA
-                        ? makeGemmFragmentCHopper(m_, n_, m_ / warp_m,
-                                                  n_ / warp_n, c_->dtype.bits())
-                        : makeGemmFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
-                                            c_->dtype.bits());
-    results.Set(c_, fragment->BindThreadRange(thread_range));
-    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
-      int dim_A = a_->shape.size();
-      const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
-      const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
-      const int64_t continuity =
-          transA_ ? 4 * mat_continuous / warp_m : mat_continuous;
-      auto ABLayout =
-          gemm_inst == GemmInst::kWGMMA
-              ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
-                                       a_->dtype.bits(), !transA_)
-              : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                 a_->dtype.bits(), !transA_);
-      results.Set(a_, ABLayout);
-    } else {
-      auto fragment = makeGemmFragmentA(m_, n_, k_, m_ / warp_m, n_ / warp_n,
-                                        a_->dtype.bits(), transA_);
-      results.Set(a_, fragment->BindThreadRange(thread_range));
-    }
-    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
-      int dim_B = b_->shape.size();
-      const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
-      const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
-      const int64_t continuity =
-          transB_ ? mat_continuous : mat_continuous / warp_n;
-
-      auto ABLayout =
-          gemm_inst == GemmInst::kWGMMA
-              ? makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
-                                       b_->dtype.bits(), transB_)
-              : makeGemmABLayout(mat_stride, mat_continuous, mat_continuous,
-                                 b_->dtype.bits(), transB_);
-      results.Set(b_, ABLayout);
-    } else {
-      auto fragment =
-          makeGemmFragmentB(m_, n_, k_, m_ / warp_m, n_ / warp_n, transB_);
-      results.Set(b_, fragment->BindThreadRange(thread_range));
-    }
-  } else if (gemm_inst == GemmInst::kTCGEN5MMA) {
-    ICHECK(c_.scope() == "shared.tmem")
-        << "TCGEN5MMA only supports C in shared.tmem scope, got " << c_.scope();
-    ICHECK(a_.scope() == "shared.dyn" || a_.scope() == "shared")
-        << "Current TCGEN5MMA only supports A in shared.dyn scope";
-    auto [can_use_tcgen5mma, meta] =
-        GetTCGEN5MMAMeta(m_, n_, k_, a_->dtype, c_->dtype);
-    ICHECK(can_use_tcgen5mma);
-    {
-      int dim_A = a_->shape.size();
-      const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
-      const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
-      results.Set(a_, makeGemmABLayoutSm100(mat_stride, mat_continuous,
-                                            mat_continuous, a_->dtype.bits(),
-                                            transA_ ? 1 : 2));
-    }
-    {
-      int dim_B = b_->shape.size();
-      const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
-      const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
-      const int64_t continuity = mat_continuous;
-      results.Set(b_,
-                  makeGemmABLayoutSm100(mat_stride, mat_continuous, continuity,
-                                        b_->dtype.bits(), transB_ ? 2 : 1));
-    }
-    {
-      Layout res;
-      IterVar i = make_itervar("i", m_);
-      IterVar j = make_itervar("j", n_);
-      ICHECK(m_ % meta.atom_m == 0);
-      PrimExpr atom_idx = FloorDiv(i, meta.atom_m) +
-                          FloorDiv(j, meta.atom_n) * (m_ / meta.atom_m);
-      PrimExpr ai = FloorMod(i, meta.atom_m); // "ai" means "atom_i"
-      PrimExpr aj = FloorMod(j, meta.atom_n);
-      if (meta.atom_m == 128) {
-        // Layout D
-        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-d)
-        res = Layout(Array{i, j}, {ai, aj + atom_idx * meta.atom_n});
-      } else if (meta.atom_m == 64) {
-        // Layout E
-        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-e)
-        // since .ws variant is used About why we use .ws variant here, please
-        // refer to gemm_sm100.h
-        res = Layout(Array{i, j}, {FloorDiv(ai, 32) * 32 + FloorMod(ai, 32) +
-                                       FloorDiv(aj, meta.atom_n / 2) * 64,
-                                   FloorMod(aj, meta.atom_n / 2) +
-                                       atom_idx * (meta.atom_n / 2)});
-      } else if (meta.atom_m == 32) {
-        // Layout G
-        // (https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-data-path-layout-g)
-        res = Layout(
-            Array{i, j},
-            {FloorMod(ai, 32) + FloorDiv(aj, meta.atom_n / 4) * 32,
-             FloorMod(aj, meta.atom_n / 4) + atom_idx * (meta.atom_n / 4)});
+  if (const auto f = ffi::Function::GetGlobal("tl.gemm.infer_layout")) {
+    auto inferred_layouts = Downcast<LayoutMap>(
+        (*f)(tvm::ffi::GetRef<Gemm>(this), T.target, T.thread_bounds));
+    // For MMA instructions, skip shared buffer layouts that are already
+    // inferred by a prior operator to avoid layout conflicts when the same
+    // shared buffer is consumed by multiple gemm ops with different transpose
+    // semantics. WGMMA/TCGEN5MMA have strict shared memory layout requirements
+    // and must always set their layouts.
+    auto block_size = *as_const_int(T.thread_bounds->extent);
+    GemmInst gemm_inst = getGemmInst(block_size, T.target);
+    bool is_mma = (gemm_inst == GemmInst::kMMA);
+    for (auto kv : inferred_layouts) {
+      const Buffer &buf = kv.first;
+      const Layout &layout = kv.second;
+      if (is_mma && IsSharedBuffer(buf) && T.layout_map.count(buf)) {
+        continue;
+      }
+      if (auto frag = layout.as<Fragment>()) {
+        results.Set(buf, frag.value()->BindThreadRange(T.thread_bounds));
       } else {
-        ICHECK(0);
+        results.Set(buf, layout);
       }
-      results.Set(c_, res);
-    }
-  } else if (TargetIsCDNA(T.target)) {
-    ICHECK(IsFragmentBuffer(c_))
-        << "CDNA gemm (FMMA) only supports C in local.fragment scope, got "
-        << c_.scope();
-    auto fragment = makeGemmFragmentCCDNA(m_, n_, m_ / warp_m, n_ / warp_n,
-                                          c_->dtype.bits());
-    results.Set(c_, fragment->BindThreadRange(thread_range));
-
-    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
-      int dim_A = a_->shape.size();
-      auto shared_layout = makeGemmABLayoutCDNA(
-          *as_const_int(a_->shape[dim_A - 2]),
-          *as_const_int(a_->shape[dim_A - 1]), a_->dtype.bits(), kPack_);
-      results.Set(a_, shared_layout);
-    } else if (IsFragmentBuffer(a_)) {
-      auto fragment =
-          makeGemmFragmentACDNA(m_, n_, k_, m_ / warp_m, n_ / warp_n,
-                                a_->dtype.bits(), kPack_, transA_);
-      results.Set(a_, fragment->BindThreadRange(thread_range));
-    } else {
-      ICHECK(0);
-    }
-    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
-      int dim_B = b_->shape.size();
-      auto shared_layout = makeGemmABLayoutCDNA(
-          *as_const_int(b_->shape[dim_B - 2]),
-          *as_const_int(b_->shape[dim_B - 1]), b_->dtype.bits(), kPack_);
-
-      results.Set(b_, shared_layout);
-    } else if (IsFragmentBuffer(b_)) {
-      auto fragment =
-          makeGemmFragmentB(m_, n_, k_, m_ / warp_m, n_ / warp_n, transB_);
-      results.Set(b_, fragment->BindThreadRange(thread_range));
-    } else {
-      ICHECK(0);
     }
   } else {
-    ICHECK(0) << "Not supported " << T.target->str();
+    LOG(FATAL) << "No infer layout function found for gemm";
   }
+
   completed_ = true;
   return results;
 }
@@ -819,12 +509,43 @@ TIR_REGISTER_TL_TILE_OP(Gemm, gemm)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kOpaque));
 
+TVM_REGISTER_OP("tl.tileop.wgmma_gemm")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "wgmma_gemm")
+    .set_attr<OpBuilderFunc>("TLOpBuilder",
+                             [](Array<PrimExpr> args,
+                                Map<String, ObjectRef> annotations) {
+                               Map<String, ObjectRef> ann = annotations;
+                               ann.Set("is_wgmma",
+                                       IntImm(DataType::Int(32), 1));
+                               return Gemm(args, ann);
+                             })
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TVM_REGISTER_OP("tl.tileop.tcgen05_gemm")
+    .set_attr<TScriptPrinterName>("TScriptPrinterName", "tcgen05_gemm")
+    .set_attr<OpBuilderFunc>("TLOpBuilder",
+                             [](Array<PrimExpr> args,
+                                Map<String, ObjectRef> annotations) {
+                               Map<String, ObjectRef> ann = annotations;
+                               ann.Set("is_tcgen05",
+                                       IntImm(DataType::Int(32), 1));
+                               return Gemm(args, ann);
+                             })
+    .set_num_inputs(5)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
 TVM_REGISTER_OP("tl.GemmWarpPolicy")
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "GemmWarpPolicy");
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   GemmNode::RegisterReflection();
   GemmWarpPolicyNode::RegisterReflection();
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def("tl.GemmWarpPolicyComputeWarpPartition",
                         [](GemmWarpPolicy policy, int M, int N, int block_size,
@@ -832,6 +553,49 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                           policy->computeWarpPartition(M, N, block_size, target,
                                                        gemm_inst);
                         });
+  refl::GlobalDef().def("tl.GemmGetGemmInst",
+                        [](Gemm gemm, int block_size, Target target) {
+                          return gemm->getGemmInst(block_size, target);
+                        });
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def(
+      "tl.get_tcgen5_mma_meta", [](int M, int N, int K, DataType ab_dtype,
+                                   DataType c_dtype, bool disable_2cta) {
+        auto [success, meta] =
+            GetTCGEN5MMAMeta(M, N, K, ab_dtype, c_dtype, disable_2cta);
+        Array<Integer> result;
+        if (success) {
+          result.push_back(Integer(meta.atom_m));
+          result.push_back(Integer(meta.atom_n));
+          result.push_back(Integer(meta.atom_k));
+          result.push_back(Integer(meta.enable_ws));
+          result.push_back(Integer(meta.enable_2cta));
+        }
+        return result;
+      });
+  refl::GlobalDef().def(
+      "tl.get_tcgen5_instr_desc",
+      [](int atom_m, int atom_n, int atom_k, DataType ab_dtype,
+         DataType c_dtype, bool a_is_k_major, bool b_is_k_major, int scale_in_a,
+         int scale_in_b) {
+        uint32_t desc = GetTCGEN5InstrDesc(atom_m, atom_n, atom_k, ab_dtype,
+                                           c_dtype, a_is_k_major, b_is_k_major,
+                                           scale_in_a, scale_in_b);
+        return Integer(static_cast<int64_t>(desc));
+      });
+  refl::GlobalDef().def("tl.get_tcgen5_blockscaled_instr_desc",
+                        [](int atom_m, int atom_n, DataType ab_dtype,
+                           bool a_is_k_major, bool b_is_k_major, int scale_in_a,
+                           int scale_in_b, int a_sf_id, int b_sf_id) {
+                          uint32_t desc = GetTCGEN5BlockScaledInstrDesc(
+                              atom_m, atom_n, ab_dtype, a_is_k_major,
+                              b_is_k_major, scale_in_a, scale_in_b, a_sf_id,
+                              b_sf_id);
+                          return Integer(static_cast<int64_t>(desc));
+                        });
 }
 
 } // namespace tl
diff --git a/src/op/gemm.h b/src/op/gemm.h
index fd27338827..26b6678402 100644
--- a/src/op/gemm.h
+++ b/src/op/gemm.h
@@ -39,7 +39,14 @@ inline const char *GemmWarpPolicyTypeToString(GemmWarpPolicyType type) {
 }
 
 // Target GEMM instruction
-enum class GemmInst : uint8_t { kMMA, kWGMMA, kTCGEN5MMA, kMFMA };
+enum class GemmInst : uint8_t {
+  kMMA,
+  kWGMMA,
+  kTCGEN5MMA,
+  kMFMA,
+  kScalar,
+  kWMMA
+};
 
 /// Convert GemmInst enum to string for debugging
 inline const char *GemmInstToString(GemmInst inst) {
@@ -52,6 +59,10 @@ inline const char *GemmInstToString(GemmInst inst) {
     return "TCGEN5MMA";
   case GemmInst::kMFMA:
     return "MFMA";
+  case GemmInst::kScalar:
+    return "Scalar";
+  case GemmInst::kWMMA:
+    return "WMMA";
   default:
     return "Unknown";
   }
@@ -118,6 +129,8 @@ class GemmWarpPolicy : public ObjectRef {
 class GemmNode : public TileOperatorNode {
 public:
   bool checkWgmma() const;
+  bool allowTcgen5Mma(Target target) const;
+  bool allowWgmma(int block_size, Target target) const;
   tir::Buffer a_, b_, c_;
   // BufferRegion for A, B and C
   BufferRegion aRegion_, bRegion_, cRegion_;
@@ -126,14 +139,19 @@ class GemmNode : public TileOperatorNode {
   int strideA_, strideB_;
   int offsetA_, offsetB_;
   PrimExpr clearAccum_ = const_false();
+  tir::BufferLoad mbar_; // mbar is optional, only used for TCGEN5MMA
+  Array<PrimExpr> cCoords_;
   // k_pack please ref to bitblas/tl/mfma_macro_generator.py::k_pack
   // only will be enabled under cdna mfma instructions
   int kPack_ = 1;
   int wgWait_ = 0;
-  BufferRegion mbarRegion_;
-  std::optional<tir::Buffer> mbar_; // mbar is optional, only used for TCGEN5MMA
-  Array<PrimExpr> cCoords_;
+  bool isWgmma_ = false;
+  bool isTcgen05_ = false;
   mutable GemmWarpPolicy policy_;
+  Map<String, ObjectRef> annotations_;
+  BufferRegion sfaRegion_, sfbRegion_;
+  PrimExpr sfAId_, sfBId_;
+
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Gemm", GemmNode, TileOperatorNode);
 
   static void RegisterReflection() {
@@ -155,22 +173,31 @@ class GemmNode : public TileOperatorNode {
         .def_ro("offsetA", &GemmNode::offsetA_)
         .def_ro("offsetB", &GemmNode::offsetB_)
         .def_ro("clearAccum", &GemmNode::clearAccum_)
+        .def_ro("mbar", &GemmNode::mbar_)
+        .def_ro("cCoords", &GemmNode::cCoords_)
         .def_ro("kPack", &GemmNode::kPack_)
         .def_ro("wgWait", &GemmNode::wgWait_)
-        .def_ro("policy", &GemmNode::policy_);
+        .def_ro("isWgmma", &GemmNode::isWgmma_)
+        .def_ro("isTcgen05", &GemmNode::isTcgen05_)
+        .def_ro("policy", &GemmNode::policy_)
+        .def_ro("annotations", &GemmNode::annotations_)
+        .def_ro("sfaRegion", &GemmNode::sfaRegion_)
+        .def_ro("sfbRegion", &GemmNode::sfbRegion_)
+        .def_ro("sfAId", &GemmNode::sfAId_)
+        .def_ro("sfBId", &GemmNode::sfBId_);
   }
 
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
                         InferLevel level) const override;
+  AccessRegions GetAccessRegions() const override;
 
   TileOperator Clone() const;
 
-private:
+  // Target GEMM instruction
   GemmInst getGemmInst(int block_size, Target target) const;
-  bool allowTcgen5Mma(Target target) const;
-  bool allowWgmma(int block_size, Target target) const;
 
+private:
   mutable bool completed_ = false;
 };
 
diff --git a/src/op/gemm_py.cc b/src/op/gemm_py.cc
index c68861814b..851aea6f31 100644
--- a/src/op/gemm_py.cc
+++ b/src/op/gemm_py.cc
@@ -319,33 +319,5 @@ TVM_FFI_STATIC_INIT_BLOCK() {
                         });
 }
 
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def(
-      "tl.get_tcgen5_mma_meta",
-      [](int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
-        auto [success, meta] = GetTCGEN5MMAMeta(M, N, K, ab_dtype, c_dtype);
-        Array<Integer> result;
-        if (success) {
-          result.push_back(Integer(meta.atom_m));
-          result.push_back(Integer(meta.atom_n));
-          result.push_back(Integer(meta.atom_k));
-          result.push_back(Integer(meta.enable_ws));
-          result.push_back(Integer(meta.enable_2cta));
-        }
-        return result;
-      });
-  refl::GlobalDef().def(
-      "tl.get_tcgen5_instr_desc",
-      [](int atom_m, int atom_n, int atom_k, DataType ab_dtype,
-         DataType c_dtype, bool a_is_k_major, bool b_is_k_major, int scale_in_a,
-         int scale_in_b) {
-        uint32_t desc = GetTCGEN5InstrDesc(atom_m, atom_n, atom_k, ab_dtype,
-                                           c_dtype, a_is_k_major, b_is_k_major,
-                                           scale_in_a, scale_in_b);
-        return Integer(static_cast<int64_t>(desc));
-      });
-}
-
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/gemm_sp.cc b/src/op/gemm_sp.cc
index acff1ff7bb..6df31aa7f1 100644
--- a/src/op/gemm_sp.cc
+++ b/src/op/gemm_sp.cc
@@ -19,15 +19,16 @@
 namespace tvm {
 namespace tl {
 
-std::pair<int, int> GemmSPWarpPolicyNode::computeWarpPartition(int M, int N,
-                                                               int block_size,
-                                                               Target target,
-                                                               bool use_wgmma,
-                                                               int bits) const {
+std::pair<int, int>
+GemmSPWarpPolicyNode::computeWarpPartition(int M, int N, int block_size,
+                                           Target target, GemmInst gemm_inst,
+                                           int bits) const {
   int num_warps = block_size / TargetGetWarpSize(target);
 
+  ICHECK(gemm_inst == GemmInst::kMMA || gemm_inst == GemmInst::kWGMMA)
+      << "GemmSP currently only supports MMA and WGMMA";
   auto [m_warp, n_warp] = GemmWarpPolicyNode::computeWarpPartition(
-      M, N, block_size, target, use_wgmma ? GemmInst::kWGMMA : GemmInst::kMMA);
+      M, N, block_size, target, gemm_inst);
 
   // Special handling for gemm_sp when the tiling size is not a multiple
   // This should be consistent with shape check in gemm_sp_sm80.h
@@ -85,10 +86,15 @@ std::pair<int, int> GemmSPWarpPolicyNode::computeWarpPartition(int M, int N,
  */
 GemmSP::GemmSP(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<GemmSPNode> node = tvm::ffi::make_object<GemmSPNode>();
-  node->aRegion_ = NormalizeToBufferRegion(args[0]);
-  node->eRegion_ = NormalizeToBufferRegion(args[1]);
-  node->bRegion_ = NormalizeToBufferRegion(args[2]);
-  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+  auto a_access = NormalizeToAccessRegion(args[0], kAccessRead);
+  auto e_access = NormalizeToAccessRegion(args[1], kAccessRead);
+  auto b_access = NormalizeToAccessRegion(args[2], kAccessRead);
+  auto c_access = NormalizeToAccessRegion(args[3], kAccessReadWrite);
+  node->aRegion_ = a_access.region;
+  node->eRegion_ = e_access.region;
+  node->bRegion_ = b_access.region;
+  node->cRegion_ = c_access.region;
+  node->SetAccessRegions({a_access, e_access, b_access, c_access});
   node->a_ = node->aRegion_->buffer;
   node->e_ = node->eRegion_->buffer;
   node->b_ = node->bRegion_->buffer;
@@ -112,6 +118,18 @@ GemmSP::GemmSP(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   data_ = std::move(node);
 }
 
+AccessRegions GemmSPNode::GetAccessRegions() const {
+  AccessRegions result;
+  result.reads.push_back(aRegion_);
+  result.reads.push_back(eRegion_);
+  result.reads.push_back(bRegion_);
+  if (!clearAccum_) {
+    result.reads.push_back(cRegion_);
+  }
+  result.writes.push_back(cRegion_);
+  return result;
+}
+
 /**
  * @brief Create a deep copy of this GemmSPNode wrapped as a TileOperator.
  *
@@ -150,17 +168,16 @@ Stmt GemmSPNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   auto block_size = *as_const_int(T.thread_bounds->extent);
   bool maybe_wgmma = TargetIsHopper(T.target) && (this->m_ >= 64) &&
                      (block_size / warp_size % 4 == 0);
-
+  auto gemm_inst = maybe_wgmma ? GemmInst::kWGMMA : GemmInst::kMMA;
   auto [warp_m, warp_n] = policy_->computeWarpPartition(
-      m_, n_, block_size, T.target, maybe_wgmma, a_->dtype.bits());
+      m_, n_, block_size, T.target, gemm_inst, a_->dtype.bits());
 
   std::stringstream ss;
   std::string op_name = "tl::gemm_sp_ss";
-  ICHECK((a_.scope() == "shared" || a_.scope() == "shared.dyn") &&
-         (b_.scope() == "shared" || b_.scope() == "shared.dyn"))
+  ICHECK(IsSharedBuffer(a_) && IsSharedBuffer(b_))
       << "Only support shared.dyn scope for A and B, but received "
       << a_.scope() << " and " << b_.scope();
-  ICHECK((e_.scope() == "shared" || e_.scope() == "shared.dyn"))
+  ICHECK(IsSharedBuffer(e_))
       << "Only support shared.dyn scope for E as copy from smem to rmem are "
          "delegated to cute implementation, found "
       << e_.scope();
@@ -175,16 +192,20 @@ Stmt GemmSPNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     ss << ", " << wgWait_;
   }
   ss << ">";
-  auto A_buffer = T.buffer_remap.count(a_) ? T.buffer_remap[a_] : a_;
-  auto B_buffer = T.buffer_remap.count(b_) ? T.buffer_remap[b_] : b_;
-  auto C_buffer = T.buffer_remap[c_];
-  auto E_buffer = T.buffer_remap.count(e_) ? T.buffer_remap[e_] : e_;
+  // Build access pointers from regions to preserve stage-specific offsets
+  // from pipeline multi-versioning (matching dense GemmNode::Lower pattern).
+  PrimExpr Aptr =
+      MakeAccessPtrFromRegion(aRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Bptr =
+      MakeAccessPtrFromRegion(bRegion_, /*r*/ 1, /*require_2d*/ true);
+  PrimExpr Cptr =
+      MakeAccessPtrFromRegion(cRegion_, /*rw*/ 3, /*require_2d*/ true);
+  PrimExpr Eptr =
+      MakeAccessPtrFromRegion(eRegion_, /*r*/ 1, /*require_2d*/ false);
 
   auto new_call =
       Call(DataType::Handle(), tl::tl_gemm_sp(),
-           Array<PrimExpr>{StringImm(ss.str()), A_buffer.access_ptr(1),
-                           B_buffer.access_ptr(1), C_buffer.access_ptr(3),
-                           E_buffer.access_ptr(1)});
+           Array<PrimExpr>{StringImm(ss.str()), Aptr, Bptr, Cptr, Eptr});
   return Evaluate(new_call);
 }
 
@@ -229,50 +250,54 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
     constexpr int wgmma_m = 16 * 4;
     bool maybe_wgmma =
         (this->m_ >= wgmma_m) && (block_size / warp_size % 4 == 0);
+    auto gemm_inst = maybe_wgmma ? GemmInst::kWGMMA : GemmInst::kMMA;
     auto [warp_m, warp_n] = policy_->computeWarpPartition(
-        m_, n_, block_size, T.target, maybe_wgmma, a_->dtype.bits());
+        m_, n_, block_size, T.target, gemm_inst, a_->dtype.bits());
     auto fragment = maybe_wgmma
                         ? makeGemmFragmentCHopper(m_, n_, m_ / warp_m,
                                                   n_ / warp_n, c_->dtype.bits())
                         : makeGemmFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
                                             c_->dtype.bits());
     results.Set(c_, fragment->BindThreadRange(thread_range));
-    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
+    if (IsSharedBuffer(a_)) {
       int dim_A = a_->shape.size();
       const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
       const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
-      results.Set(a_, makeGemmABLayoutHopper(mat_stride, mat_continuous,
-                                             mat_continuous, a_->dtype.bits(),
-                                             transA_ ? 1 : 2));
+      auto layout =
+          makeGemmABLayoutHopper(mat_stride, mat_continuous, mat_continuous,
+                                 a_->dtype.bits(), transA_ ? 1 : 2);
+      results.Set(a_, ExpandLayoutToMatchBuffer(layout, a_));
     } else {
       ICHECK(false) << "Not implemented";
     }
 
-    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
+    if (IsSharedBuffer(b_)) {
       int dim_B = b_->shape.size();
       const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
       const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
       const int64_t continuity =
           transB_ ? mat_continuous : mat_continuous / warp_n;
-      results.Set(b_,
-                  makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
-                                         b_->dtype.bits(), transB_ ? 2 : 1));
+      auto layout =
+          makeGemmABLayoutHopper(mat_stride, mat_continuous, continuity,
+                                 b_->dtype.bits(), transB_ ? 2 : 1);
+      results.Set(b_, ExpandLayoutToMatchBuffer(layout, b_));
     } else {
       ICHECK(false) << "WGMMA only support B in shared.";
     }
   } else if (TargetIsAmpere(T.target)) {
     auto [warp_m, warp_n] = policy_->computeWarpPartition(
-        m_, n_, block_size, T.target, false, a_->dtype.bits());
+        m_, n_, block_size, T.target, GemmInst::kMMA, a_->dtype.bits());
     auto fragment = makeGemmSparseFragmentC(m_, n_, m_ / warp_m, n_ / warp_n,
                                             c_->dtype.bits());
     results.Set(c_, fragment->BindThreadRange(thread_range));
 
-    if (a_.scope() == "shared" || a_.scope() == "shared.dyn") {
+    if (IsSharedBuffer(a_)) {
       int dim_A = a_->shape.size();
       const int64_t mat_stride = *as_const_int(a_->shape[dim_A - 2]);
       const int64_t mat_continuous = *as_const_int(a_->shape[dim_A - 1]);
-      results.Set(a_, makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
-                                                   a_->dtype.bits()));
+      auto layout = makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
+                                                 a_->dtype.bits());
+      results.Set(a_, ExpandLayoutToMatchBuffer(layout, a_));
     } else if (IsFragmentBuffer(a_)) {
       // auto fragment = makeGemmFragmentA(M, N, K, M / warp_m, N / warp_n,
       //                                   A->dtype.bits(), trans_A);
@@ -281,12 +306,13 @@ LayoutMap GemmSPNode::InferLayout(const LayoutInferArgs &T,
     } else {
       ICHECK(0);
     }
-    if (b_.scope() == "shared" || b_.scope() == "shared.dyn") {
+    if (IsSharedBuffer(b_)) {
       int dim_B = b_->shape.size();
       const int64_t mat_stride = *as_const_int(b_->shape[dim_B - 2]);
       const int64_t mat_continuous = *as_const_int(b_->shape[dim_B - 1]);
-      results.Set(b_, makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
-                                                   b_->dtype.bits()));
+      auto layout = makeGemmSparseAmpereABLayout(mat_stride, mat_continuous,
+                                                 b_->dtype.bits());
+      results.Set(b_, ExpandLayoutToMatchBuffer(layout, b_));
     } else if (IsFragmentBuffer(b_)) {
       // auto fragment =
       //     makeGemmFragmentB(M, N, K, M / warp_m, N / warp_n, trans_B);
@@ -317,8 +343,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef().def(
       "tl.GemmSPWarpPolicyComputeWarpPartition",
       [](GemmSPWarpPolicy policy, int M, int N, int block_size, Target target,
-         bool use_wgmma, int bits) {
-        policy->computeWarpPartition(M, N, block_size, target, use_wgmma, bits);
+         GemmInst gemm_inst, int bits) {
+        policy->computeWarpPartition(M, N, block_size, target, gemm_inst, bits);
         return;
       });
 }
diff --git a/src/op/gemm_sp.h b/src/op/gemm_sp.h
index a00773801b..c060f4efbc 100644
--- a/src/op/gemm_sp.h
+++ b/src/op/gemm_sp.h
@@ -19,7 +19,7 @@ using namespace tir;
 class GemmSPWarpPolicyNode : public GemmWarpPolicyNode {
 public:
   std::pair<int, int> computeWarpPartition(int M, int N, int block_size,
-                                           Target target, bool use_wgmma,
+                                           Target target, GemmInst gemm_inst,
                                            int bits) const;
   TVM_FFI_DECLARE_OBJECT_INFO("tl.GemmSPWarpPolicy", GemmSPWarpPolicyNode,
                               GemmWarpPolicyNode);
@@ -77,6 +77,7 @@ class GemmSPNode : public TileOperatorNode {
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
                         InferLevel level) const override;
+  AccessRegions GetAccessRegions() const override;
 
   TileOperator Clone() const;
 
diff --git a/src/op/gemm_sp_py.cc b/src/op/gemm_sp_py.cc
index f66c8506a2..571e81ec87 100644
--- a/src/op/gemm_sp_py.cc
+++ b/src/op/gemm_sp_py.cc
@@ -52,10 +52,16 @@ using namespace tir;
 GemmSPPy::GemmSPPy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<GemmSPPyNode> node = tvm::ffi::make_object<GemmSPPyNode>();
 
-  node->aRegion_ = NormalizeToBufferRegion(args[0]);
-  node->eRegion_ = NormalizeToBufferRegion(args[1]);
-  node->bRegion_ = NormalizeToBufferRegion(args[2]);
-  node->cRegion_ = NormalizeToBufferRegion(args[3]);
+  auto a_access = NormalizeToAccessRegion(args[0], kAccessRead);
+  auto e_access = NormalizeToAccessRegion(args[1], kAccessRead);
+  auto b_access = NormalizeToAccessRegion(args[2], kAccessRead);
+  auto c_access = NormalizeToAccessRegion(args[3], kAccessReadWrite);
+
+  node->aRegion_ = a_access.region;
+  node->eRegion_ = e_access.region;
+  node->bRegion_ = b_access.region;
+  node->cRegion_ = c_access.region;
+  node->SetAccessRegions({a_access, e_access, b_access, c_access});
 
   node->A = node->aRegion_->buffer;
   node->E = node->eRegion_->buffer;
@@ -86,6 +92,18 @@ GemmSPPy::GemmSPPy(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   data_ = std::move(node);
 }
 
+AccessRegions GemmSPPyNode::GetAccessRegions() const {
+  AccessRegions result;
+  result.reads.push_back(aRegion_);
+  result.reads.push_back(eRegion_);
+  result.reads.push_back(bRegion_);
+  if (!is_one(clear_accum)) {
+    result.reads.push_back(cRegion_);
+  }
+  result.writes.push_back(cRegion_);
+  return result;
+}
+
 /**
  * @brief Create a copy of this GemmSPPyNode as a TileOperator.
  *
@@ -108,6 +126,8 @@ GemmInst GemmSPPyNode::GetGemmInst(int block_size, Target target) const {
     return GemmInst::kWGMMA;
   } else if (TargetIsCDNA(target)) {
     return GemmInst::kMFMA;
+  } else if (TargetIsRDNA(target)) {
+    return GemmInst::kWMMA;
   } else if (TargetIsCuda(target)) {
     return GemmInst::kMMA;
   } else {
@@ -227,12 +247,6 @@ static int GetArchInt(Target target) {
 }
 
 Stmt GemmSPPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
-  auto block_size = *as_const_int(T.thread_bounds->extent);
-  GemmInst gemm_inst = GetGemmInst(block_size, T.target);
-
-  auto [warp_m, warp_n] =
-      policy->computeWarpPartition(M, N, block_size, T.target, gemm_inst);
-
   if (const auto f = ffi::Function::GetGlobal("tl.gemm_sp_py.lower")) {
     auto prim_func =
         Downcast<PrimFunc>((*f)(tvm::ffi::GetRef<GemmSPPy>(this), T.target,
@@ -246,17 +260,24 @@ Stmt GemmSPPyNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
       {
         BlockNode *n = block.CopyOnWrite();
         n->name_hint = global_symbol.value();
+        n->annotations.Set(tl::attr::kLexicalAllocScope,
+                           IntImm(DataType::Int(32), 1));
       }
       return BlockRealize(block_realize->iter_values, block_realize->predicate,
                           block);
     }
     // warp with block realize node
+    Map<String, ObjectRef> block_annotations;
+    block_annotations.Set(tl::attr::kLexicalAllocScope,
+                          IntImm(DataType::Int(32), 1));
     return BlockRealize(
         /*iter_values=*/Array<PrimExpr>(),
         /*predicate=*/const_true(),
         /*block=*/
         Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
-              /*name_hint=*/global_symbol.value(), prim_func->body));
+              /*name_hint=*/global_symbol.value(), prim_func->body,
+              /*init=*/Optional<Stmt>(), /*alloc_buffers=*/{},
+              /*match_buffers=*/{}, /*annotations=*/block_annotations));
   } else {
     LOG(FATAL) << "No lower function found for gemm_sp_py";
   }
diff --git a/src/op/gemm_sp_py.h b/src/op/gemm_sp_py.h
index 59c276f168..fecd14438a 100644
--- a/src/op/gemm_sp_py.h
+++ b/src/op/gemm_sp_py.h
@@ -4,7 +4,7 @@
  *
  */
 
-// TODO: @botbw: remove redundant code with gemm_py.h
+// TODO: @botbw: remove redundant code with gemm.h
 
 #ifndef TVM_TL_OP_GEMM_SP_PY_H_
 #define TVM_TL_OP_GEMM_SP_PY_H_
@@ -70,6 +70,7 @@ class GemmSPPyNode : public TileOperatorNode {
   Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
   LayoutMap InferLayout(const LayoutInferArgs &T,
                         InferLevel level) const override;
+  AccessRegions GetAccessRegions() const override;
 
   TileOperator Clone() const;
 
diff --git a/src/op/logical.cc b/src/op/logical.cc
index 0de6658bda..38fe38cd1c 100644
--- a/src/op/logical.cc
+++ b/src/op/logical.cc
@@ -42,14 +42,16 @@ TVM_REGISTER_OP("tl.any_of")
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "any_of")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", any_of_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", any_of_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", any_of_op);
 
 TVM_REGISTER_OP("tl.all_of")
     .set_num_inputs(1)
     .set_attr<TCallEffectKind>("TCallEffectKind",
                                Integer(CallEffectKind::kPure))
     .set_attr<TScriptPrinterName>("TScriptPrinterName", "all_of")
-    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op);
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", all_of_op)
+    .set_attr<FLowerIntrinsic>("hip.FLowerIntrinsic", all_of_op);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/multimem.cc b/src/op/multimem.cc
new file mode 100644
index 0000000000..5faf41b308
--- /dev/null
+++ b/src/op/multimem.cc
@@ -0,0 +1,393 @@
+/*!
+ * \file tl/op/multimem.cc
+ * \brief Unified multimem operator implementation.
+ *
+ * Reuses CopyNode's ParallelOp + InferLayout + VectorizeLoop pipeline,
+ * then post-processes to replace mcast buffer accesses with multimem
+ * instructions.
+ */
+
+#include "multimem.h"
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include <sstream>
+
+#include "../transform/common/loop_fusion_utils.h"
+#include "../transform/common/loop_parallel_transform_utils.h"
+#include "../transform/loop_partition.h"
+#include "../transform/loop_vectorize.h"
+#include "multimem_rewriter.h"
+#include "operator.h"
+#include "utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// === MultimemOp Constructor ===
+// args[0]: src region (tl.region call), args[1]: dst region, args[2]: mode,
+// args[3]: reduce_op
+MultimemOp::MultimemOp(Array<PrimExpr> args,
+                       Map<String, ObjectRef> annotations) {
+  ObjectPtr<MultimemOpNode> node = tvm::ffi::make_object<MultimemOpNode>();
+
+  // Parse buffer regions using same utility as CopyNode
+  Array<Range> rgs[2];
+  Buffer bf[2];
+  for (int i = 0; i < 2; i++) {
+    auto region = NormalizeToBufferRegion(args[i]);
+    rgs[i] = region->region;
+    bf[i] = region->buffer;
+  }
+  node->src = bf[0];
+  node->dst = bf[1];
+  node->src_range = rgs[0];
+  node->dst_range = rgs[1];
+
+  node->mode = static_cast<MultimemMode>(args[2].as<IntImm>().value()->value);
+  node->reduce_op = args[3].as<IntImm>().value()->value;
+
+  // Validate buffer scopes based on mode:
+  //   ld_reduce: src=global(mcast), dst=local.fragment
+  //   st:        src=local.fragment, dst=global(mcast)
+  //   red:       src=local.fragment, dst=global(mcast)
+  String src_scope = node->src.scope();
+  String dst_scope = node->dst.scope();
+  switch (node->mode) {
+  case MultimemMode::kLdReduce:
+    ICHECK(src_scope == "global")
+        << "multimem_ld_reduce: src must be global (multicast) buffer, got '"
+        << src_scope << "' for buffer '" << node->src->name << "'";
+    ICHECK(dst_scope == "local.fragment")
+        << "multimem_ld_reduce: dst must be local.fragment buffer, got '"
+        << dst_scope << "' for buffer '" << node->dst->name << "'";
+    break;
+  case MultimemMode::kSt:
+    ICHECK(src_scope == "local.fragment")
+        << "multimem_st: src must be local.fragment buffer, got '" << src_scope
+        << "' for buffer '" << node->src->name << "'";
+    ICHECK(dst_scope == "global")
+        << "multimem_st: dst must be global (multicast) buffer, got '"
+        << dst_scope << "' for buffer '" << node->dst->name << "'";
+    break;
+  case MultimemMode::kRed:
+    ICHECK(src_scope == "local.fragment")
+        << "multimem_red: src must be local.fragment buffer, got '" << src_scope
+        << "' for buffer '" << node->src->name << "'";
+    ICHECK(dst_scope == "global")
+        << "multimem_red: dst must be global (multicast) buffer, got '"
+        << dst_scope << "' for buffer '" << node->dst->name << "'";
+    break;
+  case MultimemMode::kTmaStore:
+  case MultimemMode::kTmaRedStore:
+    ICHECK(src_scope == "shared" || src_scope == "shared.dyn")
+        << "multimem_tma_store: src must be shared memory, got '" << src_scope
+        << "' for buffer '" << node->src->name << "'";
+    ICHECK(dst_scope == "global")
+        << "multimem_tma_store: dst must be global (multicast) buffer, got '"
+        << dst_scope << "' for buffer '" << node->dst->name << "'";
+    break;
+  }
+
+  data_ = std::move(node);
+}
+
+// === GetCoalescedWidth ===
+// 128-bit per multimem instruction => width = 128 / dtype_bits
+int MultimemOpNode::GetCoalescedWidth() const {
+  int bits = src->dtype.bits();
+  return 128 / bits; // f32->4, f16->8, bf16->8
+}
+
+// === MakeIterVars ===
+// Creates loop iteration variables from ranges (skipping dims with extent==1)
+Array<IterVar> MultimemOpNode::MakeIterVars() const {
+  // Use the range with the higher scope level as basis (same logic as CopyNode)
+  auto scope_level = [](const Buffer &b) -> int {
+    String s = b.scope();
+    if (s == "local.fragment" || s == "local")
+      return 2;
+    if (s == "shared" || s == "shared.dyn" || s == "shared.tmem")
+      return 1;
+    return 0;
+  };
+
+  int src_level = scope_level(src);
+  int dst_level = scope_level(dst);
+  bool base_is_src = (src_level >= dst_level);
+  const Array<Range> &base_ranges = base_is_src ? src_range : dst_range;
+
+  Array<IterVar> loop_vars;
+  size_t idx = 0;
+  for (size_t i = 0; i < base_ranges.size(); i++) {
+    if (is_one(base_ranges[i]->extent))
+      continue;
+    Var var = Var(std::string{char('i' + idx)}, base_ranges[i]->extent->dtype);
+    idx++;
+    loop_vars.push_back(
+        {Range(0, base_ranges[i]->extent), var, IterVarType::kDataPar});
+  }
+  return loop_vars;
+}
+
+// === MakeIndices ===
+Array<PrimExpr> MultimemOpNode::MakeIndices(const Array<IterVar> &ivs,
+                                            int src_dst) const {
+  Array<PrimExpr> indices;
+  const Array<Range> &ranges = src_dst == 0 ? src_range : dst_range;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      indices.push_back(ranges[i]->min);
+    else {
+      indices.push_back(ranges[i]->min + ivs[idx]->var);
+      idx++;
+    }
+  }
+  return indices;
+}
+
+// === MakePredicate ===
+PrimExpr MultimemOpNode::MakePredicate(arith::Analyzer *analyzer,
+                                       const Array<IterVar> &ivs,
+                                       Array<PrimExpr> extents,
+                                       int src_dst) const {
+  const Array<Range> &ranges = src_dst == 0 ? src_range : dst_range;
+  Array<PrimExpr> cond_list;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      continue;
+    PrimExpr cond = ranges[i]->min + ivs[idx]->var < extents[i];
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    cond = ranges[i]->min + ivs[idx]->var >= 0;
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    idx++;
+  }
+  if (cond_list.empty())
+    return {};
+  PrimExpr result = cond_list[0];
+  for (size_t i = 1; i < cond_list.size(); i++)
+    result = And(result, cond_list[i]);
+  return result;
+}
+
+// === MakeSIMTLoop ===
+// Creates the element-wise parallel loop: for (i,j): dst[i,j] = src[i,j]
+// with coalesced_width annotation to limit vectorization to 128 bits.
+For MultimemOpNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
+  Array<IterVar> loop_vars = MakeIterVars();
+  bool is_scalar = loop_vars.empty();
+
+  for (const auto &iv : loop_vars)
+    analyzer->Bind(iv->var, iv->dom);
+
+  Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
+  Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
+
+  PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
+  PrimExpr dst_predicate = MakePredicate(analyzer, loop_vars, dst->shape, 1);
+
+  PrimExpr value = BufferLoad(src, src_indices);
+  if (src->dtype != dst->dtype)
+    value = Cast(dst->dtype, value);
+  if (src_predicate.defined())
+    value = if_then_else(src_predicate, value, make_zero(dst->dtype));
+
+  Stmt body = BufferStore(dst, value, dst_indices);
+  if (dst_predicate.defined())
+    body = IfThenElse(dst_predicate, body);
+
+  if (is_scalar) {
+    return For(Var("i"), 0, 1, ForKind::kSerial, body);
+  }
+
+  int coalesced_width = GetCoalescedWidth();
+  for (int i = loop_vars.size() - 1; i >= 0; i--) {
+    Map<String, ObjectRef> loop_annotations;
+    loop_annotations.Set(attr::kCoalescedWidth,
+                         IntImm(DataType::Int(32), coalesced_width));
+    body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
+               ForKind::kParallel, body, std::nullopt, loop_annotations);
+  }
+  return Downcast<For>(body);
+}
+
+// === InferLayout ===
+// Delegates to ParallelOp for layout inference (same as
+// CopyNode::LowerNormalCopy)
+LayoutMap MultimemOpNode::InferLayout(const LayoutInferArgs &T,
+                                      InferLevel level) const {
+  // Multimem ops always go through the ParallelOp path during Lower.
+  // No standalone layout inference needed.
+  return {};
+}
+
+// === Lower ===
+// The main lowering path: MakeSIMTLoop -> ParallelOp pipeline ->
+// MultimemRewriter
+Stmt MultimemOpNode::Lower(const LowerArgs &T,
+                           arith::Analyzer *analyzer) const {
+  if (mode == MultimemMode::kTmaStore || mode == MultimemMode::kTmaRedStore) {
+    return LowerBulkCopy(T, analyzer);
+  }
+
+  // Step 1-2: Create SIMT loop and fuse/transform
+  auto simt_loop = MakeSIMTLoop(analyzer);
+  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
+  auto transformed_loop =
+      Downcast<For>(ParallelLoopTransformer::Substitute(fused_loop));
+
+  // Step 3: Create ParallelOp and run InferLayout at multiple levels
+  auto par_op = ParallelOp(transformed_loop);
+
+  std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                    InferLevel::kFree};
+  for (auto level : levels) {
+    par_op->InferLayout({T.target,
+                         T.thread_bounds,
+                         T.layout_map,
+                         analyzer,
+                         false,
+                         T.buffer_remap,
+                         {}},
+                        level);
+  }
+
+  // Step 4: Lower the parallel loop (PartitionLoop + VectorizeLoop)
+  auto loop_layout = par_op->GetLoopLayout();
+  Stmt result =
+      LowerParallelLoop(par_op->GetRoot(), loop_layout, T.thread_var, analyzer,
+                        T.layout_map, par_op->GetPredicate(T.thread_var));
+
+  // Step 5: Post-process — replace mcast buffer accesses with multimem
+  // call_extern
+  Buffer mcast_buf = (mode == MultimemMode::kLdReduce) ? src : dst;
+  // Remap the mcast buffer if needed
+  if (T.buffer_remap.count(mcast_buf)) {
+    mcast_buf = T.buffer_remap[mcast_buf];
+  }
+  result = MultimemRewriter(mcast_buf, mode, reduce_op).Rewrite(result);
+  return result;
+}
+
+// === LowerBulkCopy ===
+// CTA-collective bulk async store from shared to multicast global.
+// Reuses the 1D address computation pattern from CopyNode::LowerBulkCopy1D,
+// but emits multimem.cp.async.bulk or multimem.cp.reduce.async.bulk PTX.
+Stmt MultimemOpNode::LowerBulkCopy(const LowerArgs &T,
+                                   arith::Analyzer *analyzer) const {
+  bool is_reduce = (mode == MultimemMode::kTmaRedStore);
+  // Both modes: src=shared, dst=mcast_global
+  auto &shared_tensor = src;
+  auto &global_tensor = dst;
+  auto &shared_range = src_range;
+  auto &global_range = dst_range;
+
+  // Compute total elements
+  PrimExpr shared_elements = 1;
+  for (size_t i = 0; i < shared_range.size(); i++) {
+    shared_elements *= shared_range[i]->extent;
+  }
+  PrimExpr elements = analyzer->Simplify(shared_elements);
+  PrimExpr size_bytes = elements * shared_tensor->dtype.bytes();
+
+  // 16-byte alignment check (at compile time if constant)
+  if (auto *imm = size_bytes.as<IntImmNode>()) {
+    ICHECK(imm->value % 16 == 0)
+        << "multimem_tma_store: transfer size must be 16-byte aligned, got "
+        << imm->value;
+  }
+
+  // Compute flat shared offset
+  std::vector<PrimExpr> shared_strides;
+  PrimExpr sh_stride = 1;
+  for (int i = static_cast<int>(shared_tensor->shape.size()) - 1; i >= 0; --i) {
+    shared_strides.insert(shared_strides.begin(), sh_stride);
+    sh_stride *= shared_tensor->shape[i];
+  }
+  PrimExpr shared_offset = 0;
+  for (size_t i = 0; i < shared_range.size(); i++) {
+    shared_offset += shared_range[i]->min * shared_strides[i];
+  }
+
+  // Compute flat global offset
+  std::vector<PrimExpr> global_strides;
+  PrimExpr gl_stride = 1;
+  for (int i = static_cast<int>(global_tensor->shape.size()) - 1; i >= 0; --i) {
+    global_strides.insert(global_strides.begin(), gl_stride);
+    gl_stride *= global_tensor->shape[i];
+  }
+  PrimExpr global_offset = 0;
+  for (size_t i = 0; i < global_range.size(); i++) {
+    global_offset += global_range[i]->min * global_strides[i];
+  }
+
+  // Build address_of(BufferLoad(buffer, {flat_offset}))
+  auto make_addr = [](const Buffer &buf, PrimExpr flat_idx) -> PrimExpr {
+    return Call(DataType::Handle(), builtin::address_of(),
+                {BufferLoad(buf, {flat_idx})});
+  };
+  PrimExpr smem_addr = make_addr(shared_tensor, shared_offset);
+  PrimExpr mcast_addr = make_addr(global_tensor, global_offset);
+
+  // Build function name based on mode and dtype
+  std::string func_name;
+  if (is_reduce) {
+    func_name = "tl::multimem::cp_reduce_async_bulk_";
+    switch (reduce_op) {
+    case 0:
+      func_name += "add_";
+      break;
+    case 1:
+      func_name += "min_";
+      break;
+    case 2:
+      func_name += "max_";
+      break;
+    default:
+      LOG(FATAL) << "Invalid reduce_op: " << reduce_op;
+    }
+    func_name += shared_tensor->dtype.is_float16()    ? "f16"
+                 : shared_tensor->dtype.is_bfloat16() ? "bf16"
+                                                      : "f32";
+  } else {
+    func_name = "tl::multimem::cp_async_bulk";
+  }
+
+  Array<PrimExpr> extern_args;
+  extern_args.push_back(StringImm(func_name));
+  extern_args.push_back(mcast_addr);
+  extern_args.push_back(smem_addr);
+  extern_args.push_back(size_bytes);
+
+  Stmt bulk_copy =
+      Evaluate(Call(DataType::Handle(), builtin::call_extern(), extern_args));
+
+  // Gate with tid == 0 (single thread per CTA emits the PTX)
+  bulk_copy = IfThenElse(EQ(T.thread_var, T.thread_bounds->min), bulk_copy);
+  return bulk_copy;
+}
+
+// === Clone ===
+TileOperator MultimemOpNode::Clone() const {
+  auto node = tvm::ffi::make_object<MultimemOpNode>(*this);
+  return MultimemOp(node);
+}
+
+// === Registration ===
+TIR_REGISTER_TL_TILE_OP(MultimemOp, multimem)
+    .set_num_inputs(4)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/multimem.h b/src/op/multimem.h
new file mode 100644
index 0000000000..33f762b9b3
--- /dev/null
+++ b/src/op/multimem.h
@@ -0,0 +1,98 @@
+/*!
+ * \file tl/op/multimem.h
+ * \brief Unified multimem operator that reuses T.copy's layout inference.
+ *
+ * Design: MakeSIMTLoop creates element-wise BufferLoad/BufferStore loop,
+ * then ParallelOp + InferLayout + VectorizeLoop runs the standard pipeline.
+ * Post-process replaces vectorized loads/stores on mcast buffers with
+ * multimem call_extern instructions.
+ */
+
+#ifndef TVM_TL_OP_MULTIMEM_H_
+#define TVM_TL_OP_MULTIMEM_H_
+
+#include <tvm/target/target.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include "operator.h"
+#include "parallel.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+enum class MultimemMode : int {
+  kLdReduce = 0,
+  kSt = 1,
+  kRed = 2,
+  kTmaStore = 3, // multimem.cp.async.bulk: shared → mcast_global (plain store)
+  kTmaRedStore =
+      4, // multimem.cp.reduce.async.bulk: shared → mcast_global (reduce)
+};
+
+/*!
+ * \brief Unified multimem operator for NVSwitch SHARP multicast operations.
+ *
+ * Supports three modes:
+ *  - kLdReduce: load-reduce from multicast address into local buffer
+ *  - kSt: store to multicast address (broadcast)
+ *  - kRed: reduce into multicast address (no read-back)
+ *
+ * Lower flow:
+ *  1. MakeSIMTLoop: creates element-wise parallel loop (BufferLoad ->
+ * BufferStore)
+ *  2. ParallelLoopFuser::Fuse + ParallelLoopTransformer::Substitute
+ *  3. ParallelOp -> InferLayout at multiple levels
+ *  4. LowerParallelLoop (PartitionLoop + VectorizeLoop)
+ *  5. MultimemRewriter: post-process to replace mcast buffer accesses with
+ * call_extern
+ */
+class MultimemOpNode : public TileOperatorNode {
+public:
+  Buffer src, dst;
+  Array<Range> src_range, dst_range;
+  MultimemMode mode;
+  int reduce_op; // 0=ADD, 1=MIN, 2=MAX
+
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.MultimemOp", MultimemOpNode,
+                                    TileOperatorNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<MultimemOpNode>()
+        .def_ro("src", &MultimemOpNode::src)
+        .def_ro("dst", &MultimemOpNode::dst)
+        .def_ro("src_range", &MultimemOpNode::src_range)
+        .def_ro("dst_range", &MultimemOpNode::dst_range);
+  }
+
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  TileOperator Clone() const override;
+
+private:
+  For MakeSIMTLoop(arith::Analyzer *analyzer) const;
+  Array<IterVar> MakeIterVars() const;
+  Array<PrimExpr> MakeIndices(const Array<IterVar> &ivs, int src_dst) const;
+  PrimExpr MakePredicate(arith::Analyzer *analyzer, const Array<IterVar> &ivs,
+                         Array<PrimExpr> extents, int src_dst) const;
+  int GetCoalescedWidth() const;
+  Stmt LowerBulkCopy(const LowerArgs &T, arith::Analyzer *analyzer) const;
+};
+
+class MultimemOp : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(MultimemOp, TileOperator,
+                                             MultimemOpNode);
+  TVM_DLL
+  MultimemOp(Array<PrimExpr> args,
+             Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_MULTIMEM_H_
diff --git a/src/op/multimem_rewriter.h b/src/op/multimem_rewriter.h
new file mode 100644
index 0000000000..d92fe8ce95
--- /dev/null
+++ b/src/op/multimem_rewriter.h
@@ -0,0 +1,280 @@
+/*!
+ * \file tl/op/multimem_rewriter.h
+ * \brief Post-process IR to replace vectorized BufferLoad/Store on mcast
+ * buffers with multimem call_extern instructions.
+ */
+
+#ifndef TVM_TL_OP_MULTIMEM_REWRITER_H_
+#define TVM_TL_OP_MULTIMEM_REWRITER_H_
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+
+#include <sstream>
+
+#include "multimem.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+static inline std::string GetReduceOpStr(int reduce_op) {
+  switch (reduce_op) {
+  case 0:
+    return "tl::multimem::ReduceOp::ADD";
+  case 1:
+    return "tl::multimem::ReduceOp::MIN";
+  case 2:
+    return "tl::multimem::ReduceOp::MAX";
+  default:
+    LOG(FATAL) << "Invalid reduce_op: " << reduce_op;
+    return "";
+  }
+}
+
+/*!
+ * \brief Rewrites BufferLoad/BufferStore involving a multicast buffer
+ *        into multimem call_extern instructions.
+ *
+ * After ParallelOp + VectorizeLoop, the IR contains ForKind::kVectorized loops
+ * with scalar loop variables (Ramp is not materialized until codegen).
+ * This rewriter detects two patterns:
+ *
+ * 1. ForKind::kVectorized loop containing mcast buffer access:
+ *    for (vec: kVectorized, extent=N) { dst[base+vec] = mcast[base+vec] }
+ *    → call_extern("LdReduceVN<...>::run", &dst[base], &mcast[base])
+ *
+ * 2. Scalar BufferStore with Ramp indices (if vectorization produced Ramp):
+ *    dst[Ramp(base,1,N)] = mcast[Ramp(base,1,N)]
+ *    → call_extern("LdReduceVN<...>::run", &dst[base], &mcast[base])
+ */
+class MultimemRewriter : public StmtExprMutator {
+public:
+  MultimemRewriter(Buffer mcast_buf, MultimemMode mode, int reduce_op)
+      : mcast_buf_(std::move(mcast_buf)), mode_(mode), reduce_op_(reduce_op) {}
+
+  Stmt Rewrite(Stmt stmt) { return VisitStmt(std::move(stmt)); }
+
+protected:
+  /*!
+   * \brief Handle ForKind::kVectorized loops.
+   * If the loop body is a single BufferStore involving the mcast buffer,
+   * replace the entire loop with a single vectorized multimem call.
+   */
+  Stmt VisitStmt_(const ForNode *op) override {
+    if (op->kind == ForKind::kVectorized) {
+      auto extent_ptr = op->extent.as<IntImmNode>();
+      if (extent_ptr) {
+        int lanes = static_cast<int>(extent_ptr->value);
+        // Try to match the loop body as a single BufferStore from mcast buffer
+        auto result = TryRewriteVectorizedLoop(op, lanes);
+        if (result.defined()) {
+          return result;
+        }
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  /*!
+   * \brief Handle scalar BufferStore with Ramp indices (fallback).
+   */
+  Stmt VisitStmt_(const BufferStoreNode *op) override {
+    if (mode_ == MultimemMode::kLdReduce) {
+      if (auto *load = op->value.as<BufferLoadNode>()) {
+        if (load->buffer.same_as(mcast_buf_)) {
+          int lanes = GetLanes(load->indices);
+          if (lanes > 1) {
+            return MakeMultimemCall(op->buffer, op->indices, load->buffer,
+                                    load->indices, lanes);
+          }
+        }
+      }
+    } else {
+      if (op->buffer.same_as(mcast_buf_)) {
+        if (auto *load = op->value.as<BufferLoadNode>()) {
+          int lanes = GetLanes(op->indices);
+          if (lanes > 1) {
+            return MakeMultimemCall(load->buffer, load->indices, op->buffer,
+                                    op->indices, lanes);
+          }
+        }
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+private:
+  Buffer mcast_buf_;
+  MultimemMode mode_;
+  int reduce_op_;
+
+  /*!
+   * \brief Try to rewrite a kVectorized for-loop containing a mcast
+   * BufferStore. Returns the replacement Stmt, or undefined if the pattern
+   * doesn't match.
+   */
+  Stmt TryRewriteVectorizedLoop(const ForNode *op, int lanes) {
+    // The body should be a single BufferStore (possibly wrapped in IfThenElse)
+    const BufferStoreNode *store = nullptr;
+    Stmt body = op->body;
+
+    // Unwrap IfThenElse if present
+    if (auto *ite = body.as<IfThenElseNode>()) {
+      store = ite->then_case.as<BufferStoreNode>();
+    } else {
+      store = body.as<BufferStoreNode>();
+    }
+
+    if (!store)
+      return Stmt();
+
+    const BufferLoadNode *load = store->value.as<BufferLoadNode>();
+    if (!load)
+      return Stmt();
+
+    // Check if this involves the mcast buffer
+    bool matches = false;
+    const Buffer *local_buf_ptr = nullptr;
+    const Array<PrimExpr> *local_indices_ptr = nullptr;
+    const Buffer *mc_buf_ptr = nullptr;
+    const Array<PrimExpr> *mc_indices_ptr = nullptr;
+
+    if (mode_ == MultimemMode::kLdReduce) {
+      if (load->buffer.same_as(mcast_buf_)) {
+        matches = true;
+        local_buf_ptr = &store->buffer;
+        local_indices_ptr = &store->indices;
+        mc_buf_ptr = &load->buffer;
+        mc_indices_ptr = &load->indices;
+      }
+    } else {
+      if (store->buffer.same_as(mcast_buf_)) {
+        matches = true;
+        local_buf_ptr = &load->buffer;
+        local_indices_ptr = &load->indices;
+        mc_buf_ptr = &store->buffer;
+        mc_indices_ptr = &store->indices;
+      }
+    }
+
+    if (!matches)
+      return Stmt();
+
+    // Substitute vec_var = 0 to get the base address
+    Var vec_var = op->loop_var;
+    Map<Var, PrimExpr> vmap;
+    vmap.Set(vec_var, IntImm(vec_var.dtype(), 0));
+
+    Array<PrimExpr> local_base_indices;
+    for (const auto &idx : *local_indices_ptr) {
+      local_base_indices.push_back(Substitute(idx, vmap));
+    }
+    Array<PrimExpr> mc_base_indices;
+    for (const auto &idx : *mc_indices_ptr) {
+      mc_base_indices.push_back(Substitute(idx, vmap));
+    }
+
+    return MakeMultimemCall(*local_buf_ptr, local_base_indices, *mc_buf_ptr,
+                            mc_base_indices, lanes);
+  }
+
+  /*!
+   * \brief Get the vector lanes from Ramp indices.
+   */
+  int GetLanes(const Array<PrimExpr> &indices) const {
+    if (indices.size() == 1) {
+      if (auto *ramp = indices[0].as<RampNode>()) {
+        return static_cast<int>(ramp->lanes.as<IntImmNode>()->value);
+      }
+    }
+    return 1;
+  }
+
+  /*!
+   * \brief Create the call_extern for a multimem instruction.
+   */
+  Stmt MakeMultimemCall(const Buffer &local_buf,
+                        const Array<PrimExpr> &local_indices,
+                        const Buffer &mc_buf, const Array<PrimExpr> &mc_indices,
+                        int lanes) const {
+    std::string func_name = MakeFuncName(lanes, local_buf->dtype);
+
+    Array<PrimExpr> args;
+    args.push_back(StringImm(func_name));
+
+    if (mode_ == MultimemMode::kLdReduce) {
+      args.push_back(MakeAddressOf(local_buf, local_indices));
+      args.push_back(MakeAddressOf(mc_buf, mc_indices));
+    } else {
+      args.push_back(MakeAddressOf(mc_buf, mc_indices));
+      args.push_back(MakeAddressOf(local_buf, local_indices));
+    }
+
+    auto call = Call(DataType::Handle(), builtin::call_extern(), args);
+    return Evaluate(call);
+  }
+
+  /*!
+   * \brief Construct the template function name.
+   */
+  std::string MakeFuncName(int lanes, DataType dtype) const {
+    std::string dtype_tag = DTypeToTag(dtype);
+    std::string reduce_op_str = GetReduceOpStr(reduce_op_);
+
+    std::stringstream ss;
+    switch (mode_) {
+    case MultimemMode::kLdReduce:
+      ss << "tl::multimem::LdReduceV" << lanes;
+      break;
+    case MultimemMode::kSt:
+      ss << "tl::multimem::StV" << lanes;
+      break;
+    case MultimemMode::kRed:
+      ss << "tl::multimem::RedV" << lanes;
+      break;
+    }
+    ss << "<";
+    if (mode_ != MultimemMode::kSt) {
+      ss << reduce_op_str << ", ";
+    }
+    ss << dtype_tag;
+    ss << ">::run";
+    return ss.str();
+  }
+
+  std::string DTypeToTag(DataType dtype) const {
+    if (dtype.is_float() && dtype.bits() == 32)
+      return "float";
+    if (dtype.is_float16())
+      return "half_t";
+    if (dtype.is_bfloat16())
+      return "bfloat16_t";
+    LOG(FATAL) << "Unsupported dtype for multimem: " << dtype;
+    return "";
+  }
+
+  /*!
+   * \brief Create address_of expression. Handles Ramp by extracting base.
+   */
+  PrimExpr MakeAddressOf(const Buffer &buffer,
+                         const Array<PrimExpr> &indices) const {
+    Array<PrimExpr> scalar_indices;
+    for (const auto &idx : indices) {
+      if (auto *ramp = idx.as<RampNode>()) {
+        scalar_indices.push_back(ramp->base);
+      } else {
+        scalar_indices.push_back(idx);
+      }
+    }
+    return Call(DataType::Handle(), builtin::address_of(),
+                {BufferLoad(buffer, scalar_indices)});
+  }
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_MULTIMEM_REWRITER_H_
diff --git a/src/op/operator.cc b/src/op/operator.cc
index 0a8f6b8b80..e5db15a166 100644
--- a/src/op/operator.cc
+++ b/src/op/operator.cc
@@ -6,6 +6,8 @@
 
 #include "operator.h"
 
+#include "builtin.h"
+
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op_attr_types.h>
 
@@ -71,10 +73,22 @@ TileOperator ParseOperator(Stmt stmt) {
 Var GetVarFromAccessPtr(const PrimExpr &expr) {
   auto call = expr.as<CallNode>();
   ICHECK(call);
-  ICHECK(call->op.same_as(builtin::tvm_access_ptr()));
-  auto var = call->args[1].as<VarNode>();
-  ICHECK(var);
-  return tvm::ffi::GetRef<Var>(var);
+  if (call->op.same_as(builtin::tvm_access_ptr())) {
+    auto var = call->args[1].as<VarNode>();
+    ICHECK(var);
+    return tvm::ffi::GetRef<Var>(var);
+  }
+  if (call->op.same_as(tl::access_ptr())) {
+    ICHECK_EQ(call->args.size(), 3U);
+    auto load = call->args[0].as<BufferLoadNode>();
+    ICHECK(load);
+    auto var = load->buffer->data.as<VarNode>();
+    ICHECK(var);
+    return tvm::ffi::GetRef<Var>(var);
+  }
+  LOG(FATAL) << "GetVarFromAccessPtr expects a tvm_access_ptr or tl.access_ptr "
+                "call, but got: "
+             << tvm::ffi::GetRef<Call>(call);
 }
 
 } // namespace tl
diff --git a/src/op/operator.h b/src/op/operator.h
index ddbe1fa6b2..6743660bbf 100644
--- a/src/op/operator.h
+++ b/src/op/operator.h
@@ -14,6 +14,8 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/op_attr_types.h>
 #include <tvm/tir/stmt.h>
+#include <utility>
+#include <vector>
 
 #include "../layout/layout.h"
 
@@ -23,9 +25,40 @@ namespace tl {
 using namespace tir;
 
 using AddWorkspaceCallback = std::function<PrimExpr(int, DataType)>;
+using AllocMBarrierCallback = std::function<int(int arrive_count)>;
 using LayoutMap = Map<Buffer, Layout>;
 using BufferMap = Map<Var, Buffer>;
 
+enum AccessMask : int {
+  kAccessRead = 1,
+  kAccessWrite = 2,
+  kAccessReadWrite = kAccessRead | kAccessWrite,
+};
+
+struct AccessRegion {
+  BufferRegion region;
+  int access_mask{kAccessReadWrite};
+};
+
+struct AccessRegions {
+  Array<BufferRegion> reads;
+  Array<BufferRegion> writes;
+};
+
+inline void AppendAccessRegionByMask(const AccessRegion &access,
+                                     Array<BufferRegion> *reads,
+                                     Array<BufferRegion> *writes) {
+  if (!access.region.defined()) {
+    return;
+  }
+  if (access.access_mask & kAccessRead) {
+    reads->push_back(access.region);
+  }
+  if (access.access_mask & kAccessWrite) {
+    writes->push_back(access.region);
+  }
+}
+
 enum class InferLevel : uint8_t {
   kFree = 0,
   kCommon = 1,
@@ -51,11 +84,25 @@ struct LowerArgs {
   Range thread_bounds;
   Var thread_var;
   AddWorkspaceCallback AddWorkspace;
+  AllocMBarrierCallback AllocMBarrier;
   LayoutMap layout_map;
   Map<Buffer, Buffer> buffer_remap;
   // Map from LetStmt variable to its bound expression, for resolving
   // fragment buffer accesses through let bindings
   Map<Var, PrimExpr> let_var_to_expr;
+  // Fallback mbarrier parity for ops that do not carry an explicit
+  // tl.pipeline_mbar_phase_expr annotation. LowerTileOp derives this from the
+  // nearest enclosing serial loop so non-pipelined TMA loops still alternate
+  // barrier phase correctly.
+  PrimExpr mbar_phase_expr = IntImm(DataType::Int(32), 0);
+  // Pointer to the shared.barrier buffer for compiler-generated mbarriers.
+  // Points to the LowerTileOpPass member so copy.cc sees the buffer
+  // even when created lazily by the AllocMBarrier callback.
+  Optional<Buffer> *mbarrier_buffer = nullptr;
+  // Product of cluster_dims (from block annotation). Defaults to 1 (no
+  // cluster). Used by TMA copy lowering to scale expect_tx bytes for cluster
+  // barriers.
+  int cluster_size = 1;
 };
 
 struct LayoutInferArgs {
@@ -68,6 +115,9 @@ struct LayoutInferArgs {
   // Map from LetStmt variable to its bound expression, for resolving
   // fragment buffer accesses through let bindings
   Map<Var, PrimExpr> let_var_to_expr;
+  // Whether the current TileOp is nested inside a pipelined loop
+  // (i.e. a surrounding loop annotated with num_stages > 0).
+  bool in_pipeline = false;
 };
 
 class TileOperator;
@@ -81,7 +131,22 @@ class TileOperatorNode : public Object {
 
   virtual TileOperator Clone() const = 0;
 
+  virtual AccessRegions GetAccessRegions() const {
+    AccessRegions result;
+    for (const auto &access : access_regions_) {
+      AppendAccessRegionByMask(access, &result.reads, &result.writes);
+    }
+    return result;
+  }
+
+  void SetAccessRegions(std::vector<AccessRegion> access_regions) {
+    access_regions_ = std::move(access_regions);
+  }
+
   TVM_FFI_DECLARE_OBJECT_INFO("tl.TileOperator", TileOperatorNode, Object);
+
+protected:
+  std::vector<AccessRegion> access_regions_;
 };
 
 class TileOperator : public ObjectRef {
diff --git a/src/op/parallel.cc b/src/op/parallel.cc
index 764fa1909e..b93467b54f 100644
--- a/src/op/parallel.cc
+++ b/src/op/parallel.cc
@@ -8,6 +8,9 @@
 #include <algorithm>
 #include <tvm/tir/op.h>
 
+#include "../layout/layout.h"
+#include "arith/int_operator.h"
+
 #include "../layout/utils.h"
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
@@ -19,86 +22,7 @@ namespace tl {
 
 using namespace tir;
 
-// ProveFragmentContains checks whether the threads that access elements of a
-// smaller fragment (small_frag) are a subset of the threads that access
-// elements of a larger fragment (large_frag) for any given loop index. This
-// function ensures that if the small fragment's layout corresponds to the loop
-// itself, accessing the large fragment's elements is valid. Additionally, if
-// small is updated to large, the originally valid access remains valid. The
-// proof is performed by:
-//
-// 1. Defining a variable `rep_small` to represent the replicate index of the
-//    small fragment that is being checked.
-// 2. Using the `small_frag_indices` and `rep_small` to derive the thread
-// accessing
-//    the element in the small fragment.
-// 3. Using `large_frag_indices` to derive the physical index of the large
-// fragment
-//    along with the thread information, and then feeding these into the inverse
-//    of the large fragment to obtain the logical index and replicate index.
-// 4. Verifying the mapping by checking whether the computed thread using the
-// inverse
-//    layout corresponds to the original thread calculated for the small
-//    fragment. If they don't match, this indicates that the inverse layout's
-//    domain does not include the thread and thus the access is invalid.
-bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
-                           Array<PrimExpr> small_frag_indices,
-                           Array<PrimExpr> large_frag_indices,
-                           arith::Analyzer &analyzer_,
-                           bool check_forward_index) {
-  // When check_forward_index is true, verify that the physical indices
-  // (forward index) of both fragments are equal. This is required when
-  // validating loop layout against buffer fragment, as code generation
-  // needs to correctly derive buffer physical indices from loop layout.
-  if (check_forward_index) {
-    auto small_physical = small_frag->Forward(small_frag_indices);
-    auto large_physical = large_frag->Forward(large_frag_indices);
-
-    // Dimension mismatch means they are not equal.
-    if (small_physical.size() != large_physical.size()) {
-      return false;
-    }
-
-    // Check each physical index component for equality.
-    for (size_t i = 0; i < small_physical.size(); i++) {
-      auto diff = analyzer_.Simplify(small_physical[i] - large_physical[i]);
-      if (!is_zero(diff)) {
-        return false;
-      }
-    }
-  }
-
-  Var rep_small("__checking_frag_contains_rep");
-  analyzer_.Bind(rep_small,
-                 Range(IntImm(small_frag->ReplicateExtent()->dtype, 0),
-                       small_frag->ReplicateExtent()),
-                 true); // Bind the replicate extent of small_frag.
-  // Derive thread for small_frag.
-  auto thread = small_frag->ForwardThread(small_frag_indices, rep_small);
-
-  // Get physical index and thread for large_frag.
-  auto large_frag_physical_and_thread = large_frag->Forward(large_frag_indices);
-  // Add small_frag's thread to the large fragment's thread info.
-  large_frag_physical_and_thread.push_back(thread);
-  // Get the inverse of the large fragment.
-  auto inv_large_frag = large_frag->Inverse();
-  // Compute logical index and replicate index using inverse layout.
-  auto inv_large_frag_logical_and_rep =
-      inv_large_frag->Forward(large_frag_physical_and_thread);
-
-  // Extract replicate index from the result.
-  auto inv_large_frag_rep =
-      inv_large_frag_logical_and_rep[inv_large_frag_logical_and_rep.size() - 1];
-
-  // Calculate thread based on the logical index and replicate index.
-  auto check_thread =
-      large_frag->ForwardThread(large_frag_indices, inv_large_frag_rep);
-
-  // Simplify the difference between the threads.
-  auto diff = analyzer_.Simplify(thread - check_thread);
-  // If the difference is zero, the threads match and the access is valid.
-  return is_zero(diff);
-}
+namespace {
 
 class IfBufferRemapLoopGenerator : public StmtExprMutator {
 public:
@@ -139,6 +63,8 @@ class IfBufferRemapLoopGenerator : public StmtExprMutator {
   Map<Buffer, Layout> layout_map_;
 };
 
+} // anonymous namespace
+
 /**
  * @brief Handle a parallel For node during traversal, collecting loop metadata.
  *
@@ -167,33 +93,46 @@ void ParallelLoopNestVisitor::VisitStmt_(const ForNode *op) {
 
 void ParallelLoopNestVisitor::VisitStmt_(const BufferStoreNode *op) {
   if (IsFragmentBuffer(op->buffer)) {
-    if (p->indice_map_.find(op->buffer) != p->indice_map_.end()) {
-      ICHECK(StructuralEqual()(p->indice_map_.at(op->buffer), op->indices))
-          << op->buffer << ": " << op->indices << " and "
-          << p->indice_map_.at(op->buffer);
-    } else {
-      p->indice_map_.Set(op->buffer, op->indices);
-    }
-    p->buffer_is_write_.insert(op->buffer);
+    p->RecordBufferAccess(op->buffer, op->indices, /*is_write=*/true);
   }
   StmtExprVisitor::VisitStmt_(op);
 }
 
 void ParallelLoopNestVisitor::VisitExpr_(const BufferLoadNode *op) {
   if (IsFragmentBuffer(op->buffer)) {
-    if (p->indice_map_.find(op->buffer) != p->indice_map_.end()) {
-      ICHECK(StructuralEqual()(p->indice_map_.at(op->buffer), op->indices))
-          << op->buffer << ": " << op->indices << " and "
-          << p->indice_map_.at(op->buffer);
-    } else {
-      p->indice_map_.Set(op->buffer, op->indices);
-    }
+    p->RecordBufferAccess(op->buffer, op->indices, /*is_write=*/false);
   }
   StmtExprVisitor::VisitExpr_(op);
 }
 
 ParallelOpNode::ParallelOpNode(For root) : root_(root), V(this) {
   V.VisitStmt(root);
+  // Cache any annotated layout/predicate on the outermost loop.
+  using namespace attr;
+  if (root_->annotations.count(kParallelLoopLayout)) {
+    annotated_layout_unbound_ =
+        Downcast<Fragment>(root_->annotations.Get(kParallelLoopLayout).value());
+  }
+  if (root_->annotations.count(kParallelLoopPredicate)) {
+    annotated_predicate_ = Downcast<PrimExpr>(
+        root_->annotations.Get(kParallelLoopPredicate).value());
+  }
+  // Collect cross-thread access info and buffer store info.
+  PostOrderVisit(root_, [&](const ObjectRef &obj) {
+    if (const auto *store = obj.as<BufferStoreNode>()) {
+      auto buffer = store->buffer;
+      if (IsSharedBuffer(buffer) || IsGlobalBuffer(buffer)) {
+        has_cross_thread_access_ = true;
+        store_shared_global_buffers_.emplace_back(buffer);
+      } else if (IsFragmentBuffer(buffer)) {
+        store_fragment_buffers_.emplace_back(buffer);
+      }
+    } else if (const auto *load = obj.as<BufferLoadNode>()) {
+      if (IsSharedBuffer(load->buffer) || IsGlobalBuffer(load->buffer)) {
+        has_cross_thread_access_ = true;
+      }
+    }
+  });
 }
 
 TileOperator ParallelOpNode::Clone() const {
@@ -210,8 +149,8 @@ void ParallelOpNode::ExpandLetBindings(
   std::function<void(const PrimExpr &)> expand = [&](const PrimExpr &expr) {
     PostOrderVisit(expr, [&](const ObjectRef &node) {
       if (auto bl = node.as<BufferLoadNode>()) {
-        if (IsFragmentBuffer(bl->buffer) && !indice_map_.count(bl->buffer)) {
-          indice_map_.Set(bl->buffer, bl->indices);
+        if (IsFragmentBuffer(bl->buffer)) {
+          RecordBufferAccess(bl->buffer, bl->indices, /*is_write=*/false);
         }
       } else if (auto var_node = node.as<VarNode>()) {
         auto var = tvm::ffi::GetRef<Var>(var_node);
@@ -239,14 +178,59 @@ void ParallelOpNode::ExpandLetBindings(
   }
 }
 
+void ParallelOpNode::RecordBufferAccess(const Buffer &buffer,
+                                        const Array<PrimExpr> &indices,
+                                        bool is_write) {
+  auto it = indice_map_.find(buffer);
+  if (it != indice_map_.end()) {
+    ICHECK(StructuralEqual()(it->second.indices, indices))
+        << buffer << ": " << indices << " and " << it->second.indices;
+  } else {
+    BufferAccessInfo info;
+    info.indices = indices;
+    it = indice_map_.emplace(buffer, std::move(info)).first;
+  }
+  if (is_write) {
+    it->second.is_write = true;
+  } else {
+    it->second.is_read = true;
+  }
+}
+
+const ParallelOpNode::BufferAccessInfo &
+ParallelOpNode::GetAccessInfo(const Buffer &buffer) const {
+  auto it = indice_map_.find(buffer);
+  ICHECK(it != indice_map_.end())
+      << "Missing access info for buffer " << buffer;
+  return it->second;
+}
+
+bool ParallelOpNode::IsBufferCompletelyReplicated(
+    const Buffer &buffer, const LayoutMap &layout_map) const {
+  if (!IsFragmentBuffer(buffer))
+    return false;
+  auto frag = layout_map[buffer].as<Fragment>().value();
+  // buffer indices should be IntImm
+  for (const auto &index : GetAccessInfo(buffer).indices) {
+    if (!index.as<IntImmNode>()) {
+      return false;
+    } else if (index.as<IntImmNode>()->value != 0) {
+      LOG(FATAL) << "buffer " << buffer << " is not completed replicated";
+    }
+  }
+  return frag->IsCompletedReplicated();
+}
+
 Stmt ParallelOpNode::Lower(const LowerArgs &T,
                            arith::Analyzer *analyzer) const {
   return root_;
 }
 
+// (annotations parsed in ctor; adoption happens in InferLayout)
+
 bool ParallelOpNode::IsCommonAccessIndice(const Buffer &buffer) const {
   auto common_indice = loop_vars_.Map([](const auto &iv) { return iv->var; });
-  return StructuralEqual()(indice_map_[buffer], common_indice);
+  return StructuralEqual()(GetAccessInfo(buffer).indices, common_indice);
 }
 
 /*! \brief Infer the layout for parallel operations based on different inference
@@ -269,7 +253,7 @@ bool ParallelOpNode::IsCommonAccessIndice(const Buffer &buffer) const {
  */
 LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
                                       InferLevel level) const {
-  if (loop_layout_.defined())
+  if (loop_layout_inferred_)
     return {};
 
   // Expand let bindings to find fragment buffer accesses
@@ -284,7 +268,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
     // for i in T.Parallel(m):
     //   fragment[0] = x[i]
     // then fragment[0] must be replicated on all threads.
-    for (const auto &[buffer, indices] : indice_map_) {
+    for (const auto &[buffer, access] : indice_map_) {
       if (T.layout_map.count(buffer)) {
         continue;
       }
@@ -293,7 +277,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
 
       // Check if all indices are zero
       bool all_indices_zero = true;
-      for (const auto &index : indices) {
+      for (const auto &index : access.indices) {
         if (const auto *imm = index.as<IntImmNode>()) {
           if (imm->value != 0) {
             all_indices_zero = false;
@@ -332,29 +316,15 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
     return results;
   }
 
-  auto buffer_is_completed_replicated = [&](const Buffer &buffer) {
-    if (!IsFragmentBuffer(buffer))
-      return false;
-    auto frag = T.layout_map[buffer].as<Fragment>().value();
-    // buffer indices should be IntImm
-    for (const auto &index : indice_map_[buffer]) {
-      if (!index.as<IntImmNode>()) {
-        return false;
-      } else if (index.as<IntImmNode>()->value != 0) {
-        LOG(FATAL) << "buffer " << buffer << " is not completed replicated";
-      }
-    }
-    return frag->IsCompletedReplicated();
-  };
   // Collect fragment buffers with const index and all fragment_buffers
   std::vector<Buffer> const_index_fragment_buffer, fragment_buffers;
-  for (const auto &[buffer, indices] : indice_map_) {
+  for (const auto &[buffer, access] : indice_map_) {
     if (!IsFragmentBuffer(buffer))
       continue;
     fragment_buffers.push_back(buffer);
 
     bool is_const_index = true;
-    for (const auto &index : indices) {
+    for (const auto &index : access.indices) {
       if (!index.as<IntImmNode>()) {
         is_const_index = false;
         break;
@@ -382,7 +352,7 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   Buffer source_buffer, read_source_buffer;
   Buffer replicated_write_buffer; // Backup: fully replicated write buffer
 
-  for (const auto &[buffer, indices] : indice_map_) {
+  for (const auto &[buffer, access] : indice_map_) {
     if (T.layout_map.count(buffer)) {
       // skip reducers with rep=ALL
       if (auto info = reducer_info_map_.Get(buffer->data);
@@ -390,9 +360,10 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
         continue;
 
       auto frag = T.layout_map[buffer].as<Fragment>().value();
-      bool is_fully_replicated = buffer_is_completed_replicated(buffer);
+      bool is_fully_replicated =
+          IsBufferCompletelyReplicated(buffer, T.layout_map);
 
-      if (buffer_is_write_.count(buffer)) {
+      if (access.is_write) {
         source_buffer = buffer;
       } else {
         // Keep the buffer with largest number of indices
@@ -401,8 +372,8 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
         // if the buffer is completed replicated, we don't need to infer the
         // layout from this buffer.
         if ((!read_source_buffer.defined() ||
-             indice_map_[buffer].size() >
-                 indice_map_[read_source_buffer].size())) {
+             access.indices.size() >
+                 GetAccessInfo(read_source_buffer).indices.size())) {
           read_source_buffer = buffer;
         }
         // If the buffer is not replicated and shape is equal to the
@@ -416,54 +387,24 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   }
   // moved to ComputeLoopLayoutFromBuffer
 
-  // Try to infer loop layout from buffers in order of preference:
-  // 1. Non-replicated write buffer (most reliable)
-  // 2. Non-replicated read buffer
-  // 3. Fully replicated write buffer (backup, may cause issues)
-  // 4. Free inference mode (no source buffer)
-
-  if (source_buffer.defined() && allow_layout_propgate) {
+  // Try to infer loop layout from buffers in order of preference only if we
+  // don't already have a layout (e.g., from annotations):
+  // 1. Annotated loop layout
+  // 2. Non-replicated write buffer (most reliable)
+  // 3. Non-replicated read buffer
+  // 4. Fully replicated write buffer (backup, may cause issues)
+  // 5. Free inference mode (no source buffer)
+  if (!loop_layout_.defined() && annotated_layout_unbound_.defined()) {
+    loop_layout_ =
+        annotated_layout_unbound_.value()->BindThreadRange(T.thread_bounds);
+    if (annotated_predicate_.defined()) {
+      predicate_ = annotated_predicate_.value();
+    }
+  } else if (!loop_layout_.defined() && source_buffer.defined() &&
+             allow_layout_propgate) {
     loop_layout_ = ComputeLoopLayoutFromBuffer(source_buffer, T);
-  } else if (level == InferLevel::kFree) {
+  } else if (!loop_layout_.defined() && level == InferLevel::kFree) {
     // For free layout inference
-    // If replication exists and buffer has cross-thread shared memory access,
-    // add predicate
-    bool has_cross_thread_access = false;
-    PostOrderVisit(root_, [&](const ObjectRef &obj) {
-      if (const auto *store = obj.as<BufferStoreNode>()) {
-        // check if scope is shared or global
-        if (store->buffer.scope() == "shared" ||
-            store->buffer.scope() == "shared.dyn" ||
-            store->buffer.scope() == "global") {
-          has_cross_thread_access = true;
-        }
-      } else if (const auto *load = obj.as<BufferLoadNode>()) {
-        // check if scope is shared or global
-        if (load->buffer.scope() == "shared" ||
-            load->buffer.scope() == "shared.dyn" ||
-            load->buffer.scope() == "global") {
-          has_cross_thread_access = true;
-        }
-      }
-    });
-
-    // check if loop body contains a "pure" buffer store (i.e., direct
-    // assignment, not compound update)
-    std::vector<Buffer> store_shared_global_buffers, store_fragment_buffers;
-    // Buffers that scope is above fragments.
-    // global, shared, shared.dyn
-    // which can be used to analysis replicate case
-    PostOrderVisit(root_, [&](const ObjectRef &obj) {
-      if (const auto *store = obj.as<BufferStoreNode>()) {
-        auto buffer = store->buffer;
-        if (buffer.scope() == "shared" || buffer.scope() == "shared.dyn" ||
-            buffer.scope() == "global") {
-          store_shared_global_buffers.emplace_back(buffer);
-        } else if (IsFragmentBuffer(buffer)) {
-          store_fragment_buffers.emplace_back(buffer);
-        }
-      }
-    });
     // In free inference, try two mechanisms and prefer the one that
     // minimizes replication while remaining compatible:
     // 1) compute_loop_layout_from_buffer (always correct but may
@@ -493,13 +434,12 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
       DLOG(INFO)
           << "[FreeInfer] only compute_from_buffer available, choose it.";
     }
-
-    BuildReplicationGuardsIfNeeded(
-        T, store_shared_global_buffers, store_fragment_buffers,
-        has_cross_thread_access, const_index_fragment_buffer);
-  } else {
+  } else if (!loop_layout_.defined()) {
+    // In non-free mode without a source buffer, if we don't have any layout
+    // yet (e.g., no annotation), we have nothing to infer here.
     return {};
   }
+
   // check loop_layout_ is injective
   auto injective_res = loop_layout_->DetectInjective();
   if (!injective_res->errors.empty()) {
@@ -530,28 +470,26 @@ LayoutMap ParallelOpNode::InferLayout(const LayoutInferArgs &T,
   }
 
   // Step 2: Check that the loop's partition can correctly align with all source
-  // fragment, and infer layout only when it's not yet layout-ed
+  // fragment, and infer layout only when it's not yet layout-ed.
+  ValidateCandidateAgainstFragments(loop_layout_, T, /*throw_on_error=*/true,
+                                    /*check_forward_index=*/false,
+                                    source_buffer);
+
+  // Step 3: Build replication guards
+  BuildReplicationGuardsIfNeeded(
+      T, store_shared_global_buffers_, store_fragment_buffers_,
+      has_cross_thread_access_, const_index_fragment_buffer);
+
+  // Step 4: Collect buffer fragments
   LayoutMap results;
-  for (const auto &[buffer, _] : indice_map_) {
-    if (T.layout_map.count(buffer)) {
-      auto fragment = T.layout_map[buffer].as<Fragment>().value();
-      auto vars =
-          loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
-      if (!ProveFragmentContains(loop_layout_, fragment, vars,
-                                 indice_map_[buffer], analyzer_)) {
-        std::ostringstream oss;
-        oss << "Layout infer conflict between " << buffer << " and "
-            << source_buffer << " in T.Parallel loop:" << '\n'
-            << "    loop " << loop_layout_->DebugOutput() << '\n'
-            << "    fragment " << fragment->DebugOutput() << '\n';
-        throw LayoutConflictException(oss.str());
-      }
-    } else {
+  for (const auto &[buffer, access] : indice_map_) {
+    if (!T.layout_map.count(buffer)) {
       auto dst_layout =
           CompleteBufferFragment(buffer)->BindThreadRange(T.thread_bounds);
       results.Set(buffer, dst_layout);
     }
   }
+  loop_layout_inferred_ = true;
   return results;
 }
 
@@ -572,11 +510,12 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
   // them directly and avoid introducing a synthetic replicate dimension.
   {
     auto res2d =
-        arith::DetectIterMap(indice_map_[buffer], ToVMap(loop_vars_), 1,
-                             arith::IterMapLevel::Bijective,
+        arith::DetectIterMap(GetAccessInfo(buffer).indices, ToVMap(loop_vars_),
+                             1, arith::IterMapLevel::Bijective,
                              const_cast<arith::Analyzer *>(&analyzer_));
     if (res2d->errors.empty()) {
-      Layout ind_inv2d = Layout(loop_vars_, indice_map_[buffer])->Inverse();
+      Layout ind_inv2d =
+          Layout(loop_vars_, GetAccessInfo(buffer).indices)->Inverse();
       PrimExpr indice_rep_extent = 1;
       PrimExpr loop_rep_extent = loop_layout_->ReplicateExtent();
       PrimExpr dest_buffer_rep_extent = indice_rep_extent * loop_rep_extent;
@@ -593,9 +532,9 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
   }
   // Otherwise, infer an extra flattened iterator that captures truly-unused
   // pieces of the loop space (if any), then try inversion with it.
-  PrimExpr rep_b = MakeFlattenedExpression(
-      DivideUnusedIterators(indice_map_[buffer], loop_vars_, &analyzer_));
-  auto bijective_indice = indice_map_[buffer];
+  PrimExpr rep_b = MakeFlattenedExpression(DivideUnusedIterators(
+      GetAccessInfo(buffer).indices, loop_vars_, &analyzer_));
+  auto bijective_indice = GetAccessInfo(buffer).indices;
   bijective_indice.push_back(rep_b);
   Layout ind_inv = Layout(loop_vars_, bijective_indice)->Inverse();
 
@@ -619,17 +558,45 @@ Fragment ParallelOpNode::CompleteBufferFragment(const Buffer &buffer) const {
 TVM_FFI_STATIC_INIT_BLOCK() { ParallelOpNode::RegisterReflection(); }
 
 bool ParallelOpNode::ValidateCandidateAgainstFragments(
-    const Fragment &candidate, const LayoutInferArgs &T) const {
+    const Fragment &candidate, const LayoutInferArgs &T, bool throw_on_error,
+    bool check_forward_index, const Buffer &source_buffer) const {
   auto vars =
       loop_vars_.Map([](const IterVar &iv) { return PrimExpr(iv->var); });
-  for (const auto &[buffer, _] : indice_map_) {
+  for (const auto &[buffer, access] : indice_map_) {
     if (!T.layout_map.count(buffer))
       continue;
+    if (auto info = reducer_info_map_.Get(buffer->data);
+        info && info.value()->rep == ReducerRepType::ALL)
+      continue;
     auto fragment = T.layout_map[buffer].as<Fragment>().value();
-    // check_forward_index=true: when validating loop layout against buffer
-    // fragment, we need to ensure physical indices match for correct code gen.
-    if (!ProveFragmentContains(candidate, fragment, vars, indice_map_[buffer],
-                               analyzer_, /*check_forward_index=*/true)) {
+    std::ostringstream oss;
+    bool success = true;
+    if (access.is_read &&
+        !ProveFragmentContains(candidate, fragment, vars, access.indices,
+                               analyzer_, check_forward_index)) {
+      if (throw_on_error) {
+        oss << "Layout infer conflict between " << buffer << " and "
+            << source_buffer << " in T.Parallel loop:" << '\n'
+            << "    loop " << candidate->DebugOutput() << '\n'
+            << "    fragment " << fragment->DebugOutput() << '\n';
+      }
+      success = false;
+    }
+    if (access.is_write &&
+        !ProveFragmentContains(fragment, candidate, access.indices, vars,
+                               analyzer_, check_forward_index)) {
+      if (throw_on_error) {
+        oss << "Layout infer conflict between " << buffer << " and "
+            << source_buffer << " in T.Parallel loop:" << '\n'
+            << "    loop " << candidate->DebugOutput() << '\n'
+            << "    fragment " << fragment->DebugOutput() << '\n';
+      }
+      success = false;
+    }
+    if (!success) {
+      if (throw_on_error) {
+        throw LayoutConflictException(oss.str());
+      }
       return false;
     }
   }
@@ -644,14 +611,15 @@ ParallelOpNode::ComputeLoopLayoutFromBuffer(const Buffer &buffer,
              << buffer << "` of layout " << src_layout->DebugOutput() << '\n';
 
   Fragment result;
+
   if (IsCommonAccessIndice(buffer)) {
     result = src_layout;
   } else {
-    Var rep;
+    Var rep("_rep");
     auto rep_iter =
         IterVar({0, src_layout->ReplicateExtent()}, rep, IterVarType::kDataPar);
     PrimExpr loop_var_to_thread =
-        src_layout->ForwardThread(indice_map_[buffer], rep);
+        src_layout->ForwardThread(GetAccessInfo(buffer).indices, rep);
     loop_var_to_thread = analyzer_.Simplify(loop_var_to_thread);
     PostOrderVisit(loop_var_to_thread, [&](const ObjectRef &objref) {
       if (auto opt_var = objref.as<Var>();
@@ -680,6 +648,15 @@ ParallelOpNode::ComputeLoopLayoutFromBuffer(const Buffer &buffer,
   }
   DLOG(INFO) << "[compute_loop_layout_from_buffer] ... and get "
              << result->DebugOutput() << '\n';
+  // Lei: This is a tradeoff, disable it for now.
+  // // Try DeReplicate first to reduce replication if possible.
+  // Fragment dereplicated_layout = candidate_from_buffer->DeReplicate();
+  // if (ValidateCandidateAgainstFragments(
+  //         dereplicated_layout, T, /*throw_on_error=*/false,
+  //         /*check_forward_index=*/false,
+  //         /*source_buffer=*/read_source_buffer)) {
+  //   candidate_from_buffer = dereplicated_layout;
+  // }
   return result;
 }
 
@@ -688,7 +665,8 @@ Fragment ParallelOpNode::ComputePlanCandidate(const LayoutInferArgs &T) const {
   // As the pass will do post processing to the layout
   auto maybe_remapped_root_ =
       IfBufferRemapLoopGenerator::run(root_, T.buffer_remap, T.layout_map);
-  int vector_size = GetVectorizeSize(maybe_remapped_root_, T.analyzer);
+  int vector_size =
+      GetVectorizeSize(maybe_remapped_root_, T.analyzer, T.layout_map);
   DLOG(INFO) << "[PlanLoopPartition] vector_size = " << vector_size << '\n';
 
   PrimExpr loop_total_size = 1;
diff --git a/src/op/parallel.h b/src/op/parallel.h
index e75600f77c..751e14a221 100644
--- a/src/op/parallel.h
+++ b/src/op/parallel.h
@@ -9,7 +9,10 @@
 #include <tvm/target/target.h>
 #include <tvm/tir/stmt_functor.h>
 
+#include <unordered_map>
+
 #include "../layout/layout.h"
+#include "../layout/utils.h"
 #include "../transform/layout_reducer.h"
 #include "./operator.h"
 
@@ -24,12 +27,6 @@ namespace tl {
 
 using namespace tir;
 
-bool ProveFragmentContains(Fragment small_frag, Fragment large_frag,
-                           Array<PrimExpr> small_frag_indices,
-                           Array<PrimExpr> large_frag_indices,
-                           arith::Analyzer &analyzer_,
-                           bool check_forward_index = false);
-
 class ParallelOpNode;
 
 class ParallelLoopNestVisitor : public StmtExprVisitor {
@@ -49,13 +46,31 @@ class ParallelLoopNestVisitor : public StmtExprVisitor {
 // predicates.
 class ParallelOpNode : public TileOperatorNode {
 public:
+  struct BufferAccessInfo {
+    Array<PrimExpr> indices;
+    bool is_read = false;
+    bool is_write = false;
+  };
+
+  using BufferIndiceMap = std::unordered_map<Buffer, BufferAccessInfo,
+                                             ObjectPtrHash, ObjectPtrEqual>;
+
   // The root For loop node.
   For root_;
   // The inferred layout for the loop, mutable to allow lazy inference.
   mutable Fragment loop_layout_;
+  // Whether loop_layout_ was inferred within InferLayout (vs. provided via
+  // annotations). When true, subsequent InferLayout calls can early-exit
+  // without re-emitting buffer layout updates.
+  mutable bool loop_layout_inferred_ = false;
   // The predicate expression for the loop, if any, mutable for lazy
   // construction.
   mutable Optional<PrimExpr> predicate_;
+  // If the user/compiler provided annotations on the outermost loop, we cache
+  // them here (layout without thread-range binding, and the predicate). This
+  // lets InferLayout adopt them cleanly without re-parsing annotations.
+  mutable Optional<Fragment> annotated_layout_unbound_;
+  mutable Optional<PrimExpr> annotated_predicate_;
 
   // Type key for TVM object system.
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ParallelOp", ParallelOpNode,
@@ -83,14 +98,17 @@ class ParallelOpNode : public TileOperatorNode {
   ParallelOpNode(const ParallelOpNode &other) : ParallelOpNode(other.root_) {
     loop_layout_ = other.loop_layout_;
     predicate_ = other.predicate_;
+    loop_layout_inferred_ = other.loop_layout_inferred_;
+    annotated_layout_unbound_ = other.annotated_layout_unbound_;
+    annotated_predicate_ = other.annotated_predicate_;
   }
 
   // Get the inferred loop layout.
   Fragment GetLoopLayout() const { return loop_layout_; }
   // Get the root For loop.
   For GetRoot() const { return root_; }
-  // Get the mapping from buffer to access indices.
-  Map<Buffer, Array<PrimExpr>> GetIndiceMap() const { return indice_map_; }
+  // Get the mapping from buffer to access indices + access type.
+  const BufferIndiceMap &GetIndiceMap() const { return indice_map_; }
   // Get the predicate for a given thread variable.
   Optional<PrimExpr> GetPredicate(Var thread_var) const;
 
@@ -102,16 +120,29 @@ class ParallelOpNode : public TileOperatorNode {
   Fragment CompleteBufferFragment(const Buffer &buffer) const;
   // Check if the buffer is accessed with common indices (i.e., loop variables).
   bool IsCommonAccessIndice(const Buffer &buffer) const;
+  // Record buffer access and validate consistent indices.
+  void RecordBufferAccess(const Buffer &buffer, const Array<PrimExpr> &indices,
+                          bool is_write);
+  // Access info lookup with validation.
+  const BufferAccessInfo &GetAccessInfo(const Buffer &buffer) const;
+  // Check if a buffer is completely replicated (all threads hold same data).
+  bool IsBufferCompletelyReplicated(const Buffer &buffer,
+                                    const LayoutMap &layout_map) const;
   // Validate a candidate loop layout against all source fragments in
   // T.layout_map. Returns true if compatible with all fragments; otherwise
-  // false. Does not throw.
-  bool ValidateCandidateAgainstFragments(const Fragment &candidate,
-                                         const LayoutInferArgs &T) const;
+  // false. When throw_on_error is true, throws LayoutConflictException with
+  // detailed error message on failure.
+  bool ValidateCandidateAgainstFragments(
+      const Fragment &candidate, const LayoutInferArgs &T,
+      bool throw_on_error = false, bool check_forward_index = false,
+      const Buffer &source_buffer = Buffer()) const;
   // Choose the better loop layout from two candidates using validation,
   // containment and replication heuristic.
   Fragment ChooseBestCandidate(const Fragment &candidate_from_buffer,
                                const Fragment &candidate_from_plan,
                                const LayoutInferArgs &T) const;
+  // (No helper needed anymore; annotations are parsed once in ctor and adopted
+  // inside InferLayout.)
   // Compute loop layout from a source buffer's fragment mapping.
   Fragment ComputeLoopLayoutFromBuffer(const Buffer &buffer,
                                        const LayoutInferArgs &T) const;
@@ -139,10 +170,8 @@ class ParallelOpNode : public TileOperatorNode {
 
   // Visitor for collecting loop nest information.
   ParallelLoopNestVisitor V;
-  // Mapping from buffer to their access indices in the loop.
-  Map<Buffer, Array<PrimExpr>> indice_map_;
-  // Set of buffers that are written to in the loop.
-  std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_is_write_;
+  // Mapping from buffer to their access indices and access type in the loop.
+  BufferIndiceMap indice_map_;
   // The loop variables for the parallel loop nest.
   Array<IterVar> loop_vars_;
   // The inner_vars_
@@ -151,6 +180,12 @@ class ParallelOpNode : public TileOperatorNode {
   mutable arith::Analyzer analyzer_;
   // Mapping from buffer to reducer info.
   Map<Var, ReducerInfo> reducer_info_map_;
+  // Whether the loop body has cross-thread shared/global memory access.
+  bool has_cross_thread_access_ = false;
+  // Buffers that are stored to shared/global memory in the loop body.
+  std::vector<Buffer> store_shared_global_buffers_;
+  // Fragment buffers that are stored to in the loop body.
+  std::vector<Buffer> store_fragment_buffers_;
 };
 
 class ParallelOp : public TileOperator {
diff --git a/src/op/reduce.cc b/src/op/reduce.cc
index 896a28c047..c0ed1ecc0a 100644
--- a/src/op/reduce.cc
+++ b/src/op/reduce.cc
@@ -15,7 +15,10 @@
 #include "../op/parallel.h"
 #include "../target/utils.h"
 #include "../transform/loop_partition.h"
+#include "builtin.h"
 #include "tir/transforms/ir_utils.h"
+#include "tvm/ir/expr.h"
+#include "tvm/tir/expr.h"
 #include "tvm/tir/stmt.h"
 #include "utils.h"
 
@@ -31,17 +34,47 @@ using namespace tir;
 ReduceOp::ReduceOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<ReduceOpNode> node = tvm::ffi::make_object<ReduceOpNode>();
   // Accept BufferRegion/BufferLoad for src/dst
-  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
-  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  auto src_access = NormalizeToAccessRegion(args[0], kAccessRead);
+  auto dst_access = NormalizeToAccessRegion(args[1], kAccessReadWrite);
+  node->srcRegion_ = src_access.region;
+  node->dstRegion_ = dst_access.region;
+  node->SetAccessRegions({src_access, dst_access});
   node->src = node->srcRegion_->buffer;
   node->dst = node->dstRegion_->buffer;
   std::string reduce_type = args[2].as<StringImm>().value()->value;
   node->dim = args[3].as<IntImm>().value()->value;
   node->type = ReduceType(reduce_type);
   node->clear = args[4].as<Bool>().value();
+  // Optional "batch" annotation: number of output elements per batched
+  // AllReduce call (default 1 = scalar).
+  if (auto opt = annotations.Get("batch")) {
+    if (auto i = opt.value().as<IntImm>()) {
+      node->batch = static_cast<int>(i.value()->value);
+      CHECK_GE(node->batch, 1) << "ReduceOp: batch must be >= 1";
+    }
+  }
+  // Optional annotation: "nan_propagate" — for fp16/bf16 max/min/absmax,
+  // when true, lower to CUDA __hmax_nan/__hmin_nan so NaNs propagate.
+  if (auto opt = annotations.Get("nan_propagate")) {
+    if (auto b = opt.value().as<Bool>()) {
+      node->nan_propagate = b.value();
+    } else if (auto i = opt.value().as<IntImm>()) {
+      node->nan_propagate = i.value()->value != 0;
+    }
+  }
   data_ = std::move(node);
 }
 
+AccessRegions ReduceOpNode::GetAccessRegions() const {
+  AccessRegions result;
+  result.reads.push_back(srcRegion_);
+  if (!clear) {
+    result.reads.push_back(dstRegion_);
+  }
+  result.writes.push_back(dstRegion_);
+  return result;
+}
+
 TileOperator ReduceOpNode::Clone() const {
   auto op = tvm::ffi::make_object<ReduceOpNode>(*this);
   return ReduceOp(op);
@@ -99,44 +132,57 @@ PrimExpr ReduceOpNode::MakeInitValue() const {
   }
 }
 
-PrimExpr ReduceOpNode::MakeReduce(const PrimExpr &lhs,
+PrimExpr ReduceOpNode::MakeReduce(const PrimExpr &acc,
                                   const PrimExpr &b) const {
   PrimExpr rhs = b;
-  if (lhs->dtype != rhs->dtype) {
-    rhs = Cast(lhs->dtype, rhs);
+  if (acc->dtype != rhs->dtype) {
+    rhs = Cast(acc->dtype, rhs);
   }
+  const bool use_nan_op =
+      nan_propagate && (acc.dtype().is_float16() || acc.dtype().is_bfloat16());
   if (type->isSum()) {
-    return lhs + rhs;
+    return acc + rhs;
   } else if (type->isAbsSum()) {
-    return lhs + Max(rhs, -rhs);
+    return acc + Max(rhs, -rhs);
   } else if (type->isMax()) {
-    return Max(lhs, rhs);
+    if (use_nan_op) {
+      return Call(acc.dtype(), tl::max_nan(), {acc, rhs});
+    }
+    return Max(acc, rhs);
   } else if (type->isMin()) {
-    return Min(lhs, rhs);
+    if (use_nan_op) {
+      return Call(acc.dtype(), tl::min_nan(), {acc, rhs});
+    }
+    return Min(acc, rhs);
   } else if (type->isAbsMax()) {
-    return Max(tvm::abs(lhs), tvm::abs(rhs));
+    if (use_nan_op) {
+      return Call(acc.dtype(), tl::max_nan(), {acc, tvm::abs(rhs)});
+    }
+    return Max(acc, tvm::abs(rhs));
   } else if (type->isBitAnd()) {
-    return lhs & rhs;
+    return acc & rhs;
   } else if (type->isBitOr()) {
-    return lhs | rhs;
+    return acc | rhs;
   } else if (type->isBitXor()) {
-    return lhs ^ rhs;
+    return acc ^ rhs;
   } else {
     LOG(FATAL) << "Unsupported reduce type: " << type->type;
   }
 }
 
 std::string ReduceOpNode::MakeCodegenReducer() const {
+  const bool use_nan_op =
+      nan_propagate && (dst->dtype.is_float16() || dst->dtype.is_bfloat16());
   if (type->isSum()) {
     return "tl::SumOp";
   } else if (type->isAbsSum()) {
     return "tl::SumOp";
   } else if (type->isMax()) {
-    return "tl::MaxOp";
+    return use_nan_op ? "tl::MaxOpNan" : "tl::MaxOp";
   } else if (type->isMin()) {
-    return "tl::MinOp";
+    return use_nan_op ? "tl::MinOpNan" : "tl::MinOp";
   } else if (type->isAbsMax()) {
-    return "tl::MaxOp";
+    return use_nan_op ? "tl::MaxOpNan" : "tl::MaxOp";
   } else if (type->isBitAnd()) {
     return "tl::BitAndOp";
   } else if (type->isBitOr()) {
@@ -149,15 +195,49 @@ std::string ReduceOpNode::MakeCodegenReducer() const {
   }
 }
 
+static Array<PrimExpr> InputPlaceholders(size_t n) {
+  Array<PrimExpr> result;
+  result.reserve(n);
+  for (size_t i = 0; i < n; ++i) {
+    result.push_back(InputPlaceholder(i));
+  }
+  return result;
+}
+
+static Fragment ComputeReducerLayout(const Fragment &src_layout, int dim) {
+  PrimExpr src_rep_extent = src_layout->ReplicateExtent();
+  PrimExpr indice_rep_extent = src_layout->InputShape()[dim];
+  PrimExpr reducer_rep_extent = indice_rep_extent * src_rep_extent;
+
+  auto fwd = InputPlaceholders(src_layout->InputDim() - 1);
+  fwd.insert(fwd.begin() + dim,
+             FloorMod(ReplicationPlaceholder(), indice_rep_extent));
+
+  auto thd = src_layout->ForwardThread(
+      fwd, FloorDiv(ReplicationPlaceholder(), indice_rep_extent));
+
+  auto reducer_shape = src_layout->InputShape();
+  reducer_shape.erase(reducer_shape.begin() + dim);
+  if (reducer_shape.empty()) {
+    reducer_shape.push_back(1);
+  }
+
+  auto reducer_layout =
+      Fragment(reducer_shape, {}, thd, reducer_rep_extent, std::nullopt)
+          ->CondenseReplicateVar()
+          ->BindThreadRange(src_layout->ThreadRange());
+  return reducer_layout;
+}
+
 /**
  * @brief Lower the Reduce operator to a TIR statement.
  *
  * Lowers a ReduceOpNode operating on fragment-scoped buffers into a sequence of
  * TIR statements implementing: optional initialization, thread-local reduction
  * (unrolled inner loops), inter-thread reduction via a runtime AllReduce call
- * (Hopper-specific `run_hopper` variant when TargetIsHopper(T.target) is true),
- * and an optional accumulation or copy back to the destination buffer when a
- * temporary clear buffer is used.
+ * (Hopper targets use `NamedBarrier` instead of the default
+ * `SyncThreadsBarrier`), and an optional accumulation or copy back to the
+ * destination buffer when a temporary clear buffer is used.
  *
  * Behavior notes:
  * - Only supports src and dst in "local.fragment" scope; otherwise it checks
@@ -170,9 +250,9 @@ std::string ReduceOpNode::MakeCodegenReducer() const {
  * reduction.
  * - Performs iterator compression for local reduction loops using `analyzer`.
  * - Detects parallel thread splitting from the normalized iterator sum and
- *   emits a call to a templated `tl::AllReduce<...>::run` (or `run_hopper`)
+ *   emits a call to a templated `tl::AllReduce<...>::run`
  *   via `builtin::call_extern`. For sufficiently large reducing thread counts
- *   (>= 32) a workspace is allocated via T.AddWorkspace and passed to the
+ *   (> 32) a workspace is allocated via T.AddWorkspace and passed to the
  *   AllReduce call.
  * - The final body is wrapped in parallel loops over the destination spatial
  *   dimensions and partitioned by the lowering thread variable. If a temporary
@@ -186,6 +266,13 @@ std::string ReduceOpNode::MakeCodegenReducer() const {
  * @return Stmt Lowered TIR statement implementing the reduction.
  */
 Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
+  if (nan_propagate && (dst->dtype.is_float16() || dst->dtype.is_bfloat16()) &&
+      !TargetIsCuda(T.target)) {
+    LOG(FATAL) << "ReduceOp: nan_propagate=True for fp16/bf16 max/min/absmax "
+                  "is only supported on CUDA targets (requires "
+                  "__hmax_nan/__hmin_nan intrinsics). Target was: "
+               << T.target->str();
+  }
   auto get_buffer = [&](const Buffer &buf) {
     if (T.buffer_remap.count(buf))
       return T.buffer_remap[buf];
@@ -197,14 +284,15 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
   if (src_scope == "local.fragment" && dst_scope == "local.fragment") {
 
-    Buffer src_buffer = get_buffer(this->src);
-    Buffer dst_buffer = get_buffer(this->dst);
-    Fragment src_layout = T.layout_map[this->src].as<Fragment>().value();
-    Fragment dst_layout = T.layout_map[this->dst].as<Fragment>().value();
-    size_t src_dim = src_layout->InputDim();
-    size_t dst_dim = dst_layout->InputDim();
+    auto src_buffer = get_buffer(this->src);
+    auto dst_buffer = get_buffer(this->dst);
+    auto src_layout = T.layout_map[this->src].as<Fragment>().value();
+    auto dst_layout = T.layout_map[this->dst].as<Fragment>().value();
+    auto red_layout = ComputeReducerLayout(src_layout, dim);
+    auto src_dim = src_layout->InputDim();
+    auto dst_dim = dst_layout->InputDim();
 
-    bool is_1d_reduce = src_dim == dst_dim && dst_dim == 1;
+    auto is_1d_reduce = src_dim == dst_dim && dst_dim == 1;
 
     if (is_1d_reduce) {
       ICHECK(is_one(dst_layout->OutputShape().back()))
@@ -228,60 +316,86 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     IterVar reduce_iv(reduce_dom, Var("rv"), IterVarType::kDataPar);
     src_vars.insert(src_vars.begin() + this->dim, reduce_iv);
 
-    Array<PrimExpr> src_indices = src_layout->Forward(
+    auto src_indices = src_layout->Forward(
         src_vars.Map([](const auto &iv) { return PrimExpr(iv->var); }));
-    Array<PrimExpr> dst_indices = dst_layout->Forward(
+    auto dst_indices = dst_layout->Forward(
+        dst_vars.Map([](const auto &iv) { return PrimExpr(iv->var); }));
+    auto red_indices = red_layout->Forward(
         dst_vars.Map([](const auto &iv) { return PrimExpr(iv->var); }));
 
     Array<Stmt> stmts;
 
-    bool require_init = this->clear;
+    auto require_init = this->clear;
     if (this->type->isSum() || this->type->isAbsSum() ||
         this->type->isBitAnd() || this->type->isBitOr() ||
         this->type->isBitXor()) {
       require_init = true;
     }
 
-    Buffer clear_buffer = dst_buffer;
-    bool need_duplicate = false;
+    auto clear_buffer = dst_buffer;
+    auto need_duplicate = false;
+    auto need_update = false;
     if ((this->type->isSum() || this->type->isAbsSum()) && !this->clear) {
       need_duplicate = true;
+      need_update = true;
     } else if (this->type->isBitAnd() && !this->clear) {
       need_duplicate = true;
+      need_update = true;
     } else if ((this->type->isBitOr() || this->type->isBitXor()) &&
                !this->clear) {
       need_duplicate = true;
+      need_update = true;
+    } else if ((this->type->isMax() || this->type->isMin() ||
+                this->type->isAbsMax()) &&
+               !this->clear) {
+      need_duplicate = true;
+      need_update = true;
+    }
+
+    // red_layout should always contain dst_layout
+    // if we can prove they are the same, no need to duplicate buffer
+    // otherwise, red_layout contains more replicated dimensions than dst_layout
+    if (!analyzer->CanProve(dst_layout->ReplicateExtent() ==
+                            red_layout->ReplicateExtent())) {
+      need_duplicate = true;
     }
+    ICHECK(!analyzer->CanProve(dst_layout->ReplicateExtent() >
+                               red_layout->ReplicateExtent()))
+        << "Inconsistent layouts between src and dst in ReduceOp: "
+        << "dst_layout=" << dst_layout << "red_layout=" << red_layout;
 
     if (need_duplicate) {
       // Create a new buffer with same shape and dtype as dst_buffer
-      clear_buffer = decl_buffer(dst_buffer->shape, dst_buffer->dtype,
+      clear_buffer = decl_buffer(red_layout->OutputShape(), dst_buffer->dtype,
                                  dst_buffer->name + "_clear",
                                  GetPtrStorageScope(dst_buffer->data));
     }
     // make reduce-init stmt
-    if (require_init) {
+    // For max/min/absmax with clear=false and need_duplicate, we still need to
+    // initialize the temporary buffer with identity values since the original
+    // dst values will be combined later via need_update
+    if (require_init ||
+        (need_duplicate && (this->type->isMax() || this->type->isMin() ||
+                            this->type->isAbsMax()))) {
       stmts.push_back(
-          BufferStore(clear_buffer, this->MakeInitValue(), dst_indices));
+          BufferStore(clear_buffer, this->MakeInitValue(), red_indices));
     }
 
     // make thread-local reduce
     Array<PrimExpr> src_indice_compressed;
     Array<IterVar> src_var_compressed;
     for (size_t i = 0; i < src_layout->OutputDim(); ++i) {
-      PrimExpr expr;
-      IterVar var;
-      std::tie(expr, var) = CompressIterator(
-          src_indices[i], src_vars, src_vars[this->dim]->var, analyzer);
+      auto [expr, var] = CompressIterator(src_indices[i], src_vars,
+                                          src_vars[this->dim]->var, analyzer);
       src_indice_compressed.push_back(expr);
       src_var_compressed.push_back(var);
     }
 
     Stmt reduce_local = BufferStore(
         clear_buffer,
-        this->MakeReduce(BufferLoad(clear_buffer, dst_indices),
+        this->MakeReduce(BufferLoad(clear_buffer, red_indices),
                          BufferLoad(src_buffer, src_indice_compressed)),
-        dst_indices);
+        red_indices);
 
     for (int i = static_cast<int>(src_layout->OutputDim()) - 1; i >= 0; --i) {
       reduce_local =
@@ -291,14 +405,89 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
     }
     stmts.push_back(reduce_local);
 
-    PrimExpr src_thread = src_layout->ForwardThread(
+    auto src_thread = src_layout->ForwardThread(
         src_vars.Map([](const auto &iv) { return PrimExpr(iv->var); }), {});
     auto iter_sum =
         arith::NormalizeToIterSum(src_thread, ToVMap(src_vars), analyzer);
-    for (const auto &iter_split : iter_sum->args) {
-      auto mark = iter_split->source->source.as<Var>();
-      ICHECK(mark) << "Not a normalized iterator: " << iter_split->source;
-      if (mark.value().same_as(src_vars[this->dim]->var)) {
+
+    // batch is set by the user via the "batch" annotation (default 1 = scalar).
+    // When batch > 1 the compiler phases the reduction:
+    //   1. init + local reduce loop
+    //   2. ceil(N/batch) batched AllReduce calls, each sharing one barrier pair
+    //   3. copy-back loop (only when need_duplicate)
+    const int batch = this->batch;
+
+    // Validate batch against the actual per-thread element count N.
+    if (batch > 1) {
+      int64_t N_total = 1;
+      for (const auto &s : clear_buffer->shape) {
+        const int64_t *p = as_const_int(s);
+        ICHECK(p != nullptr) << "ReduceOp: batch > 1 requires compile-time "
+                                "constant output shape";
+        N_total *= *p;
+      }
+      CHECK_LE(batch, N_total)
+          << "ReduceOp: batch=" << batch
+          << " exceeds per-thread output element count N=" << N_total;
+      CHECK_EQ(N_total % batch, 0)
+          << "ReduceOp: batch=" << batch << " must evenly divide N=" << N_total;
+    }
+
+    bool use_batch = batch > 1;
+
+    // Helper: wrap a body in the dst_vars loops with partitioning & unrolling.
+    auto make_dst_loop = [&](Stmt body, const Array<IterVar> &vars) -> Stmt {
+      for (int i = static_cast<int>(vars.size()) - 1; i >= 0; --i) {
+        body = For(vars[i]->var, 0, vars[i]->dom->extent, ForKind::kParallel,
+                   body);
+      }
+      body = PartitionLoop(Downcast<For>(body), T.thread_var, analyzer,
+                           red_layout);
+      body = PragmaUnrollLoop(Downcast<For>(body));
+      return body;
+    };
+
+    // Helper: create fresh dst loop variables (needed so that pre/post loops
+    // do not reuse the same Var objects).
+    auto make_fresh_dst_vars = [&](const std::string &suffix)
+        -> std::tuple<Array<IterVar>, Array<PrimExpr>, Array<PrimExpr>> {
+      Array<IterVar> vars;
+      for (size_t i = 0; i < dst_dim; ++i) {
+        Var v(std::string{char('i' + i)} + suffix);
+        vars.push_back(IterVar(Range(0, dst_layout->InputShape()[i]), v,
+                               IterVarType::kDataPar));
+      }
+      auto d_idx = dst_layout->Forward(
+          vars.Map([](const auto &iv) { return PrimExpr(iv->var); }));
+      auto r_idx = red_layout->Forward(
+          vars.Map([](const auto &iv) { return PrimExpr(iv->var); }));
+      return {vars, d_idx, r_idx};
+    };
+
+    if (use_batch) {
+      // ================================================================
+      // Batched AllReduce path — three phases:
+      //   1. Loop: init + thread-local reduce
+      //   2. Flat: batched AllReduce (single butterfly pass for all values)
+      //   3. Loop: copy-back (only when need_duplicate)
+      // ================================================================
+
+      // Phase 1: pre-reduce loop
+      Stmt pre_body = stmts.size() > 1 ? SeqStmt(stmts) : stmts[0];
+      pre_body = make_dst_loop(pre_body, dst_vars);
+
+      Array<Stmt> phases;
+      phases.push_back(pre_body);
+
+      // Phase 2: batched AllReduce call(s).
+      // workspace_stride = reducing_threads (SoA layout in smem:
+      //   slot for batch item b, thread t = red_buf[b * reducing_threads + t])
+      for (const auto &iter_split : iter_sum->args) {
+        auto mark = iter_split->source->source.as<Var>();
+        if (!mark)
+          continue;
+        if (!mark.value().same_as(src_vars[this->dim]->var))
+          continue;
         auto scale = as_const_int(iter_split->scale);
         auto extent = as_const_int(iter_split->extent);
         ICHECK(scale != nullptr && extent != nullptr);
@@ -306,70 +495,235 @@ Stmt ReduceOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
           continue;
 
         int reducing_threads = (*extent) * (*scale);
+        auto thread_offset = T.thread_bounds->min;
         std::stringstream ss;
 
-        auto thread_offset = T.thread_bounds->min;
-        if (TargetIsHopper(T.target) || TargetIsSm100(T.target) ||
-            TargetIsSM120(T.target)) {
+        // Use run_batch (not run) to avoid overload-resolution ambiguity when
+        // a pointer is passed as first argument.
+        if (TargetHasSMVersionGE(T.target, 90)) {
           auto all_threads = T.thread_bounds->extent;
           ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
              << reducing_threads << ", " << (*scale) << ", " << thread_offset
-             << ", " << all_threads << ">::run_hopper";
+             << ", tl::NamedBarrier<" << all_threads << ">, " << batch << ", "
+             << reducing_threads << ">::run_batch";
+        } else if (TargetIsRocm(T.target)) {
+          ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
+             << reducing_threads << ", " << (*scale) << ", " << thread_offset
+             << ", " << batch << ", " << reducing_threads << ">::run_batch";
         } else {
           ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
              << reducing_threads << ", " << (*scale) << ", " << thread_offset
-             << ">::run";
+             << ", tl::SyncThreadsBarrier, " << batch << ", "
+             << reducing_threads << ">::run_batch";
+        }
+
+        // Workspace is only needed for cross-warp reduce (> 32 threads).
+        // Allocate once; all chunks share the same workspace buffer.
+        PrimExpr workspace;
+        bool need_workspace = reducing_threads > 32;
+        if (need_workspace) {
+          int ws_size = reducing_threads * batch;
+          workspace = T.AddWorkspace(ws_size, clear_buffer->dtype);
         }
-        Array<PrimExpr> thread_reduce_args = {
-            StringImm(ss.str()), BufferLoad(clear_buffer, dst_indices)};
-        if (reducing_threads >= 32) {
-          PrimExpr workspace = T.AddWorkspace(
-              *as_const_int(T.thread_bounds->extent), clear_buffer->dtype);
-          thread_reduce_args.push_back(workspace);
+
+        // Compute N_total and num_chunks for this buffer.
+        int64_t N_total = 1;
+        for (const auto &s : clear_buffer->shape)
+          N_total *= *as_const_int(s);
+        int num_chunks = static_cast<int>(N_total / batch);
+
+        // Compute strides for reverse-linearisation of clear_buffer->shape.
+        int buf_ndim = static_cast<int>(clear_buffer->shape.size());
+        std::vector<int64_t> buf_shape_vals;
+        for (const auto &s : clear_buffer->shape)
+          buf_shape_vals.push_back(*as_const_int(s));
+        std::vector<int64_t> buf_strides(buf_ndim, 1);
+        for (int d = buf_ndim - 2; d >= 0; d--)
+          buf_strides[d] = buf_strides[d + 1] * buf_shape_vals[d + 1];
+
+        for (int chunk = 0; chunk < num_chunks; chunk++) {
+          int64_t flat_offset = (int64_t)chunk * batch;
+          // Map flat_offset to multi-dim indices in clear_buffer.
+          Array<PrimExpr> chunk_indices;
+          for (int d = 0; d < buf_ndim; d++) {
+            int64_t idx = (flat_offset / buf_strides[d]) % buf_shape_vals[d];
+            chunk_indices.push_back(Integer(idx));
+          }
+          // Pointer to the start of this chunk's elements in clear_buffer.
+          PrimExpr ptr = Call(DataType::Handle(), builtin::address_of(),
+                              {BufferLoad(clear_buffer, chunk_indices)});
+
+          Array<PrimExpr> args = {StringImm(ss.str()), ptr};
+          if (need_workspace)
+            args.push_back(workspace);
+          phases.push_back(
+              Evaluate(Call(DataType::Handle(), builtin::call_extern(), args)));
         }
-        auto call = Call(clear_buffer->dtype, builtin::call_extern(),
-                         thread_reduce_args);
-        stmts.push_back(BufferStore(clear_buffer, call, dst_indices));
       }
-    }
 
-    if (need_duplicate) {
-      PrimExpr src_val = BufferLoad(clear_buffer, dst_indices);
-      PrimExpr dst_val = BufferLoad(dst_buffer, dst_indices);
-      PrimExpr update;
-      if (this->type->isSum() || this->type->isAbsSum()) {
-        update = dst_val + src_val;
-      } else if (this->type->isBitAnd()) {
-        update = this->clear ? src_val : bitwise_and(dst_val, src_val);
-      } else if (this->type->isBitOr()) {
-        update = bitwise_or(dst_val, src_val);
-      } else if (this->type->isBitXor()) {
-        update = bitwise_xor(dst_val, src_val);
-      } else {
-        LOG(FATAL) << "Unsupported reduce type: " << this->type->type;
+      // Phase 3: copy-back (only when a temp buffer was used)
+      if (need_duplicate) {
+        auto [post_vars, post_dst_idx, post_red_idx] =
+            make_fresh_dst_vars("_p");
+
+        // Recompute predicate with post_vars.
+        PrimExpr predicate = Bool(true);
+        {
+          auto dst_th = post_dst_idx;
+          dst_th.push_back(T.thread_var);
+          auto inv = dst_layout->Inverse()->Forward(dst_th);
+          inv.pop_back();
+          for (int i = 0; i < static_cast<int>(dst_layout->InputDim()); i++)
+            predicate = predicate && (inv[i] == post_vars[i]->var);
+          predicate = analyzer->Simplify(predicate);
+        }
+
+        PrimExpr update;
+        if (need_update) {
+          auto src_val = BufferLoad(clear_buffer, post_red_idx);
+          auto dst_val = BufferLoad(dst_buffer, post_dst_idx);
+          if (this->type->isSum() || this->type->isAbsSum()) {
+            update = dst_val + src_val;
+          } else if (this->type->isBitAnd()) {
+            update = this->clear ? src_val : bitwise_and(dst_val, src_val);
+          } else if (this->type->isBitOr()) {
+            update = bitwise_or(dst_val, src_val);
+          } else if (this->type->isBitXor()) {
+            update = bitwise_xor(dst_val, src_val);
+          } else if (this->type->isMax() || this->type->isAbsMax()) {
+            update = Max(dst_val, src_val);
+          } else if (this->type->isMin()) {
+            update = Min(dst_val, src_val);
+          } else {
+            LOG(FATAL) << "Unsupported reduce type: " << this->type->type;
+          }
+        } else {
+          update = BufferLoad(clear_buffer, post_red_idx);
+        }
+        auto store = BufferStore(dst_buffer, update, post_dst_idx);
+        Stmt post_body;
+        if (analyzer->CanProve(predicate)) {
+          post_body = store;
+        } else {
+          post_body = IfThenElse(predicate, store);
+        }
+        phases.push_back(make_dst_loop(post_body, post_vars));
       }
-      stmts.push_back(BufferStore(dst_buffer, update, dst_indices));
-    }
 
-    Stmt body = stmts.size() > 1 ? SeqStmt(stmts) : stmts[0];
-    for (int i = static_cast<int>(dst_layout->InputDim()) - 1; i >= 0; --i) {
-      body = For(dst_vars[i]->var, 0, dst_vars[i]->dom->extent,
-                 ForKind::kParallel, body);
-    }
+      Stmt body = phases.size() > 1 ? SeqStmt(phases) : phases[0];
+      if (need_duplicate) {
+        body = Allocate(clear_buffer->data, clear_buffer->dtype,
+                        clear_buffer->shape, const_true(), body);
+      }
+      return body;
 
-    if (dst_layout->InputDim() > 0) {
-      body = PartitionLoop(Downcast<For>(body), T.thread_var, analyzer,
-                           dst_layout);
     } else {
-      PrimExpr guard = (T.thread_var == T.thread_bounds->min);
-      body = IfThenElse(guard, body);
-    }
+      // ================================================================
+      // Original scalar AllReduce path (unchanged).
+      // ================================================================
+      for (const auto &iter_split : iter_sum->args) {
+        auto mark = iter_split->source->source.as<Var>();
+        if (!mark)
+          continue;
+        if (mark.value().same_as(src_vars[this->dim]->var)) {
+          auto scale = as_const_int(iter_split->scale);
+          auto extent = as_const_int(iter_split->extent);
+          ICHECK(scale != nullptr && extent != nullptr);
+          if (*extent == 1)
+            continue;
+
+          int reducing_threads = (*extent) * (*scale);
+          std::stringstream ss;
+
+          auto thread_offset = T.thread_bounds->min;
+          if (TargetHasSMVersionGE(T.target, 90)) {
+            auto all_threads = T.thread_bounds->extent;
+            ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
+               << reducing_threads << ", " << (*scale) << ", " << thread_offset
+               << ", tl::NamedBarrier<" << all_threads << ">>::run";
+          } else {
+            ss << "tl::AllReduce<" << this->MakeCodegenReducer() << ", "
+               << reducing_threads << ", " << (*scale) << ", " << thread_offset
+               << ">::run";
+          }
+          Array<PrimExpr> thread_reduce_args = {
+              StringImm(ss.str()), BufferLoad(clear_buffer, red_indices)};
+          if (reducing_threads > 32) {
+            int workspace_size =
+                static_cast<int>(*as_const_int(T.thread_bounds->extent));
+            PrimExpr workspace =
+                T.AddWorkspace(workspace_size, clear_buffer->dtype);
+            thread_reduce_args.push_back(workspace);
+          }
+          auto call = Call(clear_buffer->dtype, builtin::call_extern(),
+                           thread_reduce_args);
+          stmts.push_back(BufferStore(clear_buffer, call, red_indices));
+        }
+      }
 
-    if (need_duplicate) {
-      body = Allocate(clear_buffer->data, clear_buffer->dtype,
-                      clear_buffer->shape, const_true(), body);
+      PrimExpr predicate = Bool(true);
+      {
+        auto dst_th_indices = dst_indices;
+        dst_th_indices.push_back(T.thread_var);
+        auto inv = dst_layout->Inverse()->Forward(dst_th_indices);
+        inv.pop_back();
+        for (int i = 0; i < static_cast<int>(dst_layout->InputDim()); i++) {
+          predicate = predicate && (inv[i] == dst_vars[i]->var);
+        }
+        predicate = analyzer->Simplify(predicate);
+      }
+      if (need_duplicate) {
+        PrimExpr update;
+        if (need_update) {
+          auto src_val = BufferLoad(clear_buffer, red_indices);
+          auto dst_val = BufferLoad(dst_buffer, dst_indices);
+          if (this->type->isSum() || this->type->isAbsSum()) {
+            update = dst_val + src_val;
+          } else if (this->type->isBitAnd()) {
+            update = this->clear ? src_val : bitwise_and(dst_val, src_val);
+          } else if (this->type->isBitOr()) {
+            update = bitwise_or(dst_val, src_val);
+          } else if (this->type->isBitXor()) {
+            update = bitwise_xor(dst_val, src_val);
+          } else if (this->type->isMax() || this->type->isAbsMax()) {
+            update = Max(dst_val, src_val);
+          } else if (this->type->isMin()) {
+            update = Min(dst_val, src_val);
+          } else {
+            LOG(FATAL) << "Unsupported reduce type: " << this->type->type;
+          }
+        } else {
+          update = BufferLoad(clear_buffer, red_indices);
+        }
+        auto store = BufferStore(dst_buffer, update, dst_indices);
+        if (analyzer->CanProve(predicate)) {
+          stmts.push_back(store);
+        } else {
+          stmts.push_back(IfThenElse(predicate, store));
+        }
+      }
+
+      auto body = stmts.size() > 1 ? SeqStmt(stmts) : stmts[0];
+      for (int i = static_cast<int>(dst_layout->InputDim()) - 1; i >= 0; --i) {
+        body = For(dst_vars[i]->var, 0, dst_vars[i]->dom->extent,
+                   ForKind::kParallel, body);
+      }
+
+      if (dst_layout->InputDim() > 0) {
+        body = PartitionLoop(Downcast<For>(body), T.thread_var, analyzer,
+                             red_layout);
+        body = PragmaUnrollLoop(Downcast<For>(body));
+      } else {
+        auto guard = (T.thread_var == T.thread_bounds->min);
+        body = IfThenElse(guard, body);
+      }
+
+      if (need_duplicate) {
+        body = Allocate(clear_buffer->data, clear_buffer->dtype,
+                        clear_buffer->shape, const_true(), body);
+      }
+      return body;
     }
-    return body;
   }
 
   LOG(FATAL) << "Reduce for buffers in scope (" << src_scope << ", "
@@ -385,96 +739,32 @@ LayoutMap ReduceOpNode::InferLayout(const LayoutInferArgs &T,
   if (IsFragmentBuffer(src) && IsFragmentBuffer(dst) &&
       T.layout_map.count(src)) {
     auto src_layout = T.layout_map[src].as<Fragment>().value();
+    auto reducer_layout = ComputeReducerLayout(src_layout, this->dim);
 
-    PrimExpr indice_rep_extent = src->shape[dim];
-    PrimExpr src_rep_extent = src_layout->ReplicateExtent();
-    PrimExpr dest_buffer_rep_extent = indice_rep_extent * src_rep_extent;
-
-    Array<PrimExpr> fwd;
-    for (int i = 0; i < static_cast<int>(src->shape.size()); i++) {
-      if (i == dim) {
-        fwd.push_back(FloorMod(ReplicationPlaceholder(), indice_rep_extent));
-      } else if (i < dim) {
-        fwd.push_back(InputPlaceholder(i));
-      } else if (i > dim) {
-        fwd.push_back(InputPlaceholder(i - 1));
-      }
-    }
-    auto thd = src_layout->ForwardThread(
-        fwd, FloorDiv(ReplicationPlaceholder(), indice_rep_extent));
-
-    // Ensure the thread count is divisible by the replicate extent.
-    // Otherwise, we cannot infer a valid fragment<->fragment layout.
-    {
-      arith::Analyzer analyzer;
-      PrimExpr num_threads = T.thread_bounds->extent;
-      // Though the dest_buffer_rep_extent will be compressed at
-      // CondenseReplicateVar, we need to check the divisibility here to avoid
-      // the issue that the thread count is not divisible by the replicate
-      // extent.
-      if (!analyzer.CanProve(FloorMod(num_threads, dest_buffer_rep_extent) ==
-                             0) &&
-          !analyzer.CanProve(FloorMod(dest_buffer_rep_extent, num_threads) ==
-                             0)) {
-        ICHECK(false) << "ReduceOp fragment layout inference failed: "
-                         "num_threads % replicate_extent != 0. "
-                      << "This mapping requires the block's thread count to be "
-                         "divisible by the "
-                      << "replicate extent. "
-                      << "Try one of: (1) choose a thread block size divisible "
-                         "by replicate_extent; "
-                      << "(2) pick a different reduce dimension or adjust the "
-                         "source fragment layout; "
-                      << "Details: num_threads=" << num_threads
-                      << ", replicate_extent=" << indice_rep_extent
-                      << ", src=" << src << ", dst=" << dst;
-      }
+    if (!T.layout_map.count(dst)) {
+      return {{dst, reducer_layout}};
     }
 
-    Fragment dst_layout =
-        Fragment(dst->shape, {}, thd, dest_buffer_rep_extent, std::nullopt)
-            ->CondenseReplicateVar()
-            ->BindThreadRange(T.thread_bounds);
-
-    if (!T.layout_map.count(dst))
-      return {{dst, dst_layout}};
-    else {
-      // Check if computed layout is compatible with existing: the existing one
-      // must strictly contains the computed layout
-      auto orig_dst_layout =
-          T.layout_map.Get(dst).value().as<Fragment>().value();
-      ICHECK(dst_layout->InputDim() == orig_dst_layout->InputDim());
-      Array<PrimExpr> indices;
-      indices.reserve(dst_layout->InputDim());
-      arith::Analyzer inner_analyzer;
-      for (int i = 0; i < dst_layout->InputDim(); ++i) {
-        auto x = InputPlaceholder(i);
-        indices.push_back(x);
-        // should be literal - literal = 0, any analyzer will work
-        ICHECK(is_zero(inner_analyzer.Simplify(
-            dst_layout->InputShape()[i] - orig_dst_layout->InputShape()[i])));
-        inner_analyzer.Bind(x, Range(0, dst_layout->InputShape()[i]));
-      }
-
-      ICHECK(as_const_int(dst_layout->ReplicateExtent()));
-      ICHECK(as_const_int(src_layout->ReplicateExtent()));
-      auto dst_rep = *as_const_int(dst_layout->ReplicateExtent());
-      auto src_rep = *as_const_int(src_layout->ReplicateExtent());
-      if (dst_rep < src_rep ||
-          !ProveFragmentContains(orig_dst_layout, dst_layout, indices, indices,
-                                 inner_analyzer)) {
-        std::ostringstream oss;
-        oss << "Layout may conflict with ReduceOp for buffer " << dst << " vs. "
-            << src << "\nLHS = " << src_layout->DebugOutput()
-            << "\nRHS = " << orig_dst_layout->DebugOutput()
-            << "\nYou may need to use a shared memory to transform the "
-               "layout";
-        throw LayoutConflictException(oss.str());
-      }
+    auto orig_dst_layout = T.layout_map.Get(dst).value().as<Fragment>().value();
+    ICHECK(reducer_layout->InputDim() == orig_dst_layout->InputDim());
 
-      if (dst_rep > src_rep) {
-        return {{dst, dst_layout}};
-      }
+    auto indices = InputPlaceholders(reducer_layout->InputDim());
+    arith::Analyzer analyzer;
+    for (size_t i = 0; i < indices.size(); i++) {
+      analyzer.Bind(Downcast<Var>(indices[i]),
+                    Range(0, reducer_layout->InputShape()[i]));
+    }
+    if (!ProveFragmentContains(orig_dst_layout, reducer_layout, indices,
+                               indices, analyzer)) {
+      std::ostringstream oss;
+      oss << "Layout may conflict with ReduceOp for buffer " << dst << " vs. "
+          << src << "\n"
+          << "src_layout = " << src_layout << "\n"
+          << "reducer_layout = " << reducer_layout << "\n"
+          << "orig_dst_layout = " << orig_dst_layout << "\n"
+          << "You may need to use a shared memory to transform the "
+             "layout";
+      throw LayoutConflictException(oss.str());
     }
   }
   return {};
@@ -505,8 +795,11 @@ CumSumOp::CumSumOp(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
   ObjectPtr<CumSumOpNode> node = tvm::ffi::make_object<CumSumOpNode>();
   // node->src = vmap[GetVarFromAccessPtr(args[0])];
   // node->dst = vmap[GetVarFromAccessPtr(args[1])];
-  node->srcRegion_ = NormalizeToBufferRegion(args[0]);
-  node->dstRegion_ = NormalizeToBufferRegion(args[1]);
+  auto src_access = NormalizeToAccessRegion(args[0], kAccessRead);
+  auto dst_access = NormalizeToAccessRegion(args[1], kAccessWrite);
+  node->srcRegion_ = src_access.region;
+  node->dstRegion_ = dst_access.region;
+  node->SetAccessRegions({src_access, dst_access});
   node->src = node->srcRegion_->buffer;
   node->dst = node->dstRegion_->buffer;
   node->dim = args[2].as<IntImm>().value()->value;
@@ -523,9 +816,8 @@ Stmt CumSumOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
   if (IsFragmentBuffer(this->src) && IsFragmentBuffer(this->dst)) {
     LOG(FATAL) << "CumSum for fragment not implemented, please raise an issue "
                   "if you need this feature.";
-  } else if (this->src.scope() == "shared.dyn" ||
-             this->src.scope() == "shared") {
-    ICHECK(this->dst.scope() == "shared.dyn" || this->dst.scope() == "shared");
+  } else if (IsSharedBuffer(this->src)) {
+    ICHECK(IsSharedBuffer(this->dst));
     std::stringstream ss;
     auto threads = T.thread_bounds->extent;
     Array<PrimExpr> args;
diff --git a/src/op/reduce.h b/src/op/reduce.h
index 9d3fd8c4e0..bc776c44ae 100644
--- a/src/op/reduce.h
+++ b/src/op/reduce.h
@@ -87,6 +87,18 @@ class ReduceOpNode : public TileOperatorNode {
   int dim;         ///< Dimension to reduce along
   ReduceType type; ///< Type of reduction operation
   bool clear;      ///< Whether to clear destination before reduction
+  int batch{1};    ///< Number of output elements per batched AllReduce
+                   ///< call. Default 1 = scalar (current behaviour).
+                   ///< When batch > 1, the compiler emits
+                   ///< ceil(N/batch) batched AllReduce calls each
+                   ///< sharing a single pair of barriers across batch
+                   ///< elements. batch must evenly divide the
+                   ///< per-thread output element count N derived from
+                   ///< the fragment layout.
+  bool nan_propagate{false}; ///< For fp16/bf16 max/min/absmax: propagate NaN
+                             ///< (use __hmax_nan/__hmin_nan) instead of the
+                             ///< default __hmax/__hmin which return the
+                             ///< non-NaN operand.
 
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.ReduceOp", ReduceOpNode,
                                     TileOperatorNode);
@@ -100,7 +112,9 @@ class ReduceOpNode : public TileOperatorNode {
         .def_ro("dstRegion", &ReduceOpNode::dstRegion_)
         .def_ro("dim", &ReduceOpNode::dim)
         .def_ro("type", &ReduceOpNode::type)
-        .def_ro("clear", &ReduceOpNode::clear);
+        .def_ro("clear", &ReduceOpNode::clear)
+        .def_ro("batch", &ReduceOpNode::batch)
+        .def_ro("nan_propagate", &ReduceOpNode::nan_propagate);
   }
 
   /// Lower the operator to TIR statements
@@ -108,6 +122,7 @@ class ReduceOpNode : public TileOperatorNode {
   /// Infer memory layout for buffers
   LayoutMap InferLayout(const LayoutInferArgs &T,
                         InferLevel level) const override;
+  AccessRegions GetAccessRegions() const override;
   static const Op &Get();
   TileOperator Clone() const;
 
@@ -115,7 +130,7 @@ class ReduceOpNode : public TileOperatorNode {
   /// Generate initial value for reduction
   PrimExpr MakeInitValue() const;
   /// Generate reduction expression
-  PrimExpr MakeReduce(const PrimExpr &a, const PrimExpr &b) const;
+  PrimExpr MakeReduce(const PrimExpr &acc, const PrimExpr &b) const;
   /// Generate codegen reducer string
   std::string MakeCodegenReducer() const;
 };
diff --git a/src/op/remote_copy.cc b/src/op/remote_copy.cc
index 1c2e7a7d27..b4ed9c80d9 100644
--- a/src/op/remote_copy.cc
+++ b/src/op/remote_copy.cc
@@ -12,7 +12,6 @@
 
 #include <sstream>
 
-#include "../target/cuda.h"
 #include "../target/utils.h"
 #include "builtin.h"
 #include "distributed.h"
@@ -114,9 +113,11 @@ Stmt PutOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
         Call(DataType::Handle(), tl::get_remote_base_ptr(), {dst_pe}) +
         offset_to_base);
   } else {
-    new_args.push_back(MakeRemappedAddress(T, dst_buffer, dst_indices));
+    new_args.push_back(Call(DataType::Handle(), tl::get_uintptr_t(),
+                            {MakeRemappedAddress(T, dst_buffer, dst_indices)}));
   }
-  new_args.push_back(MakeRemappedAddress(T, src_buffer, src_indices));
+  new_args.push_back(Call(DataType::Handle(), tl::get_uintptr_t(),
+                          {MakeRemappedAddress(T, src_buffer, src_indices)}));
   auto put = Call(DataType::Handle(), builtin::call_extern(), new_args);
   return Evaluate(put);
 }
@@ -212,7 +213,8 @@ Stmt GetOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
 
   new_args.push_back(StringImm(ss.str()));
   PrimExpr dst_addr_expr = MakeRemappedAddress(T, dst_buffer, dst_indices);
-  new_args.push_back(dst_addr_expr); // Always dst first in tl_templates
+  new_args.push_back(
+      Call(DataType::Handle(), tl::get_uintptr_t(), {dst_addr_expr}));
   if (is_distributed()) {
     PrimExpr src_addr_expr = MakeRemappedAddress(T, src_buffer, src_indices);
     PrimExpr local_rank = Call(DataType::Int(64), tl::get_rank(), {});
@@ -225,7 +227,8 @@ Stmt GetOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
         Call(DataType::Handle(), tl::get_remote_base_ptr(), {src_pe}) +
         offset_to_base);
   } else {
-    new_args.push_back(MakeRemappedAddress(T, src_buffer, src_indices));
+    new_args.push_back(Call(DataType::Handle(), tl::get_uintptr_t(),
+                            {MakeRemappedAddress(T, src_buffer, src_indices)}));
   }
 
   auto get = Call(DataType::Handle(), builtin::call_extern(), new_args);
@@ -291,7 +294,7 @@ Stmt StOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
         Call(DataType::Handle(), tl::get_remote_base_ptr(), {dst_pe}) +
         offset_to_base);
   } else {
-    new_args.push_back(dst);
+    new_args.push_back(Call(DataType::Handle(), tl::get_uintptr_t(), {dst}));
   }
   new_args.push_back(value);
 
@@ -359,7 +362,7 @@ Stmt LdOpNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
         Call(DataType::Handle(), tl::get_remote_base_ptr(), {src_pe}) +
         offset_to_base);
   } else {
-    new_args.push_back(src);
+    new_args.push_back(Call(DataType::Handle(), tl::get_uintptr_t(), {src}));
   }
   new_args.push_back(value);
 
diff --git a/src/op/tcgen5_meta.h b/src/op/tcgen5_meta.h
index 8b6ff61ba3..1ddfa9367e 100644
--- a/src/op/tcgen5_meta.h
+++ b/src/op/tcgen5_meta.h
@@ -19,8 +19,12 @@ struct TCGEN5MMAMeta {
 };
 
 inline std::pair<bool, TCGEN5MMAMeta>
-GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
+GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype,
+                 bool disable_2cta = false) {
 // TODO (lei) Currently not all shapes / dtypes are supported for TCGEN5MMA.
+// NOTE: In case that other tcgen5mma in the same kernel must use 1-cta,
+// we should disable 2cta for the current tcgen5mma.
+// TODO(wt): Add more 2cta-preferred shapes
 #define FAIL                                                                   \
   return {                                                                     \
     false, TCGEN5MMAMeta { 0, 0, 0, false, false }                             \
@@ -34,8 +38,19 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
       (c_dtype.is_float() && c_dtype.bits() == 32)) {
     if (K % 16 != 0)
       FAIL;
+    if (!disable_2cta) {
+      if (M % 128 == 0) {
+        for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
+          if (N % atom_n == 0)
+            SUCCESS(256, atom_n, 16, false, true);
+      } else if (M % 64 == 0) {
+        for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
+          if (N % atom_n == 0)
+            SUCCESS(128, atom_n, 16, false, true);
+      }
+    }
     if (M % 128 == 0) {
-      for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
+      for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
         if (N % atom_n == 0)
           SUCCESS(128, atom_n, 16, false, false);
       FAIL;
@@ -58,13 +73,21 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
               (c_dtype.is_float16() && c_dtype.bits() == 16))) {
     if (K % 32 != 0)
       FAIL;
+    if (!disable_2cta) {
+      if (M % 128 == 0) {
+        for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
+          if (N % atom_n == 0)
+            SUCCESS(256, atom_n, 32, false, true);
+      } else if (M % 64 == 0) {
+        for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
+          if (N % atom_n == 0)
+            SUCCESS(128, atom_n, 32, false, true);
+      }
+    }
     if (M % 128 == 0) {
       for (int atom_n : ws_valid_atom_ns)
         if (N % atom_n == 0)
           SUCCESS(128, atom_n, 32, true, false);
-      for (int atom_n = 256; atom_n >= 16; atom_n -= 16)
-        if (N % atom_n == 0)
-          SUCCESS(128, atom_n, 32, false, true);
       for (int atom_n = 256; atom_n >= 8; atom_n -= 8)
         if (N % atom_n == 0)
           SUCCESS(128, atom_n, 32, false, false);
@@ -85,6 +108,47 @@ GetTCGEN5MMAMeta(int M, int N, int K, DataType ab_dtype, DataType c_dtype) {
     } else {
       FAIL;
     }
+  } else if ((ab_dtype.is_int() || ab_dtype.is_uint()) &&
+             ab_dtype.bits() == 8 && c_dtype.is_int() && c_dtype.bits() == 32) {
+    if (K % 32 != 0)
+      FAIL;
+    if (!disable_2cta) {
+      if (M % 128 == 0) {
+        for (int atom_n = 256; atom_n >= 32; atom_n -= 32)
+          if (N % atom_n == 0)
+            SUCCESS(256, atom_n, 32, false, true);
+      } else if (M % 64 == 0) {
+        for (int atom_n = 256; atom_n >= 32; atom_n -= 32)
+          if (N % atom_n == 0)
+            SUCCESS(128, atom_n, 32, false, true);
+      }
+    }
+    if (M % 128 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= (atom_n > 32 ? 16 : 8))
+        // steps of 16 after N > 32
+        if (N % atom_n == 0)
+          SUCCESS(128, atom_n, 32, false, false);
+      FAIL;
+    } else if (M % 64 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, true, false);
+      for (int atom_n = 256; atom_n >= 8; atom_n -= (atom_n > 32 ? 16 : 8))
+        // steps of 16 after N > 32
+        if (N % atom_n == 0)
+          SUCCESS(64, atom_n, 32, false, false);
+      FAIL;
+    } else if (M % 32 == 0) {
+      for (int atom_n : ws_valid_atom_ns)
+        if (N % atom_n == 0)
+          SUCCESS(32, atom_n, 32, true, false);
+      FAIL;
+    } else {
+      FAIL;
+    }
   }
   FAIL;
 #undef FAIL
@@ -114,6 +178,16 @@ inline uint32_t GetTCGEN5InstrDesc(int atom_m, int atom_n, int atom_k,
       return static_cast<uint32_t>(0);
     } else if (dtype.is_float8_e5m2fnuz() || dtype.is_float8_e5m2()) {
       return static_cast<uint32_t>(1);
+    } else if (dtype.is_float6_e2m3fn()) {
+      return static_cast<uint32_t>(3);
+    } else if (dtype.is_float6_e3m2fn()) {
+      return static_cast<uint32_t>(4);
+    } else if (dtype.is_float4_e2m1fn()) {
+      return static_cast<uint32_t>(5);
+    } else if (dtype.is_int() && dtype.bits() == 8) {
+      return static_cast<uint32_t>(1);
+    } else if (dtype.is_uint() && dtype.bits() == 8) {
+      return static_cast<uint32_t>(0);
     }
     LOG(FATAL) << "Unsupported dtype for TCGEN5MMA descriptor: " << dtype;
     return 0u;
@@ -127,7 +201,7 @@ inline uint32_t GetTCGEN5InstrDesc(int atom_m, int atom_n, int atom_k,
     c_format = 0;
   } else if (c_dtype.is_float()) {
     c_format = 1;
-  } else if (c_dtype.is_int()) {
+  } else if (c_dtype.is_int() && c_dtype.bits() == 32) {
     c_format = 2;
   } else {
     LOG(FATAL) << "Unsupported accumulator dtype for TCGEN5MMA descriptor: "
@@ -171,6 +245,72 @@ inline uint32_t GetTCGEN5InstrDesc(int atom_m, int atom_n, int atom_k,
   return desc;
 }
 
+// Build block-scaled instruction descriptor for mxf8f6f4.block_scale
+// Bit layout: InstrDescriptorBlockScaled (see CUTLASS mma_sm100_desc.hpp)
+inline uint32_t GetTCGEN5BlockScaledInstrDesc(int atom_m, int atom_n,
+                                              DataType ab_dtype,
+                                              bool a_is_k_major,
+                                              bool b_is_k_major, int scale_in_a,
+                                              int scale_in_b, int a_sf_id,
+                                              int b_sf_id) {
+  ICHECK(atom_m % 16 == 0) << "atom_m must be divisible by 16";
+  ICHECK(atom_n % 8 == 0) << "atom_n must be divisible by 8";
+  ICHECK(scale_in_a == 1 || scale_in_a == -1);
+  ICHECK(scale_in_b == 1 || scale_in_b == -1);
+
+  // a_format / b_format for MXF8F6F4: E4M3=0, E5M2=1
+  auto encode_mxfp_dtype = [&](DataType dtype) -> uint32_t {
+    if (dtype.is_float8_e4m3fn() || dtype.is_float8_e4m3fnuz() ||
+        dtype.is_float8_e4m3()) {
+      return 0u; // E4M3
+    } else if (dtype.is_float8_e5m2fnuz() || dtype.is_float8_e5m2()) {
+      return 1u; // E5M2
+    } else if (dtype.is_float6_e2m3fn()) {
+      return 3u; // E2M3
+    } else if (dtype.is_float6_e3m2fn()) {
+      return 4u; // E3M2
+    } else if (dtype.is_float4_e2m1fn()) {
+      return 5u; // E2M1
+    }
+    LOG(FATAL) << "Unsupported dtype for block-scaled descriptor: " << dtype;
+    return 0u;
+  };
+
+  auto set_bits = [](uint32_t value, int start, int width) -> uint32_t {
+    uint32_t mask = (width == 32) ? 0xFFFFFFFFu : ((1u << width) - 1);
+    return (value & mask) << start;
+  };
+
+  uint32_t a_format = encode_mxfp_dtype(ab_dtype);
+  uint32_t b_format = a_format;
+  uint32_t a_neg = (scale_in_a == -1) ? 1u : 0u;
+  uint32_t b_neg = (scale_in_b == -1) ? 1u : 0u;
+  uint32_t a_major = a_is_k_major ? 0u : 1u;
+  uint32_t b_major = b_is_k_major ? 0u : 1u;
+  uint32_t n_dim = static_cast<uint32_t>(atom_n >> 3);
+  uint32_t m_dim = static_cast<uint32_t>(atom_m >> 4);
+
+  uint32_t desc = 0;
+  desc |= set_bits(0, 0, 2); // sparse_id2
+  desc |= set_bits(0, 2, 1); // sparse_flag
+  // bit 3 reserved
+  desc |= set_bits(static_cast<uint32_t>(b_sf_id), 4, 2); // b_sf_id
+  // bit 6 reserved
+  desc |= set_bits(a_format, 7, 3);  // a_format
+  desc |= set_bits(b_format, 10, 3); // b_format
+  desc |= set_bits(a_neg, 13, 1);    // a_negate
+  desc |= set_bits(b_neg, 14, 1);    // b_negate
+  desc |= set_bits(a_major, 15, 1);  // a_major
+  desc |= set_bits(b_major, 16, 1);  // b_major
+  desc |= set_bits(n_dim, 17, 6);    // n_dim
+  desc |= set_bits(1, 23, 1);        // scale_format = 1 (E8M0)
+  desc |= set_bits(m_dim, 24, 5);    // m_dim
+  desc |= set_bits(static_cast<uint32_t>(a_sf_id), 29, 2); // a_sf_id
+  desc |= set_bits(0, 31, 1);                              // k_size = 0 (K32)
+
+  return desc;
+}
+
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/op/transpose.cc b/src/op/transpose.cc
new file mode 100644
index 0000000000..3238a633c9
--- /dev/null
+++ b/src/op/transpose.cc
@@ -0,0 +1,214 @@
+/*!
+ * \file tl/op/transpose.cc
+ * \brief Transpose operator: dst[j, i] = src[i, j] using SIMT loops.
+ */
+
+#include "transpose.h"
+
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/op_attr_types.h>
+
+#include "../target/utils.h"
+#include "../transform/common/loop_fusion_utils.h"
+#include "../transform/loop_partition.h"
+#include "../transform/loop_vectorize.h"
+#include "utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+Transpose::Transpose(Array<PrimExpr> args, Map<String, ObjectRef> annotations) {
+  ObjectPtr<TransposeNode> node = tvm::ffi::make_object<TransposeNode>();
+  auto src_access = NormalizeToAccessRegion(args[0], kAccessRead);
+  auto dst_access = NormalizeToAccessRegion(args[1], kAccessWrite);
+  node->src = src_access.region->buffer;
+  node->dst = dst_access.region->buffer;
+  node->src_range = src_access.region->region;
+  node->dst_range = dst_access.region->region;
+  node->SetAccessRegions({src_access, dst_access});
+  data_ = std::move(node);
+}
+
+TileOperator TransposeNode::Clone() const {
+  auto op = tvm::ffi::make_object<TransposeNode>(*this);
+  return Transpose(op);
+}
+
+Array<IterVar> TransposeNode::MakeIterVars() const {
+  // Use src_range as the iteration domain (src is the "inner" side).
+  Array<IterVar> loop_vars;
+  size_t idx = 0;
+  for (size_t i = 0; i < src_range.size(); i++) {
+    if (is_one(src_range[i]->extent))
+      continue;
+    Var var = Var(std::string{char('i' + idx)}, src_range[i]->extent->dtype);
+    idx++;
+    loop_vars.push_back(
+        {Range(0, src_range[i]->extent), var, IterVarType::kDataPar});
+  }
+  return loop_vars;
+}
+
+Array<PrimExpr> TransposeNode::MakeIndices(const Array<IterVar> &ivs,
+                                           int src_dst) const {
+  Array<PrimExpr> indices;
+  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+
+  if (src_dst == 1) {
+    // Transpose: reverse the loop variable assignment for non-trivial dims.
+    std::vector<size_t> nontrivial;
+    for (size_t i = 0; i < ranges.size(); i++) {
+      if (!is_one(ranges[i]->extent))
+        nontrivial.push_back(i);
+    }
+    ICHECK(nontrivial.size() == ivs.size())
+        << "Transpose: nontrivial dims (" << nontrivial.size()
+        << ") != ivs size (" << ivs.size() << ") for dst=" << dst->name;
+    size_t N = nontrivial.size();
+    size_t nt_idx = 0;
+    for (size_t i = 0; i < ranges.size(); i++) {
+      if (is_one(ranges[i]->extent)) {
+        indices.push_back(ranges[i]->min);
+      } else {
+        size_t rev = N - 1 - nt_idx;
+        indices.push_back(ranges[i]->min + ivs[rev]->var);
+        nt_idx++;
+      }
+    }
+  } else {
+    // Source: direct mapping.
+    size_t idx = 0;
+    for (size_t i = 0; i < ranges.size(); i++) {
+      if (is_one(ranges[i]->extent))
+        indices.push_back(ranges[i]->min);
+      else {
+        indices.push_back(ranges[i]->min + ivs[idx]->var);
+        idx++;
+      }
+    }
+    ICHECK(idx == ivs.size())
+        << "idx = " << idx << ", ivs.size() = " << ivs.size()
+        << " src name = " << src->name << ", dst name = " << dst->name;
+  }
+  return indices;
+}
+
+PrimExpr TransposeNode::MakePredicate(arith::Analyzer *analyzer,
+                                      const Array<IterVar> &ivs,
+                                      Array<PrimExpr> extents,
+                                      int src_dst) const {
+  bool do_transpose = (src_dst == 1);
+  Array<Range> ranges = src_dst == 0 ? src_range : dst_range;
+
+  size_t num_nontrivial = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (!is_one(ranges[i]->extent))
+      num_nontrivial++;
+  }
+
+  Array<PrimExpr> cond_list;
+  ICHECK(extents.size() == ranges.size()) << extents << " " << ranges;
+  size_t idx = 0;
+  for (size_t i = 0; i < ranges.size(); i++) {
+    if (is_one(ranges[i]->extent))
+      continue;
+    size_t iv_idx = do_transpose ? (num_nontrivial - 1 - idx) : idx;
+    PrimExpr cond = ranges[i]->min + ivs[iv_idx]->var < extents[i];
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    cond = ranges[i]->min + ivs[iv_idx]->var >= 0;
+    if (!analyzer->CanProve(cond, arith::ProofStrength::kSymbolicBound)) {
+      cond_list.push_back(cond);
+    }
+    idx++;
+  }
+  if (cond_list.empty())
+    return {};
+  PrimExpr cond = cond_list[0];
+  for (size_t i = 1; i < cond_list.size(); i++)
+    cond = And(cond, cond_list[i]);
+  return cond;
+}
+
+For TransposeNode::MakeSIMTLoop(arith::Analyzer *analyzer) const {
+  Array<IterVar> loop_vars = MakeIterVars();
+  bool is_scalar = loop_vars.empty();
+
+  for (const auto &iv : loop_vars)
+    analyzer->Bind(iv->var, iv->dom);
+
+  Array<PrimExpr> src_indices = MakeIndices(loop_vars, 0);
+  Array<PrimExpr> dst_indices = MakeIndices(loop_vars, 1);
+
+  PrimExpr src_predicate = MakePredicate(analyzer, loop_vars, src->shape, 0);
+  PrimExpr dst_predicate = MakePredicate(analyzer, loop_vars, dst->shape, 1);
+
+  PrimExpr value = BufferLoad(src, src_indices);
+  if (src->dtype != dst->dtype)
+    value = Cast(dst->dtype, value);
+  if (src_predicate.defined())
+    value = if_then_else(src_predicate, value, make_zero(dst->dtype));
+
+  Stmt body = BufferStore(dst, value, dst_indices);
+  if (dst_predicate.defined())
+    body = IfThenElse(dst_predicate, body);
+  if (is_scalar) {
+    return For(Var("i"), 0, 1, ForKind::kSerial, body);
+  }
+
+  for (int i = loop_vars.size() - 1; i >= 0; i--) {
+    body = For(loop_vars[i]->var, 0, loop_vars[i]->dom->extent,
+               ForKind::kParallel, body);
+  }
+  return Downcast<For>(body);
+}
+
+Stmt TransposeNode::Lower(const LowerArgs &T, arith::Analyzer *analyzer) const {
+  // Transpose always uses normal SIMT lowering (no TMA/LDSM/etc.).
+  bool is_cpu_target = T.target->GetTargetDeviceType() == kDLCPU;
+  auto simt_loop = MakeSIMTLoop(analyzer);
+  auto fused_loop = Downcast<For>(ParallelLoopFuser::Fuse(simt_loop));
+
+  if (is_cpu_target || IsLocalBuffer(src) || IsLocalBuffer(dst)) {
+    auto vectorized_loop = VectorizeLoop(fused_loop, T.layout_map);
+    return vectorized_loop;
+  } else {
+    auto par_op = ParallelOp(fused_loop);
+    std::vector<InferLevel> levels = {InferLevel::kCommon, InferLevel::kStrict,
+                                      InferLevel::kFree};
+    for (auto level : levels) {
+      par_op->InferLayout({T.target,
+                           T.thread_bounds,
+                           T.layout_map,
+                           analyzer,
+                           false,
+                           T.buffer_remap,
+                           {}},
+                          level);
+    }
+    auto loop_layout = par_op->GetLoopLayout();
+    return LowerParallelLoop(par_op->GetRoot(), loop_layout, T.thread_var,
+                             analyzer, T.layout_map,
+                             par_op->GetPredicate(T.thread_var));
+  }
+}
+
+LayoutMap TransposeNode::InferLayout(const LayoutInferArgs &T,
+                                     InferLevel level) const {
+  // Transpose always uses SIMT loops; no special layout inference needed.
+  return {};
+}
+
+TIR_REGISTER_TL_TILE_OP(Transpose, transpose)
+    .set_num_inputs(2)
+    .set_attr<TCallEffectKind>("TCallEffectKind",
+                               Integer(CallEffectKind::kOpaque));
+
+TVM_FFI_STATIC_INIT_BLOCK() { TransposeNode::RegisterReflection(); }
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/op/transpose.h b/src/op/transpose.h
new file mode 100644
index 0000000000..2d7ca30d1e
--- /dev/null
+++ b/src/op/transpose.h
@@ -0,0 +1,70 @@
+/*!
+ * \file tl/op/transpose.h
+ * \brief Transpose operation for 2D shared memory buffers.
+ */
+
+#ifndef TVM_TL_OP_TRANSPOSE_H_
+#define TVM_TL_OP_TRANSPOSE_H_
+
+#include "operator.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/// Node class for transpose operations: dst[j, i] = src[i, j]
+class TransposeNode : public TileOperatorNode {
+public:
+  Buffer src, dst;
+  Array<Range> src_range, dst_range;
+
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.Transpose", TransposeNode,
+                                    TileOperatorNode);
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<TransposeNode>()
+        .def_ro("src", &TransposeNode::src)
+        .def_ro("dst", &TransposeNode::dst)
+        .def_ro("src_range", &TransposeNode::src_range)
+        .def_ro("dst_range", &TransposeNode::dst_range);
+  }
+
+  Stmt Lower(const LowerArgs &T, arith::Analyzer *analyzer) const override;
+  LayoutMap InferLayout(const LayoutInferArgs &T,
+                        InferLevel level) const override;
+  TileOperator Clone() const override;
+
+private:
+  /// Create iterator variables for dimensions with extent > 1.
+  Array<IterVar> MakeIterVars() const;
+
+  /// Generate source (src_dst=0) or destination (src_dst=1) index expressions.
+  /// For the destination side, non-trivial dimension indices are reversed to
+  /// implement the transpose.
+  Array<PrimExpr> MakeIndices(const Array<IterVar> &ivs, int src_dst) const;
+
+  /// Build boundary predicate with transposed index mapping for dst.
+  PrimExpr MakePredicate(arith::Analyzer *analyzer, const Array<IterVar> &ivs,
+                         Array<PrimExpr> extents, int src_dst) const;
+
+  /// Build a SIMT-style nested parallel loop implementing the transpose.
+  For MakeSIMTLoop(arith::Analyzer *analyzer) const;
+};
+
+/// Wrapper class for transpose operations
+class Transpose : public TileOperator {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NULLABLE(Transpose, TileOperator,
+                                             TransposeNode);
+  TVM_DLL
+  Transpose(Array<PrimExpr> args,
+            Map<String, ObjectRef> annotations = Map<String, ObjectRef>());
+  static const Op &Get();
+};
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_OP_TRANSPOSE_H_
diff --git a/src/op/utils.cc b/src/op/utils.cc
index 7e56ae8c7a..2be9704617 100644
--- a/src/op/utils.cc
+++ b/src/op/utils.cc
@@ -4,6 +4,7 @@
  */
 
 #include "utils.h"
+#include "tvm/tir/expr.h"
 
 #include <tvm/tir/builtin.h>
 
@@ -12,6 +13,16 @@ namespace tl {
 
 using namespace tir;
 
+bool IsBufferLikeExpr(const PrimExpr &expr) {
+  if (expr.as<BufferLoadNode>() || expr.as<BufferRegionNode>()) {
+    return true;
+  }
+  if (const auto *call = expr.as<CallNode>()) {
+    return (call->op.same_as(RegionOp::Get()));
+  }
+  return false;
+}
+
 BufferRegion NormalizeToBufferRegion(const PrimExpr &arg) {
   // Case 1: Already a BufferRegion
   if (arg->IsInstance<BufferRegionNode>()) {
@@ -52,6 +63,18 @@ BufferRegion NormalizeToBufferRegion(const PrimExpr &arg) {
   throw; // Unreachable
 }
 
+AccessRegion NormalizeToAccessRegion(const PrimExpr &arg,
+                                     int default_access_mask) {
+  if (const auto *call = arg.as<CallNode>()) {
+    if (call->op.same_as(RegionOp::Get())) {
+      RegionOp region(call->args);
+      return {BufferRegion(region->GetBuffer(), region->GetRanges()),
+              region->GetAccessMask()};
+    }
+  }
+  return {NormalizeToBufferRegion(arg), default_access_mask};
+}
+
 PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region, int rw_mask,
                                  bool require_2d) {
   Buffer buf = region->buffer;
@@ -92,5 +115,106 @@ PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region, int rw_mask,
   return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
 }
 
+PrimExpr MakeAccessPtrFromBufferLoad(const BufferLoad &load, int rw_mask) {
+  Buffer buf = load->buffer;
+  int ndim = static_cast<int>(buf->shape.size());
+
+  // Compute offset using row-major layout (iterate in reverse)
+  PrimExpr offset = 0;
+  PrimExpr stride = 1;
+
+  for (int i = ndim - 1; i >= 0; --i) {
+    const PrimExpr &index = load->indices[i];
+    if (const auto *ramp = index.as<RampNode>()) {
+      // For Ramp, use the base
+      offset = offset + ramp->base * stride;
+    } else {
+      // For scalar index (IntImm or other PrimExpr)
+      offset = offset + index * stride;
+    }
+    stride = stride * buf->shape[i];
+  }
+
+  // Extent is 1 element for a single BufferLoad access
+  PrimExpr extent = make_const(DataType::Int(32), 1);
+
+  // Build access_ptr
+  PrimExpr ptype = tir::TypeAnnotation(buf->dtype);
+  Array<PrimExpr> acc_args{ptype, buf->data, offset, extent,
+                           IntImm(DataType::Int(32), rw_mask)};
+  return Call(DataType::Handle(), builtin::tvm_access_ptr(), acc_args);
+}
+
+// Maps TVM DataType to CUDA's CUtensorMapDataType enum value.
+int to_CUtensorMapDataType(DataType dtype) {
+  // CUDA 13 adds packed U4 TensorMap formats. The vendored CUDA stub may lag
+  // the installed toolkit, so keep the enum value by CUDA's documented order.
+  constexpr int kTensorMapDataType16U4Align8B = 13;
+  if (dtype.is_float4_e2m1fn()) {
+    return kTensorMapDataType16U4Align8B;
+  }
+
+  CUtensorMapDataType tp;
+  if (dtype.is_float()) {
+    switch (dtype.bits()) {
+    case 64:
+      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT64;
+      break;
+    case 32:
+      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT32;
+      break;
+    case 16:
+      tp = CU_TENSOR_MAP_DATA_TYPE_FLOAT16;
+      break;
+    case 8:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      break;
+    default:
+      ICHECK(0) << dtype;
+    }
+  } else if (dtype.is_bfloat16()) {
+    tp = CU_TENSOR_MAP_DATA_TYPE_BFLOAT16;
+  } else if (dtype.is_float8()) {
+    tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+  } else if (dtype.is_int()) {
+    switch (dtype.bits()) {
+    case 64:
+      tp = CU_TENSOR_MAP_DATA_TYPE_INT64;
+      break;
+    case 32:
+      tp = CU_TENSOR_MAP_DATA_TYPE_INT32;
+      break;
+    case 16:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT16;
+      break;
+    case 8:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      break;
+    default:
+      ICHECK(0) << dtype;
+    }
+  } else if (dtype.is_uint()) {
+    switch (dtype.bits()) {
+    case 64:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT64;
+      break;
+    case 32:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT32;
+      break;
+    case 16:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT16;
+      break;
+    case 8:
+      tp = CU_TENSOR_MAP_DATA_TYPE_UINT8;
+      break;
+    default:
+      ICHECK(0) << dtype;
+    }
+  } else {
+    ICHECK(0) << dtype;
+  }
+  return static_cast<int>(tp);
+}
+
 } // namespace tl
 } // namespace tvm
diff --git a/src/op/utils.h b/src/op/utils.h
index 0cc26f2d5a..77e21feda6 100644
--- a/src/op/utils.h
+++ b/src/op/utils.h
@@ -6,8 +6,10 @@
 #ifndef TVM_TL_OP_UTILS_H_
 #define TVM_TL_OP_UTILS_H_
 
+#include "../target/stubs/cuda.h"
 #include "./operator.h"
 #include "region.h"
+#include "tvm/runtime/base.h"
 #include <tvm/tir/buffer.h>
 #include <tvm/tir/op.h>
 
@@ -16,11 +18,29 @@ namespace tl {
 
 using namespace tir;
 
+// Maps TVM DataType to CUDA's CUtensorMapDataType enum value.
+TVM_DLL int to_CUtensorMapDataType(DataType dtype);
+
+// Reverses an array (used for row-major/column-major layout conversion).
+template <typename T> Array<T> ReverseArray(Array<T> array) {
+  return Array<T>{array.rbegin(), array.rend()};
+}
+
+// Check if an PrimExpr is a buffer-like (BufferRegion/BufferLoad/tl.region)
+// expression.
+TVM_DLL bool IsBufferLikeExpr(const PrimExpr &expr);
+
 // Normalize an argument (BufferRegion/BufferLoad/tl.region)
 // to BufferRegion so ops can uniformly consume regions.
 // Note: tvm_access_ptr is no longer supported here.
 TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg);
 
+// Normalize an argument to BufferRegion together with an access mask.
+// If the argument is a tl.region(...) bridge, preserve its encoded mask;
+// otherwise fall back to the provided default mask.
+TVM_DLL AccessRegion NormalizeToAccessRegion(
+    const PrimExpr &arg, int default_access_mask = kAccessReadWrite);
+
 // Build a tvm_access_ptr(handle) from a BufferRegion.
 // - If `require_2d` is true, checks buffer ndim >= 2.
 // - For 1D regions (when allowed), offset=min, extent=extent.
@@ -29,26 +49,65 @@ TVM_DLL BufferRegion NormalizeToBufferRegion(const PrimExpr &arg);
 TVM_DLL PrimExpr MakeAccessPtrFromRegion(const BufferRegion &region,
                                          int rw_mask, bool require_2d = false);
 
+// Build a tvm_access_ptr(handle) from a BufferLoad.
+TVM_DLL PrimExpr MakeAccessPtrFromBufferLoad(const BufferLoad &load,
+                                             int rw_mask);
+
 // Check if a buffer is a fragment buffer (scope == "local.fragment")
 inline bool IsFragmentBuffer(const Buffer &buffer) {
   return buffer.defined() && buffer.scope() == "local.fragment";
 }
 
+// Expand a lower-rank layout by prepending the leading dimensions of `buffer`
+// so that the resulting layout input shape matches `buffer->shape`.
+//
+// This is useful when we infer a 2D swizzle layout from the trailing matrix
+// dimensions of a higher-rank buffer (e.g. batched GEMM shared-memory buffers).
+inline Layout ExpandLayoutToMatchBuffer(const Layout &layout,
+                                        const Buffer &buffer) {
+  if (!layout.defined() || !buffer.defined()) {
+    return layout;
+  }
+  const size_t buffer_ndim = buffer->shape.size();
+  const size_t layout_ndim = layout->InputDim();
+  if (buffer_ndim <= layout_ndim) {
+    return layout;
+  }
+
+  Array<PrimExpr> leading_shape;
+  leading_shape.reserve(buffer_ndim - layout_ndim);
+  for (size_t i = 0; i < buffer_ndim - layout_ndim; ++i) {
+    leading_shape.push_back(buffer->shape[i]);
+  }
+  return layout->Expand(leading_shape);
+}
+
 inline bool IsSharedBuffer(const Buffer &buffer, bool allow_dynamic = true) {
+  if (!buffer.defined()) {
+    return false;
+  }
   if (allow_dynamic) {
-    return buffer.defined() &&
-           (buffer.scope() == "shared" || buffer.scope() == "shared.dyn");
-  } else {
-    return buffer.defined() && buffer.scope() == "shared";
+    return buffer.scope() == "shared" || buffer.scope() == "shared.dyn";
   }
+  return buffer.scope() == "shared";
 }
 
 inline bool IsGlobalBuffer(const Buffer &buffer) {
   return buffer.defined() && buffer.scope() == "global";
 }
 
-inline bool IsLocalBuffer(const Buffer &buffer) {
-  return buffer.defined() && buffer.scope() == "local";
+inline bool IsValidCPAsyncTransferBytes(int bytes) {
+  return bytes == 4 || bytes == 8 || bytes == 16;
+}
+
+inline bool IsLocalBuffer(const Buffer &buffer, bool allow_var = false) {
+  if (!buffer.defined()) {
+    return false;
+  }
+  if (allow_var) {
+    return buffer.scope() == "local" || buffer.scope() == "local.var";
+  }
+  return buffer.scope() == "local";
 }
 
 inline bool IsLocalVarBuffer(const Buffer &buffer) {
diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc
new file mode 100644
index 0000000000..9745a025cb
--- /dev/null
+++ b/src/runtime/logging.cc
@@ -0,0 +1,133 @@
+#include <tvm/runtime/logging.h>
+
+#include <ctime>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+#include <unordered_map>
+
+namespace tvm {
+namespace runtime {
+namespace detail {
+
+namespace {
+const char *level_strings[] = {
+    ": Debug: ",   // TVM_LOG_LEVEL_DEBUG = 0
+    ": ",          // TVM_LOG_LEVEL_INFO  = 1
+    ": Warning: ", // TVM_LOG_LEVEL_WARNING = 2
+    ": Error: ",   // TVM_LOG_LEVEL_ERROR = 3
+    ": Fatal: ",   // TVM_LOG_LEVEL_FATAL = 4
+};
+
+constexpr const char *kSrcPrefix = "/src/";
+constexpr const size_t kSrcPrefixLength = 5;
+constexpr const char *kDefaultKeyword = "DEFAULT";
+
+std::string FileToVLogMapKey(const std::string &filename) {
+  size_t last_src =
+      filename.rfind(kSrcPrefix, std::string::npos, kSrcPrefixLength);
+  if (last_src == std::string::npos) {
+    std::string no_slash_src{kSrcPrefix + 1};
+    if (filename.substr(0, no_slash_src.size()) == no_slash_src) {
+      return filename.substr(no_slash_src.size());
+    }
+  }
+  return (last_src == std::string::npos)
+             ? filename
+             : filename.substr(last_src + kSrcPrefixLength);
+}
+} // namespace
+
+void LogMessageImpl(const std::string &file, int lineno, int level,
+                    const std::string &message) {
+  std::time_t t = std::time(nullptr);
+  std::cerr << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] ";
+#ifdef TILELANG_RELEASE_BUILD
+  // Release (wheel) builds: omit file path for a cleaner user experience.
+  std::cerr << level_strings[level] << message << std::endl;
+#else
+  // Dev builds: include file path for debugging.
+  std::cerr << file << ":" << lineno << level_strings[level] << message
+            << std::endl;
+#endif
+}
+
+[[noreturn]] void LogFatalImpl(const std::string &file, int lineno,
+                               const std::string &message) {
+  LogMessageImpl(file, lineno, TVM_LOG_LEVEL_FATAL, message);
+  throw InternalError(file, lineno, message);
+}
+
+TvmLogDebugSettings TvmLogDebugSettings::ParseSpec(const char *opt_spec) {
+  TvmLogDebugSettings settings;
+  if (opt_spec == nullptr) {
+    return settings;
+  }
+  std::string spec(opt_spec);
+  if (spec.empty() || spec == "0") {
+    return settings;
+  }
+  settings.dlog_enabled_ = true;
+  if (spec == "1") {
+    return settings;
+  }
+  std::istringstream spec_stream(spec);
+  auto tell_pos = [&](const std::string &last_read) {
+    int pos = spec_stream.tellg();
+    if (pos == -1) {
+      pos = spec.size() - last_read.size();
+    }
+    return pos;
+  };
+  while (spec_stream) {
+    std::string name;
+    if (!std::getline(spec_stream, name, '=')) {
+      break;
+    }
+    if (name.empty()) {
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(name)
+                 << ": empty filename";
+    }
+    name = FileToVLogMapKey(name);
+    std::string level;
+    if (!std::getline(spec_stream, level, ',')) {
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": expecting \"=<level>\" after \"" << name << "\"";
+      return settings;
+    }
+    if (level.empty()) {
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": empty level after \"" << name << "\"";
+      return settings;
+    }
+    char *end_of_level = nullptr;
+    int level_val = static_cast<int>(strtol(level.c_str(), &end_of_level, 10));
+    if (end_of_level != level.c_str() + level.size()) {
+      LOG(FATAL) << "TVM_LOG_DEBUG ill-formed at position " << tell_pos(level)
+                 << ": invalid level: \"" << level << "\"";
+      return settings;
+    }
+    LOG(INFO) << "TVM_LOG_DEBUG enables VLOG statements in '" << name
+              << "' up to level " << level;
+    settings.vlog_level_map_.emplace(name, level_val);
+  }
+  return settings;
+}
+
+bool TvmLogDebugSettings::VerboseEnabledImpl(const std::string &filename,
+                                             int level) const {
+  auto itr = vlog_level_map_.find(FileToVLogMapKey(filename));
+  if (itr != vlog_level_map_.end()) {
+    return level <= itr->second;
+  }
+  itr = vlog_level_map_.find(kDefaultKeyword);
+  if (itr != vlog_level_map_.end()) {
+    return level <= itr->second;
+  }
+  return false;
+}
+
+} // namespace detail
+} // namespace runtime
+} // namespace tvm
diff --git a/src/runtime/runtime.cc b/src/runtime/runtime.cc
index b2a7127d26..d921bb34db 100644
--- a/src/runtime/runtime.cc
+++ b/src/runtime/runtime.cc
@@ -6,9 +6,12 @@
 
 #include "runtime.h"
 
-#include "../target/cuda.h"
+#include "../target/stubs/cuda.h"
+#include <cstdint>
+#include <sstream>
 #include <tvm/ffi/function.h>
 #include <tvm/node/node.h>
+#include <vector>
 
 namespace tvm {
 namespace tl {
@@ -32,6 +35,276 @@ template <typename T> static std::string ArrayToStr(const T *ptr, size_t n) {
   return ss.str();
 }
 
+static uint64_t PtrModulo(const void *ptr, uint64_t align) {
+  return reinterpret_cast<uintptr_t>(ptr) % align;
+}
+
+// The vendored CUDA stub may lag the installed toolkit. Keep the numeric
+// values from the CUDA Driver API enum order so runtime validation can accept
+// descriptors produced for newer drivers without requiring newer headers.
+static constexpr int kTensorMapDataType16U4Align8B = 13;
+static constexpr int kTensorMapDataType16U4Align16B = 14;
+static constexpr int kTensorMapDataType16U6Align16B = 15;
+
+static constexpr int kTensorMapSwizzle128BAtom32B = 4;
+static constexpr int kTensorMapSwizzle128BAtom32BFlip8B = 5;
+static constexpr int kTensorMapSwizzle128BAtom64B = 6;
+
+static const char *TensorMapDataTypeToString(CUtensorMapDataType type) {
+  switch (static_cast<int>(type)) {
+  case CU_TENSOR_MAP_DATA_TYPE_UINT8:
+    return "CU_TENSOR_MAP_DATA_TYPE_UINT8";
+  case CU_TENSOR_MAP_DATA_TYPE_UINT16:
+    return "CU_TENSOR_MAP_DATA_TYPE_UINT16";
+  case CU_TENSOR_MAP_DATA_TYPE_UINT32:
+    return "CU_TENSOR_MAP_DATA_TYPE_UINT32";
+  case CU_TENSOR_MAP_DATA_TYPE_INT32:
+    return "CU_TENSOR_MAP_DATA_TYPE_INT32";
+  case CU_TENSOR_MAP_DATA_TYPE_UINT64:
+    return "CU_TENSOR_MAP_DATA_TYPE_UINT64";
+  case CU_TENSOR_MAP_DATA_TYPE_INT64:
+    return "CU_TENSOR_MAP_DATA_TYPE_INT64";
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT16:
+    return "CU_TENSOR_MAP_DATA_TYPE_FLOAT16";
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT32:
+    return "CU_TENSOR_MAP_DATA_TYPE_FLOAT32";
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT64:
+    return "CU_TENSOR_MAP_DATA_TYPE_FLOAT64";
+  case CU_TENSOR_MAP_DATA_TYPE_BFLOAT16:
+    return "CU_TENSOR_MAP_DATA_TYPE_BFLOAT16";
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ:
+    return "CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ";
+  case CU_TENSOR_MAP_DATA_TYPE_TFLOAT32:
+    return "CU_TENSOR_MAP_DATA_TYPE_TFLOAT32";
+  case CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ:
+    return "CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ";
+  case kTensorMapDataType16U4Align8B:
+    return "CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B";
+  case kTensorMapDataType16U4Align16B:
+    return "CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B";
+  case kTensorMapDataType16U6Align16B:
+    return "CU_TENSOR_MAP_DATA_TYPE_16U6_ALIGN16B";
+  default:
+    return "<unknown CUtensorMapDataType>";
+  }
+}
+
+static const char *
+TensorMapInterleaveToString(CUtensorMapInterleave interleave) {
+  switch (interleave) {
+  case CU_TENSOR_MAP_INTERLEAVE_NONE:
+    return "CU_TENSOR_MAP_INTERLEAVE_NONE";
+  case CU_TENSOR_MAP_INTERLEAVE_16B:
+    return "CU_TENSOR_MAP_INTERLEAVE_16B";
+  case CU_TENSOR_MAP_INTERLEAVE_32B:
+    return "CU_TENSOR_MAP_INTERLEAVE_32B";
+  default:
+    return "<unknown CUtensorMapInterleave>";
+  }
+}
+
+static const char *TensorMapSwizzleToString(CUtensorMapSwizzle swizzle) {
+  switch (static_cast<int>(swizzle)) {
+  case CU_TENSOR_MAP_SWIZZLE_NONE:
+    return "CU_TENSOR_MAP_SWIZZLE_NONE";
+  case CU_TENSOR_MAP_SWIZZLE_32B:
+    return "CU_TENSOR_MAP_SWIZZLE_32B";
+  case CU_TENSOR_MAP_SWIZZLE_64B:
+    return "CU_TENSOR_MAP_SWIZZLE_64B";
+  case CU_TENSOR_MAP_SWIZZLE_128B:
+    return "CU_TENSOR_MAP_SWIZZLE_128B";
+  case kTensorMapSwizzle128BAtom32B:
+    return "CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B";
+  case kTensorMapSwizzle128BAtom32BFlip8B:
+    return "CU_TENSOR_MAP_SWIZZLE_128B_ATOM_32B_FLIP_8B";
+  case kTensorMapSwizzle128BAtom64B:
+    return "CU_TENSOR_MAP_SWIZZLE_128B_ATOM_64B";
+  default:
+    return "<unknown CUtensorMapSwizzle>";
+  }
+}
+
+static const char *TensorMapL2PromotionToString(CUtensorMapL2promotion l2) {
+  switch (l2) {
+  case CU_TENSOR_MAP_L2_PROMOTION_NONE:
+    return "CU_TENSOR_MAP_L2_PROMOTION_NONE";
+  case CU_TENSOR_MAP_L2_PROMOTION_L2_64B:
+    return "CU_TENSOR_MAP_L2_PROMOTION_L2_64B";
+  case CU_TENSOR_MAP_L2_PROMOTION_L2_128B:
+    return "CU_TENSOR_MAP_L2_PROMOTION_L2_128B";
+  case CU_TENSOR_MAP_L2_PROMOTION_L2_256B:
+    return "CU_TENSOR_MAP_L2_PROMOTION_L2_256B";
+  default:
+    return "<unknown CUtensorMapL2promotion>";
+  }
+}
+
+static const char *TensorMapOOBFillToString(CUtensorMapFloatOOBfill oob_fill) {
+  switch (oob_fill) {
+  case CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE:
+    return "CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE";
+  case CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA:
+    return "CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA";
+  default:
+    return "<unknown CUtensorMapFloatOOBfill>";
+  }
+}
+
+static uint64_t TensorMapDataTypeBits(CUtensorMapDataType type) {
+  switch (static_cast<int>(type)) {
+  case CU_TENSOR_MAP_DATA_TYPE_UINT8:
+    return 8;
+  case CU_TENSOR_MAP_DATA_TYPE_UINT16:
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT16:
+  case CU_TENSOR_MAP_DATA_TYPE_BFLOAT16:
+    return 16;
+  case CU_TENSOR_MAP_DATA_TYPE_UINT32:
+  case CU_TENSOR_MAP_DATA_TYPE_INT32:
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT32:
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ:
+  case CU_TENSOR_MAP_DATA_TYPE_TFLOAT32:
+  case CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ:
+    return 32;
+  case CU_TENSOR_MAP_DATA_TYPE_UINT64:
+  case CU_TENSOR_MAP_DATA_TYPE_INT64:
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT64:
+    return 64;
+  case kTensorMapDataType16U4Align8B:
+  case kTensorMapDataType16U4Align16B:
+    return 4;
+  case kTensorMapDataType16U6Align16B:
+    return 6;
+  default:
+    return 0;
+  }
+}
+
+static bool IsFloatTensorMapType(CUtensorMapDataType type) {
+  switch (type) {
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT16:
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT32:
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT64:
+  case CU_TENSOR_MAP_DATA_TYPE_BFLOAT16:
+  case CU_TENSOR_MAP_DATA_TYPE_FLOAT32_FTZ:
+  case CU_TENSOR_MAP_DATA_TYPE_TFLOAT32:
+  case CU_TENSOR_MAP_DATA_TYPE_TFLOAT32_FTZ:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool IsTensorMapDataType16U4Align8B(CUtensorMapDataType type) {
+  return static_cast<int>(type) == kTensorMapDataType16U4Align8B;
+}
+
+static bool IsTensorMapDataType16U4Align16B(CUtensorMapDataType type) {
+  return static_cast<int>(type) == kTensorMapDataType16U4Align16B;
+}
+
+static bool IsTensorMapDataType16U6Align16B(CUtensorMapDataType type) {
+  return static_cast<int>(type) == kTensorMapDataType16U6Align16B;
+}
+
+static bool IsPackedAlign16TensorMapType(CUtensorMapDataType type) {
+  return IsTensorMapDataType16U4Align16B(type) ||
+         IsTensorMapDataType16U6Align16B(type);
+}
+
+static bool IsTensorMapSwizzle128BFamily(CUtensorMapSwizzle swizzle) {
+  switch (static_cast<int>(swizzle)) {
+  case CU_TENSOR_MAP_SWIZZLE_128B:
+  case kTensorMapSwizzle128BAtom32B:
+  case kTensorMapSwizzle128BAtom32BFlip8B:
+  case kTensorMapSwizzle128BAtom64B:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool IsSupportedPackedTensorMapSwizzle(CUtensorMapDataType type,
+                                              CUtensorMapSwizzle swizzle) {
+  int swizzle_value = static_cast<int>(swizzle);
+  if (IsTensorMapDataType16U6Align16B(type)) {
+    return swizzle_value == CU_TENSOR_MAP_SWIZZLE_NONE ||
+           swizzle_value == CU_TENSOR_MAP_SWIZZLE_128B ||
+           swizzle_value == kTensorMapSwizzle128BAtom32B ||
+           swizzle_value == kTensorMapSwizzle128BAtom64B;
+  }
+  if (IsTensorMapDataType16U4Align16B(type)) {
+    return swizzle_value == CU_TENSOR_MAP_SWIZZLE_NONE ||
+           swizzle_value == CU_TENSOR_MAP_SWIZZLE_128B ||
+           swizzle_value == kTensorMapSwizzle128BAtom32B;
+  }
+  return true;
+}
+
+static uint64_t
+RequiredGlobalAddressAlignment(CUtensorMapDataType type,
+                               CUtensorMapInterleave interleave) {
+  if (interleave == CU_TENSOR_MAP_INTERLEAVE_32B ||
+      IsPackedAlign16TensorMapType(type)) {
+    return 32;
+  }
+  return 16;
+}
+
+static uint64_t
+RequiredGlobalStrideAlignment(CUtensorMapDataType type,
+                              CUtensorMapInterleave interleave) {
+  if (interleave == CU_TENSOR_MAP_INTERLEAVE_32B ||
+      IsPackedAlign16TensorMapType(type)) {
+    return 32;
+  }
+  return 16;
+}
+
+static uint64_t SwizzleSpanBytes(CUtensorMapSwizzle swizzle) {
+  if (IsTensorMapSwizzle128BFamily(swizzle)) {
+    return 128;
+  }
+  switch (static_cast<int>(swizzle)) {
+  case CU_TENSOR_MAP_SWIZZLE_NONE:
+    return 0;
+  case CU_TENSOR_MAP_SWIZZLE_32B:
+    return 32;
+  case CU_TENSOR_MAP_SWIZZLE_64B:
+    return 64;
+  default:
+    return 0;
+  }
+}
+
+static std::string CudaResultToString(CUresult result) {
+  const char *error_name = nullptr;
+  const char *error_string = nullptr;
+  (void)cuGetErrorName(result, &error_name);
+  (void)cuGetErrorString(result, &error_string);
+
+  std::stringstream ss;
+  ss << result;
+  if (error_name != nullptr) {
+    ss << " (" << error_name;
+    if (error_string != nullptr) {
+      ss << ": " << error_string;
+    }
+    ss << ")";
+  } else if (error_string != nullptr) {
+    ss << " (" << error_string << ")";
+  }
+  return ss.str();
+}
+
+static std::string
+FormatValidationIssues(const std::vector<std::string> &issues) {
+  std::stringstream ss;
+  for (size_t i = 0; i < issues.size(); ++i) {
+    ss << "  [" << (i + 1) << "] " << issues[i] << '\n';
+  }
+  return ss.str();
+}
+
 struct TensorMapArgs {
   CUtensorMap *map;
   CUtensorMapDataType type;
@@ -78,22 +351,185 @@ struct TensorMapArgs {
 
   std::string ToDebugString() {
     std::stringstream ss;
-    ss << "TMA Desc Addr:   " << map << '\n'
-       << "format         " << type << '\n'
+    ss << "TMA Desc Addr:   " << map << " (mod64=" << PtrModulo(map, 64)
+       << ")\n"
+       << "format         " << type << " (" << TensorMapDataTypeToString(type)
+       << ")\n"
        << "dim            " << tensorRank << '\n'
-       << "gmem_address   " << globalAddress << '\n'
+       << "gmem_address   " << globalAddress
+       << " (mod16=" << PtrModulo(globalAddress, 16)
+       << ", mod32=" << PtrModulo(globalAddress, 32) << ")\n"
        << "globalDim      " << ArrayToStr(globalDim, tensorRank) << '\n'
-       << "globalStrides  " << ArrayToStr(globalStride, tensorRank) << '\n'
+       << "globalStridesRaw " << ArrayToStr(globalStride, tensorRank) << '\n'
+       << "cudaGlobalStrides "
+       << ArrayToStr(globalStride + 1, tensorRank == 0 ? 0 : tensorRank - 1)
+       << '\n'
        << "boxDim         " << ArrayToStr(boxDim, tensorRank) << '\n'
        << "elementStrides " << ArrayToStr(elementStrides, tensorRank) << '\n'
-       << "interleave     " << interleave << '\n'
-       << "swizzle        " << swizzle << '\n'
-       << "l2Promotion    " << l2Promotion << '\n'
-       << "oobFill        " << oobFill << '\n';
+       << "interleave     " << interleave << " ("
+       << TensorMapInterleaveToString(interleave) << ")\n"
+       << "swizzle        " << swizzle << " ("
+       << TensorMapSwizzleToString(swizzle) << ")\n"
+       << "l2Promotion    " << l2Promotion << " ("
+       << TensorMapL2PromotionToString(l2Promotion) << ")\n"
+       << "oobFill        " << oobFill << " ("
+       << TensorMapOOBFillToString(oobFill) << ")\n";
     return ss.str();
   }
 };
 
+static std::vector<std::string> ValidateTensorMapArgs(const TensorMapArgs &T) {
+  std::vector<std::string> issues;
+  uint64_t type_bits = TensorMapDataTypeBits(T.type);
+  uint64_t addr_align = RequiredGlobalAddressAlignment(T.type, T.interleave);
+  uint64_t stride_align = RequiredGlobalStrideAlignment(T.type, T.interleave);
+
+  if (T.map == nullptr) {
+    issues.push_back("tensorMap must be non-null");
+  } else if (PtrModulo(T.map, 64) != 0) {
+    issues.push_back("tensorMap address must be 64-byte aligned, but got " +
+                     std::to_string(reinterpret_cast<uintptr_t>(T.map)) +
+                     " with mod64=" + std::to_string(PtrModulo(T.map, 64)));
+  }
+
+  if (type_bits == 0) {
+    issues.push_back(
+        "tensorDataType is not a supported CUtensorMapDataType enum: " +
+        std::to_string(static_cast<int>(T.type)));
+  }
+
+  if (T.tensorRank == 0 || T.tensorRank > 5) {
+    issues.push_back("tensorRank must be in [1, 5], but got " +
+                     std::to_string(T.tensorRank));
+  }
+  if (T.interleave != CU_TENSOR_MAP_INTERLEAVE_NONE && T.tensorRank < 3) {
+    issues.push_back("tensorRank must be >= 3 when interleave is not NONE");
+  }
+
+  if (T.globalAddress == nullptr) {
+    issues.push_back("globalAddress must be non-null");
+  } else if (PtrModulo(T.globalAddress, addr_align) != 0) {
+    issues.push_back("globalAddress must be " + std::to_string(addr_align) +
+                     "-byte aligned, but mod" + std::to_string(addr_align) +
+                     "=" +
+                     std::to_string(PtrModulo(T.globalAddress, addr_align)));
+  }
+
+  for (size_t i = 0; i < T.tensorRank; ++i) {
+    if (T.globalDim[i] == 0) {
+      issues.push_back("globalDim[" + std::to_string(i) + "] must be non-zero");
+    }
+    if (T.globalDim[i] > (uint64_t{1} << 32)) {
+      issues.push_back("globalDim[" + std::to_string(i) +
+                       "] must be <= 2^32, but got " +
+                       std::to_string(T.globalDim[i]));
+    }
+  }
+  if (T.tensorRank > 0) {
+    if (IsPackedAlign16TensorMapType(T.type) && T.globalDim[0] % 128 != 0) {
+      issues.push_back("globalDim[0] must be a multiple of 128 for " +
+                       std::string(TensorMapDataTypeToString(T.type)) +
+                       ", but got " + std::to_string(T.globalDim[0]));
+    }
+    if (IsTensorMapDataType16U4Align8B(T.type) && T.globalDim[0] % 2 != 0) {
+      issues.push_back("globalDim[0] must be a multiple of 2 for " +
+                       std::string(TensorMapDataTypeToString(T.type)) +
+                       ", but got " + std::to_string(T.globalDim[0]));
+    }
+  }
+
+  for (size_t raw_i = 1; raw_i < T.tensorRank; ++raw_i) {
+    cuuint64_t stride = T.globalStride[raw_i];
+    size_t cuda_i = raw_i - 1;
+    if (stride == 0) {
+      issues.push_back("effective cuda globalStrides[" +
+                       std::to_string(cuda_i) + "] (raw globalStride[" +
+                       std::to_string(raw_i) + "]) must be non-zero");
+    }
+    if (stride % stride_align != 0) {
+      issues.push_back("effective cuda globalStrides[" +
+                       std::to_string(cuda_i) + "] (raw globalStride[" +
+                       std::to_string(raw_i) + "] = " + std::to_string(stride) +
+                       ") must be a multiple of " +
+                       std::to_string(stride_align) + " bytes");
+    }
+    if (stride >= (uint64_t{1} << 40)) {
+      issues.push_back("effective cuda globalStrides[" +
+                       std::to_string(cuda_i) + "] (raw globalStride[" +
+                       std::to_string(raw_i) + "] = " + std::to_string(stride) +
+                       ") must be < 2^40");
+    }
+  }
+
+  for (size_t i = 0; i < T.tensorRank; ++i) {
+    if (T.boxDim[i] == 0) {
+      issues.push_back("boxDim[" + std::to_string(i) + "] must be non-zero");
+    }
+    if (T.boxDim[i] > 256) {
+      issues.push_back("boxDim[" + std::to_string(i) +
+                       "] must be <= 256, but got " +
+                       std::to_string(T.boxDim[i]));
+    }
+  }
+  if (T.tensorRank > 0 && IsPackedAlign16TensorMapType(T.type) &&
+      T.boxDim[0] != 128) {
+    issues.push_back("boxDim[0] must be 128 for " +
+                     std::string(TensorMapDataTypeToString(T.type)) +
+                     ", but got " + std::to_string(T.boxDim[0]));
+  }
+
+  if (T.tensorRank > 0 && T.interleave == CU_TENSOR_MAP_INTERLEAVE_NONE &&
+      type_bits != 0 && ((uint64_t{T.boxDim[0]} * type_bits) % 128 != 0)) {
+    issues.push_back("boxDim[0] * elementSize must be a multiple of 16 bytes "
+                     "when interleave is NONE, but got " +
+                     std::to_string(T.boxDim[0]) + " * " +
+                     std::to_string(type_bits) + " bits");
+  }
+
+  for (size_t i = 0; i < T.tensorRank; ++i) {
+    if (T.elementStrides[i] == 0 || T.elementStrides[i] > 8) {
+      issues.push_back("elementStrides[" + std::to_string(i) +
+                       "] must be in [1, 8], but got " +
+                       std::to_string(T.elementStrides[i]));
+    }
+  }
+
+  if (T.interleave == CU_TENSOR_MAP_INTERLEAVE_32B &&
+      T.swizzle != CU_TENSOR_MAP_SWIZZLE_32B) {
+    issues.push_back("swizzle must be CU_TENSOR_MAP_SWIZZLE_32B when "
+                     "interleave is CU_TENSOR_MAP_INTERLEAVE_32B");
+  }
+  if (IsTensorMapDataType16U6Align16B(T.type) &&
+      T.interleave != CU_TENSOR_MAP_INTERLEAVE_NONE) {
+    issues.push_back("interleave must be CU_TENSOR_MAP_INTERLEAVE_NONE for " +
+                     std::string(TensorMapDataTypeToString(T.type)));
+  }
+  if (!IsSupportedPackedTensorMapSwizzle(T.type, T.swizzle)) {
+    issues.push_back(std::string(TensorMapSwizzleToString(T.swizzle)) +
+                     " is not supported for " +
+                     TensorMapDataTypeToString(T.type));
+  }
+
+  uint64_t swizzle_bytes = SwizzleSpanBytes(T.swizzle);
+  if (T.tensorRank > 0 && T.interleave == CU_TENSOR_MAP_INTERLEAVE_NONE &&
+      swizzle_bytes != 0 && type_bits != 0 &&
+      (uint64_t{T.boxDim[0]} * type_bits > swizzle_bytes * 8)) {
+    issues.push_back("boxDim[0] * elementSize must be <= swizzle span (" +
+                     std::to_string(swizzle_bytes) +
+                     " bytes) when swizzle is enabled, but got " +
+                     std::to_string(T.boxDim[0]) + " * " +
+                     std::to_string(type_bits) + " bits");
+  }
+
+  if (T.oobFill == CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA &&
+      !IsFloatTensorMapType(T.type)) {
+    issues.push_back("CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can "
+                     "only be used with floating-point tensorDataType");
+  }
+
+  return issues;
+}
+
 // set device api
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
@@ -101,13 +537,21 @@ TVM_FFI_STATIC_INIT_BLOCK() {
   refl::GlobalDef().def_packed(
       tl::tvm_tensormap_create_tiled, [](PackedArgs args, Any *ret) {
         TensorMapArgs T = TensorMapArgs::Extract(args);
+        std::vector<std::string> issues = ValidateTensorMapArgs(T);
+        if (!issues.empty()) {
+          LOG_FATAL << "Invalid TMA descriptor arguments for "
+                    << tl::tvm_tensormap_create_tiled << ":\n"
+                    << FormatValidationIssues(issues) << T.ToDebugString();
+        }
         CUresult result = cuTensorMapEncodeTiled(
             T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
             T.globalStride + 1, T.boxDim, T.elementStrides, T.interleave,
             T.swizzle, T.l2Promotion, T.oobFill);
         if (result != CUDA_SUCCESS) {
-          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
-                    << '\n'
+          LOG_FATAL << "Failed to initialize the TMA descriptor "
+                    << CudaResultToString(result) << '\n'
+                    << "No local tiled-TMA constraint violation was detected "
+                       "before calling cuTensorMapEncodeTiled.\n"
                     << T.ToDebugString();
         }
         *ret = static_cast<int>(result);
@@ -167,12 +611,19 @@ struct TensorMapIm2ColArgs {
 
   std::string ToDebugString() {
     std::stringstream ss;
-    ss << "TMA Desc Addr:   " << map << '\n'
-       << "format         " << type << '\n'
+    ss << "TMA Desc Addr:   " << map << " (mod64=" << PtrModulo(map, 64)
+       << ")\n"
+       << "format         " << type << " (" << TensorMapDataTypeToString(type)
+       << ")\n"
        << "dim            " << tensorRank << '\n'
-       << "gmem_address   " << globalAddress << '\n'
+       << "gmem_address   " << globalAddress
+       << " (mod16=" << PtrModulo(globalAddress, 16)
+       << ", mod32=" << PtrModulo(globalAddress, 32) << ")\n"
        << "globalDim      " << ArrayToStr(globalDim, tensorRank) << '\n'
-       << "globalStrides  " << ArrayToStr(globalStride, tensorRank) << '\n'
+       << "globalStridesRaw " << ArrayToStr(globalStride, tensorRank) << '\n'
+       << "cudaGlobalStrides "
+       << ArrayToStr(globalStride + 1, tensorRank == 0 ? 0 : tensorRank - 1)
+       << '\n'
        << "smem_box_pixel " << smem_box_pixel << '\n'
        << "smem_box_channel " << smem_box_channel << '\n'
        << "pixelBoxLowerCorner  "
@@ -180,27 +631,163 @@ struct TensorMapIm2ColArgs {
        << "pixelBoxUpperCorner  "
        << ArrayToStr(pixelBoxUpperCorner, tensorRank - 2) << '\n'
        << "elementStrides " << ArrayToStr(elementStrides, tensorRank) << '\n'
-       << "interleave     " << interleave << '\n'
-       << "swizzle        " << swizzle << '\n'
-       << "l2Promotion    " << l2Promotion << '\n'
-       << "oobFill        " << oobFill << '\n';
+       << "interleave     " << interleave << " ("
+       << TensorMapInterleaveToString(interleave) << ")\n"
+       << "swizzle        " << swizzle << " ("
+       << TensorMapSwizzleToString(swizzle) << ")\n"
+       << "l2Promotion    " << l2Promotion << " ("
+       << TensorMapL2PromotionToString(l2Promotion) << ")\n"
+       << "oobFill        " << oobFill << " ("
+       << TensorMapOOBFillToString(oobFill) << ")\n";
     return ss.str();
   }
 };
 
+static std::vector<std::string>
+ValidateTensorMapIm2ColArgs(const TensorMapIm2ColArgs &T) {
+  std::vector<std::string> issues;
+  uint64_t type_bits = TensorMapDataTypeBits(T.type);
+  uint64_t addr_align = RequiredGlobalAddressAlignment(T.type, T.interleave);
+  uint64_t stride_align = RequiredGlobalStrideAlignment(T.type, T.interleave);
+
+  if (T.map == nullptr) {
+    issues.push_back("tensorMap must be non-null");
+  } else if (PtrModulo(T.map, 64) != 0) {
+    issues.push_back("tensorMap address must be 64-byte aligned, but mod64=" +
+                     std::to_string(PtrModulo(T.map, 64)));
+  }
+
+  if (type_bits == 0) {
+    issues.push_back(
+        "tensorDataType is not a supported CUtensorMapDataType enum: " +
+        std::to_string(static_cast<int>(T.type)));
+  }
+
+  if (T.tensorRank < 3 || T.tensorRank > 5) {
+    issues.push_back("tensorRank must be in [3, 5] for im2col, but got " +
+                     std::to_string(T.tensorRank));
+  }
+
+  if (T.globalAddress == nullptr) {
+    issues.push_back("globalAddress must be non-null");
+  } else if (PtrModulo(T.globalAddress, addr_align) != 0) {
+    issues.push_back("globalAddress must be " + std::to_string(addr_align) +
+                     "-byte aligned, but mod" + std::to_string(addr_align) +
+                     "=" +
+                     std::to_string(PtrModulo(T.globalAddress, addr_align)));
+  }
+
+  for (size_t i = 0; i < T.tensorRank; ++i) {
+    if (T.globalDim[i] == 0) {
+      issues.push_back("globalDim[" + std::to_string(i) + "] must be non-zero");
+    }
+    if (T.globalDim[i] > (uint64_t{1} << 32)) {
+      issues.push_back("globalDim[" + std::to_string(i) +
+                       "] must be <= 2^32, but got " +
+                       std::to_string(T.globalDim[i]));
+    }
+    if (T.elementStrides[i] == 0 || T.elementStrides[i] > 8) {
+      issues.push_back("elementStrides[" + std::to_string(i) +
+                       "] must be in [1, 8], but got " +
+                       std::to_string(T.elementStrides[i]));
+    }
+  }
+  if (T.tensorRank > 0) {
+    if (IsPackedAlign16TensorMapType(T.type) && T.globalDim[0] % 128 != 0) {
+      issues.push_back("globalDim[0] must be a multiple of 128 for " +
+                       std::string(TensorMapDataTypeToString(T.type)) +
+                       ", but got " + std::to_string(T.globalDim[0]));
+    }
+    if (IsTensorMapDataType16U4Align8B(T.type) && T.globalDim[0] % 2 != 0) {
+      issues.push_back("globalDim[0] must be a multiple of 2 for " +
+                       std::string(TensorMapDataTypeToString(T.type)) +
+                       ", but got " + std::to_string(T.globalDim[0]));
+    }
+  }
+
+  for (size_t raw_i = 1; raw_i < T.tensorRank; ++raw_i) {
+    cuuint64_t stride = T.globalStride[raw_i];
+    size_t cuda_i = raw_i - 1;
+    if (stride == 0) {
+      issues.push_back("effective cuda globalStrides[" +
+                       std::to_string(cuda_i) + "] (raw globalStride[" +
+                       std::to_string(raw_i) + "]) must be non-zero");
+    }
+    if (stride % stride_align != 0) {
+      issues.push_back("effective cuda globalStrides[" +
+                       std::to_string(cuda_i) + "] (raw globalStride[" +
+                       std::to_string(raw_i) + "] = " + std::to_string(stride) +
+                       ") must be a multiple of " +
+                       std::to_string(stride_align) + " bytes");
+    }
+    if (stride >= (uint64_t{1} << 40)) {
+      issues.push_back("effective cuda globalStrides[" +
+                       std::to_string(cuda_i) + "] (raw globalStride[" +
+                       std::to_string(raw_i) + "] = " + std::to_string(stride) +
+                       ") must be < 2^40");
+    }
+  }
+
+  if (T.smem_box_channel == 0 || T.smem_box_channel > 256) {
+    issues.push_back("channelsPerPixel must be in [1, 256], but got " +
+                     std::to_string(T.smem_box_channel));
+  }
+  if (IsPackedAlign16TensorMapType(T.type) && T.smem_box_channel != 128) {
+    issues.push_back("channelsPerPixel must be 128 for " +
+                     std::string(TensorMapDataTypeToString(T.type)) +
+                     ", but got " + std::to_string(T.smem_box_channel));
+  }
+  if (T.smem_box_pixel == 0 || T.smem_box_pixel > 1024) {
+    issues.push_back("pixelsPerColumn must be in [1, 1024], but got " +
+                     std::to_string(T.smem_box_pixel));
+  }
+
+  if (T.interleave == CU_TENSOR_MAP_INTERLEAVE_32B &&
+      T.swizzle != CU_TENSOR_MAP_SWIZZLE_32B) {
+    issues.push_back("swizzle must be CU_TENSOR_MAP_SWIZZLE_32B when "
+                     "interleave is CU_TENSOR_MAP_INTERLEAVE_32B");
+  }
+  if (IsTensorMapDataType16U6Align16B(T.type) &&
+      T.interleave != CU_TENSOR_MAP_INTERLEAVE_NONE) {
+    issues.push_back("interleave must be CU_TENSOR_MAP_INTERLEAVE_NONE for " +
+                     std::string(TensorMapDataTypeToString(T.type)));
+  }
+  if (!IsSupportedPackedTensorMapSwizzle(T.type, T.swizzle)) {
+    issues.push_back(std::string(TensorMapSwizzleToString(T.swizzle)) +
+                     " is not supported for " +
+                     TensorMapDataTypeToString(T.type));
+  }
+
+  if (T.oobFill == CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA &&
+      !IsFloatTensorMapType(T.type)) {
+    issues.push_back("CU_TENSOR_MAP_FLOAT_OOB_FILL_NAN_REQUEST_ZERO_FMA can "
+                     "only be used with floating-point tensorDataType");
+  }
+
+  return issues;
+}
+
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef().def_packed(
       tl::tvm_tensormap_create_im2col, [](PackedArgs args, Any *ret) {
         TensorMapIm2ColArgs T = TensorMapIm2ColArgs::Extract(args);
+        std::vector<std::string> issues = ValidateTensorMapIm2ColArgs(T);
+        if (!issues.empty()) {
+          LOG_FATAL << "Invalid TMA im2col descriptor arguments for "
+                    << tl::tvm_tensormap_create_im2col << ":\n"
+                    << FormatValidationIssues(issues) << T.ToDebugString();
+        }
         CUresult result = cuTensorMapEncodeIm2col(
             T.map, T.type, T.tensorRank, T.globalAddress, T.globalDim,
             T.globalStride + 1, T.pixelBoxLowerCorner, T.pixelBoxUpperCorner,
             T.smem_box_channel, T.smem_box_pixel, T.elementStrides,
             T.interleave, T.swizzle, T.l2Promotion, T.oobFill);
         if (result != CUDA_SUCCESS) {
-          LOG_FATAL << "Failed to initialize the TMA descriptor " << result
-                    << '\n'
+          LOG_FATAL << "Failed to initialize the TMA descriptor "
+                    << CudaResultToString(result) << '\n'
+                    << "No local im2col-TMA constraint violation was detected "
+                       "before calling cuTensorMapEncodeIm2col.\n"
                     << T.ToDebugString();
         }
         *ret = static_cast<int>(result);
diff --git a/src/runtime/tilescale_cuda_module.cc b/src/runtime/tilescale_cuda_module.cc
index 8f570e6c7d..5e7bbfde5c 100644
--- a/src/runtime/tilescale_cuda_module.cc
+++ b/src/runtime/tilescale_cuda_module.cc
@@ -77,7 +77,7 @@ class TileScaleCUDAModuleNode : public ffi::ModuleObj {
     for (size_t i = 0; i < module_.size(); ++i) {
       if (module_[i] != nullptr) {
         CUDA_CALL(cudaSetDevice(static_cast<int>(i)));
-        CUDA_DRIVER_CALL(cuModuleUnload(module_[i]));
+        CUDA_CALL(cudaModuleUnload(module_[i]));
       }
     }
   }
@@ -130,13 +130,9 @@ class TileScaleCUDAModuleNode : public ffi::ModuleObj {
   // Get a CUfunction from primary context in device_id
   CUfunction GetFunc(int device_id, const std::string &func_name) {
     std::lock_guard<std::mutex> lock(mutex_);
+    cudaSetDevice(device_id);
     if (module_[device_id] == nullptr) {
-      CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[device_id]), data_.c_str()));
-      static auto nvshmem_init_hook =
-          ffi::Function::GetGlobal("runtime.nvshmem.cumodule_init");
-      if (nvshmem_init_hook.has_value()) {
-        (*nvshmem_init_hook)(static_cast<void *>(module_[device_id]));
-      }
+      CUDA_CALL(cudaModuleLoadData(&(module_[device_id]), data_.c_str()));
     }
     CUfunction func;
     CUresult result =
@@ -154,26 +150,26 @@ class TileScaleCUDAModuleNode : public ffi::ModuleObj {
   CUdeviceptr GetGlobal(int device_id, const std::string &global_name,
                         size_t expect_nbytes) {
     std::lock_guard<std::mutex> lock(mutex_);
+    cudaSetDevice(device_id);
     if (module_[device_id] == nullptr) {
-      CUDA_DRIVER_CALL(cuModuleLoadData(&(module_[device_id]), data_.c_str()));
-      static auto nvshmem_init_hook =
-          ffi::Function::GetGlobal("runtime.nvshmem.cumodule_init");
-      if (nvshmem_init_hook.has_value()) {
-        (*nvshmem_init_hook)(static_cast<void *>(module_[device_id]));
-      }
+      // Use runtime API for module loading to ensure proper context binding.
+      // The driver API cuModuleLoadData does not require a context, which can
+      // lead to CUDA_ERROR_INVALID_CONTEXT in spawned subprocesses.
+      CUDA_CALL(cudaModuleLoadData(&(module_[device_id]), data_.c_str()));
     }
     CUdeviceptr global;
     size_t nbytes;
 
     CUresult result = cuModuleGetGlobal(&global, &nbytes, module_[device_id],
                                         global_name.c_str());
-    ICHECK_EQ(nbytes, expect_nbytes);
     if (result != CUDA_SUCCESS) {
       const char *msg;
       cuGetErrorName(result, &msg);
       LOG(FATAL) << "CUDAError: cuModuleGetGlobal " << global_name
                  << " failed with error: " << msg;
     }
+    ICHECK_EQ(nbytes, expect_nbytes)
+        << "Unexpected size for symbol " << global_name;
     return global;
   }
 
diff --git a/src/shared_memory/shared_memory.cc b/src/shared_memory/shared_memory.cc
new file mode 100644
index 0000000000..e93c4b5856
--- /dev/null
+++ b/src/shared_memory/shared_memory.cc
@@ -0,0 +1,470 @@
+/*!
+ * \file shared_memory/shared_memory.cc
+ * \brief VMM/IPC/multicast shared memory ops registered via TVM FFI.
+ *
+ * All functions are registered under the "tl.shared_memory.*" namespace
+ * and accessed from Python via tvm_ffi.get_global_func().
+ */
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ffi/string.h>
+#include <tvm/runtime/logging.h>
+
+#include <cstdio>
+#include <cstring>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+using namespace tvm;
+using namespace tvm::ffi;
+
+// ---------- helpers ----------
+
+#define SM_CUDA_CHECK(cmd)                                                     \
+  do {                                                                         \
+    cudaError_t e = (cmd);                                                     \
+    if (e != cudaSuccess) {                                                    \
+      LOG_FATAL << "CUDA error " << __FILE__ << ":" << __LINE__ << " '"        \
+                << cudaGetErrorString(e) << "'";                               \
+    }                                                                          \
+  } while (0)
+
+#define SM_CU_CHECK(cmd)                                                       \
+  do {                                                                         \
+    CUresult e = (cmd);                                                        \
+    if (e != CUDA_SUCCESS) {                                                   \
+      const char *err_str = nullptr;                                           \
+      cuGetErrorString(e, &err_str);                                           \
+      LOG_FATAL << "CU error " << __FILE__ << ":" << __LINE__ << " '"          \
+                << (err_str ? err_str : "unknown") << "'";                     \
+    }                                                                          \
+  } while (0)
+
+static void cu_mem_set_access_all(void *ptr, size_t size) {
+  int device_count;
+  SM_CUDA_CHECK(cudaGetDeviceCount(&device_count));
+
+  std::vector<CUmemAccessDesc> access_desc(device_count);
+  for (int idx = 0; idx < device_count; ++idx) {
+    access_desc[idx].location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+    access_desc[idx].location.id = idx;
+    access_desc[idx].flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
+  }
+
+  SM_CU_CHECK(
+      cuMemSetAccess((CUdeviceptr)ptr, size, access_desc.data(), device_count));
+}
+
+static size_t align_to_granularity(size_t size_raw, size_t granularity) {
+  size_t size = (size_raw + granularity - 1) & ~(granularity - 1);
+  if (size == 0)
+    size = granularity;
+  return size;
+}
+
+// ---------- VMM malloc/free ----------
+
+static int64_t vmm_malloc_impl(int64_t size_raw) {
+  CUdevice device;
+  SM_CU_CHECK(cuCtxGetDevice(&device));
+
+  CUmemAllocationProp prop = {};
+  prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
+  prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
+  prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+  prop.location.id = device;
+
+  size_t granularity = 0;
+  SM_CU_CHECK(cuMemGetAllocationGranularity(&granularity, &prop,
+                                            CU_MEM_ALLOC_GRANULARITY_MINIMUM));
+
+  size_t size = align_to_granularity((size_t)size_raw, granularity);
+
+  CUmemGenericAllocationHandle handle;
+  SM_CU_CHECK(cuMemCreate(&handle, size, &prop, 0));
+
+  void *ptr = nullptr;
+  SM_CU_CHECK(
+      cuMemAddressReserve((CUdeviceptr *)&ptr, size, granularity, 0, 0));
+  SM_CU_CHECK(cuMemMap((CUdeviceptr)ptr, size, 0, handle, 0));
+  cu_mem_set_access_all(ptr, size);
+
+  return (int64_t)(uintptr_t)ptr;
+}
+
+static void vmm_free_impl(int64_t ptr_val) {
+  void *ptr = reinterpret_cast<void *>((uintptr_t)ptr_val);
+  CUmemGenericAllocationHandle handle;
+  SM_CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+
+  size_t size = 0;
+  SM_CU_CHECK(cuMemGetAddressRange_v2(NULL, &size, (CUdeviceptr)ptr));
+
+  SM_CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+  SM_CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+  SM_CU_CHECK(cuMemRelease(handle));
+}
+
+// ---------- handle export/import ----------
+
+// Returns serialized handle as Bytes.
+// Format: 8 bytes size + sizeof(CUmemFabricHandle) bytes fabric handle.
+static ffi::Bytes create_vmm_handle_impl(int64_t ptr_val) {
+  void *ptr = reinterpret_cast<void *>((uintptr_t)ptr_val);
+  CUmemGenericAllocationHandle handle;
+  SM_CU_CHECK(cuMemRetainAllocationHandle(&handle, ptr));
+
+  size_t size = 0;
+  SM_CU_CHECK(cuMemGetAddressRange_v2(NULL, &size, (CUdeviceptr)ptr));
+
+  CUmemFabricHandle fabric_handle;
+  SM_CU_CHECK(cuMemExportToShareableHandle(&fabric_handle, handle,
+                                           CU_MEM_HANDLE_TYPE_FABRIC, 0));
+
+  std::string raw(sizeof(size_t) + sizeof(CUmemFabricHandle), '\0');
+  std::memcpy(&raw[0], &size, sizeof(size_t));
+  std::memcpy(&raw[sizeof(size_t)], &fabric_handle, sizeof(CUmemFabricHandle));
+  return ffi::Bytes(raw);
+}
+
+static int64_t open_vmm_handle_impl(ffi::Bytes handle_bytes) {
+  ICHECK(handle_bytes.size() == sizeof(size_t) + sizeof(CUmemFabricHandle));
+  const char *data = handle_bytes.data();
+
+  size_t size = 0;
+  std::memcpy(&size, data, sizeof(size_t));
+
+  CUmemFabricHandle fabric_handle;
+  std::memcpy(&fabric_handle, data + sizeof(size_t), sizeof(CUmemFabricHandle));
+
+  CUmemGenericAllocationHandle alloc_handle;
+  SM_CU_CHECK(cuMemImportFromShareableHandle(&alloc_handle, &fabric_handle,
+                                             CU_MEM_HANDLE_TYPE_FABRIC));
+
+  void *ptr = nullptr;
+  SM_CU_CHECK(cuMemAddressReserve((CUdeviceptr *)&ptr, size, 0, 0, 0));
+  SM_CU_CHECK(cuMemMap((CUdeviceptr)ptr, size, 0, alloc_handle, 0));
+  cu_mem_set_access_all(ptr, size);
+
+  return (int64_t)(uintptr_t)ptr;
+}
+
+static void close_vmm_handle_impl(int64_t ptr_val) { vmm_free_impl(ptr_val); }
+
+// ---------- IPC handle ----------
+
+static ffi::Bytes create_ipc_handle_impl(int64_t ptr_val) {
+  void *ptr = reinterpret_cast<void *>((uintptr_t)ptr_val);
+  cudaIpcMemHandle_t handle{};
+  SM_CUDA_CHECK(cudaIpcGetMemHandle(&handle, ptr));
+  return ffi::Bytes(reinterpret_cast<const char *>(handle.reserved),
+                    CUDA_IPC_HANDLE_SIZE);
+}
+
+static int64_t open_ipc_handle_impl(ffi::Bytes handle_bytes) {
+  ICHECK(handle_bytes.size() == CUDA_IPC_HANDLE_SIZE);
+  cudaIpcMemHandle_t handle{};
+  std::memcpy(handle.reserved, handle_bytes.data(), CUDA_IPC_HANDLE_SIZE);
+
+  void *ptr = nullptr;
+  SM_CUDA_CHECK(
+      cudaIpcOpenMemHandle(&ptr, handle, cudaIpcMemLazyEnablePeerAccess));
+  return (int64_t)(uintptr_t)ptr;
+}
+
+static void close_ipc_handle_impl(int64_t ptr_val) {
+  void *ptr = reinterpret_cast<void *>((uintptr_t)ptr_val);
+  SM_CUDA_CHECK(cudaIpcCloseMemHandle(ptr));
+}
+
+// ---------- support detection ----------
+
+static bool supports_vmm_fabric_impl() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  if (err != cudaSuccess || device_count == 0)
+    return false;
+
+  int driver_version = 0;
+  CUresult cu_err = cuDriverGetVersion(&driver_version);
+  if (cu_err != CUDA_SUCCESS || driver_version < 12040)
+    return false;
+
+  for (int i = 0; i < device_count; ++i) {
+    CUdevice dev;
+    SM_CU_CHECK(cuDeviceGet(&dev, i));
+    int supported = 0;
+    SM_CU_CHECK(cuDeviceGetAttribute(
+        &supported, CU_DEVICE_ATTRIBUTE_HANDLE_TYPE_FABRIC_SUPPORTED, dev));
+    if (!supported)
+      return false;
+  }
+  return true;
+}
+
+static bool supports_multicast_impl() {
+  int device_count = 0;
+  cudaError_t err = cudaGetDeviceCount(&device_count);
+  if (err != cudaSuccess || device_count == 0)
+    return false;
+
+  int driver_version = 0;
+  CUresult cu_err = cuDriverGetVersion(&driver_version);
+  if (cu_err != CUDA_SUCCESS || driver_version < 12040)
+    return false;
+
+  for (int i = 0; i < device_count; ++i) {
+    CUdevice dev;
+    SM_CU_CHECK(cuDeviceGet(&dev, i));
+    int supported = 0;
+    SM_CU_CHECK(cuDeviceGetAttribute(
+        &supported, CU_DEVICE_ATTRIBUTE_MULTICAST_SUPPORTED, dev));
+    if (!supported)
+      return false;
+  }
+  return true;
+}
+
+// ---------- Multicast (NVSwitch) ----------
+// Multi-process multi-GPU with fabric handles (same as vmm_malloc).
+// Each process manages one GPU. MC handle shared via fabric export/import.
+
+// Create multicast object with FABRIC handle type, returns handle as int64.
+static int64_t mc_create_impl(int64_t size_raw, int64_t num_devices) {
+  CUmulticastObjectProp prop = {};
+  prop.numDevices = (unsigned int)num_devices;
+  prop.handleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+
+  size_t granularity = 0;
+  SM_CU_CHECK(cuMulticastGetGranularity(&granularity, &prop,
+                                        CU_MULTICAST_GRANULARITY_RECOMMENDED));
+
+  size_t size = align_to_granularity((size_t)size_raw, granularity);
+  prop.size = size;
+
+  CUmemGenericAllocationHandle mc_handle;
+  SM_CU_CHECK(cuMulticastCreate(&mc_handle, &prop));
+
+  return (int64_t)mc_handle;
+}
+
+// Export multicast handle as fabric handle bytes (for sharing across processes)
+static ffi::Bytes mc_export_handle_impl(int64_t mc_handle_val) {
+  CUmemGenericAllocationHandle mc_handle =
+      (CUmemGenericAllocationHandle)mc_handle_val;
+
+  CUmemFabricHandle fabric_handle;
+  SM_CU_CHECK(cuMemExportToShareableHandle(&fabric_handle, mc_handle,
+                                           CU_MEM_HANDLE_TYPE_FABRIC, 0));
+
+  return ffi::Bytes(reinterpret_cast<const char *>(&fabric_handle),
+                    sizeof(CUmemFabricHandle));
+}
+
+// Import multicast handle from fabric handle bytes, returns handle as int64.
+static int64_t mc_import_handle_impl(ffi::Bytes handle_bytes) {
+  ICHECK(handle_bytes.size() == sizeof(CUmemFabricHandle));
+
+  CUmemFabricHandle fabric_handle;
+  std::memcpy(&fabric_handle, handle_bytes.data(), sizeof(CUmemFabricHandle));
+
+  CUmemGenericAllocationHandle mc_handle;
+  SM_CU_CHECK(cuMemImportFromShareableHandle(&mc_handle, &fabric_handle,
+                                             CU_MEM_HANDLE_TYPE_FABRIC));
+
+  return (int64_t)mc_handle;
+}
+
+// Add a device to the multicast object
+static void mc_add_device_impl(int64_t mc_handle_val, int64_t device_id) {
+  CUmemGenericAllocationHandle mc_handle =
+      (CUmemGenericAllocationHandle)mc_handle_val;
+  CUdevice device;
+  SM_CU_CHECK(cuDeviceGet(&device, (int)device_id));
+  SM_CU_CHECK(cuMulticastAddDevice(mc_handle, device));
+}
+
+// Bind a physical memory VA (from vmm_malloc) to the multicast object
+static void mc_bind_mem_impl(int64_t mc_handle_val, int64_t ptr_val,
+                             int64_t size) {
+  CUmemGenericAllocationHandle mc_handle =
+      (CUmemGenericAllocationHandle)mc_handle_val;
+  void *ptr = reinterpret_cast<void *>((uintptr_t)ptr_val);
+
+  // Retrieve the physical allocation handle from the mapped pointer
+  CUmemGenericAllocationHandle phys_handle;
+  SM_CU_CHECK(cuMemRetainAllocationHandle(&phys_handle, ptr));
+
+  // Bind to multicast
+  SM_CU_CHECK(
+      cuMulticastBindMem(mc_handle, 0, phys_handle, 0, (size_t)size, 0));
+
+  // Release the temporary handle reference
+  SM_CU_CHECK(cuMemRelease(phys_handle));
+}
+
+// Map multicast object to a VA, returns mc_ptr. Does NOT release handle.
+static int64_t mc_map_impl(int64_t mc_handle_val, int64_t size_raw,
+                           int64_t num_devices) {
+  CUmemGenericAllocationHandle mc_handle =
+      (CUmemGenericAllocationHandle)mc_handle_val;
+
+  CUmulticastObjectProp prop = {};
+  prop.numDevices = (unsigned int)num_devices;
+  prop.handleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+
+  size_t granularity = 0;
+  SM_CU_CHECK(cuMulticastGetGranularity(&granularity, &prop,
+                                        CU_MULTICAST_GRANULARITY_RECOMMENDED));
+  size_t size = align_to_granularity((size_t)size_raw, granularity);
+
+  void *mc_ptr = nullptr;
+  SM_CU_CHECK(
+      cuMemAddressReserve((CUdeviceptr *)&mc_ptr, size, granularity, 0, 0));
+  SM_CU_CHECK(cuMemMap((CUdeviceptr)mc_ptr, size, 0, mc_handle, 0));
+  cu_mem_set_access_all(mc_ptr, size);
+
+  return (int64_t)(uintptr_t)mc_ptr;
+}
+
+// Release a multicast handle (call after map)
+static void mc_release_handle_impl(int64_t mc_handle_val) {
+  CUmemGenericAllocationHandle mc_handle =
+      (CUmemGenericAllocationHandle)mc_handle_val;
+  SM_CU_CHECK(cuMemRelease(mc_handle));
+}
+
+// Free multicast VA mapping
+static void mc_unmap_impl(int64_t mc_ptr_val, int64_t size_raw,
+                          int64_t num_devices) {
+  void *ptr = reinterpret_cast<void *>((uintptr_t)mc_ptr_val);
+
+  CUmulticastObjectProp prop = {};
+  prop.numDevices = (unsigned int)num_devices;
+  prop.handleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+
+  size_t granularity = 0;
+  SM_CU_CHECK(cuMulticastGetGranularity(&granularity, &prop,
+                                        CU_MULTICAST_GRANULARITY_RECOMMENDED));
+  size_t size = align_to_granularity((size_t)size_raw, granularity);
+
+  SM_CU_CHECK(cuMemUnmap((CUdeviceptr)ptr, size));
+  SM_CU_CHECK(cuMemAddressFree((CUdeviceptr)ptr, size));
+}
+
+// Get the aligned size for multicast
+static int64_t mc_get_aligned_size_impl(int64_t size_raw, int64_t num_devices) {
+  CUmulticastObjectProp prop = {};
+  prop.numDevices = (unsigned int)num_devices;
+  prop.handleTypes = CU_MEM_HANDLE_TYPE_FABRIC;
+
+  size_t granularity = 0;
+  SM_CU_CHECK(cuMulticastGetGranularity(&granularity, &prop,
+                                        CU_MULTICAST_GRANULARITY_RECOMMENDED));
+  return (int64_t)align_to_granularity((size_t)size_raw, granularity);
+}
+
+// ---------- sync helpers ----------
+
+// Synchronize VMM handles: open all peer handles and write pointers to GPU.
+// peer_handles is a comma-separated list of hex-encoded handle bytes (or "SELF"
+// for local rank). We pass individual handle open results back via buffer_ptrs.
+// packed_handles: num_ranks concatenated raw handle bytes
+static void sync_vmm_handles_impl(int64_t rank, int64_t num_ranks,
+                                  int64_t buffer_ptrs_gpu_addr,
+                                  ffi::Bytes packed_handles) {
+  const size_t handle_size = sizeof(size_t) + sizeof(CUmemFabricHandle);
+  ICHECK(packed_handles.size() == handle_size * (size_t)num_ranks);
+
+  std::vector<void *> buffer_ptrs(num_ranks, nullptr);
+
+  for (int64_t i = 0; i < num_ranks; ++i) {
+    if (i != rank) {
+      ffi::Bytes h(packed_handles.data() + i * handle_size, handle_size);
+      buffer_ptrs[i] =
+          reinterpret_cast<void *>((uintptr_t)open_vmm_handle_impl(h));
+    }
+  }
+
+  void **gpu_ptr = reinterpret_cast<void **>((uintptr_t)buffer_ptrs_gpu_addr);
+  SM_CUDA_CHECK(cudaMemcpy(gpu_ptr, buffer_ptrs.data(),
+                           sizeof(void *) * buffer_ptrs.size(),
+                           cudaMemcpyHostToDevice));
+  SM_CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+static void sync_ipc_handles_impl(int64_t rank, int64_t num_ranks,
+                                  int64_t buffer_ptrs_gpu_addr,
+                                  ffi::Bytes packed_handles) {
+  ICHECK(packed_handles.size() == CUDA_IPC_HANDLE_SIZE * (size_t)num_ranks);
+
+  std::vector<void *> buffer_ptrs(num_ranks, nullptr);
+
+  for (int64_t i = 0; i < num_ranks; ++i) {
+    if (i != rank) {
+      ffi::Bytes h(packed_handles.data() + i * CUDA_IPC_HANDLE_SIZE,
+                   CUDA_IPC_HANDLE_SIZE);
+      buffer_ptrs[i] =
+          reinterpret_cast<void *>((uintptr_t)open_ipc_handle_impl(h));
+    }
+  }
+
+  void **gpu_ptr = reinterpret_cast<void **>((uintptr_t)buffer_ptrs_gpu_addr);
+  SM_CUDA_CHECK(cudaMemcpy(gpu_ptr, buffer_ptrs.data(),
+                           sizeof(void *) * buffer_ptrs.size(),
+                           cudaMemcpyHostToDevice));
+  SM_CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+// ---------- Registration ----------
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+
+  // VMM
+  refl::GlobalDef().def("tl.shared_memory.vmm_malloc", vmm_malloc_impl);
+  refl::GlobalDef().def("tl.shared_memory.vmm_free", vmm_free_impl);
+  refl::GlobalDef().def("tl.shared_memory.create_vmm_handle",
+                        create_vmm_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.open_vmm_handle",
+                        open_vmm_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.close_vmm_handle",
+                        close_vmm_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.sync_vmm_handles",
+                        sync_vmm_handles_impl);
+
+  // IPC
+  refl::GlobalDef().def("tl.shared_memory.create_ipc_handle",
+                        create_ipc_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.open_ipc_handle",
+                        open_ipc_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.close_ipc_handle",
+                        close_ipc_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.sync_ipc_handles",
+                        sync_ipc_handles_impl);
+
+  // Support detection
+  refl::GlobalDef().def("tl.shared_memory.supports_vmm_fabric",
+                        supports_vmm_fabric_impl);
+  refl::GlobalDef().def("tl.shared_memory.supports_multicast",
+                        supports_multicast_impl);
+
+  // Multicast (NVSwitch)
+  refl::GlobalDef().def("tl.shared_memory.mc_create", mc_create_impl);
+  refl::GlobalDef().def("tl.shared_memory.mc_export_handle",
+                        mc_export_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.mc_import_handle",
+                        mc_import_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.mc_add_device", mc_add_device_impl);
+  refl::GlobalDef().def("tl.shared_memory.mc_bind_mem", mc_bind_mem_impl);
+  refl::GlobalDef().def("tl.shared_memory.mc_map", mc_map_impl);
+  refl::GlobalDef().def("tl.shared_memory.mc_release_handle",
+                        mc_release_handle_impl);
+  refl::GlobalDef().def("tl.shared_memory.mc_unmap", mc_unmap_impl);
+  refl::GlobalDef().def("tl.shared_memory.mc_get_aligned_size",
+                        mc_get_aligned_size_impl);
+}
diff --git a/src/target/codegen_cpp.cc b/src/target/codegen_c.cc
similarity index 85%
rename from src/target/codegen_cpp.cc
rename to src/target/codegen_c.cc
index 4f736bb066..e290d94144 100644
--- a/src/target/codegen_cpp.cc
+++ b/src/target/codegen_c.cc
@@ -18,9 +18,9 @@
  */
 
 /*!
- * \file codegen_c_host.cc
+ * \file codegen_c.cc
  */
-#include "codegen_cpp.h"
+#include "codegen_c.h"
 
 #include <tvm/runtime/module.h>
 #include <tvm/target/codegen.h>
@@ -38,32 +38,31 @@
 namespace tvm {
 namespace codegen {
 
-CodeGenTileLangCPP::CodeGenTileLangCPP() {
+CodeGenTileLangC::CodeGenTileLangC() {
   module_name_ = name_supply_->FreshName("__tvm_ffi_library_ctx");
 }
 
-void CodeGenTileLangCPP::Init(bool output_ssa, bool emit_asserts,
-                              bool emit_fwd_func_decl, std::string target_str,
-                              const std::unordered_set<std::string> &devices) {
+void CodeGenTileLangC::Init(bool output_ssa, bool emit_asserts,
+                            bool emit_fwd_func_decl, std::string target_str,
+                            const std::unordered_set<std::string> &devices) {
   emit_asserts_ = emit_asserts;
   emit_fwd_func_decl_ = emit_fwd_func_decl;
   declared_globals_.clear();
   decl_stream << "// tilelang target: " << target_str << "\n";
   decl_stream << "#include <tl_templates/cpp/common.h>\n";
-  decl_stream << "#include <tl_templates/cpp/gemm.h>\n";
   decl_stream << "\n";
   CodeGenC::Init(output_ssa);
 }
 
-void CodeGenTileLangCPP::InitGlobalContext() {
+void CodeGenTileLangC::InitGlobalContext() {
   decl_stream << "void* " << ffi::symbol::tvm_ffi_library_ctx << " = NULL;\n";
 }
 
-void CodeGenTileLangCPP::DefineModuleName() {
+void CodeGenTileLangC::DefineModuleName() {
   decl_stream << "void* " << module_name_ << " = NULL;\n";
 }
 
-void CodeGenTileLangCPP::GenerateForwardFunctionDeclarations(
+void CodeGenTileLangC::GenerateForwardFunctionDeclarations(
     String global_symbol,
 
     const Array<Type> &arg_types, const Type &ret_type) {
@@ -87,13 +86,13 @@ void CodeGenTileLangCPP::GenerateForwardFunctionDeclarations(
   fwd_decl_stream << ");\n";
 }
 
-void CodeGenTileLangCPP::PrintFuncPrefix(std::ostream &os) { // NOLINT(*)
+void CodeGenTileLangC::PrintFuncPrefix(std::ostream &os) { // NOLINT(*)
   os << "#ifdef __cplusplus\n"
      << "extern \"C\"\n"
      << "#endif\n";
 }
 
-void CodeGenTileLangCPP::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
+void CodeGenTileLangC::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
   int lanes = t.lanes();
   if (t.is_handle()) {
     ICHECK_EQ(lanes, 1) << "does not support vector types";
@@ -164,8 +163,8 @@ void CodeGenTileLangCPP::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
   LOG(FATAL) << "Cannot convert type " << t << " to C type";
 }
 
-void CodeGenTileLangCPP::VisitExpr_(const BroadcastNode *op,
-                                    std::ostream &os) { // NOLINT(*)
+void CodeGenTileLangC::VisitExpr_(const BroadcastNode *op,
+                                  std::ostream &os) { // NOLINT(*)
   std::string v = PrintExpr(op->value);
   int lanes = op->dtype.lanes();
   os << "((";
@@ -179,7 +178,7 @@ void CodeGenTileLangCPP::VisitExpr_(const BroadcastNode *op,
   os << "))";
 }
 
-void CodeGenTileLangCPP::PrintGetFuncFromBackend(
+void CodeGenTileLangC::PrintGetFuncFromBackend(
     const std::string &func_name, const std::string &packed_func_name) {
   this->PrintIndent();
   this->stream << "if (" << packed_func_name << " == NULL) {\n";
@@ -199,8 +198,8 @@ void CodeGenTileLangCPP::PrintGetFuncFromBackend(
   this->stream << "}\n";
 }
 
-void CodeGenTileLangCPP::PrintFuncCall(const std::string &packed_func_name,
-                                       int num_args) {
+void CodeGenTileLangC::PrintFuncCall(const std::string &packed_func_name,
+                                     int num_args) {
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
@@ -223,9 +222,9 @@ void CodeGenTileLangCPP::PrintFuncCall(const std::string &packed_func_name,
   this->stream << "}\n";
 }
 
-void CodeGenTileLangCPP::PrintFuncCallC(
-    const std::string &packed_func_name, int num_args,
-    const std::string &resource_handle_name) {
+void CodeGenTileLangC::PrintFuncCallC(const std::string &packed_func_name,
+                                      int num_args,
+                                      const std::string &resource_handle_name) {
   this->PrintIndent();
   std::string ret_val = name_supply_->FreshName("ret_val");
   std::string ret_type_code = name_supply_->FreshName("ret_type_code");
@@ -251,7 +250,7 @@ void CodeGenTileLangCPP::PrintFuncCallC(
   this->stream << "}\n";
 }
 
-void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
+void CodeGenTileLangC::AddFunction(const PrimFunc &f) {
   // clear previous generated state.
   this->InitFuncState(f);
   // reserve keywords
@@ -318,7 +317,7 @@ void CodeGenTileLangCPP::AddFunction(const PrimFunc &f) {
   this->stream << "}\n\n";
 }
 
-std::string CodeGenTileLangCPP::GetPackedName(const CallNode *op) {
+std::string CodeGenTileLangC::GetPackedName(const CallNode *op) {
   const StringImmNode *s = op->args[0].as<StringImmNode>();
   ICHECK(s != nullptr)
       << "tvm_call_packed_lowered expects first argument as function name";
@@ -336,9 +335,9 @@ std::string CodeGenTileLangCPP::GetPackedName(const CallNode *op) {
   return unique_name;
 }
 
-CodeGenTileLangCPP::FunctionInfo
-CodeGenTileLangCPP::GetFunctionInfo(const CallNode *op,
-                                    bool has_resource_handle) {
+CodeGenTileLangC::FunctionInfo
+CodeGenTileLangC::GetFunctionInfo(const CallNode *op,
+                                  bool has_resource_handle) {
   const StringImmNode *s = op->args[0].as<StringImmNode>();
   ICHECK(s != nullptr)
       << "tvm_call_[c]packed_lowered expects first argument as function name";
@@ -379,8 +378,8 @@ CodeGenTileLangCPP::GetFunctionInfo(const CallNode *op,
   return {func_name, num_args, "NULL"};
 }
 
-void CodeGenTileLangCPP::VisitExpr_(const CallNode *op,
-                                    std::ostream &os) { // NOLINT(*)
+void CodeGenTileLangC::VisitExpr_(const CallNode *op,
+                                  std::ostream &os) { // NOLINT(*)
   if (op->op.same_as(builtin::tvm_stack_alloca())) {
     std::string stack_name = name_supply_->FreshName("stack");
     const std::string &type = op->args[0].as<StringImmNode>()->value;
@@ -401,7 +400,12 @@ void CodeGenTileLangCPP::VisitExpr_(const CallNode *op,
       LOG(FATAL) << "Unknown stack alloca type " << type;
     }
     this->PrintIndent();
-    this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
+    if (type == "arg_value") {
+      this->stream << "alignas(64) TVMFFIAny " << stack_name << "[" << size
+                   << "];\n";
+    } else {
+      this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
+    }
     os << stack_name;
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
     auto function_info = GetFunctionInfo(op, false /* has_resource_handle */);
@@ -420,7 +424,7 @@ void CodeGenTileLangCPP::VisitExpr_(const CallNode *op,
   }
 }
 
-void CodeGenTileLangCPP::VisitStmt_(const AssertStmtNode *op) { // NOLINT(*)
+void CodeGenTileLangC::VisitStmt_(const AssertStmtNode *op) { // NOLINT(*)
   if (emit_asserts_) {
     std::string cond = PrintExpr(op->condition);
     PrintIndent();
@@ -438,7 +442,7 @@ void CodeGenTileLangCPP::VisitStmt_(const AssertStmtNode *op) { // NOLINT(*)
   this->PrintStmt(op->body);
 }
 
-void CodeGenTileLangCPP::VisitStmt_(const AllocateNode *op) {
+void CodeGenTileLangC::VisitStmt_(const AllocateNode *op) {
   ICHECK(!is_zero(op->condition));
   std::string vid = AllocVarID(op->buffer_var.get());
 
@@ -456,20 +460,20 @@ void CodeGenTileLangCPP::VisitStmt_(const AllocateNode *op) {
   this->PrintStmt(op->body);
 }
 
-void CodeGenTileLangCPP::VisitExpr_(const MinNode *op,
-                                    std::ostream &os) { // NOLINT(*)
+void CodeGenTileLangC::VisitExpr_(const MinNode *op,
+                                  std::ostream &os) { // NOLINT(*)
   PrintTernaryCondExpr(op, "<", os);
 }
 
-void CodeGenTileLangCPP::VisitExpr_(const MaxNode *op,
-                                    std::ostream &os) { // NOLINT(*)
+void CodeGenTileLangC::VisitExpr_(const MaxNode *op,
+                                  std::ostream &os) { // NOLINT(*)
   PrintTernaryCondExpr(op, ">", os);
 }
 
 template <typename T>
 inline void
-CodeGenTileLangCPP::PrintTernaryCondExpr(const T *op, const char *compare,
-                                         std::ostream &os) { // NOLINT(*)
+CodeGenTileLangC::PrintTernaryCondExpr(const T *op, const char *compare,
+                                       std::ostream &os) { // NOLINT(*)
   std::ostringstream temp_a;
   VisitExpr(op->a, temp_a);
   std::string a_id = SSAGetID(temp_a.str(), op->a.dtype());
diff --git a/src/target/codegen_cpp.h b/src/target/codegen_c.h
similarity index 94%
rename from src/target/codegen_cpp.h
rename to src/target/codegen_c.h
index 25bb115c82..64bff21b66 100644
--- a/src/target/codegen_cpp.h
+++ b/src/target/codegen_c.h
@@ -18,11 +18,11 @@
  */
 
 /*!
- * \file codegen_c_host.h
- * \brief Generate C host code.
+ * \file codegen_c.h
+ * \brief Generate C code when target is c (CPU).
  */
-#ifndef TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
-#define TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
+#ifndef TVM_TL_CODEGEN_C_H_
+#define TVM_TL_CODEGEN_C_H_
 
 #include <string>
 #include <unordered_map>
@@ -37,9 +37,9 @@
 namespace tvm {
 namespace codegen {
 
-class CodeGenTileLangCPP : public CodeGenC {
+class CodeGenTileLangC : public CodeGenC {
 public:
-  CodeGenTileLangCPP();
+  CodeGenTileLangC();
   void Init(bool output_ssa, bool emit_asserts, bool emit_fwd_func_decl,
             std::string target_str,
             const std::unordered_set<std::string> &devices);
@@ -122,4 +122,4 @@ class CodeGenTileLangCPP : public CodeGenC {
 } // namespace codegen
 } // namespace tvm
 
-#endif // TVM_TARGET_SOURCE_CODEGEN_C_HOST_H_
+#endif // TVM_TL_CODEGEN_C_H_
diff --git a/src/target/codegen_c_host.cc b/src/target/codegen_c_host.cc
index 2c18734670..aaf1a452fe 100644
--- a/src/target/codegen_c_host.cc
+++ b/src/target/codegen_c_host.cc
@@ -58,6 +58,17 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts,
   // snprintf for richer assert messages with actual values
   decl_stream << "#include <stdio.h>\n";
   decl_stream << "#include <stdbool.h>\n";
+
+  decl_stream << "#ifdef __OBJC__\n";
+  decl_stream << "#include \"tvm/runtime/device_api.h\"\n";
+  decl_stream << "#include \"tvm/ffi/function.h\"\n";
+
+  decl_stream << "#include <Metal/Metal.h>\n";
+  decl_stream << "#include <Foundation/Foundation.h>\n";
+
+  decl_stream << "#include <torch/mps.h>\n";
+  decl_stream << "#endif\n";
+
   CodeGenCHost::InitGlobalContext();
   tvm::codegen::CodeGenC::Init(output_ssa);
 }
@@ -92,15 +103,17 @@ void CodeGenCHost::AddFunction(const tvm::GlobalVar &gvar,
         << "CodeGenCHost: The entry func must have the global_symbol "
            "attribute, "
         << "but function " << gvar << " only has attributes " << func->attrs;
-    function_names_.push_back(tvm::ffi::symbol::tvm_ffi_main);
-    stream << "// CodegenC: NOTE: Auto-generated entry function\n";
-    PrintFuncPrefix(stream);
-    PrintType(func->ret_type, stream);
-    stream << " " << tvm::ffi::symbol::tvm_ffi_main
-           << "(void* self, void* args,int num_args, void* result) {\n";
-    stream << "  return " << static_cast<std::string>(global_symbol.value())
-           << "(self, args, num_args, result);\n";
-    stream << "}\n";
+    if (global_symbol.value() != tvm::ffi::symbol::tvm_ffi_main) {
+      function_names_.push_back(tvm::ffi::symbol::tvm_ffi_main);
+      stream << "// CodegenC: NOTE: Auto-generated entry function\n";
+      PrintFuncPrefix(stream);
+      PrintType(func->ret_type, stream);
+      stream << " " << tvm::ffi::symbol::tvm_ffi_main
+             << "(void* self, void* args,int num_args, void* result) {\n";
+      stream << "  return " << static_cast<std::string>(global_symbol.value())
+             << "(self, args, num_args, result);\n";
+      stream << "}\n";
+    }
     has_main_func_ = true;
   }
 }
@@ -269,6 +282,9 @@ void CodeGenCHost::PrintCallPacked(const tvm::tir::CallNode *op) {
   std::string args_stack = PrintExpr(op->args[1]);
   this->PrintIndent();
   std::string result = name_supply_->FreshName("result");
+  if (is_in_metal_context) {
+    this->stream << "__block ";
+  }
   this->stream << "TVMFFIAny " << result << ";\n";
   this->PrintIndent();
   // must make sure type_index is set to none
@@ -277,6 +293,23 @@ void CodeGenCHost::PrintCallPacked(const tvm::tir::CallNode *op) {
   this->stream << result << ".zero_padding = 0;\n";
   this->PrintIndent();
   this->stream << result << ".v_int64 = 0;\n";
+
+  int metal_scope;
+  std::string metal_result;
+  if (is_in_metal_context) {
+    metal_result = name_supply_->FreshName("metal_ret");
+    this->PrintLine("__block int ", metal_result, " = 0;");
+    this->PrintLine("auto serialQueue = torch::mps::get_dispatch_queue();");
+    this->PrintLine("dispatch_sync(serialQueue, ^() {");
+    metal_scope = this->BeginScope();
+
+    this->PrintLine("const id<MTLCommandBuffer> commandBuffer = "
+                    "torch::mps::get_command_buffer();");
+    this->PrintLine(
+        "const auto f = tvm::ffi::Function::GetGlobal(\"metal.SetStream\");");
+    this->PrintLine("(*f)(static_cast<TVMStreamHandle>(commandBuffer));");
+  }
+
   this->PrintIndent();
   if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
     this->stream << "if (TVMFFIFunctionCall(" << packed_func_name << ", ";
@@ -286,11 +319,20 @@ void CodeGenCHost::PrintCallPacked(const tvm::tir::CallNode *op) {
   this->stream << "(TVMFFIAny*) " << args_stack << ", " << num_args << ", "
                << "&" << result << ") != 0) {\n";
   int func_call_scope = this->BeginScope();
-  this->PrintIndent();
-  this->stream << "return -1;\n";
+  if (is_in_metal_context) {
+    this->PrintLine(metal_result, " = -1;");
+  } else {
+    this->PrintLine("return -1;");
+  }
   this->EndScope(func_call_scope);
-  this->PrintIndent();
-  this->stream << "}\n";
+
+  this->PrintLine("}");
+
+  if (is_in_metal_context) {
+    this->EndScope(metal_scope);
+    this->PrintLine("});");
+    this->PrintLine("if (", metal_result, " != 0) return ", metal_result, ";");
+  }
 }
 
 std::string CodeGenCHost::GetPackedName(const tvm::tir::CallNode *op) {
@@ -333,7 +375,14 @@ void CodeGenCHost::VisitExpr_(const tvm::tir::CallNode *op,
       LOG(FATAL) << "Unknown stack alloca type " << type;
     }
     this->PrintIndent();
-    this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
+    if (type == "tvm_ffi_any") {
+      // TMA descriptors are materialized through tvm_ffi_any stack allocas and
+      // must satisfy CUDA's 64-byte alignment requirement for CUtensorMap.
+      this->stream << "__attribute__((aligned(64))) TVMFFIAny " << stack_name
+                   << "[" << size << "];\n";
+    } else {
+      this->stream << "TVMFFIAny " << stack_name << "[" << size << "];\n";
+    }
     os << stack_name;
   } else if (op->op.same_as(builtin::tvm_call_packed_lowered())) {
     this->PrintCallPacked(op);
@@ -403,6 +452,18 @@ void CodeGenCHost::VisitStmt_(const tvm::tir::AssertStmtNode *op) { // NOLINT(*)
   this->PrintStmt(op->body);
 }
 
+void CodeGenCHost::VisitStmt_(const tvm::tir::AttrStmtNode *op) {
+  bool enter_metal_ctx = op->attr_key == "metal_context";
+  if (enter_metal_ctx) {
+    ICHECK(!is_in_metal_context) << "Nested metal context";
+    is_in_metal_context = true;
+  }
+  tvm::codegen::CodeGenC::VisitStmt_(op);
+  if (enter_metal_ctx) {
+    is_in_metal_context = false;
+  }
+}
+
 void CodeGenCHost::VisitExpr_(const tvm::tir::MinNode *op,
                               std::ostream &os) { // NOLINT(*)
   PrintTernaryCondExpr(op, "<", os);
@@ -442,6 +503,10 @@ using tvm::ffi::String;
 
 // Build function that mirrors TVM's host C codegen, registered under a
 // TileLang-specific name.
+// NOTE(chaofan): Different from codegen_c / BuildTileLangC, this CodeGen class
+// is only used to generate C host code for TileLang when the host codegen is
+// enabled. If you use TileLang to generate CPU code (in this case, C is the
+// device code) , it will be generated by BuildTileLangC.
 ::tvm::ffi::Module BuildTileLangCHost(::tvm::IRModule mod,
                                       ::tvm::Target target) {
   bool output_ssa = false;
@@ -473,7 +538,7 @@ ::tvm::ffi::Module BuildTileLangCHost(::tvm::IRModule mod,
   std::vector<std::pair<::tvm::GlobalVar, ::tvm::tir::PrimFunc>> funcs;
   for (auto [gvar, base_func] : mod->functions) {
     ICHECK(base_func->IsInstance<::tvm::tir::PrimFuncNode>())
-        << "CodegenCHost: Can only take PrimFunc";
+        << "TileLangCodegenCHost: Can only take PrimFunc";
     auto prim_func = ::tvm::Downcast<::tvm::tir::PrimFunc>(base_func);
     funcs.push_back({gvar, prim_func});
   }
@@ -504,7 +569,7 @@ ::tvm::ffi::Module BuildTileLangCHost(::tvm::IRModule mod,
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("target.build.tilelang_c", BuildTileLangCHost);
+  refl::GlobalDef().def("target.build.tilelang_c_host", BuildTileLangCHost);
 }
 
 } // namespace tl
diff --git a/src/target/codegen_c_host.h b/src/target/codegen_c_host.h
index 8d54cb4ad9..345caab88a 100644
--- a/src/target/codegen_c_host.h
+++ b/src/target/codegen_c_host.h
@@ -19,10 +19,10 @@
 
 /*!
  * \file codegen_c_host.h
- * \brief Generate C host code (TileLang copy).
+ * \brief Generate C host code with TVM FFI when Host CodeGen is enabled.
  */
-#ifndef TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
-#define TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+#ifndef TVM_TL_CODEGEN_C_HOST_H_
+#define TVM_TL_CODEGEN_C_HOST_H_
 
 #include <string>
 #include <unordered_map>
@@ -80,6 +80,8 @@ class CodeGenCHost : public tvm::codegen::CodeGenC {
 
   void VisitStmt_(const tvm::tir::AssertStmtNode *op) final; // NOLINT(*)
 
+  void VisitStmt_(const tvm::tir::AttrStmtNode *op) final; // NOLINT(*)
+
   void GenerateForwardFunctionDeclarations(
       tvm::ffi::String global_symbol,
       const tvm::ffi::Array<tvm::Type> &arg_types,
@@ -102,6 +104,8 @@ class CodeGenCHost : public tvm::codegen::CodeGenC {
   /*! \brief whether to generate the entry function if encountered */
   bool has_main_func_ = false;
 
+  bool is_in_metal_context = false;
+
   std::string GetPackedName(const tvm::tir::CallNode *op);
   void PrintGetFuncFromBackend(const std::string &func_name,
                                const std::string &packed_func_name);
@@ -116,9 +120,14 @@ class CodeGenCHost : public tvm::codegen::CodeGenC {
   template <typename T>
   inline void PrintTernaryCondExpr(const T *op, const char *compare,
                                    std::ostream &os); // NOLINT(*)
+
+  template <typename... Args> void PrintLine(Args &&...args) {
+    this->PrintIndent();
+    (this->stream << ... << args) << '\n';
+  }
 };
 
 } // namespace tl
 } // namespace tvm
 
-#endif // TL_TARGET_SOURCE_CODEGEN_C_HOST_H_
+#endif // TVM_TL_CODEGEN_C_HOST_H_
diff --git a/src/target/codegen_cuda.cc b/src/target/codegen_cuda.cc
index ce605ac7da..247d47f6b1 100644
--- a/src/target/codegen_cuda.cc
+++ b/src/target/codegen_cuda.cc
@@ -9,6 +9,8 @@
 #include <tvm/tir/op.h>
 
 #include <cmath>
+#include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -16,6 +18,7 @@
 #include "../op/builtin.h"
 #include "../op/distributed.h"
 #include "../op/sync.h"
+#include "../transform/common/attr.h"
 #include "./ptx.h"
 #include "./utils.h"
 #include "arith/pattern_match.h"
@@ -25,6 +28,91 @@ namespace codegen {
 using namespace tvm::tl::codegen;
 using namespace ffi;
 
+namespace {
+
+bool IsValidCPAsyncTransferBytes(int64_t bytes) {
+  return bytes == 4 || bytes == 8 || bytes == 16;
+}
+
+std::optional<DataType> GetAccessPtrElementType(const PrimExpr &expr) {
+  const auto *ptr_call = expr.as<CallNode>();
+  if (ptr_call == nullptr) {
+    return std::nullopt;
+  }
+  if (ptr_call->op.same_as(builtin::address_of())) {
+    const auto *buffer_load = ptr_call->args[0].as<BufferLoadNode>();
+    ICHECK(buffer_load) << "address_of arg must be BufferLoad";
+    return buffer_load->buffer->dtype;
+  }
+  if (ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+    ICHECK(!ptr_call->args.empty());
+    return ptr_call->args[0].dtype();
+  }
+  if (ptr_call->op.same_as(tl::access_ptr())) {
+    ICHECK_EQ(ptr_call->args.size(), 3U)
+        << "tl.access_ptr expects 3 args: (BufferLoad, extent, rw_mask)";
+    const auto *buffer_load = ptr_call->args[0].as<BufferLoadNode>();
+    ICHECK(buffer_load) << "tl.access_ptr arg0 must be BufferLoad";
+    return buffer_load->buffer->dtype;
+  }
+  return std::nullopt;
+}
+
+int GetTileLangCPAsyncTransferBytes(const CallNode *op) {
+  ICHECK(op->args.size() == 3 || op->args.size() == 4)
+      << "tl::ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+         "src_access_ptr, num_elems, [predicate])";
+  const auto *num_elems_imm = op->args[2].as<IntImmNode>();
+  ICHECK(num_elems_imm) << "tl::ptx_cp_async num_elems must be IntImm, but got "
+                        << op->args[2];
+  int64_t num_elems = num_elems_imm->value;
+  ICHECK_GT(num_elems, 0);
+
+  auto dst_elem_type = GetAccessPtrElementType(op->args[0]);
+  auto src_elem_type = GetAccessPtrElementType(op->args[1]);
+  ICHECK(dst_elem_type.has_value() && src_elem_type.has_value())
+      << "tl::ptx_cp_async expects address_of, tl.access_ptr, or "
+         "tvm_access_ptr operands";
+
+  int64_t dst_total_bits =
+      num_elems * dst_elem_type.value().bits() * dst_elem_type.value().lanes();
+  int64_t src_total_bits =
+      num_elems * src_elem_type.value().bits() * src_elem_type.value().lanes();
+  ICHECK_EQ(dst_total_bits, src_total_bits)
+      << "tl::ptx_cp_async requires src/dst transfer widths to match, but got "
+      << dst_total_bits << " vs " << src_total_bits << " bits";
+  ICHECK_EQ(dst_total_bits % 8, 0)
+      << "tl::ptx_cp_async requires byte-aligned transfers, but got "
+      << dst_total_bits << " bits";
+
+  int64_t total_bytes = dst_total_bits / 8;
+  ICHECK(IsValidCPAsyncTransferBytes(total_bytes))
+      << "tl::ptx_cp_async requires a final PTX byte width in {4, 8, 16}, but "
+         "got "
+      << total_bytes;
+  return static_cast<int>(total_bytes);
+}
+
+bool CanEmitPackedX2Math(DataType t) {
+  int lanes = t.lanes();
+  if (lanes < 2 || lanes % 2 != 0) {
+    return false;
+  }
+
+  if (t.is_bfloat16() || t.is_float16()) {
+    return true;
+  }
+
+  if (t.is_float() && t.bits() == 32) {
+    Target cur_target = Target::Current(/*allow_not_defined=*/true);
+    return cur_target.defined() && tl::TargetHasSMVersionGE(cur_target, 100);
+  }
+
+  return false;
+}
+
+} // namespace
+
 struct CUDAMath {
   std::string operator()(DataType t, std::string name) const {
     if (t.is_float()) {
@@ -217,6 +305,188 @@ CodeGenTileLangCUDA::CodeGenTileLangCUDA() {
             runtime::symbol::tvm_global_barrier_state);
 }
 
+void CodeGenTileLangCUDA::ReserveKeywordsAsUnique_() {
+  CodeGenC::ReserveKeywordsAsUnique();
+  name_supply_->ReserveName("max");
+  name_supply_->ReserveName("min");
+  name_supply_->ReserveName("isfinite");
+  name_supply_->ReserveName("isinf");
+  name_supply_->ReserveName("isnan");
+
+  // skip single precision mathematical functions
+  name_supply_->ReserveName("acosf");
+  name_supply_->ReserveName("acoshf");
+  name_supply_->ReserveName("asinf");
+  name_supply_->ReserveName("asinhf");
+  name_supply_->ReserveName("atan2f");
+  name_supply_->ReserveName("atanf");
+  name_supply_->ReserveName("atanhf");
+  name_supply_->ReserveName("cbrtf");
+  name_supply_->ReserveName("ceilf");
+  name_supply_->ReserveName("copysignf");
+  name_supply_->ReserveName("cosf");
+  name_supply_->ReserveName("coshf");
+  name_supply_->ReserveName("cospif");
+  name_supply_->ReserveName("cyl_bessel_i0f");
+  name_supply_->ReserveName("cyl_bessel_i1f");
+  name_supply_->ReserveName("erfcf");
+  name_supply_->ReserveName("erfcinvf");
+  name_supply_->ReserveName("erfcxf");
+  name_supply_->ReserveName("erff");
+  name_supply_->ReserveName("erfinvf");
+  name_supply_->ReserveName("exp10f");
+  name_supply_->ReserveName("exp2f");
+  name_supply_->ReserveName("expf");
+  name_supply_->ReserveName("expm1f");
+  name_supply_->ReserveName("fabsf");
+  name_supply_->ReserveName("fdimf");
+  name_supply_->ReserveName("fdividef");
+  name_supply_->ReserveName("floorf");
+  name_supply_->ReserveName("fmaf");
+  name_supply_->ReserveName("fmaxf");
+  name_supply_->ReserveName("fminf");
+  name_supply_->ReserveName("fmodf");
+  name_supply_->ReserveName("frexpf");
+  name_supply_->ReserveName("hypotf");
+  name_supply_->ReserveName("ilogbf");
+  name_supply_->ReserveName("j0f");
+  name_supply_->ReserveName("j1f");
+  name_supply_->ReserveName("jnf");
+  name_supply_->ReserveName("ldexpf");
+  name_supply_->ReserveName("lgammaf");
+  name_supply_->ReserveName("llrintf");
+  name_supply_->ReserveName("llroundf");
+  name_supply_->ReserveName("log10f");
+  name_supply_->ReserveName("log1pf");
+  name_supply_->ReserveName("log2f");
+  name_supply_->ReserveName("logbf");
+  name_supply_->ReserveName("logf");
+  name_supply_->ReserveName("lrintf");
+  name_supply_->ReserveName("lroundf");
+  name_supply_->ReserveName("modff");
+  name_supply_->ReserveName("nanf");
+  name_supply_->ReserveName("nearbyintf");
+  name_supply_->ReserveName("nextafterf");
+  name_supply_->ReserveName("norm3df");
+  name_supply_->ReserveName("norm4df");
+  name_supply_->ReserveName("normcdff");
+  name_supply_->ReserveName("normcdfinvf");
+  name_supply_->ReserveName("normf");
+  name_supply_->ReserveName("powf");
+  name_supply_->ReserveName("rcbrtf");
+  name_supply_->ReserveName("remainderf");
+  name_supply_->ReserveName("remquof");
+  name_supply_->ReserveName("rhypotf");
+  name_supply_->ReserveName("rintf");
+  name_supply_->ReserveName("rnorm3df");
+  name_supply_->ReserveName("rnorm4df");
+  name_supply_->ReserveName("rnormf");
+  name_supply_->ReserveName("roundf");
+  name_supply_->ReserveName("rsqrtf");
+  name_supply_->ReserveName("scalblnf");
+  name_supply_->ReserveName("scalbnf");
+  name_supply_->ReserveName("signbit");
+  name_supply_->ReserveName("sincosf");
+  name_supply_->ReserveName("sincospif");
+  name_supply_->ReserveName("sinf");
+  name_supply_->ReserveName("sinhf");
+  name_supply_->ReserveName("sinpif");
+  name_supply_->ReserveName("sqrtf");
+  name_supply_->ReserveName("tanf");
+  name_supply_->ReserveName("tanhf");
+  name_supply_->ReserveName("tgammaf");
+  name_supply_->ReserveName("truncf");
+  name_supply_->ReserveName("y0f");
+  name_supply_->ReserveName("y1f");
+  name_supply_->ReserveName("ynf");
+
+  // skip double precision mathematical functions
+  name_supply_->ReserveName("acos");
+  name_supply_->ReserveName("acosh");
+  name_supply_->ReserveName("asin");
+  name_supply_->ReserveName("asinh");
+  name_supply_->ReserveName("atan");
+  name_supply_->ReserveName("atan2");
+  name_supply_->ReserveName("atanh");
+  name_supply_->ReserveName("cbrt");
+  name_supply_->ReserveName("ceil");
+  name_supply_->ReserveName("copysign");
+  name_supply_->ReserveName("cos");
+  name_supply_->ReserveName("cosh");
+  name_supply_->ReserveName("cospi");
+  name_supply_->ReserveName("cyl_bessel_i0");
+  name_supply_->ReserveName("cyl_bessel_i1");
+  name_supply_->ReserveName("erf");
+  name_supply_->ReserveName("erfc");
+  name_supply_->ReserveName("erfcinv");
+  name_supply_->ReserveName("erfcx");
+  name_supply_->ReserveName("erfinv");
+  name_supply_->ReserveName("exp");
+  name_supply_->ReserveName("exp10");
+  name_supply_->ReserveName("exp2");
+  name_supply_->ReserveName("expm1");
+  name_supply_->ReserveName("fabs");
+  name_supply_->ReserveName("fdim");
+  name_supply_->ReserveName("floor");
+  name_supply_->ReserveName("fma");
+  name_supply_->ReserveName("fmax");
+  name_supply_->ReserveName("fmin");
+  name_supply_->ReserveName("fmod");
+  name_supply_->ReserveName("frexp");
+  name_supply_->ReserveName("hypot");
+  name_supply_->ReserveName("ilogb");
+  name_supply_->ReserveName("j0");
+  name_supply_->ReserveName("j1");
+  name_supply_->ReserveName("jn");
+  name_supply_->ReserveName("ldexp");
+  name_supply_->ReserveName("lgamma");
+  name_supply_->ReserveName("llrint");
+  name_supply_->ReserveName("llround");
+  name_supply_->ReserveName("log");
+  name_supply_->ReserveName("log10");
+  name_supply_->ReserveName("log1p");
+  name_supply_->ReserveName("log2");
+  name_supply_->ReserveName("logb");
+  name_supply_->ReserveName("lrint");
+  name_supply_->ReserveName("lround");
+  name_supply_->ReserveName("modf");
+  name_supply_->ReserveName("nan");
+  name_supply_->ReserveName("nearbyint");
+  name_supply_->ReserveName("nextafter");
+  name_supply_->ReserveName("norm");
+  name_supply_->ReserveName("norm3d");
+  name_supply_->ReserveName("norm4d");
+  name_supply_->ReserveName("normcdf");
+  name_supply_->ReserveName("normcdfinv");
+  name_supply_->ReserveName("pow");
+  name_supply_->ReserveName("rcbrt");
+  name_supply_->ReserveName("remainder");
+  name_supply_->ReserveName("remquo");
+  name_supply_->ReserveName("rhypot");
+  name_supply_->ReserveName("rint");
+  name_supply_->ReserveName("rnorm");
+  name_supply_->ReserveName("rnorm3d");
+  name_supply_->ReserveName("rnorm4d");
+  name_supply_->ReserveName("round");
+  name_supply_->ReserveName("rsqrt");
+  name_supply_->ReserveName("scalbln");
+  name_supply_->ReserveName("scalbn");
+  name_supply_->ReserveName("signbit");
+  name_supply_->ReserveName("sin");
+  name_supply_->ReserveName("sincos");
+  name_supply_->ReserveName("sincospi");
+  name_supply_->ReserveName("sinh");
+  name_supply_->ReserveName("sinpi");
+  name_supply_->ReserveName("sqrt");
+  name_supply_->ReserveName("tan");
+  name_supply_->ReserveName("tanh");
+  name_supply_->ReserveName("tgamma");
+  name_supply_->ReserveName("trunc");
+  name_supply_->ReserveName("y0");
+  name_supply_->ReserveName("y1");
+  name_supply_->ReserveName("yn");
+}
+
 void CodeGenTileLangCUDA::PrintFuncPrefix(std::ostream &os) {
   os << "extern \"C\" __global__ ";
 }
@@ -236,6 +506,10 @@ class LaunchConfigExtractor : public tir::StmtVisitor {
                  iv->thread_tag == "threadIdx.z") {
         threadIdx_z_ext = op->value;
       }
+    } else if (op->attr_key == tl::attr::kMinBlocksPerSM) {
+      if (const IntImmNode *v = op->value.as<IntImmNode>()) {
+        min_blocks_per_sm = v->value;
+      }
     }
     StmtVisitor::VisitStmt_(op);
   }
@@ -244,6 +518,39 @@ class LaunchConfigExtractor : public tir::StmtVisitor {
   PrimExpr threadIdx_x_ext = Integer(1);
   PrimExpr threadIdx_y_ext = Integer(1);
   PrimExpr threadIdx_z_ext = Integer(1);
+  int64_t min_blocks_per_sm = 1; // default to 1
+};
+
+class ClusterInfoExtractor : public tir::StmtVisitor {
+private:
+  void VisitStmt(const PrimFunc &f) {
+    if (f->GetAttr<Array<PrimExpr>>("cluster_dims").has_value()) {
+      launch_with_cluster = true;
+      auto cluster_dims = f->GetAttr<Array<PrimExpr>>("cluster_dims").value();
+      cluster_grid_x_ext = cluster_dims[0].as<IntImmNode>()->value;
+      cluster_grid_y_ext = cluster_dims[1].as<IntImmNode>()->value;
+      cluster_grid_z_ext = cluster_dims[2].as<IntImmNode>()->value;
+      ICHECK(cluster_grid_x_ext > 0 && cluster_grid_y_ext > 0 &&
+             cluster_grid_z_ext > 0);
+    }
+    StmtVisitor::VisitStmt(f->body);
+  }
+
+  bool launch_with_cluster = false;
+  int64_t cluster_grid_x_ext = 1;
+  int64_t cluster_grid_y_ext = 1;
+  int64_t cluster_grid_z_ext = 1;
+
+public:
+  std::optional<std::tuple<int64_t, int64_t, int64_t>>
+  extract(const PrimFunc &f) {
+    this->VisitStmt(f);
+    if (launch_with_cluster) {
+      return std::make_tuple(cluster_grid_x_ext, cluster_grid_y_ext,
+                             cluster_grid_z_ext);
+    }
+    return std::nullopt;
+  }
 };
 
 void CodeGenTileLangCUDA::PrintExtraAttrs(const PrimFunc &f) {
@@ -260,7 +567,8 @@ void CodeGenTileLangCUDA::PrintExtraAttrs(const PrimFunc &f) {
       // return
       return;
     }
-    stream << " __launch_bounds__(" << threadIdx_ext_int->value << ", 1)";
+    stream << " __launch_bounds__(" << threadIdx_ext_int->value << ", "
+           << extractor.min_blocks_per_sm << ")";
   }
 }
 
@@ -286,6 +594,9 @@ std::string CodeGenTileLangCUDA::Finish() {
   if (enable_fp8_) {
     decl_stream << "#include <tl_templates/cuda/cuda_fp8.h>\n";
   }
+  if (enable_fp6_) {
+    decl_stream << "#include <cuda_fp6.h>\n";
+  }
   if (enable_fp4_) {
     decl_stream << "#include <tl_templates/cuda/cuda_fp4.h>\n";
   }
@@ -294,22 +605,16 @@ std::string CodeGenTileLangCUDA::Finish() {
     decl_stream << "#include <math_constants.h>\n";
   }
 
-  if (use_nvshmem_) {
-    decl_stream << "#include <nvshmem.h>\n";
-    decl_stream << "#include <nvshmemx.h>\n";
-  }
-
   if (need_cooperative_groups_) {
     decl_stream << "#include <cooperative_groups.h>\n";
   }
 
-  if (need_curand_kernel_h_) {
-    decl_stream << "#include <curand_kernel.h>\n";
+  if (need_cluster_h_) {
+    decl_stream << "#include <tl_templates/cuda/cluster.h>\n";
   }
 
-  if (use_nvshmem_) {
-    decl_stream << "#include <nvshmem.h>\n";
-    decl_stream << "#include <nvshmemx.h>\n";
+  if (need_curand_kernel_h_) {
+    decl_stream << "#include <curand_kernel.h>\n";
   }
 
   decl_stream << "#include <tl_templates/cuda/gemm.h>\n";
@@ -321,14 +626,15 @@ std::string CodeGenTileLangCUDA::Finish() {
   decl_stream << "#include <tl_templates/cuda/ldsm.h>\n";
   decl_stream << "#include <tl_templates/cuda/threadblock_swizzle.h>\n";
   decl_stream << "#include <tl_templates/cuda/debug.h>\n";
-  decl_stream << "#include <tl_templates/cuda/intrin.h>\n";
-
   if (use_distributed_) {
     decl_stream << "#include <tl_templates/cuda/distributed.h>\n";
     decl_stream << "#include <tl_templates/cuda/sync.h>\n";
     decl_stream << "#include <tl_templates/cuda/ldst.h>\n";
     decl_stream << "extern \"C\" __constant__ uint64_t meta_data[1024];\n";
   }
+  if (need_multimem_h_) {
+    decl_stream << "#include <tl_templates/cuda/multimem.h>\n";
+  }
   decl_stream << "#ifdef ENABLE_BF16\n";
   decl_stream << "#include <tl_templates/cuda/cuda_bf16_fallbacks.cuh>\n";
   decl_stream << "#endif\n";
@@ -476,8 +782,9 @@ void CodeGenTileLangCUDA::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
     enable_fp6_ = true;
     if (t.lanes() <= 4) {
       os << GetTileLangFP6Type(t);
+      return;
     }
-    return;
+    fail = true;
   } else if (t.is_float4()) {
     enable_fp4_ = true;
     if (t.lanes() <= 64) {
@@ -521,7 +828,12 @@ void CodeGenTileLangCUDA::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
     }
     case 4: {
       if (t.is_scalar()) {
-        os << "int";
+        enable_int8_ = true;
+        if (!t.is_uint()) {
+          os << "signed char";
+        } else {
+          os << "char";
+        }
         return;
       } else if (t.lanes() == 4) {
         os << "int16_t";
@@ -658,6 +970,183 @@ void CodeGenTileLangCUDA::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
 void CodeGenTileLangCUDA::PrintVecBinaryOp(const std::string &op, DataType t,
                                            PrimExpr lhs, PrimExpr rhs,
                                            std::ostream &os) { // NOLINT(*)
+  // Fast-path for packed x2 arithmetic (float32x2, bfloat16x2, float16x2).
+  //
+  // For float32x2: PTX `.f32x2` instructions are available on SM100+.
+  // For bfloat16x2 / float16x2: native packed half-precision instructions
+  // (__hadd2, __hsub2, etc.) are available on SM80+ (bf16) / SM53+ (fp16).
+  // The tl::*2 C++ helpers have compile-time arch guards with scalar
+  // fallbacks, so we can emit them unconditionally for 16-bit types.
+  //
+  // When lanes > 2 and is even, we decompose the vector operation into
+  // lanes/2 independent x2 packed operations on consecutive pairs.
+  int lanes = t.lanes();
+  if (lanes >= 2 && lanes % 2 == 0) {
+    bool is_bf16x2 = t.is_bfloat16();
+    bool is_fp16x2 = t.is_float16();
+    if (CanEmitPackedX2Math(t)) {
+      std::string tl_func;
+      bool use_fma = false;
+      PrimExpr fma_a, fma_b, fma_c;
+
+      if (op == "+") {
+        // Fuse packed mul+add here instead of relying on NVCC to recover
+        // packed FMA from tl::mul2/tl::add2 (or the underlying __fmul2 /
+        // __fadd2-style helpers). Once the pairwise ops are emitted as
+        // separate calls, NVCC does not reliably contract them back to fma2.
+        auto try_fuse_mul_add = [&](const PrimExpr &maybe_mul,
+                                    const PrimExpr &addend) -> bool {
+          const MulNode *mul = maybe_mul.as<MulNode>();
+          if (mul == nullptr || mul->dtype != t || mul->a.dtype() != t ||
+              mul->b.dtype() != t || addend.dtype() != t) {
+            return false;
+          }
+          tl_func = "fma2";
+          use_fma = true;
+          fma_a = mul->a;
+          fma_b = mul->b;
+          fma_c = addend;
+          return true;
+        };
+        if (!try_fuse_mul_add(lhs, rhs)) {
+          try_fuse_mul_add(rhs, lhs);
+        }
+      }
+
+      if (tl_func.empty() && op == "+")
+        tl_func = "add2";
+      else if (op == "-")
+        tl_func = "sub2";
+      else if (op == "*")
+        tl_func = "mul2";
+      else if (op == "min")
+        tl_func = "min2";
+      else if (op == "max")
+        tl_func = "max2";
+
+      if (!tl_func.empty()) {
+        // Decompose into lanes/2 independent x2 packed operations.
+        //
+        // Vector type → CUDA struct mapping:
+        //   bf16/fp16 x2..x8  -> uint1..uint4  (one packed x2 pair per field)
+        //   bf16/fp16 x12/x16 -> ulonglong3/4 (two packed x2 pairs per field)
+        //   f32x2  -> float2 {.x, .y}
+        //   f32x4  -> float4 {.x,.y,.z,.w}
+        //   f32x6/x8 -> ulonglong3/4 (one float2 pair per field)
+        //
+        // For bf16/fp16: each 32-bit field already packs a pair of elements,
+        //   so we apply tl::*2 on each field directly for <= 8 lanes. For
+        //   12/16 lanes, each 64-bit field stores two x2 pairs.
+        // For f32: float4 stores pairs at {x,z}; ulonglong3/4 stores one
+        //   float2 pair per field at {x,y,z,w}.
+        int num_pairs = lanes / 2;
+        static const char access[] = {'x', 'y', 'z', 'w'};
+
+        std::string sret = name_supply_->FreshName("_");
+        this->PrintIndent();
+        this->PrintType(t, stream);
+        stream << ' ' << sret << ";\n";
+        int ssa_scope = BeginScope();
+        {
+          std::vector<std::string> packed_vecs;
+          if (use_fma) {
+            packed_vecs = {
+                SSAGetID(PrintExpr(fma_a), fma_a.dtype()),
+                SSAGetID(PrintExpr(fma_b), fma_b.dtype()),
+                SSAGetID(PrintExpr(fma_c), fma_c.dtype()),
+            };
+          } else {
+            packed_vecs = {
+                SSAGetID(PrintExpr(lhs), lhs.dtype()),
+                SSAGetID(PrintExpr(rhs), rhs.dtype()),
+            };
+          }
+
+          if (is_bf16x2 || is_fp16x2) {
+            std::string native_type = is_bf16x2 ? "__nv_bfloat162" : "__half2";
+            auto make_half_pair = [&](const std::string &vec_name,
+                                      const std::string &field,
+                                      int pair_offset) {
+              std::string pair = "tl::from_uint1<";
+              pair += native_type;
+              pair += ">(";
+              if (lanes <= 8) {
+                pair += "*(uint1*)(&(";
+                pair += vec_name;
+                pair += ".";
+                pair += field;
+                pair += "))";
+              } else {
+                pair += "*(((uint1*)(&(";
+                pair += vec_name;
+                pair += ".";
+                pair += field;
+                pair += "))) + ";
+                pair += std::to_string(pair_offset);
+                pair += ")";
+              }
+              pair += ")";
+              return pair;
+            };
+            for (int p = 0; p < num_pairs; ++p) {
+              int field_idx = lanes <= 8 ? p : (p / 2);
+              ICHECK_LT(field_idx, 4);
+              int pair_offset = lanes <= 8 ? 0 : (p % 2);
+              std::string field(1, access[field_idx]);
+              std::vector<std::string> pair_args;
+              pair_args.reserve(packed_vecs.size());
+              for (const auto &vec_name : packed_vecs) {
+                pair_args.push_back(
+                    make_half_pair(vec_name, field, pair_offset));
+              }
+              this->PrintIndent();
+              if (lanes <= 8) {
+                stream << "*(uint1*)(&(" << sret << "." << field
+                       << ")) = tl::to_uint1(tl::" << tl_func << "(";
+              } else {
+                stream << "*(((uint1*)(&(" << sret << "." << field << "))) + "
+                       << pair_offset << ") = tl::to_uint1(tl::" << tl_func
+                       << "(";
+              }
+              stream << pair_args[0];
+              for (size_t i = 1; i < pair_args.size(); ++i) {
+                stream << ", " << pair_args[i];
+              }
+              stream << "));\n";
+            }
+          } else {
+            // f32: apply tl::*2 on each consecutive pair of float fields,
+            // reinterpreted as float2.
+            auto make_float_pair = [&](const std::string &vec_name,
+                                       const std::string &field) {
+              return "*(float2*)(&(" + vec_name + "." + field + "))";
+            };
+            for (int p = 0; p < num_pairs; ++p) {
+              int field_idx = lanes <= 4 ? (p * 2) : p;
+              ICHECK_LT(field_idx, 4);
+              std::string field(1, access[field_idx]);
+              std::vector<std::string> pair_args;
+              pair_args.reserve(packed_vecs.size());
+              for (const auto &vec_name : packed_vecs) {
+                pair_args.push_back(make_float_pair(vec_name, field));
+              }
+              this->PrintIndent();
+              stream << "*(float2*)(&(" << sret << "." << field
+                     << ")) = tl::" << tl_func << "(" << pair_args[0];
+              for (size_t i = 1; i < pair_args.size(); ++i) {
+                stream << ", " << pair_args[i];
+              }
+              stream << ");\n";
+            }
+          }
+        }
+        EndScope(ssa_scope);
+        os << sret;
+        return;
+      }
+    }
+  }
+
   // Declare the result.
   std::string sret = name_supply_->FreshName("_");
   this->PrintIndent();
@@ -758,8 +1247,11 @@ void CodeGenTileLangCUDA::PrintVecElemLoad(const std::string &vec, DataType t,
     // fp4_e2_8_t
     if (t.lanes() >= 8)
       os << "." << access[(i % 8) / 4];
-    // fp4_e2_4_t or fp4_e2_2_t
-    os << "." << access[i % 4];
+    // fp4_e2_4_t -> fp4_e2_2_t member
+    if (t.lanes() >= 4)
+      os << "." << access[(i % 4) / 2];
+    // fp4_e2_2_t -> method call x() or y()
+    os << "." << access[i % 2] << "()";
   } else if (t.lanes() > 4 && t.lanes() <= 8) {
     std::string type_name;
     if (t.bits() == 16) {
@@ -877,8 +1369,11 @@ void CodeGenTileLangCUDA::PrintVecElemStore(const std::string &vec, DataType t,
     // fp4_e2_8_t
     if (t.lanes() >= 8)
       stream << "." << access[(i % 8) / 4];
-    // fp4_e2_4_t or fp4_e2_2_t
-    stream << "." << access[i % 4] << " = " << value << ";\n";
+    // fp4_e2_4_t -> fp4_e2_2_t member
+    if (t.lanes() >= 4)
+      stream << "." << access[(i % 4) / 2];
+    // fp4_e2_2_t -> set_x() or set_y()
+    stream << ".set_" << access[i % 2] << "(" << value << ");\n";
   } else {
     stream << vec << "." << access[i] << " = " << value << ";\n";
   }
@@ -894,19 +1389,21 @@ void CodeGenTileLangCUDA::PrintStorageSync(const CallNode *op) {
     if (args.size() == 1) {
       this->stream << "__syncthreads();\n";
     } else if (args.size() == 2) {
-      // In contrast to TileLang, here we support runtime determined barrier_id.
-      std::string barrier_id = PrintExpr(args[1]);
-      this->stream << "tl::__sync_thread_partial(" << barrier_id << ");\n";
+      auto barrier_id = args[1].as<IntImmNode>()->value;
+      this->stream << "tl::__sync_thread_partial<" << barrier_id << ">();\n";
     } else if (args.size() == 3) {
-      // Support runtime determined barrier_id and thread_count.
-      std::string barrier_id = PrintExpr(args[1]);
-      std::string thread_count = PrintExpr(args[2]);
-      this->stream << "tl::__sync_thread_partial(" << barrier_id << ", "
-                   << thread_count << ");\n";
+      auto barrier_id = args[1].as<IntImmNode>()->value;
+      auto thread_count = args[2].as<IntImmNode>()->value;
+      this->stream << "tl::__sync_thread_partial<" << barrier_id << ", "
+                   << thread_count << ">();\n";
     } else {
       LOG(FATAL) << "Invalid number of arguments for storage sync: "
                  << args.size();
     }
+  } else if (sync == "cluster") {
+    need_cluster_h_ = true;
+    this->PrintIndent();
+    this->stream << "tl::cluster_sync();\n";
   } else if (sync == "global") {
     if (!need_global_barrier_) {
       need_global_barrier_ = true;
@@ -945,8 +1442,9 @@ void CodeGenTileLangCUDA::PrintStorageScope(const std::string &scope,
   ICHECK_NE(scope, "global")
       << "Cannot allocate global memory when targeting CUDA. You must pass "
          "all global arrays as input instead";
-  if (scope == "shared" || scope == "shared.barrier") {
-    os << "__shared__ ";
+  if (scope == "shared" || scope == "shared.barrier" ||
+      scope == "shared.cluster_barrier") {
+    os << "__shared__ __align__(" << barrier_alignment_bytes_ << ") ";
   } else if (scope == "shared.dyn") {
     os << "extern __shared__ __align__(1024) ";
   }
@@ -1016,6 +1514,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
         os << sret;
       };
 
+  // A list of casting functions that are supported by TileLang templates.
+  // To add a new type conversion, you should do the following things:
+  // 1. Add the new conversion function in tl_templates. (__tl_cvt_xx)
+  // 2. Add a new if statement like the one below.
+  // 3. In src/target/utils.cc, allow this vectorizable cast.
+
   // Handle conversion from float16 to float32
   if (from_ty.is_float16() && target_ty.is_float() && target_ty.bits() == 32) {
     // Use __half22float2 for vectorized conversion (half2 -> float2)
@@ -1071,7 +1575,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   }
 
   // Handle conversion from float8 (E4M3/E5M2) to float32
-  if (tl::IsCudaVectorizableFP8(from_ty) && target_ty.is_float()) {
+  if (tl::IsCudaVectorizableFP8(from_ty) && target_ty.is_float() &&
+      target_ty.bits() == 32) {
     bool from_type_is_e4m3 =
         from_ty.is_float8_e4m3() || from_ty.is_float8_e4m3fn();
     std::string type_suffix = from_type_is_e4m3 ? "__NV_E4M3" : "__NV_E5M2";
@@ -1084,6 +1589,53 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
     }
   }
 
+  // Handle conversion from float8 (E8M0) to bfloat16
+  if (from_ty.is_float8_e8m0fnu() && target_ty.is_bfloat16()) {
+    // Use __tl_cvt_e8m0x2_to_bfloat162 for vectorized conversion (fp8_e8m0x2 ->
+    // bfloat162)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_e8m0x2_to_bfloat162",
+                          "__nv_fp8x2_storage_t", "__nv_bfloat162", "", true,
+                          false);
+      return;
+    }
+  }
+
+  // Handle conversion from bfloat16 to float8 (E8M0)
+  if (from_ty.is_bfloat16() && target_ty.is_float8_e8m0fnu()) {
+    // Use __tl_cvt_bfloat162_to_e8m0x2 for vectorized conversion (bfloat162 ->
+    // fp8_e8m0x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_bfloat162_to_e8m0x2", "__nv_bfloat162",
+                          "__nv_fp8x2_storage_t", "", false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from float32 to float8 (E8M0)
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      target_ty.is_float8_e8m0fnu()) {
+    // Use __tl_cvt_float2_to_e8m0x2 for vectorized conversion (float2 ->
+    // fp8_e8m0x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_float2_to_e8m0x2", "float2",
+                          "__nv_fp8x2_storage_t", "", false, true);
+      return;
+    }
+  }
+
+  // Handle conversion from double to float8 (E8M0)
+  if (from_ty.is_float() && from_ty.bits() == 64 &&
+      target_ty.is_float8_e8m0fnu()) {
+    // Use __tl_cvt_double2_to_e8m0x2 for vectorized conversion (double2 ->
+    // fp8_e8m0x2)
+    if (lanes == 2 || lanes == 4 || lanes == 8) {
+      PrintVectorizedCast("__tl_cvt_double2_to_e8m0x2", "double2",
+                          "__nv_fp8x2_storage_t", "", false, true);
+      return;
+    }
+  }
+
   // Handle conversion from float16 to float4 (E2M1)
   if (from_ty.is_float16() && target_ty.is_float4_e2m1fn()) {
     // Use __tl_cvt_half2_to_fp4x2 for vectorized conversion (half2 -> fp4x2)
@@ -1095,7 +1647,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   }
 
   // Handle conversion from float32 to float4 (E2M1)
-  if (from_ty.is_float() && target_ty.is_float4_e2m1fn()) {
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      target_ty.is_float4_e2m1fn()) {
     // Use __tl_cvt_float2_to_fp4x2 for vectorized conversion (float2 -> fp4x2)
     if (lanes == 2 || lanes == 4 || lanes == 8) {
       PrintVectorizedCast("__tl_cvt_float2_to_fp4x2", "float2", "uint8_t", "",
@@ -1115,7 +1668,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const CastNode *op, std::ostream &os) {
   }
 
   // Handle conversion from float4 (E2M1) to float32
-  if (from_ty.is_float4_e2m1fn() && target_ty.is_float()) {
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float() &&
+      target_ty.bits() == 32) {
     // Use __tl_cvt_fp4x2_to_float2 for vectorized conversion (fp4x2 -> float2)
     if (lanes == 2 || lanes == 4 || lanes == 8) {
       PrintVectorizedCast("__tl_cvt_fp4x2_to_float2", "uint8_t", "float2", "",
@@ -1189,9 +1743,17 @@ void CodeGenTileLangCUDA::VisitExpr_(const MinNode *op, std::ostream &os) {
   DataType t = op->dtype;
 
   // Standard min/max functions don't support bfloat16 or float16
-  if ((t.is_bfloat16() || t.is_float16()) && t.is_scalar()) {
-    os << "cutlass::fast_min(" << PrintExpr(op->a) << ", " << PrintExpr(op->b)
-       << ")";
+  if (t.is_bfloat16() && t.is_scalar()) {
+    os << "cutlass::bfloat16_t(__hmin("
+       << "(" << PrintExpr(op->a) << ").to_nv_bfloat16(), "
+       << "(" << PrintExpr(op->b) << ").to_nv_bfloat16()))";
+    return;
+  }
+
+  if (t.is_float16() && t.is_scalar()) {
+    os << "cutlass::half_t(__hmin("
+       << "(" << PrintExpr(op->a) << ").to_half(), "
+       << "(" << PrintExpr(op->b) << ").to_half()))";
     return;
   }
 
@@ -1212,9 +1774,17 @@ void CodeGenTileLangCUDA::VisitExpr_(const MaxNode *op, std::ostream &os) {
   DataType t = op->dtype;
 
   // Standard min/max functions don't support bfloat16 or float16
-  if ((t.is_bfloat16() || t.is_float16()) && t.is_scalar()) {
-    os << "cutlass::fast_max(" << PrintExpr(op->a) << ", " << PrintExpr(op->b)
-       << ")";
+  if (t.is_bfloat16() && t.is_scalar()) {
+    os << "cutlass::bfloat16_t(__hmax("
+       << "(" << PrintExpr(op->a) << ").to_nv_bfloat16(), "
+       << "(" << PrintExpr(op->b) << ").to_nv_bfloat16()))";
+    return;
+  }
+
+  if (t.is_float16() && t.is_scalar()) {
+    os << "cutlass::half_t(__hmax("
+       << "(" << PrintExpr(op->a) << ").to_half(), "
+       << "(" << PrintExpr(op->b) << ").to_half()))";
     return;
   }
 
@@ -1295,6 +1865,11 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
   const VarNode *buffer_var = buffer->data.get();
   std::ostringstream os;
   std::string vid = GetVarID(buffer_var);
+  // For fp4 packed buffers, use the packed buffer name for vector accesses
+  auto it = fp4_packed_buffers_.find(buffer_var);
+  if (it != fp4_packed_buffers_.end() && !t.is_scalar()) {
+    vid = it->second;
+  }
   std::string scope;
   if (alloc_storage_scope_.count(buffer_var)) {
     scope = alloc_storage_scope_.at(buffer_var);
@@ -1334,18 +1909,25 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
   }
   std::string index_str = PrintExpr(index);
   if ((t.bits() == 4 && !t.is_float4()) || (t.bits() == 1 && t.is_int())) {
-    // This is a special case, because CodegenCUDA::PrintType()
-    // returns "int" for bool and for 4-bit integers. In most cases,
-    // we divide by the number of lanes to determine the index.
-    // However, the backing type for scalar int4 and scalar bool is
-    // int32.  Therefore, we need to divide by the ratio of their
-    // sizes in that case.
-    int div_factor = (t.lanes() == 1) ? (32 / t.bits()) : t.lanes();
+    // Scalar int4/uint4 storage is byte-packed (2 logical elements per byte).
+    // Vector int4 loads/stores reinterpret the underlying packed bytes as the
+    // requested vector type, so their index still advances by the vector lane
+    // count. Scalar int1 keeps the existing int32 backing.
+    int div_factor = t.lanes();
+    if (t.lanes() == 1) {
+      div_factor = (t.bits() == 4) ? 2 : (32 / t.bits());
+    }
     index_str =
         PrintExpr(arith::Analyzer().Simplify(truncdiv(index, div_factor)));
-
-    os << "*((" << ptr_cast(t) << vid << ")" << " + " << index_str << ")";
+    os << "*((" << ptr_cast(t) << vid << ")"
+       << " + " << index_str << ")";
   } else if (t == buffer_element_dtype) {
+    int div_factor = 1;
+    if (buffer_element_dtype.is_float4() && buffer_element_dtype.lanes() == 1) {
+      div_factor = 2;
+    }
+    index_str =
+        PrintExpr(arith::Analyzer().Simplify(truncdiv(index, div_factor)));
     os << buffer_str << "[" << index_str << "]";
   } else {
     // Fix fp4 pointer arithmetic: fp4 elements are 4-bit packed 2 per byte.
@@ -1356,7 +1938,6 @@ std::string CodeGenTileLangCUDA::GetBufferRef(DataType t,
     }
     index_str =
         PrintExpr(arith::Analyzer().Simplify(truncdiv(index, div_factor)));
-
     os << "*" << ptr_cast(t) << "(" << buffer_str << " + " << index_str << ")";
   }
 
@@ -1382,7 +1963,7 @@ std::string CodeGenTileLangCUDA::GetVecLoad(DataType t,
       << "Unsupported vector load size: " << t.bits() * t.lanes();
   auto buffer_ref = this->GetBufferRef(t, buffer, base);
   std::ostringstream os;
-  os << "tl::ld_global_256(&(" << buffer_ref << "))";
+  os << "tl::load_global_256(&(" << buffer_ref << "))";
   return os.str();
 }
 
@@ -1406,7 +1987,7 @@ void CodeGenTileLangCUDA::PrintVecStore(const BufferNode *buffer, DataType t,
       << "Unsupported vector load size: " << t.bits() * t.lanes();
   auto buffer_ref = this->GetBufferRef(t, buffer, base);
   this->PrintIndent();
-  this->stream << "tl::st_global_256(&(" << buffer_ref << "), " << value
+  this->stream << "tl::store_global_256(&(" << buffer_ref << "), " << value
                << ");\n";
 }
 
@@ -1460,37 +2041,70 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     this->stream << ss.str();
     this->stream << ");\n";
   };
-  auto print_mbarrier_obj = [&](PrimExpr barrier_id) {
-    std::ostringstream ss;
-    if (barrier_id.as<IntImmNode>()) {
-      // incase the barrier_id is an integer, we need to print the barrier_id as
-      // an integer
-      ss << mbarrier_name_ << "[" << barrier_id << "]";
-    } else {
-      // otherwise may be a T.get_mbarrier() call or BufferLoad Node
-      // we need to print the barrier_id as a string
-      ss << this->PrintExpr(barrier_id);
+  if (op->op.same_as(tl::max_nan()) || op->op.same_as(tl::min_nan())) {
+    ICHECK_EQ(op->args.size(), 2);
+    const bool is_max = op->op.same_as(tl::max_nan());
+    const DataType t = op->dtype;
+    const char *f16_intrin = is_max ? "__hmax_nan" : "__hmin_nan";
+    const char *fallback = is_max ? "cutlass::fast_max" : "cutlass::fast_min";
+
+    if (t.is_bfloat16() && t.is_scalar()) {
+      os << "cutlass::bfloat16_t(" << f16_intrin << "("
+         << "(" << PrintExpr(op->args[0]) << ").to_nv_bfloat16(), "
+         << "(" << PrintExpr(op->args[1]) << ").to_nv_bfloat16()))";
+      return;
     }
-    return ss.str();
-  };
+    if (t.is_float16() && t.is_scalar()) {
+      os << "cutlass::half_t(" << f16_intrin << "("
+         << "(" << PrintExpr(op->args[0]) << ").to_half(), "
+         << "(" << PrintExpr(op->args[1]) << ").to_half()))";
+      return;
+    }
+    os << fallback << "(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+    return;
+  }
   if (op->op.same_as(builtin::ptx_cp_async())) {
+    // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
+    // args[3] = predicate (optional)
+    ICHECK(op->args.size() == 3 || op->args.size() == 4)
+        << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+           "src_access_ptr, bytes, [predicate])";
+
     std::string dst = this->PrintExpr(op->args[0]);
-    std::string dst_offset = this->PrintExpr(op->args[1]);
-    std::string src = this->PrintExpr(op->args[2]);
-    std::string src_offset = this->PrintExpr(op->args[3]);
-    std::string size = this->PrintExpr(op->args[4]);
-    // use size of argument list to indicate whether or not to use predicated
-    // cp.async
-    if (op->args.size() == 5) {
-      this->PrintIndent();
-      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << "+"
-                   << dst_offset << ", " << src << "+" << src_offset << ");\n";
+    std::string src = this->PrintExpr(op->args[1]);
+    std::string size = this->PrintExpr(op->args[2]);
+
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      // Non-predicated version
+      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << ", " << src
+                   << ");\n";
     } else {
-      std::string condition = this->PrintExpr(op->args[5]);
-      this->PrintIndent();
+      // Predicated version
+      std::string condition = this->PrintExpr(op->args[3]);
       this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
-                   << "+" << dst_offset << ", " << src << "+" << src_offset
-                   << ", " << condition << ");\n";
+                   << ", " << src << ", " << condition << ");\n";
+    }
+  } else if (op->op.same_as(tl::ptx_cp_async())) {
+    // TileLang version: args[0] = dst_access_ptr, args[1] = src_access_ptr,
+    // args[2] = num_elems, args[3] = predicate (optional)
+    int total_bytes = GetTileLangCPAsyncTransferBytes(op);
+
+    std::string dst = this->PrintExpr(op->args[0]);
+    std::string src = this->PrintExpr(op->args[1]);
+    std::string size = std::to_string(total_bytes);
+
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      // Non-predicated version
+      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << ", " << src
+                   << ");\n";
+    } else {
+      // Predicated version
+      std::string condition = this->PrintExpr(op->args[3]);
+      this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
+                   << ", " << src << ", " << condition << ");\n";
     }
   } else if (op->op.same_as(builtin::ptx_commit_group())) {
     print_extern_call_stmt("tl::cp_async_commit");
@@ -1502,47 +2116,42 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     this->PrintIndent();
     int barrier_count = Downcast<IntImm>(op->args[0])->value;
     auto mbarrier_storage_name = mbarrier_name_ + "_mem";
-    this->stream << "__shared__ uint64_t " << mbarrier_storage_name << "["
+    this->stream << "__shared__ __align__(" << barrier_alignment_bytes_
+                 << ") uint64_t " << mbarrier_storage_name << "["
                  << barrier_count << "];\n";
     this->PrintIndent();
     this->stream << "auto " << mbarrier_name_ << " = reinterpret_cast<"
                  << mbarrier_dtype_ << "*>(" << mbarrier_storage_name << ");\n";
-  } else if (op->op.same_as(tl::get_mbarrier())) {
-    ICHECK_EQ(op->args.size(), 1);
-    std::string barrier_id = this->PrintExpr(op->args[0]);
-    os << mbarrier_name_ + "[" + barrier_id + "]";
   } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
-    if (op->args.size() == 1) {
-      this->PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
-      this->stream << mbarrier_obj << ".arrive();\n";
-    } else if (op->args.size() == 3) {
-      this->PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
-      auto cta_id = this->PrintExpr(op->args[1]);
-      auto pred = this->PrintExpr(op->args[2]);
-      this->stream << mbarrier_obj << ".arrive(" << cta_id << ", " << pred
-                   << ");\n";
-    } else {
-      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier "
-                 << op->args.size();
+    ICHECK_EQ(op->args.size(), 1);
+    this->PrintIndent();
+    auto mbarrier_obj = this->PrintExpr(op->args[0]);
+    this->stream << mbarrier_obj << ".arrive();\n";
+  } else if (op->op.same_as(tl::ptx_arrive_cluster_barrier())) {
+    ICHECK_EQ(op->args.size(), 2);
+    this->PrintIndent();
+    auto mbarrier_obj = this->PrintExpr(op->args[0]);
+    auto cta_id = this->PrintExpr(op->args[1]);
+    if (op->args[1].as<IntImmNode>()) {
+      cta_id += "u"; // Ensure cta_id as u32
     }
+    this->stream << mbarrier_obj << ".arrive(" << cta_id << ");\n";
   } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
     ICHECK_EQ(op->args.size(), 2);
     this->PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = this->PrintExpr(op->args[0]);
     auto arrive_count = this->PrintExpr(op->args[1]);
     this->stream << mbarrier_obj << ".init(" << arrive_count << ");\n";
   } else if (op->op.same_as(builtin::ptx_arrive_barrier_expect_tx())) {
     if (op->args.size() == 2) {
       this->PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto mbarrier_obj = this->PrintExpr(op->args[0]);
       auto transaction_bytes = this->PrintExpr(op->args[1]);
       this->stream << mbarrier_obj << ".arrive_and_expect_tx("
                    << transaction_bytes << ");\n";
     } else if (op->args.size() == 4) {
       this->PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto mbarrier_obj = this->PrintExpr(op->args[0]);
       auto transaction_bytes = this->PrintExpr(op->args[1]);
       auto cta_id = this->PrintExpr(op->args[2]);
       auto pred = this->PrintExpr(op->args[3]);
@@ -1562,20 +2171,32 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::mbarrier_expect_tx())) {
     ICHECK_EQ(op->args.size(), 2);
     this->PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = this->PrintExpr(op->args[0]);
     auto transaction_bytes = this->PrintExpr(op->args[1]);
     this->stream << mbarrier_obj << ".expect_transaction(" << transaction_bytes
                  << ");\n";
   } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
     ICHECK_EQ(op->args.size(), 2);
     this->PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = this->PrintExpr(op->args[0]);
     auto phase = this->PrintExpr(op->args[1]);
     this->stream << mbarrier_obj << ".wait(" << phase << ");\n";
   } else if (op->op.same_as(tl::ptx_init_tensor_memory())) {
-    print_extern_call_stmt("tl::tmem_allocate");
+    std::ostringstream ss;
+    ss << "tl::tmem_allocate";
+    if (op->annotations.find("use_2cta") != op->annotations.end() &&
+        Downcast<Bool>(op->annotations["use_2cta"])->value) {
+      ss << "<true>";
+    }
+    print_extern_call_stmt(ss.str());
   } else if (op->op.same_as(tl::ptx_deallocate_tensor_memory())) {
-    print_extern_call_stmt("tl::tmem_deallocate");
+    std::ostringstream ss;
+    ss << "tl::tmem_deallocate";
+    if (op->annotations.find("use_2cta") != op->annotations.end() &&
+        Downcast<Bool>(op->annotations["use_2cta"])->value) {
+      ss << "<true>";
+    }
+    print_extern_call_stmt(ss.str());
   } else if (op->op.same_as(tl::no_set_max_nreg())) {
     return;
   } else if (op->op.same_as(tl::tma_load())) {
@@ -1585,14 +2206,22 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
         this->eviction_policy_names_
             [op->args[op->args.size() - 1].as<IntImmNode>()->value];
     // Simplify the code by using the default eviction policy
-    if (eviction_policy != "EVICT_NORMAL") {
+    if (op->annotations.find("use_2cta") != op->annotations.end() &&
+        Downcast<Bool>(op->annotations["use_2cta"])->value) {
+      if (eviction_policy != "EVICT_NORMAL") {
+        ss << "tl::tma_load_2sm<tl::CacheHintSm100::" << eviction_policy
+           << ">(";
+      } else {
+        ss << "tl::tma_load_2sm(";
+      }
+    } else if (eviction_policy != "EVICT_NORMAL") {
       ss << "tl::tma_load<tl::CacheHintSm90::" << eviction_policy << ">(";
     } else {
       ss << "tl::tma_load(";
     }
     auto desc = op->args[0];
     ss << this->PrintExpr(desc) << ", ";
-    ss << print_mbarrier_obj(op->args[1]) << ", ";
+    ss << this->PrintExpr(op->args[1]) << ", ";
     for (size_t i = 2; i < op->args.size() - 1; i++) {
       if (i > 2)
         ss << ", ";
@@ -1647,7 +2276,9 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::tma_store_arrive())) {
     print_extern_call_stmt("tl::tma_store_arrive");
   } else if (op->op.same_as(tl::tma_store_wait())) {
-    print_extern_call_stmt("tl::tma_store_wait<0>");
+    int count = Downcast<IntImm>(op->args[0])->value;
+    this->PrintIndent();
+    this->stream << "tl::tma_store_wait<" << count << ">();\n";
   } else if (op->op.same_as(tl::warpgroup_arrive())) {
     print_extern_call_stmt("tl::warpgroup_arrive");
   } else if (op->op.same_as(tl::warpgroup_commit_batch())) {
@@ -1680,6 +2311,9 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string func_name =
         is_inc ? "tl::warpgroup_reg_alloc" : "tl::warpgroup_reg_dealloc";
     this->stream << func_name << "<" << std::to_string(nreg) << ">();\n";
+  } else if (op->op.same_as(tl::annotate_producer_reg_dealloc()) ||
+             op->op.same_as(tl::annotate_consumer_reg_alloc())) {
+    return;
   } else if (op->op.same_as(tl::wait_wgmma())) {
     this->PrintIndent();
     int num_mma = Downcast<IntImm>(op->args[0])->value;
@@ -1687,63 +2321,63 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
-    // } else if (op->op.same_as(tl::sync_grid_cg())) {
-    //   this->need_cooperative_groups_ = true;
-    //   this->PrintIndent();
-    //   this->stream << "cooperative_groups::this_grid().sync();\n";
-  } else if (op->op.same_as(tl::init_barrier_gpu())) {
-    ICHECK_GE(op->args.size(), 2);
+  } else if (op->op.same_as(tl::sync_grid())) {
+    this->need_cooperative_groups_ = true;
     this->PrintIndent();
-    this->stream << "tl::init_barrier_gpu<" << this->PrintExpr(op->args[1])
-                 << ">(" << this->PrintExpr(op->args[0]) << ");\n";
-  } else if (op->op.same_as(tl::arrive_barrier_gpu())) {
+    this->stream << "cooperative_groups::this_grid().sync();\n";
+  } else if (op->op.same_as(tl::sync_warp())) {
     this->PrintIndent();
-    this->stream << "tl::arrive_barrier_gpu(" << this->PrintExpr(op->args[0])
-                 << ");\n";
-  } else if (op->op.same_as(tl::wait_barrier_gpu())) {
+    this->stream << "__syncwarp(";
+    if (!op->args.empty()) {
+      this->stream << this->PrintExpr(op->args[0]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::pdl_trigger())) {
     this->PrintIndent();
-    this->stream << "tl::wait_barrier_gpu(" << this->PrintExpr(op->args[0])
-                 << ");\n";
-  } else if (op->op.same_as(tl::sync_barrier_gpu())) {
+    this->stream << "cudaTriggerProgrammaticLaunchCompletion();\n";
+  } else if (op->op.same_as(tl::pdl_sync())) {
     this->PrintIndent();
-    this->stream << "tl::sync_barrier_gpu(" << this->PrintExpr(op->args[0])
-                 << ");\n";
-  } else if (op->op.same_as(tl::sync_grid())) {
+    this->stream << "cudaGridDependencySynchronize();\n";
+  } else if (op->op.same_as(tl::cluster_arrive_relaxed())) {
+    need_cluster_h_ = true;
     this->PrintIndent();
-    this->stream << "tl::sync_grid(" << this->PrintExpr(op->args[0]) << ");\n";
-  } else if (op->op.same_as(tl::atom_add())) {
-    std::string func_name = "tl::ptx_atom_add_" +
-                            op->args[2].as<StringImmNode>()->value + "_" +
-                            op->args[3].as<StringImmNode>()->value;
-    os << func_name << "(" << this->PrintExpr(op->args[0]) << ", "
-       << this->PrintExpr(op->args[1]) << ")";
-    // } else if (op->op.same_as(tl::get_clock())) {
-    //   os << "get_clock()";
+    this->stream << "tl::cluster_arrive_relaxed();\n";
+  } else if (op->op.same_as(tl::cluster_arrive())) {
+    need_cluster_h_ = true;
+    this->PrintIndent();
+    this->stream << "tl::cluster_arrive();\n";
+  } else if (op->op.same_as(tl::cluster_wait())) {
+    need_cluster_h_ = true;
+    this->PrintIndent();
+    this->stream << "tl::cluster_wait();\n";
+  } else if (op->op.same_as(tl::cluster_sync())) {
+    need_cluster_h_ = true;
+    this->PrintIndent();
+    this->stream << "tl::cluster_sync();\n";
+  } else if (op->op.same_as(tl::block_rank_in_cluster())) {
+    need_cluster_h_ = true;
+    os << "tl::block_rank_in_cluster()";
+  } else if (op->op.same_as(tl::clc_try_cancel())) {
+    need_cluster_h_ = true;
+    print_extern_call_stmt("tl::clc_try_cancel");
+  } else if (op->op.same_as(tl::clc_try_cancel_multicast())) {
+    need_cluster_h_ = true;
+    print_extern_call_stmt("tl::clc_try_cancel_multicast");
+  } else if (op->op.same_as(tl::clc_is_canceled())) {
+    need_cluster_h_ = true;
+    os << "tl::clc_is_canceled(" << this->PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::clc_get_first_ctaid_x())) {
+    need_cluster_h_ = true;
+    os << "tl::clc_get_first_ctaid_x(" << this->PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::clc_get_first_ctaid_y())) {
+    need_cluster_h_ = true;
+    os << "tl::clc_get_first_ctaid_y(" << this->PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::clc_get_first_ctaid_z())) {
+    need_cluster_h_ = true;
+    os << "tl::clc_get_first_ctaid_z(" << this->PrintExpr(op->args[0]) << ")";
   } else if (op->op.same_as(tl::loop_break())) {
     this->PrintIndent();
     this->stream << "break;\n";
-  } else if (op->op.same_as(tl::get_rank())) {
-    this->use_distributed_ = true;
-    os << "tl::get_rank()";
-  } else if (op->op.same_as(tl::get_num_ranks())) {
-    this->use_distributed_ = true;
-    os << "tl::get_num_ranks()";
-  } else if (op->op.same_as(tl::get_remote_base_ptr())) {
-    this->use_distributed_ = true;
-    std::string pe_str = this->PrintExpr(op->args[0]);
-    os << "tl::get_remote_base_ptr(" << pe_str << ")";
-  } else if (op->op.same_as(tl::get_uintptr_t())) {
-    os << "tl::get_uintptr_t(" << this->PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::warp_reduce_sum())) {
-    os << "tl::warp_reduce_sum(" << this->PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::warp_reduce_max())) {
-    os << "tl::warp_reduce_max(" << this->PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::warp_reduce_min())) {
-    os << "tl::warp_reduce_min(" << this->PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
-    os << "tl::warp_reduce_bitand(" << this->PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
-    os << "tl::warp_reduce_bitor(" << this->PrintExpr(op->args[0]) << ")";
   } else if (op->op.same_as(builtin::tvm_fill_fragment())) {
     need_mma_h_ = true;
     ICHECK_EQ(op->args.size(), 6U);
@@ -1826,14 +2460,24 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string B_dtype = Downcast<StringImm>(op->args[4])->value;
     std::string C_dtype = Downcast<StringImm>(op->args[5])->value;
     std::string a_ref = this->PrintExpr(op->args[6]);
-    std::string a_bias = this->PrintExpr(op->args[7]);
     std::string b_ref = this->PrintExpr(op->args[8]);
-    std::string b_bias = this->PrintExpr(op->args[9]);
     std::string c_ref = this->PrintExpr(op->args[10]);
     std::string c_bias = this->PrintExpr(op->args[11]);
     auto dtype_a_enum = tl::codegen::ptx::DTypeFromString(A_dtype);
     auto dtype_b_enum = tl::codegen::ptx::DTypeFromString(B_dtype);
     auto dtype_c_enum = tl::codegen::ptx::DTypeFromString(C_dtype);
+    PrimExpr a_bias_expr = op->args[7];
+    PrimExpr b_bias_expr = op->args[9];
+    if (dtype_a_enum == tl::codegen::ptx::DataType::kInt4 ||
+        dtype_a_enum == tl::codegen::ptx::DataType::kUInt4) {
+      a_bias_expr = arith::Analyzer().Simplify(truncdiv(a_bias_expr, 2));
+    }
+    if (dtype_b_enum == tl::codegen::ptx::DataType::kInt4 ||
+        dtype_b_enum == tl::codegen::ptx::DataType::kUInt4) {
+      b_bias_expr = arith::Analyzer().Simplify(truncdiv(b_bias_expr, 2));
+    }
+    std::string a_bias = this->PrintExpr(a_bias_expr);
+    std::string b_bias = this->PrintExpr(b_bias_expr);
     auto [m, n, k] = tl::codegen::ptx::ParseMMAShape(shape);
 
     need_mma_instruction_h_ = true;
@@ -1863,7 +2507,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     if (BRegType == "float") {
       BRegType = "uint32_t";
     }
-
     replacer.register_rule("(AType)", AType);
     replacer.register_rule("(BType)", BType);
     replacer.register_rule("(CType)",
@@ -2134,9 +2777,9 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     wgmma_call = replacer.rewrite(wgmma_call);
     this->stream << wgmma_call;
   } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
-    ICHECK_EQ(op->args.size(), 14U)
+    ICHECK_EQ(op->args.size(), 15U)
         << "ptx_tcgen05_mma_ss args is " << op->args;
-    std::string C_dtype = Downcast<StringImm>(op->args[0])->value;
+    std::string kind_dtype = Downcast<StringImm>(op->args[0])->value;
     std::string a_desc = this->PrintExpr(op->args[1]);
     std::string A_offset = this->PrintExpr(op->args[2]);
     std::string b_desc = this->PrintExpr(op->args[3]);
@@ -2150,20 +2793,30 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string mask2 = this->PrintExpr(op->args[11]);
     std::string mask3 = this->PrintExpr(op->args[12]);
     bool enable_ws = Downcast<Bool>(op->args[13])->value;
+    bool enable_2cta = Downcast<Bool>(op->args[14])->value;
 
-    auto dtype_c_enum = tl::codegen::ptx::DTypeFromString(C_dtype);
+    std::string use_2cta_suffix;
+    if (enable_ws) {
+      ICHECK(!enable_2cta)
+          << "enable_ws and enable_2cta cannot be true at the same time";
+    } else {
+      use_2cta_suffix = std::string(", ") + (enable_2cta ? "true" : "false");
+    }
+    auto dtype_enum = tl::codegen::ptx::DTypeFromString(kind_dtype);
+    std::string ab_type_str = tl::codegen::ptx::DTypeEnumToString(dtype_enum);
 
     need_tcgen05mma_instruction_h_ = true;
     this->PrintIndent();
     std::string tcgen05_call =
-        "tl::(tcgen05_name)<(CType)>(uint64_t((desc_a) + (A_offset)), "
+        "tl::(tcgen05_name)<(ABType)(USE_2CTA_SUFFIX)>(uint64_t((desc_a) + "
+        "(A_offset)), "
         "uint64_t((desc_b) + (B_offset)), (*reinterpret_cast<uint32_t*>((C))) "
         "+ (C_offset), "
         "(scale_out), static_cast<uint32_t>((desc_val)), (mask0), (mask1), "
         "(mask2), (mask3));\n";
     tl::codegen::Replacer replacer;
-    replacer.register_rule("(CType)",
-                           tl::codegen::ptx::DTypeEnumToString(dtype_c_enum));
+    replacer.register_rule("(ABType)", ab_type_str);
+    replacer.register_rule("(USE_2CTA_SUFFIX)", use_2cta_suffix);
     replacer.register_rule("(desc_a)", a_desc);
     replacer.register_rule("(A_offset)", A_offset);
     replacer.register_rule("(desc_b)", b_desc);
@@ -2182,7 +2835,7 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     this->stream << tcgen05_call;
   } else if (op->op.same_as(tl::ptx_tcgen05_mma_ts())) {
     // TS: A from TMEM, B from SMEM (desc)
-    ICHECK_EQ(op->args.size(), 13U)
+    ICHECK_EQ(op->args.size(), 14U)
         << "ptx_tcgen05_mma_ts args is " << op->args;
     std::string kind_dtype = Downcast<StringImm>(op->args[0])->value;
     std::string a_ref = this->PrintExpr(op->args[1]);
@@ -2197,21 +2850,26 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string mask1 = this->PrintExpr(op->args[10]);
     std::string mask2 = this->PrintExpr(op->args[11]);
     std::string mask3 = this->PrintExpr(op->args[12]);
+    bool enable_2cta = Downcast<Bool>(op->args[13])->value;
 
     auto dtype_enum = tl::codegen::ptx::DTypeFromString(kind_dtype);
+    std::string use_2cta_suffix =
+        std::string(", ") + (enable_2cta ? "true" : "false");
 
     need_tcgen05mma_instruction_h_ = true;
     this->PrintIndent();
     std::string tcgen05_call =
-        "tl::tcgen05mma_ts<(CType)>( (*reinterpret_cast<uint32_t*>((A))) + "
+        "tl::tcgen05mma_ts<(ABType)(USE_2CTA_SUFFIX)>( "
+        "(*reinterpret_cast<uint32_t*>((A))) + "
         "(A_offset), "
         "uint64_t((desc_b) + (B_offset)), (*reinterpret_cast<uint32_t*>((C))) "
         "+ (C_offset), "
         "(scale_out), static_cast<uint32_t>((desc_val)), (mask0), (mask1), "
         "(mask2), (mask3));\n";
     tl::codegen::Replacer replacer;
-    replacer.register_rule("(CType)",
+    replacer.register_rule("(ABType)",
                            tl::codegen::ptx::DTypeEnumToString(dtype_enum));
+    replacer.register_rule("(USE_2CTA_SUFFIX)", use_2cta_suffix);
     replacer.register_rule("(A)", a_ref);
     replacer.register_rule("(A_offset)", A_offset);
     replacer.register_rule("(desc_b)", b_desc);
@@ -2226,12 +2884,135 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     replacer.register_rule("(mask3)", mask3);
     tcgen05_call = replacer.rewrite(tcgen05_call);
     this->stream << tcgen05_call;
-  } else if (op->op.same_as(tl::tcgen05_mma_arrive())) {
-    ICHECK_EQ(op->args.size(), 1U) << "tcgen05_mma_arrive expects 1 argument";
+  } else if (op->op.same_as(tl::ptx_tcgen05_mma_blockscaled_ss())) {
+    ICHECK_EQ(op->args.size(), 17U)
+        << "ptx_tcgen05_mma_blockscaled_ss expects 17 arguments";
+    std::string kind_dtype = Downcast<StringImm>(op->args[0])->value;
+    std::string a_desc = this->PrintExpr(op->args[1]);
+    std::string A_offset = this->PrintExpr(op->args[2]);
+    std::string b_desc = this->PrintExpr(op->args[3]);
+    std::string B_offset = this->PrintExpr(op->args[4]);
+    std::string c_ref = this->PrintExpr(op->args[5]);
+    std::string c_offset = this->PrintExpr(op->args[6]);
+    PrimExpr desc_expr = op->args[7];
+    std::string scale_out = this->PrintExpr(op->args[8]);
+    std::string sfa_ref = this->PrintExpr(op->args[9]);
+    std::string sfa_offset = this->PrintExpr(op->args[10]);
+    std::string sfb_ref = this->PrintExpr(op->args[11]);
+    std::string sfb_offset = this->PrintExpr(op->args[12]);
+    // args[13], [14] reserved for future mask/flags
+    bool enable_ws = Downcast<Bool>(op->args[15])->value;
+    bool enable_2cta = Downcast<Bool>(op->args[16])->value;
+    ICHECK(!(enable_ws && enable_2cta))
+        << "Block-scaled TCGEN05 does not support combining .ws and 2CTA";
+
+    auto dtype_enum = tl::codegen::ptx::DTypeFromString(kind_dtype);
+
+    need_tcgen05mma_instruction_h_ = true;
+    this->PrintIndent();
+    std::string tcgen05_call =
+        "tl::(tcgen05_name)<(ABType), (USE_2CTA)>(uint64_t((desc_a) + "
+        "(A_offset)), "
+        "uint64_t((desc_b) + (B_offset)), (*reinterpret_cast<uint32_t*>((C))) "
+        "+ (C_offset), "
+        "(scale_out), static_cast<uint32_t>((desc_val)), "
+        "(*reinterpret_cast<uint32_t*>((SFA))) + (SFA_offset), "
+        "(*reinterpret_cast<uint32_t*>((SFB))) + (SFB_offset));\n";
+    tl::codegen::Replacer replacer;
+    replacer.register_rule("(ABType)",
+                           tl::codegen::ptx::DTypeEnumToString(dtype_enum));
+    replacer.register_rule("(USE_2CTA)", enable_2cta ? "true" : "false");
+    replacer.register_rule("(desc_a)", a_desc);
+    replacer.register_rule("(A_offset)", A_offset);
+    replacer.register_rule("(desc_b)", b_desc);
+    replacer.register_rule("(B_offset)", B_offset);
+    replacer.register_rule("(C)", c_ref);
+    replacer.register_rule("(C_offset)", c_offset);
+    replacer.register_rule("(tcgen05_name)",
+                           enable_ws ? "tcgen05mma_blockscaled_ws_ss"
+                                     : "tcgen05mma_blockscaled_ss");
+    replacer.register_rule("(scale_out)", scale_out);
+    replacer.register_rule("(desc_val)", this->PrintExpr(desc_expr));
+    replacer.register_rule("(SFA)", sfa_ref);
+    replacer.register_rule("(SFA_offset)", sfa_offset);
+    replacer.register_rule("(SFB)", sfb_ref);
+    replacer.register_rule("(SFB_offset)", sfb_offset);
+    tcgen05_call = replacer.rewrite(tcgen05_call);
+    this->stream << tcgen05_call;
+  } else if (op->op.same_as(tl::ptx_tcgen05_cp_warpx4())) {
+    ICHECK_EQ(op->args.size(), 3U)
+        << "ptx_tcgen05_cp_warpx4 expects 3 arguments";
+    need_tcgen05_common_h_ = true;
+    // arg[0] = smem pointer, arg[1] = tmem data pointer, arg[2] = tmem column
+    // offset
+    std::string smem_ptr = this->PrintExpr(op->args[0]);
+    std::string tmem_ptr = this->PrintExpr(op->args[1]);
+    std::string tmem_col_offset = this->PrintExpr(op->args[2]);
+    bool use_2cta = false;
+    if (op->annotations.find("use_2cta") != op->annotations.end()) {
+      use_2cta = Downcast<Bool>(op->annotations["use_2cta"])->value;
+    }
+    this->PrintIndent();
+    this->stream << "tl::tcgen05_cp<" << (use_2cta ? "true" : "false") << ">("
+                 << "tl::make_sf_smem_desc(reinterpret_cast<void*>(" << smem_ptr
+                 << ")), "
+                 << "(*reinterpret_cast<uint32_t*>(" << tmem_ptr << ")) + "
+                 << tmem_col_offset << ");\n";
+  } else if (op->op.same_as(tl::ptx_tcgen05_sf_warp_transpose())) {
+    ICHECK_EQ(op->args.size(), 1U)
+        << "ptx_tcgen05_sf_warp_transpose expects 1 argument";
     need_tcgen05_common_h_ = true;
+    std::string smem_ptr = this->PrintExpr(op->args[0]);
+    this->PrintIndent();
+    this->stream << "tl::tcgen05_sf_warp_transpose(reinterpret_cast<uint32_t*>("
+                 << smem_ptr << "));\n";
+  } else if (op->op.same_as(tl::tcgen05_ld())) {
+    ICHECK_EQ(op->args.size(), 6U) << "tcgen05_ld expects 6 arguments";
+    need_tcgen05_common_h_ = true;
+    int inst_bits = Downcast<IntImm>(op->args[0])->value;
+    int chunks = Downcast<IntImm>(op->args[1])->value;
+    bool pack16 = Downcast<Bool>(op->args[2])->value;
+    std::string tmem_start_col = this->PrintExpr(op->args[3]);
+    std::string col_offset = this->PrintExpr(op->args[4]);
+    std::string dst_ptr = this->PrintExpr(op->args[5]);
+    this->PrintIndent();
+    this->stream << "tl::tcgen05_ld_32dp" << inst_bits << "bNx<" << chunks
+                 << ", " << (pack16 ? "true" : "false") << ">("
+                 << tmem_start_col << ", " << col_offset << ", " << dst_ptr
+                 << ");\n";
+  } else if (op->op.same_as(tl::tcgen05_st())) {
+    ICHECK_EQ(op->args.size(), 6U) << "tcgen05_st expects 6 arguments";
+    int inst_bits = Downcast<IntImm>(op->args[0])->value;
+    int chunks = Downcast<IntImm>(op->args[1])->value;
+    bool unpack16 = Downcast<Bool>(op->args[2])->value;
+    std::string tmem_start_col = this->PrintExpr(op->args[3]);
+    std::string col_offset = this->PrintExpr(op->args[4]);
+    std::string src_ptr = this->PrintExpr(op->args[5]);
     this->PrintIndent();
-    this->stream << "tl::tcgen05_mma_arrive(" << this->PrintExpr(op->args[0])
+    this->stream << "tl::tcgen05_st_32dp" << inst_bits << "bNx<" << chunks
+                 << ", " << (unpack16 ? "true" : "false") << ">("
+                 << tmem_start_col << ", " << col_offset << ", " << src_ptr
                  << ");\n";
+  } else if (op->op.same_as(tl::tcgen05_mma_arrive())) {
+    ICHECK_EQ(op->args.size(), 1U) << "tcgen05_mma_arrive expects 1 argument";
+    need_tcgen05_common_h_ = true;
+    std::ostringstream ss;
+    ss << "tl::tcgen05_mma_arrive";
+    if (op->annotations.find("use_2cta") != op->annotations.end() &&
+        Downcast<Bool>(op->annotations["use_2cta"])->value) {
+      ss << "<true>";
+    }
+    print_extern_call_stmt(ss.str());
+  } else if (op->op.same_as(tl::tcgen05_before_thread_sync())) {
+    ICHECK_EQ(op->args.size(), 0U)
+        << "tcgen05_before_thread_sync expects no arguments";
+    need_tcgen05_common_h_ = true;
+    print_extern_call_stmt("tl::tcgen05_before_thread_sync");
+  } else if (op->op.same_as(tl::tcgen05_after_thread_sync())) {
+    ICHECK_EQ(op->args.size(), 0U)
+        << "tcgen05_after_thread_sync expects no arguments";
+    need_tcgen05_common_h_ = true;
+    print_extern_call_stmt("tl::tcgen05_after_thread_sync");
   } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
     // arg 0: whether the matrix is loaded in column major format or not.
     // arg 1: number of matrices to load.
@@ -2246,8 +3027,16 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     int num = Downcast<Integer>(op->args[1])->value;
     std::string type = Downcast<StringImm>(op->args[2])->value;
     std::string local_ptr = this->PrintExpr(op->args[3]);
-    std::string local_elem_offset = this->PrintExpr(op->args[4]);
+    bool is_packed_int4 =
+        op->dtype.bits() == 4 && (op->dtype.is_int() || op->dtype.is_uint());
+    PrimExpr local_elem_offset_expr = op->args[4];
+    if (is_packed_int4) {
+      local_elem_offset_expr =
+          arith::Analyzer().Simplify(truncdiv(local_elem_offset_expr, 2));
+    }
+    std::string local_elem_offset = this->PrintExpr(local_elem_offset_expr);
     std::string smem_ptr = this->PrintExpr(op->args[5]);
+
     if (trans && op->dtype.bits() == 8) {
       // Since ldmatrix assumes that a matrix element is 16 bit, it cannot
       // properly transpose an int8 matrix.
@@ -2261,7 +3050,12 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
                 " + threadIdx.x / 4 + (i / 8) * 8];\n";
       os << "}\n";
     } else {
-      std::string smem_elem_offset = this->PrintExpr(op->args[6]);
+      PrimExpr smem_elem_offset_expr = op->args[6];
+      if (is_packed_int4) {
+        smem_elem_offset_expr =
+            arith::Analyzer().Simplify(truncdiv(smem_elem_offset_expr, 2));
+      }
+      std::string smem_elem_offset = this->PrintExpr(smem_elem_offset_expr);
       std::string func_name = "tl::ptx_ldmatrix_x" + std::to_string(num);
       if (trans == 1)
         func_name += "_trans";
@@ -2348,22 +3142,6 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     os << "for (int i = 0; i < " << num_elem << "; ++i) {\n";
     os << dst << "[" << dst_offset << " + i] = 0.0;";
     os << "}\n";
-  } else if (op->op.same_as(builtin::ptx_cp_async())) {
-    std::string dst = this->PrintExpr(op->args[0]);
-    std::string dst_offset = this->PrintExpr(op->args[1]);
-    std::string src = this->PrintExpr(op->args[2]);
-    std::string src_offset = this->PrintExpr(op->args[3]);
-    std::string size = this->PrintExpr(op->args[4]);
-    need_cast_smem_ptr_to_int_ = true;
-    // use size of argument list to indicate whether or not to use predicated
-    // cp.async
-    if (op->args.size() == 5) {
-      this->stream << PrintCpAsyncAssembly(dst, dst_offset, src, src_offset,
-                                           size);
-    } else {
-      this->stream << PrintPredicatedCpAsyncAssembly(
-          dst, dst_offset, src, src_offset, size, this->PrintExpr(op->args[5]));
-    }
   } else if (op->op.same_as(builtin::ptx_cp_async_bulk())) {
     need_cast_smem_ptr_to_int_ = true;
     std::string dst = this->PrintExpr(op->args[0]);
@@ -2461,6 +3239,138 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     // Emit __ldg(&buffer_ref)
     auto buffer_ref = this->GetBufferRef(op->dtype, buffer, base);
     os << "__ldg(&(" << buffer_ref << "))";
+  } else if (op->op.same_as(tl::ldg32())) {
+    // Explicit 32-bit global memory load: load_global_32(ptr) or
+    // load_global_32_conditional(ptr, pred)
+    ICHECK(!op->args.empty()) << "T.ldg32 expects a pointer argument.";
+    if (op->args.size() > 1) {
+      os << "tl::load_global_32_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    } else {
+      os << "tl::load_global_32(";
+      this->PrintExpr(op->args[0], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::ldg64())) {
+    // Explicit 64-bit global memory load: load_global_64(ptr) or
+    // load_global_64_conditional(ptr, pred)
+    ICHECK(!op->args.empty()) << "T.ldg64 expects a pointer argument.";
+    if (op->args.size() > 1) {
+      os << "tl::load_global_64_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    } else {
+      os << "tl::load_global_64(";
+      this->PrintExpr(op->args[0], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::ldg128())) {
+    // Explicit 128-bit global memory load: load_global_128(ptr) or
+    // load_global_128_conditional(ptr, pred)
+    ICHECK(!op->args.empty()) << "T.ldg128 expects a pointer argument.";
+    if (op->args.size() > 1) {
+      os << "tl::load_global_128_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    } else {
+      os << "tl::load_global_128(";
+      this->PrintExpr(op->args[0], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::ldg256())) {
+    // Explicit 256-bit global memory load: load_global_256(ptr) or
+    // load_global_256_conditional(ptr, pred)
+    ICHECK(!op->args.empty()) << "T.ldg256 expects a pointer argument.";
+    if (op->args.size() > 1) {
+      os << "tl::load_global_256_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    } else {
+      os << "tl::load_global_256(";
+      this->PrintExpr(op->args[0], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::stg32())) {
+    // Explicit 32-bit global memory store: store_global_32(ptr, value) or
+    // store_global_32_conditional(ptr, value, pred)
+    ICHECK(op->args.size() >= 2)
+        << "T.stg32 expects pointer and value arguments.";
+    if (op->args.size() > 2) {
+      os << "tl::store_global_32_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+      os << ", ";
+      this->PrintExpr(op->args[2], os);
+    } else {
+      os << "tl::store_global_32(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::stg64())) {
+    // Explicit 64-bit global memory store: store_global_64(ptr, value) or
+    // store_global_64_conditional(ptr, value, pred)
+    ICHECK(op->args.size() >= 2)
+        << "T.stg64 expects pointer and value arguments.";
+    if (op->args.size() > 2) {
+      os << "tl::store_global_64_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+      os << ", ";
+      this->PrintExpr(op->args[2], os);
+    } else {
+      os << "tl::store_global_64(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::stg128())) {
+    // Explicit 128-bit global memory store: store_global_128(ptr, value) or
+    // store_global_128_conditional(ptr, value, pred)
+    ICHECK(op->args.size() >= 2)
+        << "T.stg128 expects pointer and value arguments.";
+    if (op->args.size() > 2) {
+      os << "tl::store_global_128_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+      os << ", ";
+      this->PrintExpr(op->args[2], os);
+    } else {
+      os << "tl::store_global_128(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::stg256())) {
+    // Explicit 256-bit global memory store: store_global_256(ptr, value) or
+    // store_global_256_conditional(ptr, value, pred)
+    ICHECK(op->args.size() >= 2)
+        << "T.stg256 expects pointer and value arguments.";
+    if (op->args.size() > 2) {
+      os << "tl::store_global_256_conditional(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+      os << ", ";
+      this->PrintExpr(op->args[2], os);
+    } else {
+      os << "tl::store_global_256(";
+      this->PrintExpr(op->args[0], os);
+      os << ", ";
+      this->PrintExpr(op->args[1], os);
+    }
+    os << ")";
   } else if (op->op.same_as(builtin::reinterpret())) {
     DataType tgt_dtype = op->dtype;
     DataType src_dtype = op->args[0]->dtype;
@@ -2468,7 +3378,28 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
 
     // Handle float4_e2m1fn reinterpret
     if (!src_dtype.is_float4_e2m1fn() && !tgt_dtype.is_float4_e2m1fn()) {
-      return CodeGenC::VisitExpr_(op, os);
+      CHECK_EQ(tgt_dtype.lanes() * tgt_dtype.bits(),
+               src_dtype.lanes() * src_dtype.bits())
+          << "reinterpret expects source and target to have the same number of "
+             "bits";
+
+      std::string src_val = PrintExpr(value);
+      std::string rhs = SSAGetID(src_val, src_dtype);
+
+      // If SSAGetID returns the expression itself (happens when MarkConst was
+      // called for constants like -CUDART_INF_F), we need to create a temp
+      // variable because we cannot take the address of an rvalue.
+      if (rhs == src_val) {
+        rhs = name_supply_->FreshName("_reinterpret_tmp");
+        PrintIndent();
+        PrintType(src_dtype, stream);
+        stream << " " << rhs << " = " << src_val << ";\n";
+      }
+
+      os << "(*(";
+      this->PrintType(tgt_dtype, os);
+      os << " *)(&(" << rhs << ")))";
+      return;
     }
     if (src_dtype == tgt_dtype || tgt_dtype.lanes() * tgt_dtype.bits() ==
                                       src_dtype.lanes() * src_dtype.bits()) {
@@ -2490,7 +3421,15 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
       // The case of lane=1 is same as the normal reinterpret,
       // except that we allow the src and dst dtype to have different number of
       // bits.
-      std::string rhs = SSAGetID(PrintExpr(value), src_dtype);
+      std::string src_val = PrintExpr(value);
+      std::string rhs = SSAGetID(src_val, src_dtype);
+      // If SSAGetID returns the expression itself (constant), create temp var
+      if (rhs == src_val) {
+        rhs = name_supply_->FreshName("_reinterpret_tmp");
+        PrintIndent();
+        PrintType(src_dtype, stream);
+        stream << " " << rhs << " = " << src_val << ";\n";
+      }
       os << "(*(";
       this->PrintType(tgt_dtype, os);
       os << " *)(&(" << rhs << ")))";
@@ -2571,6 +3510,76 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     enable_sparse_gemm_ = true;
     this->PrintCallExtern(GetType(tvm::ffi::GetRef<PrimExpr>(op)),
                           op_instance->value, op->args, true, os);
+  } else if (op->op.same_as(tl::any_sync())) {
+    ICHECK_EQ(op->args.size(), 2U) << "tl.any_sync expects <mask, predicate>.";
+    os << "__any_sync(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::all_sync())) {
+    ICHECK_EQ(op->args.size(), 2U) << "tl.all_sync expects <mask, predicate>.";
+    os << "__all_sync(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::ballot_sync())) {
+    ICHECK_EQ(op->args.size(), 2U)
+        << "tl.ballot_sync expects <mask, predicate>.";
+    // __ballot_sync returns unsigned int (32 bits); zero-extend to uint64.
+    os << "((unsigned long long)__ballot_sync(" << PrintExpr(op->args[0])
+       << ", " << PrintExpr(op->args[1]) << "))";
+  } else if (op->op.same_as(tl::ballot())) {
+    ICHECK_EQ(op->args.size(), 1U) << "tl.ballot expects <predicate>.";
+    os << "((unsigned long long)__ballot_sync(0xFFFFFFFFu, "
+       << PrintExpr(op->args[0]) << "))";
+  } else if (op->op.same_as(tl::activemask())) {
+    ICHECK(op->args.empty()) << "tl.activemask takes no arguments.";
+    os << "((unsigned long long)__activemask())";
+  } else if (op->op.same_as(tl::syncthreads_count())) {
+    ICHECK_EQ(op->args.size(), 1U)
+        << "tl.syncthreads_count expects <predicate>.";
+    os << "__syncthreads_count(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::syncthreads_and())) {
+    ICHECK_EQ(op->args.size(), 1U) << "tl.syncthreads_and expects <predicate>.";
+    os << "__syncthreads_and(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::syncthreads_or())) {
+    ICHECK_EQ(op->args.size(), 1U) << "tl.syncthreads_or expects <predicate>.";
+    os << "__syncthreads_or(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::shfl_sync())) {
+    ICHECK_EQ(op->args.size(), 4U)
+        << "tl.shfl_sync expects <mask, value, src_lane, width>.";
+    os << "__shfl_sync(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2]) << ", "
+       << PrintExpr(op->args[3]) << ")";
+  } else if (op->op.same_as(tl::shfl_xor_sync())) {
+    ICHECK_EQ(op->args.size(), 4U)
+        << "tl.shfl_xor_sync expects <mask, value, lane_mask, width>.";
+    os << "__shfl_xor_sync(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2]) << ", "
+       << PrintExpr(op->args[3]) << ")";
+  } else if (op->op.same_as(tl::shfl_down_sync())) {
+    ICHECK_EQ(op->args.size(), 4U)
+        << "tl.shfl_down_sync expects <mask, value, delta, width>.";
+    os << "__shfl_down_sync(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2]) << ", "
+       << PrintExpr(op->args[3]) << ")";
+  } else if (op->op.same_as(tl::shfl_up_sync())) {
+    ICHECK_EQ(op->args.size(), 4U)
+        << "tl.shfl_up_sync expects <mask, value, delta, width>.";
+    os << "__shfl_up_sync(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2]) << ", "
+       << PrintExpr(op->args[3]) << ")";
+  } else if (op->op.same_as(tl::match_any_sync())) {
+    ICHECK_EQ(op->args.size(), 2U)
+        << "tl.match_any_sync expects <mask, value>.";
+    os << "__match_any_sync(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::match_all_sync())) {
+    ICHECK_EQ(op->args.size(), 2U)
+        << "tl.match_all_sync expects <mask, value>.";
+    // __match_all_sync writes a `pred` flag through its third argument. We
+    // hide the out-parameter behind an immediately-invoked lambda and
+    // discard pred (the returned mask already encodes whether all lanes
+    // agreed: a non-zero result implies pred == 1).
+    os << "([&]() -> unsigned { int _tl_pred = 0; return __match_all_sync("
+       << PrintExpr(op->args[0]) << ", " << PrintExpr(op->args[1])
+       << ", &_tl_pred); }())";
   } else if (op->op.same_as(tl::get_lane_idx())) {
     ICHECK_LE(op->args.size(), 1)
         << "tl.get_lane_idx expects at most one argument <warp_size>.";
@@ -2606,6 +3615,24 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
       os << PrintExpr(op->args[i]);
     }
     os << ")";
+  } else if (op->op.same_as(tl::get_rank())) {
+    os << "tl::get_rank()";
+  } else if (op->op.same_as(tl::get_num_ranks())) {
+    os << "tl::get_num_ranks()";
+  } else if (op->op.same_as(tl::get_remote_base_ptr())) {
+    ICHECK_EQ(op->args.size(), 1U)
+        << "tl.get_remote_base_ptr expects 1 argument <rank>.";
+    os << "tl::get_remote_base_ptr(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::get_uintptr_t())) {
+    ICHECK_EQ(op->args.size(), 1U)
+        << "tl.get_uintptr_t expects 1 argument <ptr>.";
+    os << "tl::get_uintptr_t(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::fence_cta())) {
+    os << "__threadfence_block()";
+  } else if (op->op.same_as(tl::fence_gpu())) {
+    os << "__threadfence()";
+  } else if (op->op.same_as(tl::fence_sys())) {
+    os << "__threadfence_system()";
   } else if (op->op.same_as(tl::tl_shuffle_elect())) {
     os << "tl::tl_shuffle_elect<" << PrintExpr(op->args[0]) << ">()";
   } else if (op->op.same_as(tl::initialize_wgmma_descriptor())) {
@@ -2721,20 +3748,112 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string func_name = math_func(op->dtype, "fdiv", rounding_mode);
     os << func_name << "(" << PrintExpr(op->args[0]) << ", "
        << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::add2()) || op->op.same_as(tl::sub2()) ||
+             op->op.same_as(tl::mul2()) || op->op.same_as(tl::fma2()) ||
+             op->op.same_as(tl::max2()) || op->op.same_as(tl::min2()) ||
+             op->op.same_as(tl::abs2())) {
+    // Packed x2 element-wise math intrinsics.
+    //
+    // For float32x2 the CUDA type is float2 and C++ overload resolution
+    // works directly.  For bfloat16x2 / float16x2 the CUDA type is uint1
+    // (both map to the same 32-bit struct), so we must cast arguments to
+    // the correct native type (__nv_bfloat162 or __half2) and cast the
+    // result back to uint1 to avoid the ambiguous uint1 bridge overload.
+    std::string op_name;
+    std::vector<PrimExpr> packed_args(op->args.begin(), op->args.end());
+    if (op->op.same_as(tl::add2()))
+      op_name = "add2";
+    else if (op->op.same_as(tl::sub2()))
+      op_name = "sub2";
+    else if (op->op.same_as(tl::mul2()))
+      op_name = "mul2";
+    else if (op->op.same_as(tl::fma2()))
+      op_name = "fma2";
+    else if (op->op.same_as(tl::max2()))
+      op_name = "max2";
+    else if (op->op.same_as(tl::min2()))
+      op_name = "min2";
+    else
+      op_name = "abs2";
+
+    if (op->op.same_as(tl::add2()) && op->args.size() == 2) {
+      // Keep explicit packed helper trees on the same fused path for the
+      // same reason as PrintVecBinaryOp: NVCC will not reliably rewrite
+      // tl::mul2(...) + tl::add2(...) back into packed fma2 on its own.
+      auto try_fuse_mul_add = [&](const PrimExpr &mul_expr,
+                                  const PrimExpr &addend) -> bool {
+        const CallNode *mul_call = mul_expr.as<CallNode>();
+        if (mul_call == nullptr || !mul_call->op.same_as(tl::mul2()) ||
+            mul_call->args.size() != 2 || mul_call->dtype != op->dtype ||
+            addend.dtype() != op->dtype) {
+          return false;
+        }
+        op_name = "fma2";
+        packed_args = {mul_call->args[0], mul_call->args[1], addend};
+        return true;
+      };
+      if (!try_fuse_mul_add(op->args[0], op->args[1])) {
+        try_fuse_mul_add(op->args[1], op->args[0]);
+      }
+    }
+
+    DataType dtype = op->dtype;
+    bool need_cast = dtype.is_bfloat16() || dtype.is_float16();
+    std::string native_type;
+    if (dtype.is_bfloat16()) {
+      native_type = "__nv_bfloat162";
+    } else if (dtype.is_float16()) {
+      native_type = "__half2";
+    }
+
+    // Helper lambda to print a casted argument expression.
+    auto print_arg = [&](const PrimExpr &arg) -> std::string {
+      std::string arg_str = PrintExpr(arg);
+      if (need_cast) {
+        return "tl::from_uint1<" + native_type + ">(" + arg_str + ")";
+      }
+      return arg_str;
+    };
+
+    if (need_cast) {
+      os << "tl::to_uint1(tl::" << op_name << "(";
+    } else {
+      os << "tl::" << op_name << "(";
+    }
+
+    os << print_arg(packed_args[0]);
+    for (size_t i = 1; i < packed_args.size(); ++i) {
+      os << ", " << print_arg(packed_args[i]);
+    }
+    os << ")";
+
+    if (need_cast) {
+      os << ")";
+    }
   } else if (op->op.same_as(tl::rng_init())) {
     this->need_curand_kernel_h_ = true;
-    this->curand_philox_state = name_supply_->FreshName("__philox_state");
+    this->curand_random_generator_state =
+        name_supply_->FreshName("__random_generator_state");
+    this->curand_random_generator_state_type =
+        op->args[3].as<StringImmNode>()->value;
     this->PrintIndent();
-    this->stream << "curandStatePhilox4_32_10_t " << this->curand_philox_state
-                 << ";\n";
+    this->stream << op->args[3].as<StringImmNode>()->value << " "
+                 << this->curand_random_generator_state << ";\n";
     this->PrintIndent();
     this->stream << "curand_init(" << PrintExpr(op->args[0]) << ", "
                  << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2])
-                 << ", &" << this->curand_philox_state << ");\n";
+                 << ", &" << this->curand_random_generator_state << ");\n";
     // Store state_var for later use by rng_rand
   } else if (op->op.same_as(tl::rng_rand())) {
     this->need_curand_kernel_h_ = true;
-    os << "curand(&" << this->curand_philox_state << ")";
+    os << "curand(&" << this->curand_random_generator_state << ")";
+  } else if (op->op.same_as(tl::rng_rand_float())) {
+    this->need_curand_kernel_h_ = true;
+    os << "curand_" << op->args[0].as<StringImmNode>()->value;
+    if (op->dtype.bits() == 64) {
+      os << "_double";
+    }
+    os << "(&" << this->curand_random_generator_state << ")";
   } else if (op->op.same_as(tl::warp_reduce_sum())) {
     os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
   } else if (op->op.same_as(tl::warp_reduce_max())) {
@@ -2745,161 +3864,120 @@ void CodeGenTileLangCUDA::VisitExpr_(const CallNode *op, std::ostream &os) {
     os << "tl::warp_reduce_bitand(" << PrintExpr(op->args[0]) << ")";
   } else if (op->op.same_as(tl::warp_reduce_bitor())) {
     os << "tl::warp_reduce_bitor(" << PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::sync_warp())) {
-    os << "__syncwarp()";
-  } else if (op->op.same_as(tl::warp_any())) {
-    os << "__any_sync(" << PrintExpr(op->args[1]) << ", "
-       << PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::warp_all())) {
-    os << "__all_sync(" << PrintExpr(op->args[1]) << ", "
-       << PrintExpr(op->args[0]) << ")";
-  } else if (op->op.same_as(tl::GetPE())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_my_pe()";
-  } else if (op->op.same_as(tl::GetPENum())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_n_pes()";
-  } else if (op->op.same_as(tl::IntPE())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_int_p(";
-    this->PrintExpr(op->args[0], os);
-    os << ", ";
-    this->PrintExpr(op->args[1], os);
-    os << ", ";
-    this->PrintExpr(op->args[2], os);
-    os << ")";
-  } else if (op->op.same_as(tl::PutmemNbiBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_putmem_nbi_block(";
-    this->PrintExpr(op->args[0], os);
-    os << ", ";
-    this->PrintExpr(op->args[1], os);
-    os << ", ";
-    this->PrintExpr(op->args[2], os);
-    os << ", ";
-    this->PrintExpr(op->args[3], os);
-    os << ")";
-  } else if (op->op.same_as(tl::PutmemSignalNbiBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_putmem_signal_nbi_block(";
-    for (size_t i = 0; i < op->args.size(); i++) {
-      this->PrintExpr(op->args[i], os);
-      if (i != op->args.size() - 1) {
-        os << ", ";
-      }
+  } else if (op->op.same_as(tl::atomic_add_elem_op())) {
+    // atomic_add_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAdd(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
     }
-    os << ")";
-  } else if (op->op.same_as(tl::GetmemNbiBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_getmem_nbi_block(";
-    for (size_t i = 0; i < op->args.size(); i++) {
-      this->PrintExpr(op->args[i], os);
-      if (i != op->args.size() - 1) {
-        os << ", ";
-      }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_add_ret_elem_op())) {
+    // atomic_add_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicAddRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
     }
     os << ")";
-  } else if (op->op.same_as(tl::BarrierAll())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_barrier_all()";
-  } else if (op->op.same_as(tl::BarrierAllBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_barrier_all_block()";
-  } else if (op->op.same_as(tl::SyncAll())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_sync_all()";
-  } else if (op->op.same_as(tl::SyncAllBlock())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_sync_all_block()";
-  } else if (op->op.same_as(tl::Quiet())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_quiet()";
-  } else if (op->op.same_as(tl::Fence())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_fence()";
-  } else if (op->op.same_as(tl::SignalOp())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmemx_signal_op(";
-    for (size_t i = 0; i < op->args.size(); i++) {
-      this->PrintExpr(op->args[i], os);
-      if (i != op->args.size() - 1) {
-        os << ", ";
-      }
+  } else if (op->op.same_as(tl::atomic_addx2_elem_op())) {
+    // atomic_addx2_elem_op(dst_ptr, src_ptr[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_ptr = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAddx2(" << dst_ptr << ", " << src_ptr;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_addx4_elem_op())) {
+    // atomic_addx4_elem_op(dst_ptr, src_ptr[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_ptr = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAddx4(" << dst_ptr << ", " << src_ptr;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_load_elem_op())) {
+    // atomic_load_elem_op(src_ptr, memory_order) -> returns loaded value
+    os << "AtomicLoad(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_store_elem_op())) {
+    // atomic_store_elem_op(dst_ptr, value, memory_order)
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string value = PrintExpr(op->args[1]);
+    std::string memory_order = PrintExpr(op->args[2]);
+    this->PrintIndent();
+    this->stream << "AtomicStore(" << dst_ptr << ", " << value << ", "
+                 << memory_order << ");\n";
+  } else if (op->op.same_as(tl::atomic_max_elem_op())) {
+    // atomic_max_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicMax(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_max_ret_elem_op())) {
+    // atomic_max_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicMaxRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
     }
     os << ")";
-  } else if (op->op.same_as(tl::SignalWaitUntil())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_signal_wait_until(";
-    for (size_t i = 0; i < op->args.size(); i++) {
-      this->PrintExpr(op->args[i], os);
-      if (i != op->args.size() - 1) {
-        os << ", ";
-      }
+  } else if (op->op.same_as(tl::atomic_min_elem_op())) {
+    // atomic_min_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicMin(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_min_ret_elem_op())) {
+    // atomic_min_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicMinRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
     }
     os << ")";
-  } else if (op->op.same_as(tl::Quiet())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_quiet()";
-  } else if (op->op.same_as(tl::Fence())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_fence()";
-  } else if (op->op.same_as(tl::SyncAll())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_sync_all()";
-  } else if (op->op.same_as(tl::BarrierAll())) {
-    this->use_distributed_ = true;
-    this->use_nvshmem_ = true;
-    os << "nvshmem_barrier_all()";
-  } else if (op->op.same_as(tl::fence_cta())) {
-    this->use_distributed_ = true;
-    os << "tl::memory_fence_cta()";
-  } else if (op->op.same_as(tl::fence_gpu())) {
-    this->use_distributed_ = true;
-    os << "tl::memory_fence_gpu()";
-  } else if (op->op.same_as(tl::fence_sys())) {
-    this->use_distributed_ = true;
-    os << "tl::memory_fence_sys()";
-  } else if (op->op.same_as(tl::get_rank())) {
-    this->use_distributed_ = true;
-    os << "tl::get_rank()";
-  } else if (op->op.same_as(tl::get_num_ranks())) {
-    this->use_distributed_ = true;
-    os << "tl::get_num_ranks()";
-  } else if (op->op.same_as(tl::get_remote_base_ptr())) {
-    this->use_distributed_ = true;
-    std::string pe_str = this->PrintExpr(op->args[0]);
-    os << "tl::get_remote_base_ptr(" << pe_str << ")";
-  } else if (op->op.same_as(tl::get_uintptr_t())) {
-    this->use_distributed_ = true;
-    std::string ptr_str = this->PrintExpr(op->args[0]);
-    os << "tl::get_uintptr_t(" << ptr_str << ")";
   } else {
-    // Note: tl.put, tl.get, tl.wait are TileOperators handled through
-    // remote_copy.cc They are lowered to call_extern with
-    // tl::cp_warp/tl::cp_block templates
+    // Detect multimem call_extern to include multimem.h header
+    if (op->op.same_as(builtin::call_extern()) && !op->args.empty()) {
+      if (auto *str_imm = op->args[0].as<StringImmNode>()) {
+        if (std::string(str_imm->value).find("tl::multimem::") == 0) {
+          this->need_multimem_h_ = true;
+          this->use_distributed_ = true;
+        }
+      }
+    }
     CodeGenC::VisitExpr_(op, os);
   }
 }
 
 void CodeGenTileLangCUDA::VisitStmt_(const AttrStmtNode *op) {
-  if (op->attr_key == tir::attr::fragment_shape) {
+  if (op->attr_key == tl::attr::kLexicalAllocScope) {
+    PrintIndent();
+    stream << "{\n";
+    int scope = BeginScope();
+    PrintStmt(op->body);
+    EndScope(scope);
+    PrintIndent();
+    stream << "}\n";
+    return;
+  } else if (op->attr_key == tir::attr::fragment_shape) {
     const VarNode *buffer = op->node.as<VarNode>();
     const StringImmNode *shape_str = op->value.as<StringImmNode>();
     fragment_shapes[buffer] = shape_str->value;
@@ -2907,32 +3985,37 @@ void CodeGenTileLangCUDA::VisitStmt_(const AttrStmtNode *op) {
     const VarNode *buffer = op->node.as<VarNode>();
     const StringImmNode *layout_str = op->value.as<StringImmNode>();
     fragment_layouts[buffer] = layout_str->value;
-  } else if (op->attr_key == tir::attr::async_commit_queue_scope) {
-    const IntImmNode *queue_id = op->value.as<IntImmNode>();
-    ICHECK(queue_id && queue_id->value == 0)
-        << "For CUDA, the index of an async queue must be 0.";
-    this->VisitStmt(op->body);
-    auto commit_group = Call(DataType::Void(), builtin::ptx_commit_group(), {});
-    this->VisitExpr(commit_group, this->stream);
-    return;
-  } else if (op->attr_key == tir::attr::async_wait_queue_scope) {
-    auto wait_attrs = GetAsyncWaitAttributes(op);
-    auto queue_id = wait_attrs.first.as<IntImmNode>();
-    ICHECK(queue_id && queue_id->value == 0)
-        << "For CUDA, the index of an async queue must be 0.";
-    auto wait_cnt = wait_attrs.second;
-    auto wait_group =
-        Call(DataType::Void(), builtin::ptx_wait_group(), {wait_cnt});
-    this->VisitExpr(wait_group, this->stream);
-    auto inner = op->body.as<AttrStmtNode>();
-    ICHECK(inner);
-    this->VisitStmt(inner->body);
-    return;
   } else if (op->attr_key == "threadblock_swizzle_pattern") {
     this->PrintIndent();
-    const StringImmNode *pattern = op->value.as<StringImmNode>();
-    ICHECK(pattern);
-    this->stream << "const dim3 blockIdx = " << pattern->value << "();\n";
+    std::string func_name;
+    int panel_size = 0;
+    if (const auto *call = op->value.as<CallNode>()) {
+      if (call->op.same_as(tir::builtin::tvm_tuple()) &&
+          call->args.size() >= 2) {
+        const auto *name_node = call->args[0].as<StringImmNode>();
+        const auto *size_node = call->args[1].as<IntImmNode>();
+        ICHECK(name_node && size_node) << "threadblock_swizzle_pattern expects "
+                                          "tvm_tuple(device_func, panel_size)";
+        func_name = name_node->value;
+        panel_size = static_cast<int>(size_node->value);
+      }
+    }
+    ICHECK(!func_name.empty() && panel_size > 0);
+    if (this->cluster_dims.has_value()) {
+      auto [cluster_grid_x_ext, cluster_grid_y_ext, cluster_grid_z_ext] =
+          this->cluster_dims.value();
+      ICHECK(cluster_grid_y_ext == 1 && cluster_grid_z_ext == 1)
+          << "Only support annotate threadblock swizzle for cluster on X "
+             "dimension for now!";
+      ICHECK(panel_size % cluster_grid_x_ext == 0)
+          << "panel_size must be divisible by clusterDim.x";
+      this->stream << "const dim3 blockIdx = tl::" << func_name
+                   << "WithCluster<" << panel_size / cluster_grid_x_ext << ", "
+                   << cluster_grid_x_ext << ">();\n";
+    } else {
+      this->stream << "const dim3 blockIdx = tl::" << func_name << "<"
+                   << panel_size << ">();\n";
+    }
     this->VisitStmt(op->body);
     return;
   } else if (op->attr_key == "pragma_unroll_factor") {
@@ -2971,8 +4054,18 @@ void CodeGenTileLangCUDA::VisitStmt_(const AllocateNode *op) {
   } else if (scope == "local.descriptor.tcgen05_instr") {
     stream << "tl::Tcgen05InstrDescriptor " << vid << ";\n";
   } else {
-    PrintStorageScope(scope, stream);
-    PrintType(op->dtype, stream);
+    // For FP4 scalar local buffers, we use packed storage type,
+    // so skip type declaration here (will be handled in the local scope section
+    // below)
+    bool is_fp4_scalar_local =
+        op->dtype.is_float4() && op->dtype.is_scalar() && scope == "local";
+    bool is_int4_scalar_local =
+        (op->dtype == DataType::Int(4) || op->dtype == DataType::UInt(4)) &&
+        op->dtype.is_scalar() && scope == "local";
+    if (!is_fp4_scalar_local && !is_int4_scalar_local) {
+      PrintStorageScope(scope, stream);
+      PrintType(op->dtype, stream);
+    }
   }
 
   if (scope == "shared.dyn") {
@@ -2985,21 +4078,40 @@ void CodeGenTileLangCUDA::VisitStmt_(const AllocateNode *op) {
     if (scope.find("wmma.") == 0) {
       constant_size = GetWmmaFragmentSize(scope, buffer, constant_size);
     }
-    if ((op->dtype == DataType::Int(4) || op->dtype == DataType::UInt(4) ||
-         op->dtype == DataType::Int(1)) &&
+    if ((op->dtype == DataType::Int(4) || op->dtype == DataType::UInt(4)) &&
         scope == "shared") {
-      constant_size = constant_size / (32 / op->dtype.bits());
+      constant_size = (constant_size + 1) / 2;
+    } else if (op->dtype == DataType::Int(1) && scope == "shared") {
+      constant_size = constant_size / 32;
     }
     if (scope == "shared") {
       stream << ' ' << vid << '[' << constant_size << "];\n";
-    } else if (scope == "shared.barrier") {
+    } else if (scope == "shared.barrier" || scope == "shared.cluster_barrier") {
       auto v_id_mem = vid + "_mem";
       stream << ' ' << v_id_mem << "[" << constant_size << "];\n";
       PrintIndent();
       stream << "auto " << vid << " = reinterpret_cast<" << mbarrier_dtype_
              << "*>(" << v_id_mem << ");\n";
     } else if (scope == "local") {
-      stream << ' ' << vid << '[' << constant_size << "];\n";
+      if (op->dtype == DataType::Int(4) || op->dtype == DataType::UInt(4)) {
+        stream << "alignas(16) ";
+        PrintType(op->dtype, stream);
+        stream << ' ' << vid << '[' << (constant_size + 1) / 2 << "];\n";
+      } else {
+        // For FP4 types, use packed storage type to avoid wasting registers.
+        // fp4_e2_t uses int8 as storage but only needs 4 bits per element.
+        // By using fp4_e2_2_t (which stores 2 fp4 values in 1 byte), we halve
+        // the storage.
+        if (op->dtype.is_float4() && op->dtype.is_scalar()) {
+          auto vid_packed = vid + "_packed";
+          stream << "fp4_e2_2_t " << vid_packed << '['
+                 << (constant_size + 1) / 2 << "];\n";
+          // Record mapping from original buffer to packed buffer name
+          fp4_packed_buffers_[op->buffer_var.get()] = vid_packed;
+        } else {
+          stream << ' ' << vid << '[' << constant_size << "];\n";
+        }
+      }
     } else if (scope == "local.var") {
       PrimExpr init = tir::make_const(op->dtype, 0);
       auto init_it = op->annotations.find(tl::attr::kLocalVarInit);
@@ -3082,11 +4194,44 @@ void CodeGenTileLangCUDA::VisitExpr_(const BufferLoadNode *op,
   Var buffer_var = op->buffer->data;
   DataType element_dtype = op->buffer->dtype;
 
+  if ((element_dtype == DataType::Int(4) ||
+       element_dtype == DataType::UInt(4)) &&
+      element_dtype.is_scalar() && value_dtype.is_scalar()) {
+    std::string idx_str = PrintExpr(index);
+    std::string vid = GetVarID(buffer_var.get());
+    if (element_dtype.is_uint()) {
+      os << "tl_uint4_packed_load((const unsigned char*)" << vid << ", "
+         << idx_str << ")";
+    } else {
+      os << "tl_int4_packed_load((const signed char*)" << vid << ", " << idx_str
+         << ")";
+    }
+    return;
+  }
+
+  // Check if this is a fp4 packed buffer access
+  auto packed_it = fp4_packed_buffers_.find(buffer_var.get());
+  if (packed_it != fp4_packed_buffers_.end() && value_dtype.is_scalar()) {
+    std::string idx_str = PrintExpr(index);
+    os << "tl_fp4_packed_load(" << packed_it->second << ", " << idx_str << ")";
+    return;
+  }
   int lanes = op->dtype.lanes();
   // declare type.
   if (value_dtype.lanes() == element_dtype.lanes()) {
-    std::string ref = GetBufferRef(op->dtype, op->buffer.get(), index);
-    HandleVolatileLoads(ref, op, os);
+    // For scalar fp4 loads from non-packed buffers, use tl_fp4_packed_load
+    // to correctly extract the nibble at the given index (the /2 in
+    // GetBufferRef maps two consecutive fp4 elements to the same byte, but
+    // reading that byte only returns the low nibble — the odd-indexed element
+    // is lost).
+    if (element_dtype.is_float4() && element_dtype.lanes() == 1) {
+      std::string idx_str = PrintExpr(index);
+      std::string vid = GetVarID(buffer_var.get());
+      os << "tl_fp4_packed_load((fp4_e2_2_t*)" << vid << ", " << idx_str << ")";
+    } else {
+      std::string ref = GetBufferRef(op->dtype, op->buffer.get(), index);
+      HandleVolatileLoads(ref, op, os);
+    }
   } else {
     bool can_vector_load = false;
     arith::PVar<PrimExpr> base;
@@ -3146,12 +4291,52 @@ void CodeGenTileLangCUDA::VisitStmt_(const BufferStoreNode *op) {
   PrimExpr index_expr = op->indices[0];
   Var buffer_var = op->buffer->data;
 
-  if (value_dtype.lanes() == element_dtype.lanes()) {
+  if ((element_dtype == DataType::Int(4) ||
+       element_dtype == DataType::UInt(4)) &&
+      element_dtype.is_scalar() && value_dtype.is_scalar()) {
+    std::string idx_str = PrintExpr(index_expr);
     std::string value = this->PrintExpr(op->value);
-    std::string ref =
-        this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
+    std::string vid = GetVarID(buffer_var.get());
     this->PrintIndent();
-    stream << ref << " = " << value << ";\n";
+    if (element_dtype.is_uint()) {
+      stream << "tl_uint4_packed_store((unsigned char*)" << vid << ", "
+             << idx_str << ", " << value << ");\n";
+    } else {
+      stream << "tl_int4_packed_store((signed char*)" << vid << ", " << idx_str
+             << ", " << value << ");\n";
+    }
+    return;
+  }
+
+  // Check if this is a fp4 packed buffer access
+  auto packed_it = fp4_packed_buffers_.find(buffer_var.get());
+  if (packed_it != fp4_packed_buffers_.end() && value_dtype.is_scalar()) {
+    std::string idx_str = PrintExpr(index_expr);
+    std::string value = this->PrintExpr(op->value);
+    this->PrintIndent();
+    stream << "tl_fp4_packed_store(" << packed_it->second << ", " << idx_str
+           << ", " << value << ");\n";
+    return;
+  }
+  if (value_dtype.lanes() == element_dtype.lanes()) {
+    // For scalar fp4 stores to non-packed buffers, use tl_fp4_packed_store
+    // to correctly handle nibble-level writes. The /2 in GetBufferRef maps two
+    // consecutive fp4 elements to the same byte, and a plain assignment
+    // overwrites the entire byte — destroying the neighboring nibble.
+    if (element_dtype.is_float4() && element_dtype.lanes() == 1) {
+      std::string idx_str = PrintExpr(index_expr);
+      std::string value = this->PrintExpr(op->value);
+      std::string vid = GetVarID(buffer_var.get());
+      this->PrintIndent();
+      stream << "tl_fp4_packed_store((fp4_e2_2_t*)" << vid << ", " << idx_str
+             << ", " << value << ");\n";
+    } else {
+      std::string value = this->PrintExpr(op->value);
+      std::string ref =
+          this->GetBufferRef(value_dtype, op->buffer.get(), index_expr);
+      this->PrintIndent();
+      stream << ref << " = " << value << ";\n";
+    }
   } else {
     arith::PVar<PrimExpr> base;
     int ramp_lanes = value_dtype.lanes() / element_dtype.lanes();
@@ -3194,36 +4379,71 @@ void CodeGenTileLangCUDA::VisitStmt_(const BufferStoreNode *op) {
   }
 }
 
+void CodeGenTileLangCUDA::VisitExpr_(const ShuffleNode *op,
+                                     std::ostream &os) { // NOLINT(*)
+  // For bfloat16x2 / float16x2 construction from two scalar lanes, emit a
+  // proper pack intrinsic instead of the generic `uint1(a, b)` produced by
+  // the base CodeGenC which is not valid CUDA.
+  DataType t = op->dtype;
+  bool is_bf16x2 = t.is_bfloat16() && t.lanes() == 2;
+  bool is_fp16x2 = t.is_float16() && t.lanes() == 2;
+  if ((is_bf16x2 || is_fp16x2) && op->vectors.size() == 2 &&
+      op->vectors[0].dtype().lanes() == 1 &&
+      op->vectors[1].dtype().lanes() == 1) {
+    // Collect the two scalar element expressions.
+    std::string e0 = PrintExpr(op->vectors[0]);
+    std::string e1 = PrintExpr(op->vectors[1]);
+    if (is_bf16x2) {
+      enable_bf16_ = true;
+      // __pack_nv_bfloat162(bfloat16_t, bfloat16_t) -> unsigned (32-bit).
+      // Use aggregate initialisation of uint1 (struct { unsigned x; })
+      // to avoid taking the address of a temporary.
+      os << "uint1{__pack_nv_bfloat162(" << e0 << ", " << e1 << ")}";
+    } else {
+      enable_fp16_ = true;
+      // __pack_half2 returns __half2 which is 32-bit.
+      // Reinterpret via aggregate initialisation.
+      os << "uint1{*(unsigned*)&(__pack_half2((__half)(" << e0 << "), (__half)("
+         << e1 << ")))}";
+    }
+    return;
+  }
+  // Default path for all other types.
+  CodeGenC::VisitExpr_(op, os);
+}
+
 void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
                                      std::ostream &os) { // NOLINT(*)
   int lanes = static_cast<int>(Downcast<IntImm>(op->lanes)->value);
   if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 8) {
-    if (lanes == 4) {
-      // make_int8x4
-      const int64_t *p = as_const_int(op->value);
-      ICHECK(p);
-      int64_t v = *p & 0xFF;
-      v = (v << 24) | (v << 16) | (v << 8) | v;
-      if (op->dtype.is_uint()) {
-        os << "(uint)" << v;
-      } else {
-        os << "(int)" << v;
-      }
-      return;
-    } else if (lanes == 32) {
-      // make_int8x32
-      const int64_t *p = as_const_int(op->value);
-      ICHECK(p);
-      int64_t v = *p & 0xFF;
-      v = (v << 24) | (v << 16) | (v << 8) | v;
-      if (op->dtype.is_uint()) {
-        os << "make_ulonglong4(" << v << ", " << v << ", " << v << ", " << v
-           << ")";
-      } else {
-        os << "make_longlong4(" << v << ", " << v << ", " << v << ", " << v
-           << ")";
+    const int64_t *p = as_const_int(op->value);
+    if (p) {
+      if (lanes == 4) {
+        // make_int8x4
+        ICHECK(p);
+        int64_t v = *p & 0xFF;
+        v = (v << 24) | (v << 16) | (v << 8) | v;
+        if (op->dtype.is_uint()) {
+          os << "(uint)" << v;
+        } else {
+          os << "(int)" << v;
+        }
+        return;
+      } else if (lanes == 32) {
+        // make_int8x32
+        const int64_t *p = as_const_int(op->value);
+        ICHECK(p);
+        int64_t v = *p & 0xFF;
+        v = (v << 24) | (v << 16) | (v << 8) | v;
+        if (op->dtype.is_uint()) {
+          os << "make_ulonglong4(" << v << ", " << v << ", " << v << ", " << v
+             << ")";
+        } else {
+          os << "make_longlong4(" << v << ", " << v << ", " << v << ", " << v
+             << ")";
+        }
+        return;
       }
-      return;
     }
   }
 
@@ -3289,7 +4509,8 @@ void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
   if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 4) {
     bool fail = false;
     const int64_t *p = as_const_int(op->value);
-    ICHECK(p);
+    ICHECK(p) << "BroadcastNode " << op << " value: " << op->value
+              << " is not a constant";
     int64_t v = *p & 0xF;
 
     if (lanes == 4) {
@@ -3332,6 +4553,59 @@ void CodeGenTileLangCUDA::VisitExpr_(const BroadcastNode *op,
     }
   }
 
+  if (auto call = op->value.as<CallNode>()) {
+    if (this->curand_random_generator_state_type ==
+        "curandStatePhilox4_32_10_t") {
+      if (call->op.same_as(tl::rng_rand()) && lanes == 4) {
+        os << "curand4(&" << this->curand_random_generator_state << ")";
+        return;
+      }
+      if (call->op.same_as(tl::rng_rand_float())) {
+        int bits = call->dtype.bits();
+        std::string dist = call->args[0].as<StringImmNode>()->value;
+        if (bits == 32) {
+          if (lanes == 4) {
+            os << "curand_" << dist << "4(&"
+               << this->curand_random_generator_state << ")";
+            return;
+          } else if (lanes == 2 && dist == "normal") {
+            os << "curand_normal2(&" << this->curand_random_generator_state
+               << ")";
+            return;
+          }
+
+        } else {
+          if (lanes == 2) {
+            os << "curand_" << dist << "2_double(&"
+               << this->curand_random_generator_state << ")";
+            return;
+          }
+        }
+      }
+    } else if (this->curand_random_generator_state_type ==
+                   "curandStateMRG32k3a_t" ||
+               this->curand_random_generator_state_type ==
+                   "curandStateXORWOW_t") {
+      if (call->op.same_as(tl::rng_rand_float())) {
+        int bits = call->dtype.bits();
+        std::string dist = call->args[0].as<StringImmNode>()->value;
+        if (bits == 32) {
+          if (lanes == 2 && dist == "normal") {
+            os << "curand_normal2(&" << this->curand_random_generator_state
+               << ")";
+            return;
+          }
+        } else {
+          if (lanes == 2 && dist == "normal") {
+            os << "curand_normal2_double(&"
+               << this->curand_random_generator_state << ")";
+            return;
+          }
+        }
+      }
+    }
+  }
+
   std::string v = PrintExpr(op->value);
   os << "make_";
   PrintType(op->dtype, os);
@@ -3558,6 +4832,10 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
   CodeGenC::PrintType(func->ret_type, os);
   CodeGenC::PrintExtraAttrs(func, os);
   bool no_alias = func->HasNonzeroAttr(tir::attr::kNoAlias);
+  // NVCC has issues with __restrict__ on kernel parameters when using PDL
+  // (Programmatic Dependent Launch) synchronization. Suppress the annotation
+  // when kHasGridSync is set.
+  bool has_cuda_pdl_sync = func->HasNonzeroAttr(tl::attr::kHasGridSync);
   std::unordered_set<const VarNode *> non_restrict;
   if (auto opt =
           func->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
@@ -3607,7 +4885,7 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
         }
       }
 
-      if (no_alias && !non_restrict.count(v.get())) {
+      if (!has_cuda_pdl_sync && no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, os);
       }
     } else {
@@ -3631,18 +4909,38 @@ void CodeGenTileLangCUDA::PrintFunctionSignature(const String &function_name,
 
 void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
                                       const PrimFunc &f) {
+  auto code_block_source = f->GetAttr<String>(tl::attr::kCodeBlockSource);
+  if (code_block_source) {
+    auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
+    ICHECK(global_symbol) << "CodeGenTileLangCUDA: Expect PrimFunc to have the "
+                             "global_symbol attribute";
+    if (auto code_block_entry_name =
+            f->GetAttr<String>(tl::attr::kCodeBlockEntryName)) {
+      ICHECK_EQ(static_cast<std::string>(global_symbol.value()),
+                static_cast<std::string>(code_block_entry_name.value()))
+          << "T.CUDASourceCodeKernel expects the lowered device global_symbol "
+             "to match entry_name";
+    }
+    stream << static_cast<std::string>(code_block_source.value()) << "\n\n";
+    return;
+  }
+
   // If the function has already been forward-declared, this is a
   // no-op.
   CodeGenC::DeclareFunction(gvar, f);
   // clear previous generated state.
   this->InitFuncState(f);
   // reserve keywords
-  ReserveKeywordsAsUnique();
+  ReserveKeywordsAsUnique_();
 
   auto global_symbol = f->GetAttr<String>(tvm::attr::kGlobalSymbol);
   ICHECK(global_symbol)
       << "CodeGenC: Expect PrimFunc to have the global_symbol attribute";
   bool no_alias = f->HasNonzeroAttr(tir::attr::kNoAlias);
+  // NVCC has issues with __restrict__ on kernel parameters when using PDL
+  // (Programmatic Dependent Launch) synchronization. Suppress the annotation
+  // when kHasGridSync is set.
+  bool has_cuda_pdl_sync = f->HasNonzeroAttr(tl::attr::kHasGridSync);
   std::unordered_set<const VarNode *> non_restrict;
   if (auto opt =
           f->GetAttr<ffi::Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
@@ -3661,6 +4959,9 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
   CodeGenC::PrintType(f->ret_type, stream);
   this->PrintExtraAttrs(f);
 
+  // Record cluster dimensions for usage in threadblock swizzle codegen
+  this->cluster_dims = ClusterInfoExtractor().extract(f);
+
   this->stream << " " << static_cast<std::string>(global_symbol.value()) << "(";
 
   for (size_t i = 0; i < f->params.size(); ++i) {
@@ -3694,7 +4995,7 @@ void CodeGenTileLangCUDA::AddFunction(const GlobalVar &gvar,
         }
       }
 
-      if (no_alias && !non_restrict.count(v.get())) {
+      if (!has_cuda_pdl_sync && no_alias && !non_restrict.count(v.get())) {
         PrintRestrict(v, stream);
       }
     } else {
diff --git a/src/target/codegen_cuda.h b/src/target/codegen_cuda.h
index 6c5f89e076..8bfca330ff 100644
--- a/src/target/codegen_cuda.h
+++ b/src/target/codegen_cuda.h
@@ -5,21 +5,17 @@
 #ifndef TVM_TL_TARGET_CODEGEN_CUDA_H_
 #define TVM_TL_TARGET_CODEGEN_CUDA_H_
 
+#include <optional>
 #include <tvm/target/codegen.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
-#include <stdlib.h>
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 
 #include "target/source/codegen_c.h"
 
-/*
- * Utility function for judging whether distributed mode is enabled.
- * This is used to determine whether to include distributed.h in the generated
- * code.
- */
 static inline bool use_distributed() {
   const char *env = std::getenv("TILELANG_USE_DISTRIBUTED");
   if (env) {
@@ -65,6 +61,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   void VisitExpr_(const FloatImmNode *op, std::ostream &os) final;
   void VisitExpr_(const CallNode *op, std::ostream &os) final;
   void VisitExpr_(const CastNode *op, std::ostream &os) final;
+  void VisitExpr_(const ShuffleNode *op, std::ostream &os) final;
   void VisitExpr_(const MinNode *op, std::ostream &os) final;
   void VisitExpr_(const MaxNode *op, std::ostream &os) final;
   void VisitStmt_(const EvaluateNode *op) final;
@@ -79,6 +76,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
                               const PrimFunc &func, std::ostream &os);
 
 protected:
+  void ReserveKeywordsAsUnique_();
   virtual std::string GetBufferRef(DataType t, const BufferNode *buffer,
                                    PrimExpr index) final;
   void PrintCallExtern(Type ret_type, ffi::String global_symbol,
@@ -103,7 +101,8 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   // Global barrier expected node.
   std::string vid_global_barrier_expect_;
   // Global curand state
-  std::string curand_philox_state;
+  std::string curand_random_generator_state;
+  std::string curand_random_generator_state_type;
 
   // whether enable fp16
   bool enable_fp16_{false};
@@ -141,10 +140,12 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   bool need_cooperative_groups_{false};
   // whether need curand_kernel.h
   bool need_curand_kernel_h_{false};
-  // whether need distributed.h
+  // whether need cluster.h
+  bool need_cluster_h_{false};
+  // whether need distributed templates (TileScale)
   bool use_distributed_{use_distributed()};
-  // whether need nvshmem.h
-  bool use_nvshmem_{false};
+  // whether need multimem.h (TileScale)
+  bool need_multimem_h_{false};
   // Op attribute map
   OpAttrMap<bool> op_need_warp_shuffle_ =
       Op::GetAttrMap<bool>("cuda.need_warp_shuffle");
@@ -154,6 +155,7 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   // The size of the barrier array in shared memory
   int barrier_count_ = -1;
   // The name of the mbarrier array in shared memory
+  // The same as injected_mbarrier_name_ in transform/common/mbarrier.h
   const std::string mbarrier_name_ = "mbarrier";
   // The type name of the mbarrier array
   const std::string mbarrier_dtype_ = "Barrier";
@@ -164,6 +166,9 @@ class CodeGenTileLangCUDA final : public CodeGenC {
   std::unordered_map<const VarNode *, std::string> fragment_shapes;
   std::unordered_map<const VarNode *, std::string> fragment_layouts;
   std::unordered_map<const VarNode *, IntImm> unroll_factor;
+  std::optional<std::tuple<int64_t, int64_t, int64_t>> cluster_dims;
+  // Map from VarNode to packed buffer variable name for fp4 packed storage
+  std::unordered_map<const VarNode *, std::string> fp4_packed_buffers_;
   friend void PrintConst(const FloatImmNode *op, std::ostream &os,
                          CodeGenTileLangCUDA *p);
   void PrintWmmaScope(const std::string &scope, DataType t,
diff --git a/src/target/codegen_cutedsl.cc b/src/target/codegen_cutedsl.cc
index 8279710de4..0ee1ba3b09 100644
--- a/src/target/codegen_cutedsl.cc
+++ b/src/target/codegen_cutedsl.cc
@@ -4,13 +4,17 @@
 
 #include "codegen_cutedsl.h"
 #include "codegen_utils.h"
+#include "ptx.h"
 #include <tvm/arith/analyzer.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ir/transform.h>
+#include <tvm/tir/builtin.h>
 #include <tvm/tir/index_map.h>
 #include <tvm/tir/op.h>
 
 #include <cmath>
+#include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -22,6 +26,30 @@ namespace tvm {
 namespace codegen {
 namespace {
 
+// Helper to check if a statement subtree contains loop break ops
+// (either tl::loop_break() or builtin::break_loop())
+class LoopBreakDetector : public tir::StmtExprVisitor {
+public:
+  bool found = false;
+  void VisitExpr_(const CallNode *op) override {
+    if (op->op.same_as(tl::loop_break()) ||
+        op->op.same_as(builtin::break_loop())) {
+      found = true;
+    }
+    if (!found)
+      StmtExprVisitor::VisitExpr_(op);
+  }
+  void VisitStmt_(const ForNode *op) override {
+    // Don't recurse into nested for loops — their breaks are their own
+  }
+};
+
+static bool ContainsLoopBreak(const Stmt &stmt) {
+  LoopBreakDetector det;
+  det(stmt);
+  return det.found;
+}
+
 // The threshold of the loop extent to use cutlass.range_constexpr
 // Higher values would lead to DSLOptimizationWarning:
 // This static loop has 128 iterations, which may be very slow to compile,
@@ -38,6 +66,69 @@ void ReplaceAll(std::string &str, const std::string &from,
   }
 }
 
+bool IsValidCPAsyncTransferBytes(int64_t bytes) {
+  return bytes == 4 || bytes == 8 || bytes == 16;
+}
+
+std::optional<DataType> GetAccessPtrElementType(const PrimExpr &expr) {
+  const auto *ptr_call = expr.as<CallNode>();
+  if (ptr_call == nullptr) {
+    return std::nullopt;
+  }
+  if (ptr_call->op.same_as(builtin::address_of())) {
+    const auto *buffer_load = ptr_call->args[0].as<BufferLoadNode>();
+    ICHECK(buffer_load) << "address_of arg must be BufferLoad";
+    return buffer_load->buffer->dtype;
+  }
+  if (ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+    ICHECK(!ptr_call->args.empty());
+    return ptr_call->args[0].dtype();
+  }
+  if (ptr_call->op.same_as(tl::access_ptr())) {
+    ICHECK_EQ(ptr_call->args.size(), 3U)
+        << "tl.access_ptr expects 3 args: (BufferLoad, extent, rw_mask)";
+    const auto *buffer_load = ptr_call->args[0].as<BufferLoadNode>();
+    ICHECK(buffer_load) << "tl.access_ptr arg0 must be BufferLoad";
+    return buffer_load->buffer->dtype;
+  }
+  return std::nullopt;
+}
+
+int GetTileLangCPAsyncTransferBytes(const CallNode *op) {
+  ICHECK(op->args.size() == 3 || op->args.size() == 4)
+      << "tl::ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+         "src_access_ptr, num_elems, [predicate])";
+  const auto *num_elems_imm = op->args[2].as<IntImmNode>();
+  ICHECK(num_elems_imm) << "tl::ptx_cp_async num_elems must be IntImm, but got "
+                        << op->args[2];
+  int64_t num_elems = num_elems_imm->value;
+  ICHECK_GT(num_elems, 0);
+
+  auto dst_elem_type = GetAccessPtrElementType(op->args[0]);
+  auto src_elem_type = GetAccessPtrElementType(op->args[1]);
+  ICHECK(dst_elem_type.has_value() && src_elem_type.has_value())
+      << "tl::ptx_cp_async expects address_of, tl.access_ptr, or "
+         "tvm_access_ptr operands";
+
+  int64_t dst_total_bits =
+      num_elems * dst_elem_type.value().bits() * dst_elem_type.value().lanes();
+  int64_t src_total_bits =
+      num_elems * src_elem_type.value().bits() * src_elem_type.value().lanes();
+  ICHECK_EQ(dst_total_bits, src_total_bits)
+      << "tl::ptx_cp_async requires src/dst transfer widths to match, but got "
+      << dst_total_bits << " vs " << src_total_bits << " bits";
+  ICHECK_EQ(dst_total_bits % 8, 0)
+      << "tl::ptx_cp_async requires byte-aligned transfers, but got "
+      << dst_total_bits << " bits";
+
+  int64_t total_bytes = dst_total_bits / 8;
+  ICHECK(IsValidCPAsyncTransferBytes(total_bytes))
+      << "tl::ptx_cp_async requires a final PTX byte width in {4, 8, 16}, but "
+         "got "
+      << total_bytes;
+  return static_cast<int>(total_bytes);
+}
+
 } // namespace
 
 CodeGenTileLangCuTeDSL::CodeGenTileLangCuTeDSL() {
@@ -52,11 +143,13 @@ CodeGenTileLangCuTeDSL::CodeGenTileLangCuTeDSL() {
 std::string CodeGenTileLangCuTeDSL::CanonicalizeFastmathFunctionName_(
     const std::string &func_name) const {
   static const std::unordered_map<std::string, std::string> kFastMathMap = {
-      {"divf", "tl.divf"},   {"exp", "tl.exp"},    {"expf", "tl.exp"},
-      {"exp2", "tl.exp2"},   {"exp2f", "tl.exp2"}, {"log", "tl.log"},
-      {"logf", "tl.log"},    {"log2", "tl.log2"},  {"log2f", "tl.log2"},
-      {"log10", "tl.log10"}, {"tan", "tl.tan"},    {"cos", "tl.cos"},
-      {"sin", "tl.sin"},     {"sqrt", "tl.sqrt"},  {"sqrtf", "tl.sqrt"},
+      {"divf", "tl.divf"},    {"exp", "tl.exp"},    {"expf", "tl.exp"},
+      {"exp2", "tl.exp2"},    {"exp2f", "tl.exp2"}, {"log", "tl.log"},
+      {"logf", "tl.log"},     {"log2", "tl.log2"},  {"log2f", "tl.log2"},
+      {"log10", "tl.log10"},  {"tan", "tl.tan"},    {"cos", "tl.cos"},
+      {"sin", "tl.sin"},      {"sqrt", "tl.sqrt"},  {"sqrtf", "tl.sqrt"},
+      {"tanh", "tl.tanh"},    {"tanhf", "tl.tanh"}, {"rsqrt", "tl.rsqrt"},
+      {"rsqrtf", "tl.rsqrt"}, {"fabs", "tl.fabsf"}, {"fabsf", "tl.fabsf"},
   };
 
   auto it = kFastMathMap.find(func_name);
@@ -156,8 +249,20 @@ void CodeGenTileLangCuTeDSL::PrintType(DataType t,
 
 void CodeGenTileLangCuTeDSL::VisitExpr_(const BroadcastNode *op,
                                         std::ostream &os) { // NOLINT(*)
+  // Note: We need to pass the dtype to make_filled_tensor so it can create
+  // the correct CuTeDSL type (e.g., cutlass.Int32 instead of Python int)
+  std::ostringstream dtype_str;
+  DataType dt = op->value.dtype();
+  // CuTeDSL/MLIR normalizes unsigned integer tensor loads to signed types
+  // (e.g., Uint8 pointer -> i8 tensor elements). Use signed type here to
+  // match, avoiding type mismatch in tl.where() operations.
+  if (dt.is_uint()) {
+    PrintType(DataType::Int(dt.bits()), dtype_str);
+  } else {
+    PrintType(dt, dtype_str);
+  }
   os << "tl.make_filled_tensor((" << PrintExpr_(op->lanes) << ",), "
-     << PrintExpr_(op->value) << ").load()";
+     << dtype_str.str() << "(" << PrintExpr_(op->value) << ")).load()";
 }
 
 void CodeGenTileLangCuTeDSL::VisitExpr_(const FloatImmNode *op,
@@ -198,6 +303,27 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const FloatImmNode *op,
   }
 }
 
+void CodeGenTileLangCuTeDSL::VisitExpr_(const IntImmNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  // CuTeDSL's tensor __setitem__ uses as_numeric() which converts bare
+  // Python ints to Int32.  For non-int32 integer literals (e.g. int16, uint8),
+  // wrap with the CuTeDSL type constructor so the value has the correct width.
+  if (op->dtype == DataType::Bool()) {
+    os << (op->value ? "True" : "False");
+  } else if (op->dtype != DataType::Int(32)) {
+    std::ostringstream temp;
+    PrintType(op->dtype, temp);
+    temp << "(" << op->value << ")";
+    MarkConst(temp.str());
+    os << temp.str();
+  } else {
+    std::ostringstream temp;
+    temp << op->value;
+    MarkConst(temp.str());
+    os << temp.str();
+  }
+}
+
 void CodeGenTileLangCuTeDSL::VisitExpr_(const CastNode *op,
                                         std::ostream &os) { // NOLINT(*)
   DataType from_ty = op->value.dtype();
@@ -207,20 +333,63 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CastNode *op,
   if (from_ty.is_scalar())
     return CodeGenTileLangPY::VisitExpr_(op, os);
 
-  // Emit this as vectorized unary ops.
-  std::string sret = name_supply_->FreshName("_");
-  PrintIndent();
-  stream << sret << " = tl.make_rmem_tensor((" << target_ty.lanes() << ",), ";
-  PrintType(target_ty.element_of(), stream);
-  stream << ")\n";
+  int lanes = target_ty.lanes();
+
+  // CuTeDSL requires narrow precision (e.g. FP8) vector .to() casts to
+  // operate on 32-bit aligned vectors.  For unaligned widths (e.g.
+  // float8x2), pad the source to aligned width, cast, then return
+  // the aligned rmem tensor.  The store path is responsible for
+  // extracting only value_lanes elements when writing back.
+  bool is_narrow_unaligned = target_ty.bits() < 32 && lanes > 1 &&
+                             (target_ty.bits() * lanes) % 32 != 0;
+  int aligned_lanes = is_narrow_unaligned ? (32 / target_ty.bits()) : lanes;
 
   std::string src = SSAGetID(PrintExpr_(op->value), from_ty);
 
+  // If unaligned, pad source to aligned width
+  std::string cast_src = src;
+  if (is_narrow_unaligned) {
+    cast_src = name_supply_->FreshName("_pad_src");
+    PrintIndent();
+    stream << cast_src << " = tl.make_rmem_tensor((" << aligned_lanes << ",), ";
+    PrintType(from_ty.element_of(), stream);
+    stream << ")\n";
+    for (int i = 0; i < aligned_lanes; ++i) {
+      PrintIndent();
+      if (i < lanes) {
+        stream << cast_src << "[" << i << "] = " << src << "[" << i << "]\n";
+      } else {
+        stream << cast_src << "[" << i << "] = ";
+        PrintType(from_ty.element_of(), stream);
+        stream << "(0)\n";
+      }
+    }
+  }
+
+  // Cast (always aligned now)
+  std::string cast_dst = name_supply_->FreshName("_cast");
+  PrintIndent();
+  stream << cast_dst << " = tl.make_rmem_tensor((" << aligned_lanes << ",), ";
+  PrintType(target_ty.element_of(), stream);
+  stream << ")\n";
   PrintIndent();
-  stream << sret << ".store(" << src << ".to(";
+  if (is_narrow_unaligned) {
+    stream << cast_dst << ".store(" << cast_src << ".load().to(";
+  } else {
+    stream << cast_dst << ".store(" << src << ".to(";
+  }
   PrintType(target_ty.element_of(), stream);
   stream << "))\n";
-  os << sret << ".load()";
+
+  if (is_narrow_unaligned) {
+    // Return the aligned rmem tensor (not .load()) so downstream code
+    // uses rmem element access (e.g. cast_dst[i]) instead of MLIR
+    // vector extractelement, which fails for FP8 types due to
+    // unrealized_conversion_cast in LLVM translation.
+    os << cast_dst;
+  } else {
+    os << cast_dst << ".load()";
+  }
   return;
 }
 
@@ -296,38 +465,51 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
     stream << ")\n";
   };
 
-  auto print_mbarrier_obj = [&](PrimExpr barrier_id) {
-    std::ostringstream ss;
-    if (barrier_id.as<IntImmNode>()) {
-      // incase the barrier_id is an integer, we need to print the barrier_id as
-      // an integer
-      ss << "(" << mbarrier_name_ << "+" << barrier_id << ")";
+  // NOTE: builtin::if_then_else is handled by the base class
+  // (CodeGenTileLangPY) as a Python ternary: (true_val if cond else false_val).
+  // This is correct for expression contexts (range(), arithmetic, etc.). When
+  // the result is used in a BufferStore that needs TensorSSA, the store handler
+  // wraps it with tl.where().
+
+  if (op->op.same_as(builtin::ptx_cp_async())) {
+    // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
+    // args[3] = predicate (optional)
+    ICHECK(op->args.size() == 3 || op->args.size() == 4)
+        << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+           "src_access_ptr, bytes, [predicate])";
+
+    std::string dst = PrintExpr_(op->args[0]);
+    std::string src = PrintExpr_(op->args[1]);
+    std::string size = PrintExpr_(op->args[2]);
+
+    if (op->args.size() == 3) {
+      this->PrintIndent();
+      stream << "tl.cp_async_gs(" << size << ", " << dst << ", " << src
+             << ")\n";
     } else {
-      // otherwise may be a T.get_mbarrier() call or BufferLoad Node
-      // we need to print the barrier_id as a string
-      ss << PrintExpr_(barrier_id);
+      std::string condition = PrintExpr_(op->args[3]);
+      this->PrintIndent();
+      stream << "tl.cp_async_gs_conditional(" << size << ", " << dst << ", "
+             << src << ", " << condition << ")\n";
     }
-    return ss.str();
-  };
+  } else if (op->op.same_as(tl::ptx_cp_async())) {
+    // TileLang version: args[0] = dst_access_ptr, args[1] = src_access_ptr,
+    // args[2] = num_elems, args[3] = predicate (optional)
+    int total_bytes = GetTileLangCPAsyncTransferBytes(op);
 
-  if (op->op.same_as(builtin::ptx_cp_async())) {
     std::string dst = PrintExpr_(op->args[0]);
-    std::string dst_offset = PrintExpr_(op->args[1]);
-    std::string src = PrintExpr_(op->args[2]);
-    std::string src_offset = PrintExpr_(op->args[3]);
-    std::string size = PrintExpr_(op->args[4]);
-    // use size of argument list to indicate whether or not to use predicated
-    // cp.async
-    if (op->args.size() == 5) {
-      PrintIndent();
-      stream << "tl.cp_async_gs(" << size << ", " << dst << ", " << dst_offset
-             << ", " << src << ", " << src_offset << ")\n";
+    std::string src = PrintExpr_(op->args[1]);
+    std::string size = std::to_string(total_bytes);
+
+    if (op->args.size() == 3) {
+      this->PrintIndent();
+      stream << "tl.cp_async_gs(" << size << ", " << dst << ", " << src
+             << ")\n";
     } else {
-      std::string condition = PrintExpr_(op->args[5]);
-      PrintIndent();
+      std::string condition = PrintExpr_(op->args[3]);
+      this->PrintIndent();
       stream << "tl.cp_async_gs_conditional(" << size << ", " << dst << ", "
-             << dst_offset << ", " << src << ", " << src_offset << ", "
-             << condition << ")\n";
+             << src << ", " << condition << ")\n";
     }
   } else if (op->op.same_as(builtin::ptx_commit_group())) {
     print_extern_call_stmt("tl.cp_async_commit");
@@ -339,43 +521,34 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
     stream << mbarrier_name_
            << " = tl.alloc_smem(cutlass.Uint64, size_in_elems=" << barrier_count
            << ")\n";
-  } else if (op->op.same_as(tl::get_mbarrier())) {
-    ICHECK_EQ(op->args.size(), 1);
-    std::string barrier_id = PrintExpr_(op->args[0]);
-    os << "(" << mbarrier_name_ << "+" << barrier_id << ")";
   } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
-    if (op->args.size() == 1) {
-      PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
-      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ")\n";
-    } else if (op->args.size() == 3) {
-      PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
-      auto cta_id = PrintExpr_(op->args[1]);
-      auto pred = PrintExpr_(op->args[2]);
-      stream << "tl.mbarrier_arrive(" << mbarrier_obj << ", " << cta_id << ", "
-             << pred << ")\n";
-    } else {
-      LOG(FATAL) << "Invalid parameter  for tl::arrive_barrier "
-                 << op->args.size();
-    }
+    ICHECK_EQ(op->args.size(), 1);
+    PrintIndent();
+    auto mbarrier_obj = PrintExpr_(op->args[0]);
+    stream << "tl.mbarrier_arrive(" << mbarrier_obj << ")\n";
+  } else if (op->op.same_as(tl::ptx_arrive_cluster_barrier())) {
+    ICHECK_EQ(op->args.size(), 2);
+    PrintIndent();
+    auto mbarrier_obj = PrintExpr_(op->args[0]);
+    auto cta_id = PrintExpr_(op->args[1]);
+    stream << "tl.mbarrier_arrive(" << mbarrier_obj << ", " << cta_id << ")\n";
   } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
     ICHECK_EQ(op->args.size(), 2);
     PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = PrintExpr_(op->args[0]);
     auto arrive_count = PrintExpr_(op->args[1]);
     stream << "tl.mbarrier_init(" << mbarrier_obj << ", " << arrive_count
            << ")\n";
   } else if (op->op.same_as(builtin::ptx_arrive_barrier_expect_tx())) {
     if (op->args.size() == 2) {
       PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto mbarrier_obj = PrintExpr_(op->args[0]);
       auto transaction_bytes = PrintExpr_(op->args[1]);
       stream << "tl.arrive_and_expect_tx(" << mbarrier_obj << ", "
              << transaction_bytes << ")\n";
     } else if (op->args.size() == 4) {
       PrintIndent();
-      auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+      auto mbarrier_obj = PrintExpr_(op->args[0]);
       auto transaction_bytes = PrintExpr_(op->args[1]);
       auto cta_id = PrintExpr_(op->args[2]);
       auto pred = PrintExpr_(op->args[3]);
@@ -394,20 +567,28 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
   } else if (op->op.same_as(tl::mbarrier_expect_tx())) {
     ICHECK_EQ(op->args.size(), 2);
     PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = PrintExpr_(op->args[0]);
     auto transaction_bytes = PrintExpr_(op->args[1]);
     stream << "tl.mbarrier_expect_tx(" << mbarrier_obj << ", "
            << transaction_bytes << ")\n";
   } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
     ICHECK_EQ(op->args.size(), 2);
     PrintIndent();
-    auto mbarrier_obj = print_mbarrier_obj(op->args[0]);
+    auto mbarrier_obj = PrintExpr_(op->args[0]);
     auto phase = PrintExpr_(op->args[1]);
     stream << "tl.mbarrier_wait(" << mbarrier_obj << ", " << phase << ")\n";
   } else if (op->op.same_as(tl::ptx_init_tensor_memory())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    ICHECK_EQ(op->args.size(), 2U);
+    PrintIndent();
+    auto tmem_buffer = PrintExpr_(op->args[0]);
+    auto num_cols = PrintExpr_(op->args[1]);
+    stream << "tl.tmem_allocate(" << tmem_buffer << ", " << num_cols << ")\n";
   } else if (op->op.same_as(tl::ptx_deallocate_tensor_memory())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    ICHECK_EQ(op->args.size(), 2U);
+    PrintIndent();
+    auto tmem_buffer = PrintExpr_(op->args[0]);
+    auto num_cols = PrintExpr_(op->args[1]);
+    stream << "tl.tmem_deallocate(" << tmem_buffer << ", " << num_cols << ")\n";
   } else if (op->op.same_as(tl::no_set_max_nreg())) {
     // do nothing
   } else if (op->op.same_as(tl::tma_load())) {
@@ -427,7 +608,7 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
     }
     auto desc = op->args[0];
     ss << PrintExpr_(desc) << ", ";
-    ss << print_mbarrier_obj(op->args[1]) << ", ";
+    ss << PrintExpr_(op->args[1]) << ", ";
     ss << PrintExpr_(op->args[2]) << ", (";
     for (size_t i = 3; i < op->args.size() - 1; i++) {
       if (i > 3)
@@ -455,7 +636,20 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
         << op->args[op->args.size() - 2]->GetTypeKey();
     auto need_reduce = need_reduce_ptr->value;
     if (need_reduce) {
-      LOG(FATAL) << "Currently unsupported op: " << op->op;
+      // Use tma_reduce for reduce mode
+      ss << "tl.tma_reduce(";
+      auto desc = op->args[0];
+      ss << PrintExpr_(desc) << ", ";
+      ss << PrintExpr_(op->args[1]) << ", (";
+      for (size_t i = 2; i < op->args.size() - 2; i++) {
+        if (i > 2)
+          ss << ", ";
+        ss << PrintExpr_(op->args[i]);
+      }
+      ss << "))\n";
+      PrintIndent();
+      stream << ss.str();
+      return;
     }
 
     // Safely extract and validate eviction policy index
@@ -507,16 +701,21 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
   } else if (op->op.same_as(tl::tma_store_arrive())) {
     print_extern_call_stmt("tl.tma_store_arrive");
   } else if (op->op.same_as(tl::tma_store_wait())) {
+    int count = Downcast<IntImm>(op->args[0])->value;
     PrintIndent();
-    stream << "tl.tma_store_wait(0)\n";
+    stream << "tl.tma_store_wait(" << count << ")\n";
   } else if (op->op.same_as(tl::warpgroup_arrive())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    PrintIndent();
+    stream << "tl.warpgroup_arrive()\n";
   } else if (op->op.same_as(tl::warpgroup_commit_batch())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    PrintIndent();
+    stream << "tl.warpgroup_commit_batch()\n";
   } else if (op->op.same_as(tl::warpgroup_wait())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    PrintIndent();
+    int num_mma = Downcast<IntImm>(op->args[0])->value;
+    stream << "tl.warpgroup_wait(" << num_mma << ")\n";
   } else if (op->op.same_as(tl::warpgroup_fence_operand())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // no-op: warpgroup_fence_operand is not needed in CuTeDSL
   } else if (op->op.same_as(tl::set_max_nreg())) {
     PrintIndent();
     int nreg = Downcast<IntImm>(op->args[0])->value;
@@ -524,34 +723,309 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
     std::string func_name =
         is_inc ? "tl.warpgroup_reg_alloc" : "tl.warpgroup_reg_dealloc";
     stream << func_name << "(" << nreg << ")\n";
+  } else if (op->op.same_as(tl::annotate_producer_reg_dealloc()) ||
+             op->op.same_as(tl::annotate_consumer_reg_alloc())) {
+    return;
   } else if (op->op.same_as(tl::wait_wgmma())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    PrintIndent();
+    int num_mma = Downcast<IntImm>(op->args[0])->value;
+    stream << "tl.wgmma_wait_group(" << num_mma << ")\n";
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "tl.pack_half2(" << PrintExpr_(op->args[0]) << ", "
        << PrintExpr_(op->args[1]) << ")";
   } else if (op->op.same_as(tl::sync_grid())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
-  } else if (op->op.same_as(tl::loop_break())) {
     PrintIndent();
-    stream << "break\n";
+    stream << "tl.sync_grid()\n";
+  } else if (op->op.same_as(tl::loop_break()) ||
+             op->op.same_as(builtin::break_loop())) {
+    if (in_break_loop_) {
+      PrintIndent();
+      stream << "_loop_break_" << current_break_id_ << " = 1\n";
+      break_emitted_in_seq_ = true;
+    } else {
+      PrintIndent();
+      stream << "break\n";
+    }
   } else if (op->op.same_as(builtin::ptx_mma())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // arg 0: shape: mXnXkX
+    // arg 1: A layout: row/col
+    // arg 2: B layout: row/col
+    // arg 3: A precision: fp16, fp64, ...
+    // arg 4: B precision: fp16, fp64, ...
+    // arg 5: C precision: fp32, fp64, ...
+    // arg 6: A multiplicand
+    // arg 7: A multiplicand index
+    // arg 8: B multiplicand
+    // arg 9: B multiplicand index
+    // arg 10: C accumulator
+    // arg 11: C accumulator index
+    // arg 12: saturate
+    // arg 13: (optional) 1-bit operator (xor or and)
+    ICHECK(op->args.size() == 13U || op->args.size() == 14U);
+    std::string shape = Downcast<StringImm>(op->args[0])->value;
+    std::string A_layout = Downcast<StringImm>(op->args[1])->value;
+    std::string B_layout = Downcast<StringImm>(op->args[2])->value;
+    std::string A_dtype = Downcast<StringImm>(op->args[3])->value;
+    std::string B_dtype = Downcast<StringImm>(op->args[4])->value;
+    std::string C_dtype = Downcast<StringImm>(op->args[5])->value;
+    std::string a_ref = GetVarPtr_(op->args[6]);
+    std::string a_bias = PrintExpr_(op->args[7]);
+    std::string b_ref = GetVarPtr_(op->args[8]);
+    std::string b_bias = PrintExpr_(op->args[9]);
+    std::string c_ref = GetVarPtr_(op->args[10]);
+    std::string c_bias = PrintExpr_(op->args[11]);
+
+    // Generate call to tl.ptx_mma dispatcher
+    PrintIndent();
+    stream << "tl.ptx_mma(\"" << shape << "\", \"" << A_layout << "\", \""
+           << B_layout << "\", \"" << A_dtype << "\", \"" << B_dtype << "\", \""
+           << C_dtype << "\", " << a_ref << ", " << a_bias << ", " << b_ref
+           << ", " << b_bias << ", " << c_ref << ", " << c_bias << ")\n";
   } else if (op->op.same_as(tl::ptx_mma_sm70())) {
     LOG(FATAL) << "Currently unsupported op: " << op->op;
   } else if (op->op.same_as(builtin::ptx_mma_sp())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // arg 0: shape: mXnXkX
+    // arg 1: A layout: row/col
+    // arg 2: B layout: row/col
+    // arg 3: A precision: fp16, fp32, ...
+    // arg 4: B precision: fp16, fp32, ...
+    // arg 5: C precision: fp16, fp32, ...
+    // arg 6: A multiplicand pointer
+    // arg 7: A multiplicand index
+    // arg 8: B multiplicand pointer
+    // arg 9: B multiplicand index
+    // arg 10: C accumulator pointer
+    // arg 11: C accumulator index
+    // arg 12: metadata pointer
+    // arg 13: metadata index
+    // arg 14: sparse_selector
+    // arg 15: saturate
+    ICHECK_EQ(op->args.size(), 16U);
+    std::string shape = Downcast<StringImm>(op->args[0])->value;
+    std::string A_layout = Downcast<StringImm>(op->args[1])->value;
+    std::string B_layout = Downcast<StringImm>(op->args[2])->value;
+    std::string A_dtype = Downcast<StringImm>(op->args[3])->value;
+    std::string B_dtype = Downcast<StringImm>(op->args[4])->value;
+    std::string C_dtype = Downcast<StringImm>(op->args[5])->value;
+    std::string a_ref = GetVarPtr_(op->args[6]);
+    std::string a_bias = PrintExpr_(op->args[7]);
+    std::string b_ref = GetVarPtr_(op->args[8]);
+    std::string b_bias = PrintExpr_(op->args[9]);
+    std::string c_ref = GetVarPtr_(op->args[10]);
+    std::string c_bias = PrintExpr_(op->args[11]);
+    std::string meta_ref = GetVarPtr_(op->args[12]);
+    std::string meta_bias = PrintExpr_(op->args[13]);
+    std::string sparse_selector = PrintExpr_(op->args[14]);
+    std::string saturate =
+        Downcast<Bool>(op->args[15])->value ? "True" : "False";
+
+    PrintIndent();
+    stream << "tl.ptx_mma_sp(\"" << shape << "\", \"" << A_layout << "\", \""
+           << B_layout << "\", \"" << A_dtype << "\", \"" << B_dtype << "\", \""
+           << C_dtype << "\", " << a_ref << ", " << a_bias << ", " << b_ref
+           << ", " << b_bias << ", " << c_ref << ", " << c_bias << ", "
+           << meta_ref << ", " << meta_bias << ", " << sparse_selector << ", "
+           << saturate << ")\n";
   } else if (op->op.same_as(tl::ptx_wgmma_ss())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // arg 0: shape (StringImm, e.g. "m64n128k16")
+    // arg 1: a_is_k_major (Bool)
+    // arg 2: b_is_k_major (Bool)
+    // arg 3: A_dtype (StringImm)
+    // arg 4: B_dtype (StringImm)
+    // arg 5: C_dtype (StringImm)
+    // arg 6: A descriptor (Var)
+    // arg 7: A offset (PrimExpr)
+    // arg 8: B descriptor (Var)
+    // arg 9: B offset (PrimExpr)
+    // arg 10: C accumulator (Var)
+    // arg 11: C offset (PrimExpr)
+    // arg 12: scale_out (PrimExpr)
+    // arg 13: scale_in_a (Bool)
+    // arg 14: scale_in_b (Bool)
+    ICHECK_EQ(op->args.size(), 15U) << "ptx_wgmma_ss expects 15 args";
+    std::string shape = Downcast<StringImm>(op->args[0])->value;
+    auto [m, n, k] = tl::codegen::ptx::ParseMMAShape(shape);
+    bool a_is_k_major = Downcast<Bool>(op->args[1])->value;
+    bool b_is_k_major = Downcast<Bool>(op->args[2])->value;
+    std::string A_dtype = Downcast<StringImm>(op->args[3])->value;
+    std::string B_dtype = Downcast<StringImm>(op->args[4])->value;
+    std::string C_dtype = Downcast<StringImm>(op->args[5])->value;
+    std::string a_desc = PrintExpr_(op->args[6]);
+    std::string A_offset = PrintExpr_(op->args[7]);
+    std::string b_desc = PrintExpr_(op->args[8]);
+    std::string B_offset = PrintExpr_(op->args[9]);
+    std::string c_ref = GetVarPtr_(op->args[10]);
+    std::string c_offset = PrintExpr_(op->args[11]);
+    std::string scale_out = PrintExpr_(op->args[12]);
+    bool scale_in_a = Downcast<Bool>(op->args[13])->value;
+    bool scale_in_b = Downcast<Bool>(op->args[14])->value;
+    // tnspA = !a_is_k_major, tnspB = !b_is_k_major
+    std::string tnspA = a_is_k_major ? "False" : "True";
+    std::string tnspB = b_is_k_major ? "False" : "True";
+    // scaleA/scaleB: True (scale_in) -> 1, False -> -1
+    int scaleA = scale_in_a ? 1 : -1;
+    int scaleB = scale_in_b ? 1 : -1;
+    PrintIndent();
+    stream << "tl.wgmma_ss(\"" << A_dtype << "\", \"" << B_dtype << "\", \""
+           << C_dtype << "\", " << m << ", " << n << ", " << k << ", " << tnspA
+           << ", " << tnspB << ", " << scaleA << ", " << scaleB << ", ("
+           << a_desc << " + " << A_offset << "), (" << b_desc << " + "
+           << B_offset << "), " << c_ref << " + " << c_offset << ", "
+           << scale_out << ")\n";
   } else if (op->op.same_as(tl::ptx_wgmma_rs())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // arg 0: shape (StringImm, e.g. "m64n128k16")
+    // arg 1: b_is_k_major (Bool)
+    // arg 2: A_dtype (StringImm)
+    // arg 3: B_dtype (StringImm)
+    // arg 4: C_dtype (StringImm)
+    // arg 5: A register buffer (Var)
+    // arg 6: A offset (PrimExpr)
+    // arg 7: B descriptor (Var)
+    // arg 8: B offset (PrimExpr)
+    // arg 9: C accumulator (Var)
+    // arg 10: C offset (PrimExpr)
+    // arg 11: scale_out (PrimExpr)
+    // arg 12: scale_in_a (Bool)
+    // arg 13: scale_in_b (Bool)
+    ICHECK_EQ(op->args.size(), 14U) << "ptx_wgmma_rs expects 14 args";
+    std::string shape = Downcast<StringImm>(op->args[0])->value;
+    auto [m, n, k] = tl::codegen::ptx::ParseMMAShape(shape);
+    bool b_is_k_major = Downcast<Bool>(op->args[1])->value;
+    std::string A_dtype = Downcast<StringImm>(op->args[2])->value;
+    std::string B_dtype = Downcast<StringImm>(op->args[3])->value;
+    std::string C_dtype = Downcast<StringImm>(op->args[4])->value;
+    std::string a_ref = GetVarPtr_(op->args[5]);
+    std::string A_offset = PrintExpr_(op->args[6]);
+    std::string b_desc = PrintExpr_(op->args[7]);
+    std::string B_offset = PrintExpr_(op->args[8]);
+    std::string c_ref = GetVarPtr_(op->args[9]);
+    std::string c_offset = PrintExpr_(op->args[10]);
+    std::string scale_out = PrintExpr_(op->args[11]);
+    bool scale_in_a = Downcast<Bool>(op->args[12])->value;
+    bool scale_in_b = Downcast<Bool>(op->args[13])->value;
+    // tnspB = !b_is_k_major (A is always K-major in RS)
+    std::string tnspB = b_is_k_major ? "False" : "True";
+    int scaleA = scale_in_a ? 1 : -1;
+    int scaleB = scale_in_b ? 1 : -1;
+    PrintIndent();
+    stream << "tl.wgmma_rs(\"" << A_dtype << "\", \"" << B_dtype << "\", \""
+           << C_dtype << "\", " << m << ", " << n << ", " << k << ", " << tnspB
+           << ", " << scaleA << ", " << scaleB << ", " << a_ref << " + "
+           << A_offset << ", (" << b_desc << " + " << B_offset << "), " << c_ref
+           << " + " << c_offset << ", " << scale_out << ")\n";
   } else if (op->op.same_as(tl::ptx_tcgen05_mma_ss())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    ICHECK_EQ(op->args.size(), 14U)
+        << "ptx_tcgen05_mma_ss expects 14 arguments";
+    std::string kind_dtype = Downcast<StringImm>(op->args[0])->value;
+    std::string a_desc = PrintExpr_(op->args[1]);
+    std::string A_offset = PrintExpr_(op->args[2]);
+    std::string b_desc = PrintExpr_(op->args[3]);
+    std::string B_offset = PrintExpr_(op->args[4]);
+    std::string c_ref = PrintExpr_(op->args[5]);
+    std::string c_offset = PrintExpr_(op->args[6]);
+    std::string desc_val = PrintExpr_(op->args[7]);
+    std::string scale_out = PrintExpr_(op->args[8]);
+    std::string mask0 = PrintExpr_(op->args[9]);
+    std::string mask1 = PrintExpr_(op->args[10]);
+    std::string mask2 = PrintExpr_(op->args[11]);
+    std::string mask3 = PrintExpr_(op->args[12]);
+    bool enable_ws = Downcast<Bool>(op->args[13])->value;
+    PrintIndent();
+    if (enable_ws) {
+      stream << "tl.tcgen05mma_ws_ss(\"" << kind_dtype << "\", (" << a_desc
+             << " + " << A_offset << "), (" << b_desc << " + " << B_offset
+             << "), " << c_ref << "[0] + " << c_offset << ", " << desc_val
+             << ", " << scale_out << ")\n";
+    } else {
+      stream << "tl.tcgen05mma_ss(\"" << kind_dtype << "\", (" << a_desc
+             << " + " << A_offset << "), (" << b_desc << " + " << B_offset
+             << "), " << c_ref << "[0] + " << c_offset << ", " << desc_val
+             << ", " << scale_out << ", " << mask0 << ", " << mask1 << ", "
+             << mask2 << ", " << mask3 << ")\n";
+    }
   } else if (op->op.same_as(tl::ptx_tcgen05_mma_ts())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    ICHECK_EQ(op->args.size(), 13U)
+        << "ptx_tcgen05_mma_ts expects 13 arguments";
+    std::string kind_dtype = Downcast<StringImm>(op->args[0])->value;
+    std::string a_ref = PrintExpr_(op->args[1]);
+    std::string A_offset = PrintExpr_(op->args[2]);
+    std::string b_desc = PrintExpr_(op->args[3]);
+    std::string B_offset = PrintExpr_(op->args[4]);
+    std::string c_ref = PrintExpr_(op->args[5]);
+    std::string c_offset = PrintExpr_(op->args[6]);
+    std::string desc_val = PrintExpr_(op->args[7]);
+    std::string scale_out = PrintExpr_(op->args[8]);
+    std::string mask0 = PrintExpr_(op->args[9]);
+    std::string mask1 = PrintExpr_(op->args[10]);
+    std::string mask2 = PrintExpr_(op->args[11]);
+    std::string mask3 = PrintExpr_(op->args[12]);
+    PrintIndent();
+    stream << "tl.tcgen05mma_ts(\"" << kind_dtype << "\", " << a_ref << "[0] + "
+           << A_offset << ", (" << b_desc << " + " << B_offset << "), " << c_ref
+           << "[0] + " << c_offset << ", " << desc_val << ", " << scale_out
+           << ", " << mask0 << ", " << mask1 << ", " << mask2 << ", " << mask3
+           << ")\n";
+  } else if (op->op.same_as(tl::tcgen05_ld())) {
+    ICHECK_EQ(op->args.size(), 6U) << "tcgen05_ld expects 6 arguments";
+    int inst_bits = Downcast<IntImm>(op->args[0])->value;
+    int chunks = Downcast<IntImm>(op->args[1])->value;
+    bool pack16 = Downcast<Bool>(op->args[2])->value;
+    std::string tmem_start_col = PrintExpr_(op->args[3]);
+    std::string col_offset = PrintExpr_(op->args[4]);
+    std::string dst_ptr = PrintExpr_(op->args[5]);
+    PrintIndent();
+    stream << "tl.tcgen05_ld_32dp" << inst_bits << "bNx(" << chunks << ", "
+           << (pack16 ? "True" : "False") << ", " << tmem_start_col << ", "
+           << col_offset << ", " << dst_ptr << ")\n";
+  } else if (op->op.same_as(tl::tcgen05_st())) {
+    ICHECK_EQ(op->args.size(), 6U) << "tcgen05_st expects 6 arguments";
+    int inst_bits = Downcast<IntImm>(op->args[0])->value;
+    int chunks = Downcast<IntImm>(op->args[1])->value;
+    bool unpack16 = Downcast<Bool>(op->args[2])->value;
+    std::string tmem_start_col = PrintExpr_(op->args[3]);
+    std::string col_offset = PrintExpr_(op->args[4]);
+    std::string src_ptr = PrintExpr_(op->args[5]);
+    PrintIndent();
+    stream << "tl.tcgen05_st_32dp" << inst_bits << "bNx(" << chunks << ", "
+           << (unpack16 ? "True" : "False") << ", " << tmem_start_col << ", "
+           << col_offset << ", " << src_ptr << ")\n";
   } else if (op->op.same_as(tl::tcgen05_mma_arrive())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    ICHECK_EQ(op->args.size(), 1U) << "tcgen05_mma_arrive expects 1 argument";
+    PrintIndent();
+    stream << "tl.tcgen05_mma_arrive(" << PrintExpr_(op->args[0]) << ")\n";
+  } else if (op->op.same_as(tl::tcgen05_before_thread_sync())) {
+    ICHECK_EQ(op->args.size(), 0U)
+        << "tcgen05_before_thread_sync expects no arguments";
+    PrintIndent();
+    stream << "tl.tcgen05_before_thread_sync()\n";
+  } else if (op->op.same_as(tl::tcgen05_after_thread_sync())) {
+    ICHECK_EQ(op->args.size(), 0U)
+        << "tcgen05_after_thread_sync expects no arguments";
+    PrintIndent();
+    stream << "tl.tcgen05_after_thread_sync()\n";
   } else if (op->op.same_as(builtin::ptx_ldmatrix())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // arg 0: whether the matrix is loaded in column major format or not.
+    // arg 1: number of matrices to load.
+    // arg 2: The data type in the matrix, .b16 is the only accepted data type.
+    // arg 3: pointer to local buffer.
+    // arg 4: The offset of the element to store in the local buffer.
+    // arg 5: pointer to the shared memory buffer to load.
+    // arg 6: The offset of the start element of the row to load in shared
+    // memory.
+    ICHECK_EQ(op->args.size(), 7U);
+    bool trans = Downcast<Bool>(op->args[0])->value;
+    int num = Downcast<Integer>(op->args[1])->value;
+    std::string local_ptr = GetVarPtr_(op->args[3]);
+    std::string local_elem_offset = PrintExpr_(op->args[4]);
+    std::string smem_ptr = PrintExpr_(op->args[5]);
+    std::string smem_elem_offset = PrintExpr_(op->args[6]);
+
+    std::string func_name = "tl.ptx_ldmatrix_x" + std::to_string(num);
+    if (trans)
+      func_name += "_trans";
+    PrintIndent();
+    stream << func_name << "(" << smem_ptr << " + " << smem_elem_offset << ", "
+           << local_ptr << " + " << local_elem_offset << ")\n";
   } else if (op->op.same_as(builtin::mma_store())) {
     LOG(FATAL) << "Currently unsupported op: " << op->op;
   } else if (op->op.same_as(builtin::mma_fill())) {
@@ -571,22 +1045,30 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
            "bits";
 
     const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
-    ICHECK(op->args.size() == 1 && load);
-    ICHECK_EQ(load->indices.size(), 1)
-        << "CodeGenTileLangCuTeDSL only supports flat memory";
+    if (load) {
+      // Path 1: BufferLoad - use recast_ptr for memory access
+      ICHECK_EQ(load->indices.size(), 1)
+          << "CodeGenTileLangCuTeDSL only supports flat memory";
 
-    PrimExpr index = load->indices[0];
-    if (const RampNode *node = index.as<RampNode>(); node) {
-      auto *p_stride = as_const_int(node->stride);
-      CHECK(p_stride);
-      ICHECK_EQ(*p_stride, 1) << "reinterpret expects contiguous elements";
-      index = node->base;
-    }
+      PrimExpr index = load->indices[0];
+      if (const RampNode *node = index.as<RampNode>(); node) {
+        auto *p_stride = as_const_int(node->stride);
+        CHECK(p_stride);
+        ICHECK_EQ(*p_stride, 1) << "reinterpret expects contiguous elements";
+        index = node->base;
+      }
 
-    auto ptr_str = GetBufferPtr_(load->buffer.get(), index);
-    os << "tl.make_tensor(tl.recast_ptr(" << ptr_str << ", dtype=";
-    PrintType(tgt_dtype.element_of(), os);
-    os << "), (" << tgt_dtype.lanes() << ",)).load()";
+      auto ptr_str = GetBufferPtr_(load->buffer.get(), index);
+      os << "tl.make_tensor(tl.recast_ptr(" << ptr_str << ", dtype=";
+      PrintType(tgt_dtype.element_of(), os);
+      os << "), (" << tgt_dtype.lanes() << ",)).load()";
+    } else {
+      // Path 2: General expression - use arith.bitcast
+      std::string expr_str = PrintExpr_(op->args[0]);
+      os << "tl.bitcast(" << expr_str << ", ";
+      PrintType(tgt_dtype.element_of(), os);
+      os << ")";
+    }
   } else if (op->op.same_as(builtin::thread_return())) {
     os << "return";
   } else if (op->op.same_as(tl::tl_gemm())) {
@@ -600,25 +1082,71 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
   } else if (op->op.same_as(tl::tl_gemm_sp())) {
     LOG(FATAL) << "Currently unsupported op: " << op->op;
   } else if (op->op.same_as(tl::get_lane_idx())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // get_lane_idx(warp_size?) -> threadIdx.x % warp_size
+    ICHECK_LE(op->args.size(), 1U)
+        << "tl.get_lane_idx expects at most one argument <warp_size>.";
+    std::string warp_size = op->args.empty() ? "32" : PrintExpr_(op->args[0]);
+    os << "(tl.thread_idx() % " << warp_size << ")";
   } else if (op->op.same_as(tl::get_warp_idx_sync())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // get_warp_idx_sync(warp_size?) -> threadIdx.x // warp_size
+    ICHECK_LE(op->args.size(), 1U)
+        << "tl.get_warp_idx_sync expects at most one argument <warp_size>.";
+    std::string warp_size = op->args.empty() ? "32" : PrintExpr_(op->args[0]);
+    os << "(tl.thread_idx() // " << warp_size << ")";
   } else if (op->op.same_as(tl::get_warp_idx())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // get_warp_idx(warp_size?) -> threadIdx.x // warp_size
+    ICHECK_LE(op->args.size(), 1U)
+        << "tl.get_warp_idx expects at most one argument <warp_size>.";
+    std::string warp_size = op->args.empty() ? "32" : PrintExpr_(op->args[0]);
+    os << "(tl.thread_idx() // " << warp_size << ")";
   } else if (op->op.same_as(tl::get_warp_group_idx())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // get_warp_group_idx(warp_size?, warps_per_group?) ->
+    //   threadIdx.x // (warp_size * warps_per_group)
+    ICHECK_LE(op->args.size(), 2U)
+        << "tl.get_warp_group_idx expects <warp_size, warps_per_group>.";
+    std::string warp_size = !op->args.empty() ? PrintExpr_(op->args[0]) : "32";
+    std::string warps_per_group =
+        op->args.size() >= 2 ? PrintExpr_(op->args[1]) : "4";
+    os << "(tl.thread_idx() // (" << warp_size << " * " << warps_per_group
+       << "))";
   } else if (op->op.same_as(tl::tl_shuffle_elect())) {
     os << "tl.shuffle_elect(" << PrintExpr_(op->args[0]) << ")";
   } else if (op->op.same_as(tl::initialize_wgmma_descriptor())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // TIR args: (descriptor, start_address, layout_type, leading, stride)
+    // Python args: (layout_type, leading, stride, desc, start_address)
+    ICHECK_EQ(op->args.size(), 5U)
+        << "initialize_wgmma_descriptor expects 5 arguments";
+    std::string descriptor = PrintExpr_(op->args[0]);
+    std::string start_address = PrintExpr_(op->args[1]);
+    std::string layout_type = PrintExpr_(op->args[2]);
+    std::string leading = PrintExpr_(op->args[3]);
+    std::string stride = PrintExpr_(op->args[4]);
+    os << "tl.initialize_wgmma_descriptor(" << layout_type << ", " << leading
+       << ", " << stride << ", " << descriptor << ", " << start_address << ")";
   } else if (op->op.same_as(tl::initialize_tcgen05_descriptor())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    ICHECK_EQ(op->args.size(), 7U)
+        << "initialize_tcgen05_descriptor expects 7 arguments";
+    std::string descriptor = PrintExpr_(op->args[0]);
+    std::string start_address = PrintExpr_(op->args[1]);
+    std::string leading = PrintExpr_(op->args[2]);
+    std::string stride = PrintExpr_(op->args[3]);
+    std::string base_offset = PrintExpr_(op->args[4]);
+    std::string leading_abs = PrintExpr_(op->args[5]);
+    std::string swizzle_mode = PrintExpr_(op->args[6]);
+    os << "tl.initialize_tcgen05_descriptor(" << descriptor << ", "
+       << start_address << ", " << leading << ", " << stride << ", "
+       << base_offset << ", " << leading_abs << ", " << swizzle_mode << ")";
   } else if (op->op.same_as(tl::increase_descriptor_offset())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    ICHECK_EQ(op->args.size(), 2U)
+        << "increase_descriptor_offset expects 2 arguments";
+    std::string descriptor = PrintExpr_(op->args[0]);
+    std::string offset = PrintExpr_(op->args[1]);
+    os << "tl.increase_descriptor_offset(" << descriptor << ", " << offset
+       << ")";
   } else if (op->op.same_as(tl::__exp())) {
     os << "tl.exp2(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
   } else if (op->op.same_as(tl::__exp10())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    os << "tl.exp10(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
   } else if (op->op.same_as(tl::__log())) {
     os << "tl.log(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
   } else if (op->op.same_as(tl::__log2())) {
@@ -632,42 +1160,144 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const CallNode *op,
   } else if (op->op.same_as(tl::__sin())) {
     os << "tl.sin(" << PrintExpr_(op->args[0]) << ", fastmath=True)";
   } else if (op->op.same_as(tl::ieee_add())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // ieee_add(a, b, rounding_mode)
+    std::string rounding_mode = Downcast<StringImm>(op->args[2])->value;
+    os << "tl.ieee_fadd(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ", rounding=\"" << rounding_mode << "\")";
   } else if (op->op.same_as(tl::ieee_sub())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    std::string rounding_mode = Downcast<StringImm>(op->args[2])->value;
+    os << "tl.ieee_fsub(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ", rounding=\"" << rounding_mode << "\")";
   } else if (op->op.same_as(tl::ieee_mul())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    std::string rounding_mode = Downcast<StringImm>(op->args[2])->value;
+    os << "tl.ieee_fmul(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ", rounding=\"" << rounding_mode << "\")";
   } else if (op->op.same_as(tl::ieee_fmaf())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    // ieee_fmaf(a, b, c, rounding_mode)
+    std::string rounding_mode = Downcast<StringImm>(op->args[3])->value;
+    os << "tl.ieee_fmaf(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ", " << PrintExpr_(op->args[2])
+       << ", rounding=\"" << rounding_mode << "\")";
   } else if (op->op.same_as(tl::ieee_frcp())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    std::string rounding_mode = Downcast<StringImm>(op->args[1])->value;
+    os << "tl.ieee_frcp(" << PrintExpr_(op->args[0]) << ", rounding=\""
+       << rounding_mode << "\")";
   } else if (op->op.same_as(tl::ieee_fsqrt())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    std::string rounding_mode = Downcast<StringImm>(op->args[1])->value;
+    os << "tl.ieee_fsqrt(" << PrintExpr_(op->args[0]) << ", rounding=\""
+       << rounding_mode << "\")";
   } else if (op->op.same_as(tl::ieee_frsqrt())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    os << "tl.rsqrt(" << PrintExpr_(op->args[0]) << ")";
   } else if (op->op.same_as(tl::ieee_fdiv())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    std::string rounding_mode = Downcast<StringImm>(op->args[2])->value;
+    os << "tl.ieee_fdiv(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ", rounding=\"" << rounding_mode << "\")";
   } else if (op->op.same_as(tl::warp_reduce_sum())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    os << "tl.warp_reduce_sum(" << PrintExpr_(op->args[0]) << ")";
   } else if (op->op.same_as(tl::warp_reduce_max())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    os << "tl.warp_reduce_max(" << PrintExpr_(op->args[0]) << ")";
   } else if (op->op.same_as(tl::warp_reduce_min())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    os << "tl.warp_reduce_min(" << PrintExpr_(op->args[0]) << ")";
   } else if (op->op.same_as(tl::warp_reduce_bitand())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    os << "tl.warp_reduce_bitand(" << PrintExpr_(op->args[0]) << ")";
   } else if (op->op.same_as(tl::warp_reduce_bitor())) {
-    LOG(FATAL) << "Currently unsupported op: " << op->op;
+    os << "tl.warp_reduce_bitor(" << PrintExpr_(op->args[0]) << ")";
   } else if (op->op.same_as(builtin::address_of())) {
     const BufferLoadNode *load = op->args[0].as<BufferLoadNode>();
     ICHECK(op->args.size() == 1 && load);
     ICHECK_EQ(load->indices.size(), 1)
         << "CodeGenTileLangCuTeDSL only supports flat memory";
     os << GetBufferPtr_(load->buffer.get(), load->indices[0]);
+  } else if (op->op.same_as(tl::atomic_add_elem_op())) {
+    // atomic_add_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr_(op->args[0]);
+    std::string src_value = PrintExpr_(op->args[1]);
+    this->PrintIndent();
+    this->stream << "tl.AtomicAdd(" << dst_ptr << ", " << src_value << ")\n";
+  } else if (op->op.same_as(tl::atomic_add_ret_elem_op())) {
+    // atomic_add_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "tl.AtomicAdd(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_addx2_elem_op())) {
+    // atomic_addx2_elem_op(dst_ptr, src_ptr[, memory_order]) -> may return prev
+    // value
+    os << "tl.AtomicAddx2(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_addx4_elem_op())) {
+    // atomic_addx4_elem_op(dst_ptr, src_ptr[, memory_order]) -> may return prev
+    // value
+    os << "tl.AtomicAddx4(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_load_elem_op())) {
+    // atomic_load_elem_op(src_ptr, memory_order) -> returns loaded value
+    os << "tl.AtomicLoad(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_store_elem_op())) {
+    // atomic_store_elem_op(dst_ptr, value, memory_order)
+    std::string dst_ptr = PrintExpr_(op->args[0]);
+    std::string value = PrintExpr_(op->args[1]);
+    std::string memory_order = PrintExpr_(op->args[2]);
+    this->PrintIndent();
+    this->stream << "tl.AtomicStore(" << dst_ptr << ", " << value << ", "
+                 << memory_order << ")\n";
+  } else if (op->op.same_as(tl::atomic_max_elem_op())) {
+    // atomic_max_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr_(op->args[0]);
+    std::string src_value = PrintExpr_(op->args[1]);
+    this->PrintIndent();
+    this->stream << "tl.AtomicMax(" << dst_ptr << ", " << src_value << ")\n";
+  } else if (op->op.same_as(tl::atomic_max_ret_elem_op())) {
+    // atomic_max_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "tl.AtomicMaxRet(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_min_elem_op())) {
+    // atomic_min_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr_(op->args[0]);
+    std::string src_value = PrintExpr_(op->args[1]);
+    this->PrintIndent();
+    this->stream << "tl.AtomicMin(" << dst_ptr << ", " << src_value << ")\n";
+  } else if (op->op.same_as(tl::atomic_min_ret_elem_op())) {
+    // atomic_min_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "tl.AtomicMinRet(" << PrintExpr_(op->args[0]) << ", "
+       << PrintExpr_(op->args[1]) << ")";
+  } else if (op->op.same_as(builtin::shift_right())) {
+    // CuTeDSL type promotion fix: Int8 >> 4 returns Int32 in CuTeDSL,
+    // but TIR expects result type to match operand type. Wrap in explicit
+    // type conversion to match CUDA behavior.
+    ICHECK_EQ(op->args.size(), 2U);
+    DataType result_dtype = op->dtype;
+    std::string lhs = PrintExpr_(op->args[0]);
+    std::string rhs = PrintExpr_(op->args[1]);
+    PrintType(result_dtype, os);
+    os << "((" << lhs << " >> " << rhs << "))";
+  } else if (op->op.same_as(builtin::shift_left())) {
+    // Same fix for shift_left
+    ICHECK_EQ(op->args.size(), 2U);
+    DataType result_dtype = op->dtype;
+    std::string lhs = PrintExpr_(op->args[0]);
+    std::string rhs = PrintExpr_(op->args[1]);
+    PrintType(result_dtype, os);
+    os << "((" << lhs << " << " << rhs << "))";
   } else {
     CodeGenTileLangPY::VisitExpr_(op, os);
   }
 }
 
+void CodeGenTileLangCuTeDSL::VisitExpr_(const SelectNode *op,
+                                        std::ostream &os) { // NOLINT(*)
+  // Emit Python ternary: (true_val if cond else false_val).
+  // This yields ArithValue which is fine in expression contexts (range(),
+  // etc.). When used as a BufferStore value that requires TensorSSA, the store
+  // handler will wrap it with tl.where() at statement level.
+  std::string cond = PrintExpr_(op->condition);
+  std::string t = PrintExpr_(op->true_value);
+  std::string f = PrintExpr_(op->false_value);
+  os << "(" << t << " if " << cond << " else " << f << ")";
+}
+
 void CodeGenTileLangCuTeDSL::VisitExpr_(const BufferLoadNode *op,
                                         std::ostream &os) { // NOLINT(*)
   ICHECK_EQ(op->indices.size(), 1)
@@ -681,9 +1311,95 @@ void CodeGenTileLangCuTeDSL::VisitExpr_(const BufferLoadNode *op,
   DataType element_dtype = op->buffer->dtype;
 
   const int value_lanes = value_dtype.lanes();
-  if (value_lanes == element_dtype.lanes()) {
+
+  // CuTeDSL requires narrow precision vector loads to be 32-bit aligned.
+  // For unaligned widths (e.g. float8x2), load an aligned-width vector
+  // and extract the needed elements via scalar copy.
+  bool is_narrow_unaligned = value_dtype.bits() < 32 && value_lanes > 1 &&
+                             (value_dtype.bits() * value_lanes) % 32 != 0;
+
+  if (is_narrow_unaligned) {
+    int aligned_lanes = 32 / value_dtype.bits();
+    std::string vid = GetVarID(buffer_var.get());
+    std::string scope;
+    if (alloc_storage_scope_.count(buffer_var.get())) {
+      scope = alloc_storage_scope_.at(buffer_var.get());
+    }
+
+    // Compute scalar base offset
+    PrimExpr scalar_base;
+    if (value_lanes == element_dtype.lanes()) {
+      scalar_base = index * value_lanes;
+    } else {
+      // Contiguous vectorized load: extract base from ramp index
+      arith::PVar<PrimExpr> ramp_base;
+      ICHECK(arith::ramp(ramp_base, 1, value_lanes / element_dtype.lanes())
+                 .Match(index))
+          << "Non-contiguous narrow-precision load not supported";
+      scalar_base = ramp_base.Eval() * element_dtype.lanes();
+    }
+
+    // Load aligned vector into an rmem tensor (not a raw MLIR vector).
+    // This ensures downstream element access uses rmem tensor indexing
+    // (which CuTeDSL handles correctly for FP8) instead of MLIR
+    // extractelement (which fails due to unrealized_conversion_cast).
+    std::string aligned_rmem = name_supply_->FreshName("_aload");
+
+    if (scope == "local") {
+      // For rmem: load aligned_lanes elements without cute.assume.
+      // cute.assume(offset, divby=N) silently truncates offsets that
+      // are not exact multiples of N; for rmem there is no hardware
+      // alignment constraint, so we skip assume (use default div_by=1).
+      // Shape uses aligned_lanes so .load()/.store() satisfy CuTeDSL's
+      // 32-bit alignment requirement for narrow types (FP8 etc.).
+      PrintIndent();
+      stream << aligned_rmem << " = tl.make_rmem_tensor((" << aligned_lanes
+             << ",), ";
+      PrintType(value_dtype.element_of(), stream);
+      stream << ")\n";
+      PrintIndent();
+      stream << aligned_rmem << ".store(tl.make_tensor_at_offset(" << vid
+             << ".iterator, " << PrintExpr_(scalar_base) << ", ("
+             << aligned_lanes << ",)).load())\n";
+    } else {
+      // For shared/global memory: load exactly value_lanes elements with
+      // no alignment assumption (div_by=1).  This avoids:
+      //  - MisalignedAddress from div_by=aligned_lanes on non-aligned offsets
+      //  - OOB from loading aligned_lanes elements near the buffer boundary
+      bool is_handle_match = HandleTypeMatch_(buffer_var.get(), element_dtype);
+      std::string ptr_str;
+      if (is_handle_match) {
+        ptr_str = vid + ".iterator";
+      } else {
+        std::ostringstream ptr_os;
+        ptr_os << "tl.recast_ptr(" << vid << ".iterator, dtype=";
+        PrintType(value_dtype.element_of(), ptr_os);
+        ptr_os << ")";
+        ptr_str = ptr_os.str();
+      }
+      PrintIndent();
+      stream << aligned_rmem << " = tl.make_rmem_tensor((" << value_lanes
+             << ",), ";
+      PrintType(value_dtype.element_of(), stream);
+      stream << ")\n";
+      PrintIndent();
+      stream << aligned_rmem << ".store(tl.make_tensor_at_offset(" << ptr_str
+             << ", " << PrintExpr_(scalar_base) << ", (" << value_lanes
+             << ",)).load())\n";
+    }
+
+    // Return the rmem tensor (not .load()) so downstream code uses
+    // rmem element access instead of MLIR vector extractelement.
+    os << aligned_rmem;
+  } else if (value_lanes == element_dtype.lanes()) {
     std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index);
-    if (ref.back() == ')') {
+    // Check if this is a barrier buffer - barrier pointers don't need .load()
+    std::string scope;
+    if (alloc_storage_scope_.count(buffer_var.get())) {
+      scope = alloc_storage_scope_.at(buffer_var.get());
+    }
+    if (ref.back() == ')' && scope != "shared.barrier" &&
+        scope != "shared.cluster_barrier") {
       ref += ".load()";
     }
     os << ref;
@@ -742,17 +1458,228 @@ void CodeGenTileLangCuTeDSL::VisitStmt_(const BufferStoreNode *op) {
   DataType element_dtype = op->buffer->dtype;
   PrimExpr index_expr = op->indices[0];
   Var buffer_var = op->buffer->data;
-  std::string value_str = PrintExpr_(op->value);
+
+  // Pre-compute Select/if_then_else as tl.where() at statement level.
+  // Python ternary (ArithValue) is rejected by CuTeDSL .store(); tl.where()
+  // produces TensorSSA which works in all store paths (.store(), vec store,
+  // etc.). Also handles Cast(Select/if_then_else) — the cast is applied via
+  // .to() on the tl.where result.
+  bool value_is_conditional = false;
+  PrimExpr cond_expr, true_expr, false_expr;
+  DataType cast_to_dtype; // non-void if a Cast wraps the conditional
+  // Helper to detect Select/if_then_else in a PrimExpr.
+  auto detect_conditional = [&](const PrimExpr &expr) {
+    if (auto sel = expr.as<SelectNode>()) {
+      value_is_conditional = true;
+      cond_expr = sel->condition;
+      true_expr = sel->true_value;
+      false_expr = sel->false_value;
+    } else if (auto call = expr.as<CallNode>();
+               call && call->op.same_as(tir::builtin::if_then_else())) {
+      value_is_conditional = true;
+      cond_expr = call->args[0];
+      true_expr = call->args[1];
+      false_expr = call->args[2];
+    }
+  };
+  detect_conditional(op->value);
+  if (!value_is_conditional) {
+    // Check for Cast(Select/if_then_else)
+    if (auto cast_node = op->value.as<CastNode>()) {
+      detect_conditional(cast_node->value);
+      if (value_is_conditional) {
+        cast_to_dtype = cast_node->dtype;
+      }
+    }
+  }
+
+  std::string value_str;
+  if (value_is_conditional) {
+    int lanes = value_dtype.lanes();
+    if (lanes == 0)
+      lanes = 1;
+    // Helper: wrap a sub-expression as TensorSSA via make_rmem_tensor.
+    auto as_tsa = [this](const PrimExpr &e, int n,
+                         DataType elem_dt) -> std::string {
+      std::string s = PrintExpr_(e);
+      if (n == 0)
+        n = 1;
+      if (s.size() >= 7 && s.compare(s.size() - 7, 7, ".load()") == 0)
+        return s;
+      std::string var = name_supply_->FreshName("_tsa");
+      PrintIndent();
+      if (elem_dt.is_bool()) {
+        stream << var << " = tl.make_rmem_tensor((" << n
+               << ",), cutlass.Boolean)\n";
+      } else {
+        stream << var << " = tl.make_rmem_tensor((" << n << ",), "
+               << DTypeToString(elem_dt) << ")\n";
+      }
+      for (int i = 0; i < n; ++i) {
+        PrintIndent();
+        if (n == 1) {
+          stream << var << "[0] = " << s << "\n";
+        } else {
+          stream << var << "[" << i << "] = " << s << "[" << i << "]\n";
+        }
+      }
+      return var + ".load()";
+    };
+    DataType true_ty = true_expr.dtype().element_of();
+    DataType false_ty = false_expr.dtype().element_of();
+    std::string cond_tsa = as_tsa(cond_expr, 1, DataType::Bool());
+    std::string then_tsa = as_tsa(true_expr, lanes, true_ty);
+    std::string else_tsa = as_tsa(false_expr, lanes, false_ty);
+    if (true_ty != false_ty) {
+      DataType common =
+          (true_ty.bits() >= false_ty.bits()) ? true_ty : false_ty;
+      if (common.bits() < 32 && true_ty.is_float() && false_ty.is_float())
+        common = DataType::Float(32);
+      std::string common_str = DTypeToString(common);
+      then_tsa += ".to(" + common_str + ")";
+      else_tsa += ".to(" + common_str + ")";
+    }
+    std::string result = name_supply_->FreshName("_where");
+    PrintIndent();
+    stream << result << " = tl.where(" << cond_tsa << ", " << then_tsa << ", "
+           << else_tsa << ")\n";
+    // If the conditional was wrapped in a Cast, apply .to() on the result.
+    // tl.where() returns TensorSSA, and .to() on TensorSSA returns TensorSSA.
+    if (cast_to_dtype.bits() > 0) {
+      std::string cast_result = name_supply_->FreshName("_wcast");
+      PrintIndent();
+      stream << cast_result << " = " << result << ".to("
+             << DTypeToString(cast_to_dtype.element_of()) << ")\n";
+      value_str = cast_result;
+    } else {
+      value_str = result;
+    }
+  } else {
+    value_str = PrintExpr_(op->value);
+  }
+
+  // CuTeDSL does not support implicit narrowing assignments (e.g. storing
+  // Int32 to an Int16 tensor).  When the value's scalar width exceeds the
+  // buffer element width, wrap with an explicit cast so that CuTeDSL's
+  // Integer constructor emits arith.trunci (or the appropriate float
+  // truncation).  This mirrors C/CUDA's implicit narrowing conversions.
+  DataType value_elem = value_dtype.element_of();
+  DataType buf_elem = element_dtype.element_of();
+  if (value_elem.bits() > buf_elem.bits() && value_elem.lanes() == 1 &&
+      buf_elem.lanes() == 1 &&
+      (value_elem.is_int() || value_elem.is_uint()) ==
+          (buf_elem.is_int() || buf_elem.is_uint())) {
+    value_str = CastFromTo_(value_str, value_elem, buf_elem);
+    value_dtype = buf_elem.with_lanes(value_dtype.lanes());
+  }
 
   int value_lanes = value_dtype.lanes();
-  if (value_lanes == element_dtype.lanes()) {
+
+  // CuTeDSL requires narrow precision (e.g. FP8) vector stores to be 32-bit
+  // aligned. For unaligned widths (e.g. float8x2), pad to aligned width via
+  // scalar element copy, then store as aligned vector.
+  bool is_narrow_unaligned = value_dtype.bits() < 32 && value_lanes > 1 &&
+                             (value_dtype.bits() * value_lanes) % 32 != 0;
+
+  if (is_narrow_unaligned) {
+    int aligned_lanes = 32 / value_dtype.bits(); // e.g. 4 for FP8
+    // value_str may be an rmem tensor name (from Cast/BufferLoad narrow path)
+    // or a .load() expression. SSAGetID will alias it if already a name.
+    value_str = SSAGetID(value_str, value_dtype);
+
+    // Determine store target scope
+    std::string vid = GetVarID(buffer_var.get());
+    std::string scope;
+    if (alloc_storage_scope_.count(buffer_var.get())) {
+      scope = alloc_storage_scope_.at(buffer_var.get());
+    }
+
+    if (scope == "local") {
+      // For local rmem: use element-by-element assignment.
+      // A padded (aligned_lanes,) .store() writes beyond value_lanes
+      // elements, causing overlapping writes and OOB at the end of the
+      // rmem tensor.  And cute.assume with div_by=aligned_lanes silently
+      // truncates non-aligned offsets.  Element assignment (vid[i]=val)
+      // avoids both issues and does not trigger CuTeDSL's 32-bit
+      // alignment check that .store() enforces for FP8 types.
+      PrimExpr scalar_base;
+      if (value_lanes == element_dtype.lanes()) {
+        scalar_base = index_expr * value_lanes;
+      } else {
+        arith::PVar<PrimExpr> ramp_base;
+        ICHECK(arith::ramp(ramp_base, 1, value_lanes / element_dtype.lanes())
+                   .Match(index_expr))
+            << "Non-contiguous narrow-precision store not supported";
+        scalar_base = ramp_base.Eval() * element_dtype.lanes();
+      }
+      for (int i = 0; i < value_lanes; ++i) {
+        PrintIndent();
+        stream << vid << "[" << PrintExpr_(scalar_base) << " + " << i
+               << "] = " << value_str << "[" << i << "]\n";
+      }
+    } else {
+      // For global/shared: use scalar element stores (works at any alignment).
+      // CuTeDSL supports scalar FP8 stores via rmem_tensor -> global_tensor[i].
+      PrimExpr scalar_base;
+      if (value_lanes == element_dtype.lanes()) {
+        scalar_base = index_expr * value_lanes;
+      } else {
+        arith::PVar<PrimExpr> ramp_base;
+        ICHECK(arith::ramp(ramp_base, 1, value_lanes / element_dtype.lanes())
+                   .Match(index_expr))
+            << "Non-contiguous narrow-precision store not supported";
+        scalar_base = ramp_base.Eval() * element_dtype.lanes();
+      }
+
+      bool is_handle_match = HandleTypeMatch_(buffer_var.get(), element_dtype);
+      std::string ptr_str;
+      if (is_handle_match) {
+        ptr_str = vid + ".iterator";
+      } else {
+        std::ostringstream ptr_os;
+        ptr_os << "tl.recast_ptr(" << vid << ".iterator, dtype=";
+        PrintType(value_dtype.element_of(), ptr_os);
+        ptr_os << ")";
+        ptr_str = ptr_os.str();
+      }
+
+      // Create a tensor view and store each element individually
+      std::string view_var = name_supply_->FreshName("_sview");
+      PrintIndent();
+      stream << view_var << " = tl.make_tensor(" << ptr_str << " + "
+             << PrintExpr_(scalar_base) << ", (" << value_lanes << ",))\n";
+      for (int i = 0; i < value_lanes; ++i) {
+        PrintIndent();
+        stream << view_var << "[" << i << "] = " << value_str << "[" << i
+               << "]\n";
+      }
+    }
+  } else if (value_lanes == element_dtype.lanes()) {
     std::string ref = GetBufferRef_(value_dtype, op->buffer.get(), index_expr);
     PrintIndent();
 
     if (ref.back() != ')') {
-      stream << ref << " = " << RemoveOutermostParentheses(value_str) << "\n";
+      // Direct element assignment (e.g. vid[i] = value).
+      // For conditionals (pre-computed as tl.where), extract scalar via [0].
+      if (value_is_conditional && value_lanes == 1) {
+        stream << ref << " = " << value_str << "[0]\n";
+      } else {
+        stream << ref << " = " << RemoveOutermostParentheses(value_str) << "\n";
+      }
     } else {
-      stream << ref << ".store(" << RemoveOutermostParentheses(value_str)
+      // CuTeDSL Tensor.store() expects TensorSSA; scalar expressions yield
+      // ArithValue. Conditionals are already converted to tl.where()
+      // (TensorSSA) at the top. For other scalar expressions, wrap in
+      // make_filled_tensor.
+      std::string store_rhs = value_str;
+      if (!value_is_conditional && value_lanes == 1 &&
+          !op->value.as<BufferLoadNode>()) {
+        if (store_rhs.size() < 7 ||
+            store_rhs.compare(store_rhs.size() - 7, 7, ".load()") != 0) {
+          store_rhs = "tl.make_filled_tensor((1,), " + store_rhs + ").load()";
+        }
+      }
+      stream << ref << ".store(" << RemoveOutermostParentheses(store_rhs)
              << ")\n";
     }
   } else {
@@ -798,9 +1725,9 @@ void CodeGenTileLangCuTeDSL::VisitStmt_(const AllocateNode *op) {
   if (scope == "local.descriptor.wgmma") {
     stream << vid << " = tl.GmmaDescriptor()\n";
   } else if (scope == "local.descriptor.tcgen05_smem") {
-    LOG(FATAL) << "Currently unsupported scope: " << scope;
+    stream << vid << " = tl.Tcgen05SmemDescriptor()\n";
   } else if (scope == "local.descriptor.tcgen05_instr") {
-    LOG(FATAL) << "Currently unsupported scope: " << scope;
+    stream << vid << " = 0\n";
   } else if (scope == "shared.dyn") {
     stream << vid << " = tl.make_tensor(tl.get_dyn_smem(";
     PrintType(op->dtype, stream);
@@ -816,8 +1743,9 @@ void CodeGenTileLangCuTeDSL::VisitStmt_(const AllocateNode *op) {
       stream << vid << " = tl.make_tensor(tl.alloc_smem(";
       PrintType(op->dtype, stream);
       stream << ", " << constant_size << "), (" << constant_size << ",))\n";
-    } else if (scope == "shared.barrier") {
-      ICHECK(false) << "Unsupported scope: " << scope;
+    } else if (scope == "shared.barrier" || scope == "shared.cluster_barrier") {
+      stream << vid << " = tl.alloc_smem(cutlass.Uint64, size_in_elems="
+             << constant_size << ")\n";
     } else if (scope == "local") {
       stream << vid << " = tl.make_rmem_tensor((" << constant_size << "),";
       PrintType(op->dtype, stream);
@@ -851,35 +1779,25 @@ void CodeGenTileLangCuTeDSL::VisitStmt_(const AttrStmtNode *op) {
       }
     }
     VisitStmt(op->body);
-  } else if (op->attr_key == tir::attr::async_commit_queue_scope) {
-    const IntImmNode *queue_id = op->value.as<IntImmNode>();
-    ICHECK(queue_id && queue_id->value == 0)
-        << "For CUDA, the index of an async queue must be 0.";
-    VisitStmt(op->body);
-    auto commit_group = Call(DataType::Void(), builtin::ptx_commit_group(), {});
-    VisitExpr(commit_group, stream);
-  } else if (op->attr_key == tir::attr::async_wait_queue_scope) {
-    auto wait_attrs = GetAsyncWaitAttributes(op);
-    auto queue_id = wait_attrs.first.as<IntImmNode>();
-    ICHECK(queue_id && queue_id->value == 0)
-        << "For CUDA, the index of an async queue must be 0.";
-    auto wait_cnt = wait_attrs.second;
-    auto wait_group =
-        Call(DataType::Void(), builtin::ptx_wait_group(), {wait_cnt});
-    VisitExpr(wait_group, stream);
-    auto inner = op->body.as<AttrStmtNode>();
-    ICHECK(inner);
-    VisitStmt(inner->body);
   } else if (op->attr_key == "threadblock_swizzle_pattern") {
     this->PrintIndent();
-    const StringImmNode *pattern = op->value.as<StringImmNode>();
-    ICHECK(pattern);
-    std::string call_str = pattern->value;
-    // replace :: with . and replace < with ( and replace > with )
-    ReplaceAll(call_str, "::", ".");
-    ReplaceAll(call_str, "<", "(");
-    ReplaceAll(call_str, ">", ")");
-    this->stream << "blockIdx = " << call_str << "\n";
+    std::string func_name;
+    int panel_size = 0;
+    if (const auto *call = op->value.as<CallNode>()) {
+      if (call->op.same_as(tir::builtin::tvm_tuple()) &&
+          call->args.size() >= 2) {
+        const auto *name_node = call->args[0].as<StringImmNode>();
+        const auto *size_node = call->args[1].as<IntImmNode>();
+        ICHECK(name_node && size_node) << "threadblock_swizzle_pattern expects "
+                                          "tvm_tuple(device_func, panel_size)";
+        func_name = name_node->value;
+        panel_size = static_cast<int>(size_node->value);
+      }
+    }
+    ICHECK(!func_name.empty() && panel_size > 0)
+        << "threadblock_swizzle_pattern: failed to extract func_name and "
+           "panel_size";
+    this->stream << "blockIdx = tl." << func_name << "(" << panel_size << ")\n";
     this->VisitStmt(op->body);
   } else if (op->attr_key == "pragma_unroll_factor") {
     const IntImmNode *factor = op->value.as<IntImmNode>();
@@ -892,6 +1810,51 @@ void CodeGenTileLangCuTeDSL::VisitStmt_(const AttrStmtNode *op) {
 }
 
 void CodeGenTileLangCuTeDSL::VisitStmt_(const ForNode *op) {
+  bool has_break = ContainsLoopBreak(op->body);
+
+  if (has_break) {
+    // Emit guard variable before the loop
+    int break_id = loop_break_counter_++;
+    PrintIndent();
+    stream << "_loop_break_" << break_id << " = 0\n";
+
+    // Save and set break loop state
+    bool old_in_break = in_break_loop_;
+    int old_break_id = current_break_id_;
+    in_break_loop_ = true;
+    current_break_id_ = break_id;
+    break_emitted_in_seq_ = false;
+
+    // Emit the for header (same logic as below, but always non-unrolled path
+    // since break loops use runtime conditions)
+    PrintIndent();
+    std::string vid = AllocVarID(op->loop_var.get());
+    stream << "for " << vid << " in range(";
+    if (is_zero(op->min)) {
+      PrintExpr_(op->extent, stream);
+    } else {
+      PrintExpr_(op->min, stream);
+      stream << ", ";
+      PrimExpr upper_bound = arith::Analyzer().Simplify(op->extent + op->min);
+      PrintExpr_(upper_bound, stream);
+    }
+    stream << "):\n";
+
+    // Emit the body wrapped in guard check
+    int for_scope = BeginScope();
+    PrintIndent();
+    stream << "if _loop_break_" << break_id << " == 0:\n";
+    int guard_scope = BeginScope();
+    PrintStmt_(op->body);
+    EndScope(guard_scope);
+    EndScope(for_scope);
+
+    // Restore state
+    in_break_loop_ = old_in_break;
+    current_break_id_ = old_break_id;
+    return;
+  }
+
   if (op->kind != tir::ForKind::kUnrolled) {
     CodeGenTileLangPY::VisitStmt_(op);
     return;
@@ -930,6 +1893,33 @@ void CodeGenTileLangCuTeDSL::VisitStmt_(const ForNode *op) {
   EndScope(for_scope);
 }
 
+void CodeGenTileLangCuTeDSL::VisitStmt_(const SeqStmtNode *op) {
+  if (!in_break_loop_) {
+    // Normal path: delegate to base class
+    CodeGenTileLangPY::VisitStmt_(op);
+    return;
+  }
+
+  // In a break loop: after visiting a statement that contains loop_break(),
+  // wrap the remaining statements in an `if _loop_break_N == 0:` guard
+  // so they don't execute in the same iteration after the break.
+  int guard_scope = -1;
+  for (size_t i = 0; i < op->seq.size(); ++i) {
+    break_emitted_in_seq_ = false;
+    PrintStmt_(op->seq[i]);
+    if (break_emitted_in_seq_ && guard_scope < 0 && i + 1 < op->seq.size()) {
+      // Insert guard for remaining statements
+      PrintIndent();
+      stream << "if _loop_break_" << current_break_id_ << " == 0:\n";
+      guard_scope = BeginScope();
+      break_emitted_in_seq_ = false;
+    }
+  }
+  if (guard_scope >= 0) {
+    EndScope(guard_scope);
+  }
+}
+
 void CodeGenTileLangCuTeDSL::VisitStmt_(const IfThenElseNode *op) {
   std::string cond = PrintExpr_(op->condition);
   PrintIndent();
@@ -1140,6 +2130,14 @@ void CodeGenTileLangCuTeDSL::PrintCallExtern_(Type ret_type,
       sargs[0] = GetBufferPtr_(load->buffer.get(), load->indices[0]);
     }
   }
+  // Quantization Functions (decode_i4u_to_f16, decode_i4s_to_f16, etc.)
+  if (global_symbol_str.substr(0, 7) == "decode_") {
+    global_symbol_str = "tl." + global_symbol_str;
+  }
+  // Warp-level primitives (__activemask, __shfl_down_sync, __shfl_sync)
+  if (global_symbol_str.substr(0, 2) == "__") {
+    global_symbol_str = "tl." + global_symbol_str;
+  }
   // some optional template arguments might be ommited, so add names explicitly
   // for remain arguments
   if (global_symbol_str == "tl.gemm_ss" || global_symbol_str == "tl.gemm_rs" ||
@@ -1212,20 +2210,55 @@ std::string CodeGenTileLangCuTeDSL::GetBufferPtr_(const BufferNode *buffer,
   const std::string vid = GetVarID(buffer_var);
 
   DataType buffer_element_dtype = buffer->dtype;
-  bool is_handle_type_match =
-      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  // CuTeDSL only supports i1 (Boolean) in rmem; use Uint8 for gmem pointers.
+  DataType effective_dtype = buffer_element_dtype;
+  if (buffer_element_dtype.is_bool()) {
+    std::string scope;
+    if (alloc_storage_scope_.count(buffer_var)) {
+      scope = alloc_storage_scope_.at(buffer_var);
+    }
+    if (scope.empty())
+      scope = GetPtrStorageScope(buffer->data);
+    if (scope != "local" && scope != "local.var") {
+      effective_dtype = DataType::UInt(8);
+    }
+  }
+  // shared.barrier and shared.cluster_barrier are allocated via tl.alloc_smem()
+  // which returns _Pointer (not _Tensor), so it doesn't have .iterator — use
+  // vid directly.
+  std::string scope;
+  if (alloc_storage_scope_.count(buffer_var)) {
+    scope = alloc_storage_scope_.at(buffer_var);
+  }
+  if (scope.empty())
+    scope = GetPtrStorageScope(buffer->data);
+
   std::string ptr_str;
-  if (is_handle_type_match) {
-    ptr_str = vid + ".iterator";
+  if (scope == "shared.barrier" || scope == "shared.cluster_barrier") {
+    ptr_str = vid;
   } else {
-    ptr_str = "tl.recast_ptr(" + vid +
-              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+    bool is_handle_type_match = HandleTypeMatch_(buffer_var, effective_dtype);
+    if (is_handle_type_match) {
+      ptr_str = vid + ".iterator";
+    } else {
+      ptr_str = "tl.recast_ptr(" + vid +
+                ".iterator, dtype=" + DTypeToString(effective_dtype) + ")";
+    }
   }
 
   std::string index_str = PrintExpr_(index);
   return "(" + ptr_str + " + " + index_str + ")";
 }
 
+std::string CodeGenTileLangCuTeDSL::GetVarPtr_(const PrimExpr &expr) {
+  // For local buffers (rmem tensors), we need to use .iterator to get the
+  // pointer since local buffers in CuTeDSL are tensors, not raw pointers
+  if (const VarNode *var = expr.as<VarNode>()) {
+    return GetVarID(var) + ".iterator";
+  }
+  return PrintExpr_(expr);
+}
+
 // The following forms can be returned:
 // (1) vid
 // (2) vid[i]
@@ -1252,26 +2285,55 @@ std::string CodeGenTileLangCuTeDSL::GetBufferRef_(DataType t,
   }
 
   DataType buffer_element_dtype = buffer->dtype;
-  bool is_handle_type_match =
-      HandleTypeMatch_(buffer_var, buffer_element_dtype);
+  // CuTeDSL only supports i1 (Boolean) in rmem. For gmem/shared bool buffers,
+  // use Uint8 instead (matches PyTorch's torch.bool memory layout).
+  DataType effective_dtype = buffer_element_dtype;
+  if (buffer_element_dtype.is_bool() && scope != "local" &&
+      scope != "local.var") {
+    effective_dtype = DataType::UInt(8);
+  }
+  bool is_handle_type_match = HandleTypeMatch_(buffer_var, effective_dtype);
   std::string ptr_str;
   if (is_handle_type_match) {
     ptr_str = vid + ".iterator";
   } else {
     ptr_str = "tl.recast_ptr(" + vid +
-              ".iterator, dtype=" + DTypeToString(buffer_element_dtype) + ")";
+              ".iterator, dtype=" + DTypeToString(effective_dtype) + ")";
   }
 
-  const std::string index_str = PrintExpr_(index);
+  // CuTeDSL make_tensor_at_offset(ptr, offset, shape, div_by) expects a
+  // single scalar offset. For a Ramp index, pass only the base to avoid
+  // emitting a tuple (base+0, base+1, ...) that the runtime rejects.
+  PrimExpr offset_expr = index;
+  if (const RampNode *ramp = index.as<RampNode>()) {
+    ICHECK(is_one(ramp->stride))
+        << "GetBufferRef_: non-unit Ramp stride not supported, got "
+        << ramp->stride;
+    offset_expr = ramp->base;
+  } else {
+    arith::PVar<PrimExpr> ramp_base;
+    int lanes = t.lanes() > 0 ? t.lanes() : 1;
+    if (arith::ramp(ramp_base, 1, lanes).Match(index)) {
+      offset_expr = ramp_base.Eval();
+    }
+  }
+  const std::string index_str = PrintExpr_(offset_expr);
 
   if (t == buffer_element_dtype) {
-    if (is_handle_type_match && buffer_element_dtype.is_scalar() &&
-        (scope == "local" || scope == "shared" || scope == "shared.dyn" ||
-         scope == "shared.barrier")) {
+    if (scope == "shared.barrier" || scope == "shared.cluster_barrier") {
+      // shared.barrier and shared.cluster_barrier are allocated via
+      // tl.alloc_smem() which returns _Pointer. _Pointer does not support
+      // subscript access [i], but supports pointer arithmetic (ptr + i). Use
+      // pointer addition instead of subscript.
+      return "(" + vid + " + " + index_str + ")";
+    } else if (is_handle_type_match && buffer_element_dtype.is_scalar() &&
+               (scope == "local" || scope == "shared")) {
       // Tensors in these scopes are allocated as one-dimensional, so can be
       // assessed via "[]" correctly. Other tensors may be multi-dimensional,
       // and must be assessed via ptr, otherwise CuTeDSL will interpret "[]"
       // access using its visiting order and layout.
+      // Note: shared.dyn is excluded because its shape is set to (1,) and
+      // direct indexing would cause out-of-bounds access.
       return vid + "[" + index_str + "]";
     } else {
       std::ostringstream os;
diff --git a/src/target/codegen_cutedsl.h b/src/target/codegen_cutedsl.h
index 1d4edc5382..0f23124e3d 100644
--- a/src/target/codegen_cutedsl.h
+++ b/src/target/codegen_cutedsl.h
@@ -32,12 +32,14 @@ class CodeGenTileLangCuTeDSL final : public CodeGenTileLangPY {
   void VisitExpr_(const BroadcastNode *op,
                   std::ostream &os) override; // NOLINT(*)
   void VisitExpr_(const FloatImmNode *op,
-                  std::ostream &os) override;                     // NOLINT(*)
-  void VisitExpr_(const CastNode *op, std::ostream &os) override; // NOLINT(*)
-  void VisitExpr_(const DivNode *op, std::ostream &os) override;  // NOLINT(*)
-  void VisitExpr_(const MinNode *op, std::ostream &os) override;  // NOLINT(*)
-  void VisitExpr_(const MaxNode *op, std::ostream &os) override;  // NOLINT(*)
-  void VisitExpr_(const CallNode *op, std::ostream &os) override; // NOLINT(*)
+                  std::ostream &os) override;                       // NOLINT(*)
+  void VisitExpr_(const IntImmNode *op, std::ostream &os) override; // NOLINT(*)
+  void VisitExpr_(const CastNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const DivNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MinNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const MaxNode *op, std::ostream &os) override;    // NOLINT(*)
+  void VisitExpr_(const CallNode *op, std::ostream &os) override;   // NOLINT(*)
+  void VisitExpr_(const SelectNode *op, std::ostream &os) override; // NOLINT(*)
   void VisitExpr_(const BufferLoadNode *op,
                   std::ostream &os) override; // NOLINT(*)
 
@@ -47,6 +49,7 @@ class CodeGenTileLangCuTeDSL final : public CodeGenTileLangPY {
   void VisitStmt_(const ForNode *op) override;
   void VisitStmt_(const IfThenElseNode *op) override;
   void VisitStmt_(const EvaluateNode *op) override;
+  void VisitStmt_(const SeqStmtNode *op) override;
 
 protected:
   virtual void PrintVecElemLoad_(const std::string &vec, DataType t, int i,
@@ -72,6 +75,9 @@ class CodeGenTileLangCuTeDSL final : public CodeGenTileLangPY {
   std::string GetBufferRef_(DataType t, const BufferNode *buffer,
                             PrimExpr index) override;
 
+  // Get pointer string from a Var expression (local buffer -> vid.iterator)
+  std::string GetVarPtr_(const PrimExpr &expr);
+
   /*!
    * \brief Print expr representing the thread tag
    * \param IterVar iv The thread index to be binded;
@@ -94,6 +100,15 @@ class CodeGenTileLangCuTeDSL final : public CodeGenTileLangPY {
 
   // Fastmath configuration (read from PassContext)
   bool enable_fastmath_ = false;
+
+  // Loop-break guard transformation state
+  // When a for-loop contains loop_break(), we replace `break` with a guard
+  // variable pattern since CuTeDSL doesn't support early exit (break).
+  bool in_break_loop_ = false;
+  int loop_break_counter_ = 0;
+  int current_break_id_ = -1;
+  // Set to true when loop_break() replacement is emitted within current SeqStmt
+  bool break_emitted_in_seq_ = false;
 };
 
 } // namespace codegen
diff --git a/src/target/codegen_hip.cc b/src/target/codegen_hip.cc
index 8a18c3fc9c..ed141e6c2d 100644
--- a/src/target/codegen_hip.cc
+++ b/src/target/codegen_hip.cc
@@ -9,16 +9,86 @@
 #include <tvm/tir/op.h>
 
 #include <cmath>
+#include <cstdint>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
 
 #include "../op/builtin.h"
 #include "target/source/ptx.h"
+#include "utils.h"
 
 namespace tvm {
 namespace codegen {
 
+namespace {
+
+bool IsValidCPAsyncTransferBytes(int64_t bytes) {
+  return bytes == 4 || bytes == 8 || bytes == 16;
+}
+
+std::optional<DataType> GetAccessPtrElementType(const PrimExpr &expr) {
+  const auto *ptr_call = expr.as<CallNode>();
+  if (ptr_call == nullptr) {
+    return std::nullopt;
+  }
+  if (ptr_call->op.same_as(builtin::address_of())) {
+    const auto *buffer_load = ptr_call->args[0].as<BufferLoadNode>();
+    ICHECK(buffer_load) << "address_of arg must be BufferLoad";
+    return buffer_load->buffer->dtype;
+  }
+  if (ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+    ICHECK(!ptr_call->args.empty());
+    return ptr_call->args[0].dtype();
+  }
+  if (ptr_call->op.same_as(tl::access_ptr())) {
+    ICHECK_EQ(ptr_call->args.size(), 3U)
+        << "tl.access_ptr expects 3 args: (BufferLoad, extent, rw_mask)";
+    const auto *buffer_load = ptr_call->args[0].as<BufferLoadNode>();
+    ICHECK(buffer_load) << "tl.access_ptr arg0 must be BufferLoad";
+    return buffer_load->buffer->dtype;
+  }
+  return std::nullopt;
+}
+
+int GetTileLangCPAsyncTransferBytes(const CallNode *op) {
+  ICHECK(op->args.size() == 3 || op->args.size() == 4)
+      << "tl::ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+         "src_access_ptr, num_elems, [predicate])";
+  const auto *num_elems_imm = op->args[2].as<IntImmNode>();
+  ICHECK(num_elems_imm) << "tl::ptx_cp_async num_elems must be IntImm, but got "
+                        << op->args[2];
+  int64_t num_elems = num_elems_imm->value;
+  ICHECK_GT(num_elems, 0);
+
+  auto dst_elem_type = GetAccessPtrElementType(op->args[0]);
+  auto src_elem_type = GetAccessPtrElementType(op->args[1]);
+  ICHECK(dst_elem_type.has_value() && src_elem_type.has_value())
+      << "tl::ptx_cp_async expects address_of, tl.access_ptr, or "
+         "tvm_access_ptr operands";
+
+  int64_t dst_total_bits =
+      num_elems * dst_elem_type.value().bits() * dst_elem_type.value().lanes();
+  int64_t src_total_bits =
+      num_elems * src_elem_type.value().bits() * src_elem_type.value().lanes();
+  ICHECK_EQ(dst_total_bits, src_total_bits)
+      << "tl::ptx_cp_async requires src/dst transfer widths to match, but got "
+      << dst_total_bits << " vs " << src_total_bits << " bits";
+  ICHECK_EQ(dst_total_bits % 8, 0)
+      << "tl::ptx_cp_async requires byte-aligned transfers, but got "
+      << dst_total_bits << " bits";
+
+  int64_t total_bytes = dst_total_bits / 8;
+  ICHECK(IsValidCPAsyncTransferBytes(total_bytes))
+      << "tl::ptx_cp_async requires a final PTX byte width in {4, 8, 16}, but "
+         "got "
+      << total_bytes;
+  return static_cast<int>(total_bytes);
+}
+
+} // namespace
+
 static std::string GetFP8Type(DataType type) {
   std::stringstream stream;
   int32_t lanes = type.lanes();
@@ -37,17 +107,11 @@ static std::string GetFP8Type(DataType type) {
     LOG(FATAL) << "Only support scalar and vector types of width (2, 4, 8, 16) "
                   "for FP8";
   }
-  if (type.code() == DataType::kFloat8_e4m3fn) {
+  if (type.is_float8_e4m3fn() || type.is_float8_e4m3fnuz() ||
+      type.is_float8_e4m3() || type.code() == DataType::kFloat8_e4m3b11fnuz) {
     stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e4m3fnuz) {
-    stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e4m3) {
-    stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e4m3b11fnuz) {
-    stream << "fp8_e4" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e5m2) {
-    stream << "fp8_e5" << vec << "_t";
-  } else if (type.code() == DataType::kFloat8_e5m2fnuz) {
+  } else if (type.is_float8_e5m2() || type.is_float8_e5m2fnuz() ||
+             type.code() == DataType::kFloat8_e5m2) {
     stream << "fp8_e5" << vec << "_t";
   } else if (type.code() == DataType::kFloat8_e8m0fnu) {
     stream << "fp8_e8" << vec << "_t";
@@ -131,7 +195,13 @@ void CodeGenTileLangHIP::PrintExtraAttrs(const PrimFunc &f, std::ostream &os) {
       // return
       return;
     }
-    stream << " __launch_bounds__(" << threadIdx_ext_int->value << ")";
+    // AMD wavefront size is 64.  Sub-wavefront thread counts (e.g. 32)
+    // with __launch_bounds__ can cause the compiler to miscompile 64-bit
+    // shuffle operations when combined with #pragma unroll.  Only emit
+    // launch_bounds when we have at least a full wavefront.
+    if (threadIdx_ext_int->value >= 64) {
+      stream << " __launch_bounds__(" << threadIdx_ext_int->value << ")";
+    }
   }
 }
 
@@ -146,6 +216,10 @@ std::string CodeGenTileLangHIP::Finish() {
     decl_stream << "#include <tl_templates/hip/hip_fp8.h>\n";
   }
 
+  if (need_cooperative_groups_) {
+    decl_stream << "#include <hip/hip_cooperative_groups.h>\n";
+  }
+
   decl_stream << "#include <tl_templates/hip/gemm.h>\n";
   decl_stream << "#include <tl_templates/hip/copy.h>\n";
   decl_stream << "#include <tl_templates/hip/reduce.h>\n";
@@ -237,6 +311,14 @@ void CodeGenTileLangHIP::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
         ICHECK_EQ(lanes % 2, 0)
             << "only support even lane for float type with lanes > 4";
         os << "ulonglong" << lanes / 2;
+      } else if (lanes == 16) {
+        // float32x16: GCC vector extension type used by MFMA accumulators.
+        os << "float32x16";
+        return;
+      } else if (lanes == 32) {
+        // float32x32: GCC vector extension type used by MFMA accumulators.
+        os << "float32x32";
+        return;
       } else {
         fail = true;
       }
@@ -262,6 +344,10 @@ void CodeGenTileLangHIP::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
     } else if (lanes <= 8) {
       ICHECK_EQ(lanes % 2, 0) << "only support even lane for half type";
       os << "uint" << lanes / 2;
+    } else if (lanes == 16) {
+      // bfloat16x16: struct { bfloat16_t data[16]; } used by MFMA accumulators.
+      os << "bfloat16x16";
+      return;
     } else {
       fail = true;
     }
@@ -431,6 +517,7 @@ void CodeGenTileLangHIP::PrintType(DataType t, std::ostream &os) { // NOLINT(*)
 void CodeGenTileLangHIP::PrintVecBinaryOp(const std::string &op, DataType t,
                                           PrimExpr lhs, PrimExpr rhs,
                                           std::ostream &os) { // NOLINT(*)
+
   // Declare the result.
   std::string sret = name_supply_->FreshName("_");
   this->PrintIndent();
@@ -474,6 +561,8 @@ void CodeGenTileLangHIP::PrintVecElemLoad(const std::string &vec, DataType t,
 
   static const char access[] = {'x', 'y', 'z', 'w'};
   ICHECK(i >= 0 && i < (t.bits() == 8                        ? 16
+                        : (t.lanes() == 16)                  ? 16
+                        : (t.lanes() == 32)                  ? 32
                         : (t.bits() == 16 || t.bits() == 32) ? 8
                                                              : 4));
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
@@ -484,6 +573,14 @@ void CodeGenTileLangHIP::PrintVecElemLoad(const std::string &vec, DataType t,
       std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
       os << "((" << type_name << ")(" << ac << " >> " << i % 4 * 8 << "))";
     }
+  } else if ((t.lanes() == 16 || t.lanes() == 32) && t.bits() == 32 &&
+             t.is_float()) {
+    // float32x16/float32x32: __attribute__((__vector_size__(...))) supports
+    // subscript.
+    os << vec << "[" << i << "]";
+  } else if (t.lanes() == 16 && t.is_bfloat16()) {
+    // bfloat16x16: struct { bfloat16_t data[16]; }
+    os << vec << ".data[" << i << "]";
   } else if (t.is_float16()) {
     os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->"
        << access[i % 2];
@@ -519,13 +616,16 @@ void CodeGenTileLangHIP::PrintVecElemStore(const std::string &vec, DataType t,
                                            int i, const std::string &value) {
   this->PrintIndent();
   static const char access[] = {'x', 'y', 'z', 'w'};
+
   ICHECK(i >= 0 && i < (t.bits() == 8                        ? 16
+                        : (t.lanes() == 16)                  ? 16
+                        : (t.lanes() == 32)                  ? 32
                         : (t.bits() == 16 || t.bits() == 32) ? 8
                                                              : 4));
   if (t.bits() == 8 && (t.is_int() || t.is_uint())) {
     if (t.lanes() == 2 || t.lanes() == 3) {
-      stream << vec << '.' << access[i % t.lanes()] << "="
-             << "(" << value << ");\n";
+      stream << vec << '.' << access[i % t.lanes()] << "=" << "(" << value
+             << ");\n";
     } else {
       std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]);
       stream << ac << "=";
@@ -535,6 +635,14 @@ void CodeGenTileLangHIP::PrintVecElemStore(const std::string &vec, DataType t,
       }
       stream << "(" << value << " << " << i % 4 * 8 << ");\n";
     }
+  } else if ((t.lanes() == 16 || t.lanes() == 32) && t.bits() == 32 &&
+             t.is_float()) {
+    // float32x16/float32x32: __attribute__((__vector_size__(...))) supports
+    // subscript.
+    stream << vec << "[" << i << "] = " << value << ";\n";
+  } else if (t.lanes() == 16 && t.is_bfloat16()) {
+    // bfloat16x16: struct { bfloat16_t data[16]; }
+    stream << vec << ".data[" << i << "] = " << value << ";\n";
   } else if (t.is_float16()) {
     stream << "*((half_t*)(&(((half2*)(&(" << vec << "." << access[i / 2]
            << ")))->" << access[i % 2] << "))) = " << value << ";\n";
@@ -572,6 +680,10 @@ void CodeGenTileLangHIP::PrintStorageSync(const CallNode *op) {
     // DO nothing.
   } else if (sync == "shared" || sync == "shared.dyn") {
     this->PrintIndent();
+    // TODO: change to __builtin_amdgcn_s_barrier() later
+    // __syncthreads() will add a vmcnt(0) to final asm, which will break all
+    // buffer_load_async and buffer_load_async...lds
+    // in tilelang vmcnt should be managed explicitly by cp_async_wait.
     this->stream << "__syncthreads();\n";
   }
 }
@@ -608,6 +720,32 @@ std::string CodeGenTileLangHIP::CastFromTo(std::string value, DataType from,
   return os.str();
 }
 
+void CodeGenTileLangHIP::VisitExpr_(const ShuffleNode *op,
+                                    std::ostream &os) { // NOLINT(*)
+  // For bfloat16x2 / float16x2 construction from two scalar lanes, emit the
+  // HIP pack intrinsic instead of the invalid `uint1(a, b)` that CodeGenC
+  // would generate (HIP's uint1 has no two-argument constructor).
+  // `uint1{value}` aggregate initialisation is valid: uint1 is defined by ROCm
+  // as HIP_vector_type<unsigned int, 1> which has a single .x member.
+  DataType t = op->dtype;
+  bool is_bf16x2 = t.is_bfloat16() && t.lanes() == 2;
+  bool is_fp16x2 = t.is_float16() && t.lanes() == 2;
+  if ((is_bf16x2 || is_fp16x2) && op->vectors.size() == 2 &&
+      op->vectors[0].dtype().lanes() == 1 &&
+      op->vectors[1].dtype().lanes() == 1) {
+    std::string e0 = PrintExpr(op->vectors[0]);
+    std::string e1 = PrintExpr(op->vectors[1]);
+    if (is_bf16x2) {
+      os << "uint1{__pack_bfloat162(" << e0 << ", " << e1 << ")}";
+    } else {
+      os << "uint1{__pack_half2(" << e0 << ", " << e1 << ")}";
+    }
+    return;
+  }
+  // Default path for all other shuffle patterns.
+  CodeGenC::VisitExpr_(op, os);
+}
+
 void CodeGenTileLangHIP::VisitExpr_(const CastNode *op, std::ostream &os) {
   DataType from_ty = op->value.dtype();
   DataType target_ty = op->dtype;
@@ -734,6 +872,15 @@ std::string CodeGenTileLangHIP::GetBufferRef(DataType t,
     buffer_str = temp.str();
   }
 
+  if (scope.empty()) {
+    scope = GetPtrStorageScope(buffer->data);
+  }
+  // local.var is a scalar — no indexing needed.
+  if (scope == "local.var") {
+    os << vid;
+    return os.str();
+  }
+
   std::string index_str = PrintExpr(index);
   if (t.bits() == 4 || (t.bits() == 1 && t.is_int())) {
     // This is a special case, because CodegenCUDA::PrintType()
@@ -744,9 +891,8 @@ std::string CodeGenTileLangHIP::GetBufferRef(DataType t,
     // sizes in that case.
     int div_factor = (t.lanes() == 1) ? (32 / t.bits()) : t.lanes();
 
-    os << "*("
-       << "(" << ptr_cast(t) << vid << ")"
-       << " + " << index_str << " / " << div_factor << ")";
+    os << "*(" << "(" << ptr_cast(t) << vid << ")" << " + " << index_str
+       << " / " << div_factor << ")";
   } else if (t == buffer_element_dtype) {
     os << buffer_str << "[" << index_str << "]";
   } else {
@@ -768,23 +914,38 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     this->stream << ");\n";
   };
   if (op->op.same_as(builtin::ptx_cp_async())) {
+    // args[0] = dst_access_ptr, args[1] = src_access_ptr, args[2] = bytes,
+    // args[3] = predicate (optional)
+    ICHECK(op->args.size() == 3 || op->args.size() == 4)
+        << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+           "src_access_ptr, bytes, [predicate])";
+    std::string dst = this->PrintExpr(op->args[0]);
+    std::string src = this->PrintExpr(op->args[1]);
+    std::string size = this->PrintExpr(op->args[2]);
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      // Non-predicated version
+      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << ", " << src
+                   << ");\n";
+    } else {
+      // Predicated version
+      std::string condition = this->PrintExpr(op->args[3]);
+      this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
+                   << ", " << src << ", " << condition << ");\n";
+    }
+  } else if (op->op.same_as(tl::ptx_cp_async())) {
+    int total_bytes = GetTileLangCPAsyncTransferBytes(op);
     std::string dst = this->PrintExpr(op->args[0]);
-    std::string dst_offset = this->PrintExpr(op->args[1]);
-    std::string src = this->PrintExpr(op->args[2]);
-    std::string src_offset = this->PrintExpr(op->args[3]);
-    std::string size = this->PrintExpr(op->args[4]);
-    // use size of argument list to indicate whether or not to use predicated
-    // cp.async
-    if (op->args.size() == 5) {
-      this->PrintIndent();
-      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << "+"
-                   << dst_offset << ", " << src << "+" << src_offset << ");\n";
+    std::string src = this->PrintExpr(op->args[1]);
+    std::string size = std::to_string(total_bytes);
+    this->PrintIndent();
+    if (op->args.size() == 3) {
+      this->stream << "tl::cp_async_gs<" << size << ">(" << dst << ", " << src
+                   << ");\n";
     } else {
-      std::string condition = this->PrintExpr(op->args[5]);
-      this->PrintIndent();
+      std::string condition = this->PrintExpr(op->args[3]);
       this->stream << "tl::cp_async_gs_conditional<" << size << ">(" << dst
-                   << "+" << dst_offset << ", " << src << "+" << src_offset
-                   << ", " << condition << ");\n";
+                   << ", " << src << ", " << condition << ");\n";
     }
   } else if (op->op.same_as(builtin::ptx_commit_group())) {
     print_extern_call_stmt("tl::cp_async_commit");
@@ -798,10 +959,6 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     std::string barrier_name = "_mbarrier";
     this->stream << "__shared__ uint64_t " << barrier_name << "["
                  << barrier_count << "];\n";
-  } else if (op->op.same_as(tl::get_mbarrier())) {
-    std::string barrier_name = "_mbarrier";
-    std::string barrier_id = this->PrintExpr(op->args[0]);
-    os << barrier_name + "[" + barrier_id + "]";
   } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
     print_extern_call_stmt("tl::mbarrier_arrive");
   } else if (op->op.same_as(builtin::ptx_init_barrier_thread_count())) {
@@ -828,6 +985,105 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
   } else if (op->op.same_as(tl::pack_b16())) {
     os << "__pack_half2(" << this->PrintExpr(op->args[0]) << ", "
        << this->PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::sync_grid())) {
+    this->need_cooperative_groups_ = true;
+    this->PrintIndent();
+    this->stream << "cooperative_groups::this_grid().sync();\n";
+  } else if (op->op.same_as(tl::sync_warp())) {
+    // AMD wavefronts execute in lockstep, so intra-wavefront convergence is
+    // guaranteed by the hardware. __syncwarp() has no HIP equivalent and is a
+    // no-op here. The mask argument (if present) is intentionally ignored.
+  } else if (op->op.same_as(tl::any_sync())) {
+    ICHECK_EQ(op->args.size(), 2U) << "tl.any_sync expects <mask, predicate>.";
+    // HIP __any takes only the predicate; the mask is ignored because
+    // wavefront execution is always convergent across the full wave.
+    os << "__any(" << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::all_sync())) {
+    ICHECK_EQ(op->args.size(), 2U) << "tl.all_sync expects <mask, predicate>.";
+    os << "__all(" << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::ballot_sync())) {
+    ICHECK_EQ(op->args.size(), 2U)
+        << "tl.ballot_sync expects <mask, predicate>.";
+    // HIP __ballot returns uint64 natively, covering every lane of the
+    // wavefront; the CUDA-style mask argument is ignored.
+    os << "((unsigned long long)__ballot(" << PrintExpr(op->args[1]) << "))";
+  } else if (op->op.same_as(tl::ballot())) {
+    ICHECK_EQ(op->args.size(), 1U) << "tl.ballot expects <predicate>.";
+    os << "((unsigned long long)__ballot(" << PrintExpr(op->args[0]) << "))";
+  } else if (op->op.same_as(tl::activemask())) {
+    ICHECK(op->args.empty()) << "tl.activemask takes no arguments.";
+    os << "((unsigned long long)__ballot(1))";
+  } else if (op->op.same_as(tl::syncthreads_count())) {
+    ICHECK_EQ(op->args.size(), 1U)
+        << "tl.syncthreads_count expects <predicate>.";
+    os << "__syncthreads_count(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::syncthreads_and())) {
+    ICHECK_EQ(op->args.size(), 1U) << "tl.syncthreads_and expects <predicate>.";
+    os << "__syncthreads_and(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::syncthreads_or())) {
+    ICHECK_EQ(op->args.size(), 1U) << "tl.syncthreads_or expects <predicate>.";
+    os << "__syncthreads_or(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::shfl_sync())) {
+    ICHECK_EQ(op->args.size(), 4U)
+        << "tl.shfl_sync expects <mask, value, src_lane, width>.";
+    // HIP __shfl takes only (value, src_lane, width); the mask is ignored.
+    os << "__shfl(" << PrintExpr(op->args[1]) << ", " << PrintExpr(op->args[2])
+       << ", " << PrintExpr(op->args[3]) << ")";
+  } else if (op->op.same_as(tl::shfl_xor_sync())) {
+    ICHECK_EQ(op->args.size(), 4U)
+        << "tl.shfl_xor_sync expects <mask, value, lane_mask, width>.";
+    os << "__shfl_xor(" << PrintExpr(op->args[1]) << ", "
+       << PrintExpr(op->args[2]) << ", " << PrintExpr(op->args[3]) << ")";
+  } else if (op->op.same_as(tl::shfl_down_sync())) {
+    ICHECK_EQ(op->args.size(), 4U)
+        << "tl.shfl_down_sync expects <mask, value, delta, width>.";
+    os << "__shfl_down(" << PrintExpr(op->args[1]) << ", "
+       << PrintExpr(op->args[2]) << ", " << PrintExpr(op->args[3]) << ")";
+  } else if (op->op.same_as(tl::shfl_up_sync())) {
+    ICHECK_EQ(op->args.size(), 4U)
+        << "tl.shfl_up_sync expects <mask, value, delta, width>.";
+    os << "__shfl_up(" << PrintExpr(op->args[1]) << ", "
+       << PrintExpr(op->args[2]) << ", " << PrintExpr(op->args[3]) << ")";
+  } else if (op->op.same_as(tl::match_any_sync()) ||
+             op->op.same_as(tl::match_all_sync())) {
+    LOG(FATAL) << "tl." << op->op << " is not supported on HIP: the "
+               << "__match_{any,all}_sync primitives have no HIP equivalent.";
+  } else if (op->op.same_as(tl::add2()) || op->op.same_as(tl::sub2()) ||
+             op->op.same_as(tl::mul2()) || op->op.same_as(tl::fma2()) ||
+             op->op.same_as(tl::max2()) || op->op.same_as(tl::min2()) ||
+             op->op.same_as(tl::abs2())) {
+    // Packed x2 element-wise math intrinsics.
+    // HIP currently only supports float32x2 (float2).
+    std::string op_name;
+    if (op->op.same_as(tl::add2()))
+      op_name = "add2";
+    else if (op->op.same_as(tl::sub2()))
+      op_name = "sub2";
+    else if (op->op.same_as(tl::mul2()))
+      op_name = "mul2";
+    else if (op->op.same_as(tl::fma2()))
+      op_name = "fma2";
+    else if (op->op.same_as(tl::max2()))
+      op_name = "max2";
+    else if (op->op.same_as(tl::min2()))
+      op_name = "min2";
+    else
+      op_name = "abs2";
+
+    os << "tl::" << op_name << "(";
+    os << PrintExpr(op->args[0]);
+    for (size_t i = 1; i < op->args.size(); ++i) {
+      os << ", " << PrintExpr(op->args[i]);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::ds_read_tr16_b64())) {
+    ICHECK_EQ(op->args.size(), 1U)
+        << "tl.ds_read_tr16_b64 expects one argument (smem_access_ptr).";
+    os << "tl::ds_read_tr16_b64(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::ds_read_tr8_b64())) {
+    ICHECK_EQ(op->args.size(), 1U)
+        << "tl.ds_read_tr8_b64 expects one argument (smem_access_ptr).";
+    os << "tl::ds_read_tr8_b64(" << PrintExpr(op->args[0]) << ")";
   } else if (op->op.same_as(tl::__ldg())) {
     // HIP fallback: regular load
     const BufferLoadNode *bl = op->args[0].as<BufferLoadNode>();
@@ -933,16 +1189,27 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
         {"int32", "int"},
         {"int8x4", "int32_t"},
         {"int8x8", "int64_t"},
+        {"int8x16", "int32x4"},
         {"int32x4", "int32x4"},
+        {"int32x16", "int32x16"},
         {"float16", "half"},
         {"float32", "float"},
         {"float64", "double"},
         {"float16x4", "float16x4"},
+        {"float16x8", "float16x8"},
         {"bfloat16x4", "bfloat16x4_vec"},
+        {"bfloat16x8", "bfloat16x8_vec"},
         {"float32x4", "float32x4"},
         {"float8_e4m3fnuzx4", "fp8_e4_4_t"},
+        {"float8_e4m3fnx4", "fp8_e4_4_t"},
         {"float8_e4m3fnuzx8", "long"},
-        {"float32x16", "float32x16"}};
+        {"float8_e4m3fnx8", "long"},
+        {"float8_e5m2fnuzx4", "fp8_e5_4_t"},
+        {"float8_e5m2fnuzx8", "long"},
+        {"float8_e5m2x4", "fp8_e5_4_t"},
+        {"float8_e5m2x8", "long"},
+        {"float32x16", "float32x16"},
+        {"float32x32", "float32x32"}};
     std::string call_mfma_code = R"({
       *((({C_dtype}*){c_ref}) + {c_bias}) = {mfma_buildin}(*((({A_dtype}*){a_ref}) + {a_bias}),
                     *((({B_dtype}*){b_ref}) + {b_bias}),
@@ -962,6 +1229,88 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     replacer.register_rule("{c_ref}", c_ref);
     replacer.register_rule("{c_bias}", c_bias);
     os << replacer.rewrite(call_mfma_code);
+  } else if (op->op.same_as(tl::tvm_rdna_wmma())) {
+    // arg 0: shape string, e.g. "f32_16x16x16_f16_w32"
+    // arg 1: A layout: "row"/"col"
+    // arg 2: B layout: "row"/"col"
+    // arg 3: A dtype, e.g. "float16x8"
+    // arg 4: B dtype
+    // arg 5: C dtype, e.g. "float32x8"
+    // arg 6: A data pointer
+    // arg 7: A element index (in units of the vectorized type)
+    // arg 8: B data pointer
+    // arg 9: B element index
+    // arg 10: C data pointer
+    // arg 11: C element index
+    ICHECK(op->args.size() == 12U) << "tvm_rdna_wmma expects 12 arguments";
+    std::string shape = Downcast<StringImm>(op->args[0])->value;
+    std::string A_layout = Downcast<StringImm>(op->args[1])->value;
+    std::string B_layout = Downcast<StringImm>(op->args[2])->value;
+    std::string A_dtype = Downcast<StringImm>(op->args[3])->value;
+    std::string B_dtype = Downcast<StringImm>(op->args[4])->value;
+    std::string C_dtype = Downcast<StringImm>(op->args[5])->value;
+    std::string a_ref = this->PrintExpr(op->args[6]);
+    std::string a_bias = this->PrintExpr(op->args[7]);
+    std::string b_ref = this->PrintExpr(op->args[8]);
+    std::string b_bias = this->PrintExpr(op->args[9]);
+    std::string c_ref = this->PrintExpr(op->args[10]);
+    std::string c_bias = this->PrintExpr(op->args[11]);
+
+    // Get RDNA Generation
+    ICHECK(target_.defined()) << "CodeGenTileLangHIP target is not set";
+    int rdna_gen = tvm::tl::TargetGetRDNAGeneration(target_);
+    ICHECK(rdna_gen == 11 || rdna_gen == 12)
+        << "Unsupported RDNA target for WMMA: gfx" << target_->str();
+
+    // Determine wmma builtin name from shape
+    // shape = "f32_16x16x16_f16_w32" ->
+    // "__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12" For gfx12 targets use
+    // the _gfx12 suffix variant, which gfx11 targets don't have.
+    std::string wmma_builtin = "__builtin_amdgcn_wmma_" + shape;
+
+    int ab_half_elems = 16;
+    std::string ab_vec_typedef = "tl_v16f16";
+    if (rdna_gen == 12) {
+      wmma_builtin += "_gfx12";
+      ab_half_elems = 8;
+      ab_vec_typedef = "tl_v8f16";
+    }
+
+    // Emit the WMMA call.
+    // For gfx12:
+    // Signature: v8f32 = wmma_builtin(v8f16 a, v8f16 b, v8f32 c)
+    // where v8f16 = __fp16 x 8, v8f32 = float x 8.
+    // For gfx11:
+    // Signature: v8f32 = wmma_builtin(v16f16 a, v16f16 b, v8f32 c)
+    // where v16f16 = __fp16 x 16, v8f32 = float x 8.
+    //
+    // A/B buffers hold half_t (fp16), C/D buffers hold float.
+    // Each element index accesses a packed vector of 8/16 elements.
+    //
+    // Using typedef'd vector types for the cast:
+    //   typedef __attribute__((__vector_size__(8/16 * sizeof(__fp16)))) __fp16
+    //   tl_v8/16f16; typedef __attribute__((__vector_size__(8 *
+    //   sizeof(float)))) float tl_v8f32;
+    std::string call_wmma_code = R"({
+      typedef __attribute__((__vector_size__({ab_half_elems} * sizeof(__fp16)))) __fp16 {ab_vec_typedef};
+      typedef __attribute__((__vector_size__(8 * sizeof(float)))) float tl_v8f32;
+      *((tl_v8f32*){c_ref} + {c_bias}) = {wmma_builtin}(
+          *(({ab_vec_typedef}*){a_ref} + {a_bias}),
+          *(({ab_vec_typedef}*){b_ref} + {b_bias}),
+          *((tl_v8f32*){c_ref} + {c_bias}));
+    })";
+    Replacer wmma_replacer;
+    wmma_replacer.register_rule("{wmma_builtin}", wmma_builtin);
+    wmma_replacer.register_rule("{ab_half_elems}",
+                                std::to_string(ab_half_elems));
+    wmma_replacer.register_rule("{ab_vec_typedef}", ab_vec_typedef);
+    wmma_replacer.register_rule("{a_ref}", a_ref);
+    wmma_replacer.register_rule("{a_bias}", a_bias);
+    wmma_replacer.register_rule("{b_ref}", b_ref);
+    wmma_replacer.register_rule("{b_bias}", b_bias);
+    wmma_replacer.register_rule("{c_ref}", c_ref);
+    wmma_replacer.register_rule("{c_bias}", c_bias);
+    os << wmma_replacer.rewrite(call_wmma_code);
   } else if (op->op.same_as(builtin::thread_return())) {
     os << "return";
   } else if (op->op.same_as(tl::tl_gemm())) {
@@ -980,38 +1329,143 @@ void CodeGenTileLangHIP::VisitExpr_(const CallNode *op, std::ostream &os) {
     // HIP doesn't need explicit register management like CUDA
     // This is a no-op for HIP
     return;
+  } else if (op->op.same_as(tl::annotate_producer_reg_dealloc()) ||
+             op->op.same_as(tl::annotate_consumer_reg_alloc())) {
+    return;
+  } else if (op->op.same_as(tl::warp_reduce_sum())) {
+    os << "tl::warp_reduce_sum(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_max())) {
+    os << "tl::warp_reduce_max(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_min())) {
+    os << "tl::warp_reduce_min(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitand())) {
+    os << "tl::warp_reduce_bitand(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::warp_reduce_bitor())) {
+    os << "tl::warp_reduce_bitor(" << PrintExpr(op->args[0]) << ")";
+  } else if (op->op.same_as(tl::atomic_add_elem_op())) {
+    // atomic_add_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAdd(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_add_ret_elem_op())) {
+    // atomic_add_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicAddRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::atomic_addx2_elem_op())) {
+    // atomic_addx2_elem_op(dst_ptr, src_ptr[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_ptr = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAddx2(" << dst_ptr << ", " << src_ptr;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_addx4_elem_op())) {
+    // atomic_addx4_elem_op(dst_ptr, src_ptr[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_ptr = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicAddx4(" << dst_ptr << ", " << src_ptr;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_load_elem_op())) {
+    // atomic_load_elem_op(src_ptr, memory_order) -> returns loaded value
+    os << "AtomicLoad(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]) << ")";
+  } else if (op->op.same_as(tl::atomic_store_elem_op())) {
+    // atomic_store_elem_op(dst_ptr, value, memory_order)
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string value = PrintExpr(op->args[1]);
+    std::string memory_order = PrintExpr(op->args[2]);
+    this->PrintIndent();
+    this->stream << "AtomicStore(" << dst_ptr << ", " << value << ", "
+                 << memory_order << ");\n";
+  } else if (op->op.same_as(tl::atomic_max_elem_op())) {
+    // atomic_max_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicMax(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_max_ret_elem_op())) {
+    // atomic_max_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicMaxRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
+  } else if (op->op.same_as(tl::atomic_min_elem_op())) {
+    // atomic_min_elem_op(dst_ptr, src_value[, memory_order])
+    std::string dst_ptr = PrintExpr(op->args[0]);
+    std::string src_value = PrintExpr(op->args[1]);
+    this->PrintIndent();
+    this->stream << "AtomicMin(" << dst_ptr << ", " << src_value;
+    if (op->args.size() > 2) {
+      this->stream << ", " << PrintExpr(op->args[2]);
+    }
+    this->stream << ");\n";
+  } else if (op->op.same_as(tl::atomic_min_ret_elem_op())) {
+    // atomic_min_ret_elem_op(dst_ptr, src_value[, memory_order]) -> returns
+    // prev value
+    os << "AtomicMinRet(" << PrintExpr(op->args[0]) << ", "
+       << PrintExpr(op->args[1]);
+    if (op->args.size() > 2) {
+      os << ", " << PrintExpr(op->args[2]);
+    }
+    os << ")";
   } else {
     CodeGenC::VisitExpr_(op, os);
   }
 }
 
 void CodeGenTileLangHIP::VisitStmt_(const AttrStmtNode *op) {
-  if (op->attr_key == tir::attr::async_commit_queue_scope) {
-    const IntImmNode *queue_id = op->value.as<IntImmNode>();
-    ICHECK(queue_id && queue_id->value == 0)
-        << "For CUDA, the index of an async queue must be 0.";
-    this->VisitStmt(op->body);
-    auto commit_group = Call(DataType::Void(), builtin::ptx_commit_group(), {});
-    this->VisitExpr(commit_group, this->stream);
-    return;
-  } else if (op->attr_key == tir::attr::async_wait_queue_scope) {
-    auto wait_attrs = GetAsyncWaitAttributes(op);
-    auto queue_id = wait_attrs.first.as<IntImmNode>();
-    ICHECK(queue_id && queue_id->value == 0)
-        << "For CUDA, the index of an async queue must be 0.";
-    auto wait_cnt = wait_attrs.second;
-    auto wait_group =
-        Call(DataType::Void(), builtin::ptx_wait_group(), {wait_cnt});
-    this->VisitExpr(wait_group, this->stream);
-    auto inner = op->body.as<AttrStmtNode>();
-    ICHECK(inner);
-    this->VisitStmt(inner->body);
+  if (op->attr_key == tl::attr::kLexicalAllocScope) {
+    PrintIndent();
+    stream << "{\n";
+    int scope = BeginScope();
+    PrintStmt(op->body);
+    EndScope(scope);
+    PrintIndent();
+    stream << "}\n";
     return;
   } else if (op->attr_key == "threadblock_swizzle_pattern") {
     this->PrintIndent();
-    const StringImmNode *pattern = op->value.as<StringImmNode>();
-    ICHECK(pattern);
-    this->stream << "const dim3 blockIdx = " << pattern->value << "();\n";
+    std::string func_name;
+    int panel_size = 0;
+    if (const auto *call = op->value.as<CallNode>()) {
+      if (call->op.same_as(tir::builtin::tvm_tuple()) &&
+          call->args.size() >= 2) {
+        const auto *name_node = call->args[0].as<StringImmNode>();
+        const auto *size_node = call->args[1].as<IntImmNode>();
+        ICHECK(name_node && size_node) << "threadblock_swizzle_pattern expects "
+                                          "tvm_tuple(device_func, panel_size)";
+        func_name = name_node->value;
+        panel_size = static_cast<int>(size_node->value);
+      }
+    }
+    ICHECK(!func_name.empty() && panel_size > 0)
+        << "threadblock_swizzle_pattern: failed to extract func_name and "
+           "panel_size";
+    this->stream << "const dim3 blockIdx = tl::" << func_name << "<"
+                 << panel_size << ">();\n";
     this->VisitStmt(op->body);
     return;
   }
@@ -1039,7 +1493,23 @@ void CodeGenTileLangHIP::VisitStmt_(const AllocateNode *op) {
         scope == "shared") {
       constant_size = constant_size / (32 / op->dtype.bits());
     }
-    stream << ' ' << vid << '[' << constant_size << "];\n";
+
+    if (scope == "local.var") {
+      // Single-element variable: emit an initializer so the value is defined.
+      // Default to 0; respect the user-provided tl.local_var_init annotation.
+      PrimExpr init = tir::make_const(op->dtype, 0);
+      auto init_it = op->annotations.find(tl::attr::kLocalVarInit);
+      if (init_it != op->annotations.end()) {
+        PrimExpr user_init = Downcast<PrimExpr>((*init_it).second);
+        if (!user_init.dtype().is_void() && user_init.dtype() != op->dtype) {
+          user_init = tir::Cast(op->dtype, user_init);
+        }
+        init = user_init;
+      }
+      stream << ' ' << vid << " = " << PrintExpr(init) << ";\n";
+    } else {
+      stream << ' ' << vid << '[' << constant_size << "];\n";
+    }
   }
 
   RegisterHandleType(op->buffer_var.get(), op->dtype);
@@ -1053,8 +1523,8 @@ void CodeGenTileLangHIP::VisitExpr_(const RampNode *op, std::ostream &os) {
   PrintType(op->dtype, os);
   os << "(";
   for (int i = 0; i < lanes; i++) {
-    os << "(" << PrintExpr(op->base) << ")"
-       << "+(" << PrintExpr(op->stride) << "*" << i << ")";
+    os << "(" << PrintExpr(op->base) << ")" << "+(" << PrintExpr(op->stride)
+       << "*" << i << ")";
     if (i != lanes - 1)
       os << ", ";
   }
@@ -1110,16 +1580,48 @@ void CodeGenTileLangHIP::VisitExpr_(const BroadcastNode *op,
   if (op->dtype.is_float() && op->dtype.bits() == 32 &&
       op->dtype.lanes() == 8) {
     std::string v = PrintExpr(op->value);
+    // HIP does not allow taking the address of a temporary, so use a union
+    // to reinterpret float2 as unsigned long long without UB or temp-address.
     os << "make_ulonglong4(";
     for (int i = 0; i < 4; ++i) {
       if (i != 0)
         os << ", ";
-      os << "*(unsigned long long*)&make_float2(" << v << ", " << v << ")";
+      os << "([&]{ union { float2 f; unsigned long long u; } _tmp;"
+         << " _tmp.f = make_float2(" << v << ", " << v
+         << "); return _tmp.u; }())";
     }
     os << ')';
     return;
   }
 
+  if (op->dtype.is_float() && op->dtype.bits() == 32 &&
+      (op->dtype.lanes() == 16 || op->dtype.lanes() == 32)) {
+    // float32x16/float32x32: GCC vector extension — initialize with compound
+    // literal.
+    std::string v = PrintExpr(op->value);
+    os << "(float32x" << op->dtype.lanes() << "){";
+    for (int i = 0; i < op->dtype.lanes(); ++i) {
+      if (i != 0)
+        os << ", ";
+      os << v;
+    }
+    os << "}";
+    return;
+  }
+
+  if (op->dtype.is_bfloat16() && op->dtype.lanes() == 16) {
+    // bfloat16x16: struct aggregate initializer.
+    std::string v = PrintExpr(op->value);
+    os << "bfloat16x16{";
+    for (int i = 0; i < 16; ++i) {
+      if (i != 0)
+        os << ", ";
+      os << v;
+    }
+    os << "}";
+    return;
+  }
+
   if ((op->dtype.is_int() || op->dtype.is_uint()) && op->dtype.bits() == 4) {
     bool fail = false;
     const int64_t *p = as_const_int(op->value);
@@ -1182,13 +1684,17 @@ inline void PrintConst(const FloatImmNode *op, std::ostream &os,
                        CodeGenTileLangHIP *p) { // NOLINT(*)
   // Type code is kBFloat
   if (op->dtype.is_bfloat16()) {
-    os << "bfloat16_t";
-    os << '(' << std::scientific << op->value << 'f' << ')';
+    os << "bfloat16_t(";
+    FloatImm const_f32 = FloatImm(DataType::Float(32), op->value);
+    PrintConst(const_f32.get(), os, p);
+    os << ')';
     return;
   } else if (op->dtype.is_float8_e4m3fnuz() || op->dtype.is_float8_e4m3() ||
              op->dtype.is_float8_e4m3fn()) {
-    os << "fp8_e4_t";
-    os << '(' << std::scientific << op->value << 'f' << ')';
+    os << "fp8_e4_t(";
+    FloatImm const_f32 = FloatImm(DataType::Float(32), op->value);
+    PrintConst(const_f32.get(), os, p);
+    os << ')';
     return;
   }
   // Type code is kFloat
@@ -1279,7 +1785,7 @@ void CodeGenTileLangHIP::PrintVecElemLoadExpr(DataType t, int i,
     return;
   }
 
-  if (t.is_bfloat16()) {
+  if (t.is_bfloat16() && t.lanes() != 16) {
     if (i == 0) {
       os << "make_";
       PrintType(t, os);
@@ -1298,6 +1804,30 @@ void CodeGenTileLangHIP::PrintVecElemLoadExpr(DataType t, int i,
     return;
   }
 
+  if ((t.lanes() == 16 || t.lanes() == 32) && t.bits() == 32 && t.is_float()) {
+    // float32x16/float32x32: compound literal.
+    if (i == 0)
+      os << "(float32x" << t.lanes() << "){";
+    os << value;
+    if (i != t.lanes() - 1)
+      os << ",";
+    else
+      os << "}";
+    return;
+  }
+
+  if (t.lanes() == 16 && t.is_bfloat16()) {
+    // bfloat16x16: struct aggregate initializer.
+    if (i == 0)
+      os << "bfloat16x16{";
+    os << value;
+    if (i != t.lanes() - 1)
+      os << ",";
+    else
+      os << "}";
+    return;
+  }
+
   if (i == 0) {
     os << "make_";
     PrintType(t, os);
diff --git a/src/target/codegen_hip.h b/src/target/codegen_hip.h
index 631050feb6..1030352e95 100644
--- a/src/target/codegen_hip.h
+++ b/src/target/codegen_hip.h
@@ -6,6 +6,7 @@
 #define TVM_TL_TARGET_CODEGEN_HIP_H_
 
 #include <tvm/target/codegen.h>
+#include <tvm/target/target.h>
 #include <tvm/tir/expr.h>
 #include <tvm/tir/op.h>
 
@@ -21,6 +22,7 @@ class CodeGenTileLangHIP final : public CodeGenC {
 public:
   CodeGenTileLangHIP();
   std::string Finish();
+  void SetTarget(Target target) { target_ = std::move(target); }
   // override behavior
   void PrintFuncPrefix(std::ostream &os) final;
   void PrintExtraAttrs(const PrimFunc &f, std::ostream &os) final;
@@ -47,6 +49,7 @@ class CodeGenTileLangHIP final : public CodeGenC {
   void VisitExpr_(const FloatImmNode *op, std::ostream &os) final;
   void VisitExpr_(const CallNode *op, std::ostream &os) final;
   void VisitExpr_(const CastNode *op, std::ostream &os) final;
+  void VisitExpr_(const ShuffleNode *op, std::ostream &os) final; // NOLINT(*)
   void VisitStmt_(const AllocateNode *op) final;
   void VisitStmt_(const AttrStmtNode *op) final;
 
@@ -71,6 +74,8 @@ class CodeGenTileLangHIP final : public CodeGenC {
   friend void PrintConst(const FloatImmNode *op, std::ostream &os,
                          CodeGenTileLangHIP *p);
 
+  // whether need hip_cooperative_groups.h
+  bool need_cooperative_groups_{false};
   // whether need math_constants.h
   bool need_math_constants_h_{false};
   // whether need mfma.h
@@ -88,6 +93,8 @@ class CodeGenTileLangHIP final : public CodeGenC {
   // The alignment of the barrier array in shared memory
   // Set to 16 to maintain minimum alignment requirements for async bulk copy
   const int barrier_alignment_bytes_ = 16;
+  // Target
+  Target target_;
 };
 
 } // namespace codegen
diff --git a/src/target/codegen_py.cc b/src/target/codegen_py.cc
index aa12eef094..6e0b787bb2 100644
--- a/src/target/codegen_py.cc
+++ b/src/target/codegen_py.cc
@@ -155,6 +155,7 @@ void CodeGenTileLangPY::ReserveKeywordsAsUnique_() {
 
 void CodeGenTileLangPY::PrintSSAAssign(const std::string &target,
                                        const std::string &src, DataType t) {
+  PrintIndent();
   stream << target << " = " << RemoveOutermostParentheses(src) << "\n";
 }
 
diff --git a/src/target/intrin_rule_cuda.cc b/src/target/intrin_rule_cuda.cc
index 1aacd72048..7658d79cc1 100644
--- a/src/target/intrin_rule_cuda.cc
+++ b/src/target/intrin_rule_cuda.cc
@@ -118,7 +118,24 @@ struct CUDAWarpIntrinsic {
 
 static PrimExpr DispatchCUDAWarpActiveMask(const PrimExpr &e) {
   const CallNode *call = e.as<CallNode>();
-  return Call(call->dtype, Op::Get("tir.cuda.__activemask"), call->args);
+  return Call(call->dtype, Op::Get("tir.cuda.__activemask"), call->args,
+              call->annotations);
+}
+
+static PrimExpr DispatchCUDAIsFinite(const PrimExpr &e) {
+  const CallNode *call = e.as<CallNode>();
+  ICHECK(call != nullptr);
+  ICHECK_EQ(call->args.size(), 1U);
+
+  DataType arg_dtype = call->args[0].dtype();
+  if (arg_dtype.is_float() &&
+      (arg_dtype.bits() == 32 || arg_dtype.bits() == 64)) {
+    Array<PrimExpr> new_args = {StringImm("isfinite"), call->args[0]};
+    return Call(call->dtype, builtin::call_pure_extern(), new_args,
+                call->annotations);
+  }
+
+  return e;
 }
 
 template <typename T> static PrimExpr DispatchCUDAShuffle(const PrimExpr &e) {
@@ -127,13 +144,17 @@ template <typename T> static PrimExpr DispatchCUDAShuffle(const PrimExpr &e) {
   ICHECK_EQ(call->args.size(), 5); // mask, value, warp_id, width, warp_size
   Array<PrimExpr> cuda_args{
       {call->args[0], call->args[1], call->args[2], call->args[3]}};
-  return Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args);
+  return Call(call->dtype, T()(call->dtype, Downcast<Op>(call->op)), cuda_args,
+              call->annotations);
 }
 
 TVM_REGISTER_OP("tir.rsqrt")
     .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic",
                                DispatchPureExtern<CUDAMath>);
 
+TVM_REGISTER_OP("tir.isfinite")
+    .set_attr<FLowerIntrinsic>("cuda.FLowerIntrinsic", DispatchCUDAIsFinite);
+
 } // namespace intrin
 } // namespace codegen
 } // namespace tvm
diff --git a/src/target/ptx.cc b/src/target/ptx.cc
index 53f83ded93..5cec5ef117 100644
--- a/src/target/ptx.cc
+++ b/src/target/ptx.cc
@@ -35,20 +35,25 @@ namespace codegen {
 // PTX related data structures and functions.
 namespace ptx {
 
+// clang-format off
 static const char *enum_to_str[] = {
     "kInt4",        "kUInt4",         "kInt8",    "kUInt8",    "kInt16",
     "kUInt16",      "kInt32",         "kUInt32",  "kInt64",    "kUInt64",
     "kFloat8_e4m3", "kFloat8_e5m2",   "kFloat16", "kBFloat16", "kFloat16x2",
     "kFloat32",     "kTensorFloat32", "kFloat64", "kBit1",     "kBit8",
-    "kBit16",       "kBit32",         "kBit64"};
+    "kBit16",       "kBit32",         "kBit64",   "kFloat6_e2m3fn",
+    "kFloat6_e3m2fn", "kFloat4_e2m1fn"};
 
 static const char *dtype_str[] = {
     ".s4",   ".u4",  ".s8",   ".u8",   ".s16", ".u16",  ".s32",   ".u32",
     ".s64",  ".u64", ".e4m3", ".e5m2", ".f16", ".bf16", ".f16x2", ".f32",
-    ".tf32", ".f64", ".b1",   ".b8",   ".b16", ".b32",  ".b64"};
+    ".tf32", ".f64", ".b1",   ".b8",   ".b16", ".b32",  ".b64",
+    ".e2m3", ".e3m2", ".e2m1"};
 static const uint32_t num_bits[] = {4,  4,  8, 8, 16, 16, 32, 32,
                                     64, 64, 8, 8, 16, 16, 32, 32,
-                                    32, 64, 1, 8, 16, 32, 64};
+                                    32, 64, 1, 8, 16, 32, 64,
+                                    6,  6,  4};
+// clang-format on
 
 /*!
  * \brief Create PTX data type from string.
@@ -74,10 +79,18 @@ DataType DTypeFromString(const std::string str) {
     return DataType::kInt64;
   } else if (str == "uint64" || str == ".u64") {
     return DataType::kUInt64;
-  } else if (str == "float8_e4m3" || str == "e4m3" || str == ".e4m3") {
+  } else if (str == "float8_e4m3" || str == "float8_e4m3fn" || str == "e4m3" ||
+             str == ".e4m3") {
     return DataType::kFloat8_e4m3;
-  } else if (str == "float8_e5m2" || str == "e5m2" || str == ".e5m2") {
+  } else if (str == "float8_e5m2" || str == "float8_e5m2fn" || str == "e5m2" ||
+             str == ".e5m2") {
     return DataType::kFloat8_e5m2;
+  } else if (str == "float6_e2m3fn" || str == "e2m3" || str == ".e2m3") {
+    return DataType::kFloat6_e2m3fn;
+  } else if (str == "float6_e3m2fn" || str == "e3m2" || str == ".e3m2") {
+    return DataType::kFloat6_e3m2fn;
+  } else if (str == "float4_e2m1fn" || str == "e2m1" || str == ".e2m1") {
+    return DataType::kFloat4_e2m1fn;
   } else if (str == "float16" || str == "fp16" || str == ".f16") {
     return DataType::kFloat16;
   } else if (str == "bfloat16" || str == "bf16") {
@@ -1341,9 +1354,13 @@ std::string PrintCpAsyncAssembly(const std::string &shared_ptr,
 )";
   Replacer replacer;
   replacer.register_rule("{smem_addr}",
-                         shared_ptr + " + " + shared_elem_offset);
+                         shared_elem_offset.empty()
+                             ? shared_ptr
+                             : shared_ptr + " + " + shared_elem_offset);
   replacer.register_rule("{global_ptr}",
-                         global_ptr + " + " + global_elem_offset);
+                         global_elem_offset.empty()
+                             ? global_ptr
+                             : global_ptr + " + " + global_elem_offset);
   replacer.register_rule("{bytes}", bytes);
   replacer.register_rule("{cg_or_ca}", bytes == "16" ? "cg" : "ca");
   asm_code = replacer.rewrite(asm_code);
@@ -1396,9 +1413,13 @@ std::string PrintPredicatedCpAsyncAssembly(
 
   Replacer replacer;
   replacer.register_rule("{smem_addr}",
-                         shared_ptr + " + " + shared_elem_offset);
+                         shared_elem_offset.empty()
+                             ? shared_ptr
+                             : shared_ptr + " + " + shared_elem_offset);
   replacer.register_rule("{global_ptr}",
-                         global_ptr + " + " + global_elem_offset);
+                         global_elem_offset.empty()
+                             ? global_ptr
+                             : global_ptr + " + " + global_elem_offset);
   replacer.register_rule("{bytes}", bytes);
   replacer.register_rule("{cg_or_ca}", bytes == "16" ? "cg" : "ca");
   replacer.register_rule("{store_shared}", store_shared);
@@ -1418,11 +1439,19 @@ std::string PrintCpAsyncBulkAsm(const std::string &shared_ptr,
   {
     unsigned int smem_addr_int = cast_smem_ptr_to_int({smem_addr});
     unsigned int barrier_addr_int = cast_smem_ptr_to_int({barrier});
+#if (__CUDACC_VER_MAJOR__ > 12) || (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+    __asm__ __volatile__(
+      "cp.async.bulk.shared::cta.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
+      :: "r"(smem_addr_int), "l"({global_ptr}), "r"({bytes}), "r"(barrier_addr_int)
+      : "memory"
+    );
+#else
     __asm__ __volatile__(
       "cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::bytes [%0], [%1], %2, [%3];"
       :: "r"(smem_addr_int), "l"({global_ptr}), "r"({bytes}), "r"(barrier_addr_int)
       : "memory"
     );
+#endif
   }
 )";
 
diff --git a/src/target/ptx.h b/src/target/ptx.h
index 566cded6fa..6085734baf 100644
--- a/src/target/ptx.h
+++ b/src/target/ptx.h
@@ -65,7 +65,10 @@ enum class DataType : int {
   kBit8 = 19,
   kBit16 = 20,
   kBit32 = 21,
-  kBit64 = 22
+  kBit64 = 22,
+  kFloat6_e2m3fn = 23,
+  kFloat6_e3m2fn = 24,
+  kFloat4_e2m1fn = 25
 };
 
 /*!
@@ -193,9 +196,11 @@ std::string PrintLoadMatrixAssembly(bool trans, int num,
 /*!
  * \brief Print ptx cp.async assembly string given parameters.
  * \param shared_ptr: The pointer to the destination shared memory.
- * \param shared_elem_offset: The offset into the shared memory.
+ * \param shared_elem_offset: The offset into the shared memory (empty for no
+ * offset).
  * \param global_ptr: The pointer to the global memory.
- * \param global_elem_offset: The offset into the global memory.
+ * \param global_elem_offset: The offset into the global memory (empty for no
+ * offset).
  * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
  */
 std::string PrintCpAsyncAssembly(const std::string &shared_ptr,
@@ -204,12 +209,27 @@ std::string PrintCpAsyncAssembly(const std::string &shared_ptr,
                                  const std::string &global_elem_offset,
                                  const std::string &bytes);
 
+/*!
+ * \brief Print ptx cp.async assembly string given parameters (no offset
+ * version).
+ * \param shared_ptr: The pointer to the destination shared memory.
+ * \param global_ptr: The pointer to the global memory.
+ * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
+ */
+inline std::string PrintCpAsyncAssembly(const std::string &shared_ptr,
+                                        const std::string &global_ptr,
+                                        const std::string &bytes) {
+  return PrintCpAsyncAssembly(shared_ptr, "", global_ptr, "", bytes);
+}
+
 /*!
  * \brief Print predicated ptx cp.async assembly string given parameters.
  * \param shared_ptr: The pointer to the destination shared memory.
- * \param shared_elem_offset: The offset into the shared memory.
+ * \param shared_elem_offset: The offset into the shared memory (empty for no
+ * offset).
  * \param global_ptr: The pointer to the global memory.
- * \param global_elem_offset: The offset into the global memory.
+ * \param global_elem_offset: The offset into the global memory (empty for no
+ * offset).
  * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
  * \param predicate_value: The value of predicate `@p`.
  */
@@ -218,6 +238,21 @@ std::string PrintPredicatedCpAsyncAssembly(
     const std::string &global_ptr, const std::string &global_elem_offset,
     const std::string &bytes, const std::string &predicate_value);
 
+/*!
+ * \brief Print predicated ptx cp.async assembly string given parameters (no
+ * offset version).
+ * \param shared_ptr: The pointer to the destination shared memory.
+ * \param global_ptr: The pointer to the global memory.
+ * \param bytes: The number of bytes to copy, valid values are 4, 8, and 16.
+ * \param predicate_value: The value of predicate `@p`.
+ */
+inline std::string PrintPredicatedCpAsyncAssembly(
+    const std::string &shared_ptr, const std::string &global_ptr,
+    const std::string &bytes, const std::string &predicate_value) {
+  return PrintPredicatedCpAsyncAssembly(shared_ptr, "", global_ptr, "", bytes,
+                                        predicate_value);
+}
+
 /*!
  * \brief Print ptx async copy from global to shared memory using cp.async.bulk
  * \param shared_ptr: The pointer to the destination shared memory.
diff --git a/src/target/rt_mod_cpp.cc b/src/target/rt_mod_c.cc
similarity index 91%
rename from src/target/rt_mod_cpp.cc
rename to src/target/rt_mod_c.cc
index 10e3d57b6a..bc652b1e2b 100644
--- a/src/target/rt_mod_cpp.cc
+++ b/src/target/rt_mod_c.cc
@@ -1,4 +1,4 @@
-#include "codegen_cpp.h"
+#include "codegen_c.h"
 #include <tvm/ffi/extra/module.h>
 #include <tvm/ffi/reflection/registry.h>
 
@@ -7,7 +7,7 @@
 namespace tvm {
 namespace codegen {
 
-ffi::Module BuildCPPHost(IRModule mod, Target target) {
+ffi::Module BuildTileLangC(IRModule mod, Target target) {
   bool output_ssa = false;
   bool emit_asserts = false;
   bool emit_fwd_func_decl = true;
@@ -21,7 +21,7 @@ ffi::Module BuildCPPHost(IRModule mod, Target target) {
     }
   }
 
-  CodeGenTileLangCPP cg;
+  CodeGenTileLangC cg;
   cg.Init(output_ssa, emit_asserts, emit_fwd_func_decl, target->str(), devices);
   cg.SetConstantsByteAlignment(
       target->GetAttr<Integer>("constants-byte-alignment").value_or(16));
@@ -33,7 +33,7 @@ ffi::Module BuildCPPHost(IRModule mod, Target target) {
   std::vector<std::pair<GlobalVar, PrimFunc>> funcs;
   for (auto [gvar, base_func] : mod->functions) {
     ICHECK(base_func->IsInstance<PrimFuncNode>())
-        << "CodegenCHost: Can only take PrimFunc";
+        << "BuildTileLangC: Can only take PrimFunc";
     auto prim_func = Downcast<PrimFunc>(base_func);
     funcs.push_back({gvar, prim_func});
   }
@@ -72,7 +72,7 @@ ffi::Module BuildCPPHost(IRModule mod, Target target) {
 
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("target.build.tilelang_cpp", BuildCPPHost);
+  refl::GlobalDef().def("target.build.tilelang_c", BuildTileLangC);
 }
 
 } // namespace codegen
diff --git a/src/target/rt_mod_cuda.cc b/src/target/rt_mod_cuda.cc
index 69e16148f7..1ab0d1b452 100644
--- a/src/target/rt_mod_cuda.cc
+++ b/src/target/rt_mod_cuda.cc
@@ -1,5 +1,8 @@
 #include "../runtime/tilescale_cuda_module.h"
+#include "../transform/common/attr.h"
 #include "codegen_cuda.h"
+#include "runtime/cuda/cuda_module.h"
+#include "runtime/meta_data.h"
 #include "runtime/pack_args.h"
 #include <tvm/ffi/reflection/registry.h>
 #include <tvm/ir/transform.h>
@@ -7,6 +10,36 @@
 namespace tvm {
 namespace codegen {
 
+static std::string GetDeviceGlobalSymbol(const GlobalVar &gvar,
+                                         const tir::PrimFunc &f) {
+  if (auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol)) {
+    return static_cast<std::string>(global_symbol.value());
+  }
+  return gvar->name_hint;
+}
+
+static void ValidateUniqueDeviceGlobalSymbols(const IRModule &mod) {
+  std::unordered_map<std::string, std::string> symbol_to_gvar;
+
+  for (auto kv : mod->functions) {
+    ICHECK(kv.second->IsInstance<tir::PrimFuncNode>())
+        << "Can only lower IR Module with PrimFuncs";
+    auto gvar = Downcast<GlobalVar>(kv.first);
+    auto f = Downcast<tir::PrimFunc>(kv.second);
+    std::string global_symbol = GetDeviceGlobalSymbol(gvar, f);
+
+    auto [it, inserted] =
+        symbol_to_gvar.emplace(global_symbol, gvar->name_hint);
+    ICHECK(inserted)
+        << "Duplicate CUDA kernel global_symbol `" << global_symbol
+        << "` found on PrimFuncs `" << it->second << "` and `"
+        << gvar->name_hint
+        << "`. T.CUDASourceCodeKernel emits raw CUDA source without "
+           "renaming, so CUDA entry names must be unique within the compiled "
+           "module.";
+  }
+}
+
 static std::unordered_map<std::string, runtime::FunctionInfo>
 ExtractFuncInfo(const IRModule &mod) {
   std::unordered_map<std::string, runtime::FunctionInfo> fmap;
@@ -31,14 +64,30 @@ ExtractFuncInfo(const IRModule &mod) {
         dtype = DataType::Int(32);
       info.arg_types.push_back(dtype);
     }
+    if (f->HasNonzeroAttr(tl::attr::kHasGridSync)) {
+      info.launch_param_tags.push_back(
+          runtime::launch_param::kUseProgramaticDependentLaunch);
+    }
+    if (f->HasNonzeroAttr("use_cooperative_groups")) {
+      info.launch_param_tags.push_back(
+          runtime::launch_param::kUseCooperativeLaunch);
+    }
+    if (f->GetAttr<ffi::Array<Integer>>("cluster_dims").defined()) {
+      info.launch_param_tags.push_back(runtime::launch_param::kClusterDimX);
+      info.launch_param_tags.push_back(runtime::launch_param::kClusterDimY);
+      info.launch_param_tags.push_back(runtime::launch_param::kClusterDimZ);
+    }
     if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
             tir::attr::kKernelLaunchParams)) {
       for (const auto &tag : opt.value()) {
-        info.launch_param_tags.push_back(tag);
+        if (tag != runtime::launch_param::kClusterDimX &&
+            tag != runtime::launch_param::kClusterDimY &&
+            tag != runtime::launch_param::kClusterDimZ) {
+          info.launch_param_tags.push_back(tag);
+        }
       }
     }
-    auto global_symbol = f->GetAttr<ffi::String>(tvm::attr::kGlobalSymbol);
-    fmap[static_cast<std::string>(global_symbol.value())] = info;
+    fmap[GetDeviceGlobalSymbol(Downcast<GlobalVar>(kv.first), f)] = info;
   }
   return fmap;
 }
@@ -48,6 +97,12 @@ ffi::Module BuildTileLangCUDA(IRModule mod, Target target) {
   CodeGenTileLangCUDA cg;
   cg.Init(output_ssa);
 
+  ValidateUniqueDeviceGlobalSymbols(mod);
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cuda_validate")) {
+    (*f)(mod);
+  }
+
   for (auto kv : mod->functions) {
     ICHECK(kv.second->IsInstance<PrimFuncNode>())
         << "CodeGenTileLangCUDA: Can only take PrimFunc";
@@ -85,6 +140,12 @@ ffi::Module BuildTileLangCUDAWithoutCompile(IRModule mod, Target target) {
   CodeGenTileLangCUDA cg;
   cg.Init(output_ssa);
 
+  ValidateUniqueDeviceGlobalSymbols(mod);
+  if (const auto f =
+          ffi::Function::GetGlobal("tilelang_callback_cuda_validate")) {
+    (*f)(mod);
+  }
+
   for (auto kv : mod->functions) {
     ICHECK(kv.second->IsInstance<PrimFuncNode>())
         << "CodeGenTileLangCUDA: Can only take PrimFunc";
diff --git a/src/target/rt_mod_hip.cc b/src/target/rt_mod_hip.cc
index 1e5c689c6e..e4b45b5ddb 100644
--- a/src/target/rt_mod_hip.cc
+++ b/src/target/rt_mod_hip.cc
@@ -41,6 +41,10 @@ ExtractFuncInfo(const IRModule &mod) {
         dtype = DataType::Int(32);
       info.arg_types.push_back(dtype);
     }
+    if (f->HasNonzeroAttr("use_cooperative_groups")) {
+      info.launch_param_tags.push_back(
+          runtime::launch_param::kUseCooperativeLaunch);
+    }
     if (auto opt = f->GetAttr<ffi::Array<ffi::String>>(
             tir::attr::kKernelLaunchParams)) {
       for (const auto &tag : opt.value()) {
@@ -57,6 +61,7 @@ ffi::Module BuildTileLangHIP(IRModule mod, Target target) {
   bool output_ssa = false;
   CodeGenTileLangHIP cg;
   cg.Init(output_ssa);
+  cg.SetTarget(target);
 
   for (auto kv : mod->functions) {
     ICHECK(kv.second->IsInstance<PrimFuncNode>())
@@ -93,6 +98,7 @@ ffi::Module BuildTileLangHIPWithoutCompile(IRModule mod, Target target) {
   bool output_ssa = false;
   CodeGenTileLangHIP cg;
   cg.Init(output_ssa);
+  cg.SetTarget(target);
 
   for (auto kv : mod->functions) {
     ICHECK(kv.second->IsInstance<PrimFuncNode>())
diff --git a/src/target/stubs/cuda.cc b/src/target/stubs/cuda.cc
new file mode 100644
index 0000000000..7fd81f9c27
--- /dev/null
+++ b/src/target/stubs/cuda.cc
@@ -0,0 +1,277 @@
+/**
+ * \file cuda.cc
+ * \brief Implementation of CUDA driver API stub library.
+ *
+ * This file implements lazy loading of libcuda.so and provides global wrapper
+ * functions that serve as drop-in replacements for the CUDA driver API.
+ *
+ * Motivation
+ * ----------
+ * The primary purpose is to allow TileLang to be imported on systems without
+ * a GPU (e.g., CI/compilation nodes). The library is loaded on first API call
+ * using dlopen(). If loading fails, an exception is thrown at call time rather
+ * than at import time.
+ */
+
+#include "cuda.h"
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#error "cuda_stub is currently POSIX-only (requires <dlfcn.h> / dlopen). "         \
+    "On Windows, build TileLang from source with -DTILELANG_USE_CUDA_STUBS=OFF " \
+    "to link against the real CUDA libraries."
+#endif
+
+#include <dlfcn.h>
+#include <stdexcept>
+#include <string>
+
+namespace tvm::tl::cuda {
+
+namespace {
+
+// Library names to try loading (in order of preference)
+constexpr const char *kLibCudaPaths[] = {
+    "libcuda.so.1", // Versioned library (most common)
+    "libcuda.so",   // Unversioned library
+};
+
+/**
+ * \brief Try to load libcuda.so from various paths.
+ * \return The dlopen handle, or nullptr if loading failed.
+ */
+void *try_load_libcuda() {
+  void *handle = nullptr;
+  for (const char *path : kLibCudaPaths) {
+    handle = dlopen(path, RTLD_LAZY | RTLD_LOCAL);
+    if (handle != nullptr) {
+      break;
+    }
+  }
+  return handle;
+}
+
+/**
+ * \brief Get symbol from library handle, returning nullptr on failure.
+ */
+template <typename T> T get_symbol(void *handle, const char *name) {
+  // Clear any existing error
+  (void)dlerror();
+  void *sym = dlsym(handle, name);
+  // Check for error (symbol could legitimately be nullptr in some cases)
+  const char *error = dlerror();
+  if (error != nullptr) {
+    return nullptr;
+  }
+  return reinterpret_cast<T>(sym);
+}
+
+/**
+ * \brief Create and initialize the CUDADriverAPI singleton.
+ *
+ * This function loads libcuda.so and resolves all function symbols.
+ * Required symbols that are missing will cause an exception.
+ * Optional symbols that are missing will be set to nullptr.
+ *
+ * \return The initialized CUDADriverAPI instance.
+ * \throws std::runtime_error if a required symbol is missing.
+ */
+CUDADriverAPI create_driver_api() {
+  CUDADriverAPI api{};
+  void *handle = CUDADriverAPI::get_handle();
+
+  if (handle == nullptr) {
+    return api;
+  }
+
+// Lookup required symbols - throw if missing
+#define LOOKUP_REQUIRED(name)                                                  \
+  api.name##_ = get_symbol<decltype(&name)>(handle, #name);                    \
+  if (api.name##_ == nullptr) {                                                \
+    const char *error = dlerror();                                             \
+    throw std::runtime_error(                                                  \
+        std::string("Failed to load required CUDA driver symbol: ") + #name +  \
+        ". Error: " + (error ? error : "unknown"));                            \
+  }
+  TILELANG_LIBCUDA_API_REQUIRED(LOOKUP_REQUIRED)
+#undef LOOKUP_REQUIRED
+
+// Lookup optional symbols - set to nullptr if missing (no throw)
+#define LOOKUP_OPTIONAL(name)                                                  \
+  api.name##_ = get_symbol<decltype(&name)>(handle, #name);
+  TILELANG_LIBCUDA_API_OPTIONAL(LOOKUP_OPTIONAL)
+#undef LOOKUP_OPTIONAL
+
+  return api;
+}
+
+} // namespace
+
+void *CUDADriverAPI::get_handle() {
+  // Static handle ensures library is loaded only once
+  static void *handle = try_load_libcuda();
+  return handle;
+}
+
+bool CUDADriverAPI::is_available() { return get_handle() != nullptr; }
+
+CUDADriverAPI *CUDADriverAPI::get() {
+  static CUDADriverAPI singleton = create_driver_api();
+
+  if (!is_available()) {
+    throw std::runtime_error(
+        "CUDA driver library (libcuda.so) not found. "
+        "Please ensure NVIDIA drivers are installed, or use CPU-only mode.");
+  }
+
+  return &singleton;
+}
+
+} // namespace tvm::tl::cuda
+
+// ============================================================================
+// Global wrapper function implementations
+// ============================================================================
+// These are the implementations for the extern "C" functions declared in the
+// header. They provide ABI-compatible replacements for libcuda.so functions.
+
+using tvm::tl::cuda::CUDADriverAPI;
+
+extern "C" {
+
+CUresult cuGetErrorName(CUresult error, const char **pStr) {
+  return CUDADriverAPI::get()->cuGetErrorName_(error, pStr);
+}
+
+CUresult cuGetErrorString(CUresult error, const char **pStr) {
+  return CUDADriverAPI::get()->cuGetErrorString_(error, pStr);
+}
+
+CUresult cuCtxGetDevice(CUdevice *device) {
+  return CUDADriverAPI::get()->cuCtxGetDevice_(device);
+}
+
+CUresult cuCtxGetLimit(size_t *pvalue, CUlimit limit) {
+  return CUDADriverAPI::get()->cuCtxGetLimit_(pvalue, limit);
+}
+
+CUresult cuCtxSetLimit(CUlimit limit, size_t value) {
+  return CUDADriverAPI::get()->cuCtxSetLimit_(limit, value);
+}
+
+CUresult cuCtxResetPersistingL2Cache(void) {
+  return CUDADriverAPI::get()->cuCtxResetPersistingL2Cache_();
+}
+
+CUresult cuDeviceGetName(char *name, int len, CUdevice dev) {
+  return CUDADriverAPI::get()->cuDeviceGetName_(name, len, dev);
+}
+
+CUresult cuDeviceGetAttribute(int *pi, CUdevice_attribute attrib,
+                              CUdevice dev) {
+  return CUDADriverAPI::get()->cuDeviceGetAttribute_(pi, attrib, dev);
+}
+
+CUresult cuModuleLoadData(CUmodule *module, const void *image) {
+  return CUDADriverAPI::get()->cuModuleLoadData_(module, image);
+}
+
+CUresult cuModuleLoadDataEx(CUmodule *module, const void *image,
+                            unsigned int numOptions, CUjit_option *options,
+                            void **optionValues) {
+  return CUDADriverAPI::get()->cuModuleLoadDataEx_(module, image, numOptions,
+                                                   options, optionValues);
+}
+
+CUresult cuModuleUnload(CUmodule hmod) {
+  return CUDADriverAPI::get()->cuModuleUnload_(hmod);
+}
+
+CUresult cuModuleGetFunction(CUfunction *hfunc, CUmodule hmod,
+                             const char *name) {
+  return CUDADriverAPI::get()->cuModuleGetFunction_(hfunc, hmod, name);
+}
+
+CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr, size_t *bytes, CUmodule hmod,
+                              const char *name) {
+  return CUDADriverAPI::get()->cuModuleGetGlobal_(dptr, bytes, hmod, name);
+}
+
+CUresult cuFuncSetAttribute(CUfunction hfunc, CUfunction_attribute attrib,
+                            int value) {
+  return CUDADriverAPI::get()->cuFuncSetAttribute_(hfunc, attrib, value);
+}
+
+CUresult cuLaunchKernel(CUfunction f, unsigned int gridDimX,
+                        unsigned int gridDimY, unsigned int gridDimZ,
+                        unsigned int blockDimX, unsigned int blockDimY,
+                        unsigned int blockDimZ, unsigned int sharedMemBytes,
+                        CUstream hStream, void **kernelParams, void **extra) {
+  return CUDADriverAPI::get()->cuLaunchKernel_(
+      f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+      sharedMemBytes, hStream, kernelParams, extra);
+}
+
+CUresult cuLaunchKernelEx(const CUlaunchConfig *config, CUfunction f,
+                          void **kernelParams, void **extra) {
+  return CUDADriverAPI::get()->cuLaunchKernelEx_(config, f, kernelParams,
+                                                 extra);
+}
+
+CUresult cuLaunchCooperativeKernel(CUfunction f, unsigned int gridDimX,
+                                   unsigned int gridDimY, unsigned int gridDimZ,
+                                   unsigned int blockDimX,
+                                   unsigned int blockDimY,
+                                   unsigned int blockDimZ,
+                                   unsigned int sharedMemBytes,
+                                   CUstream hStream, void **kernelParams) {
+  return CUDADriverAPI::get()->cuLaunchCooperativeKernel_(
+      f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+      sharedMemBytes, hStream, kernelParams);
+}
+
+CUresult cuMemsetD32_v2(CUdeviceptr dstDevice, unsigned int ui, size_t N) {
+  return CUDADriverAPI::get()->cuMemsetD32_(dstDevice, ui, N);
+}
+
+CUresult cuStreamSetAttribute(CUstream hStream, CUstreamAttrID attr,
+                              const CUstreamAttrValue *value) {
+  return CUDADriverAPI::get()->cuStreamSetAttribute_(hStream, attr, value);
+}
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+CUresult cuTensorMapEncodeTiled(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill) {
+  auto fn = CUDADriverAPI::get()->cuTensorMapEncodeTiled_;
+  if (fn == nullptr) {
+    return CUDA_ERROR_NOT_SUPPORTED;
+  }
+  return fn(tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
+            globalStrides, boxDim, elementStrides, interleave, swizzle,
+            l2Promotion, oobFill);
+}
+
+CUresult cuTensorMapEncodeIm2col(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner,
+    const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel,
+    cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides,
+    CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill) {
+  auto fn = CUDADriverAPI::get()->cuTensorMapEncodeIm2col_;
+  if (fn == nullptr) {
+    return CUDA_ERROR_NOT_SUPPORTED;
+  }
+  return fn(tensorMap, tensorDataType, tensorRank, globalAddress, globalDim,
+            globalStrides, pixelBoxLowerCorner, pixelBoxUpperCorner,
+            channelsPerPixel, pixelsPerColumn, elementStrides, interleave,
+            swizzle, l2Promotion, oobFill);
+}
+#endif
+
+} // extern "C"
diff --git a/src/target/stubs/cuda.h b/src/target/stubs/cuda.h
new file mode 100644
index 0000000000..87a2bd0568
--- /dev/null
+++ b/src/target/stubs/cuda.h
@@ -0,0 +1,226 @@
+/**
+ * \file cuda.h
+ * \brief Stub library for lazy loading libcuda.so at runtime.
+ *
+ * Motivation
+ * ----------
+ * libcuda.so is the CUDA Driver API library. Linking directly against it
+ * creates a strong dependency on the presence of the NVIDIA driver at build
+ * time and runtime.
+ *
+ * This stub library allows TileLang to:
+ * 1. Be imported on CPU-only machines (no libcuda.so present).
+ * 2. Avoid versioning conflicts by loading the available libcuda.so
+ * dynamically.
+ *
+ * This library provides drop-in replacements for CUDA driver API functions.
+ * It allows tilelang to be imported on CPU-only machines without CUDA
+ * installed. The actual libcuda.so is loaded lazily on first API call.
+ *
+ * Usage:
+ *
+ * 1. Link against libcuda_stub.so instead of libcuda.so
+ *
+ * 2. Call CUDA driver API functions normally - they are provided as
+ *    exported global functions with C linkage:
+ *
+ *    ```cpp
+ *    #include "target/stubs/cuda.h"
+ *    CUresult result = cuModuleLoadData(&mod, image);
+ *    ```
+ *
+ * 3. For advanced use, access the singleton directly:
+ *
+ *    ```cpp
+ *    auto* api = tvm::tl::cuda::CUDADriverAPI::get();
+ *    bool available = tvm::tl::cuda::CUDADriverAPI::is_available();
+ *    ```
+ */
+
+#pragma once
+
+// Define guard before including vendor/cuda.h
+// This ensures vendor/cuda.h can only be included through this stub header.
+#define _TILELANG_CUDA_STUB_INCLUDE_GUARD
+
+#include "vendor/cuda.h" // include the full CUDA driver API types
+
+#undef _TILELANG_CUDA_STUB_INCLUDE_GUARD
+
+// Symbol visibility macros for shared library export
+#if defined(_WIN32) || defined(__CYGWIN__)
+#ifdef TILELANG_CUDA_STUB_EXPORTS
+#define TILELANG_CUDA_STUB_API __declspec(dllexport)
+#else
+#define TILELANG_CUDA_STUB_API __declspec(dllimport)
+#endif
+#else
+#define TILELANG_CUDA_STUB_API __attribute__((visibility("default")))
+#endif
+
+// X-macro for listing all required CUDA driver API functions.
+// Format: _(function_name)
+// These are the core functions used by TVM/tilelang CUDA runtime.
+#define TILELANG_LIBCUDA_API_REQUIRED(_)                                       \
+  _(cuGetErrorName)                                                            \
+  _(cuGetErrorString)                                                          \
+  _(cuCtxGetDevice)                                                            \
+  _(cuCtxGetLimit)                                                             \
+  _(cuCtxSetLimit)                                                             \
+  _(cuCtxResetPersistingL2Cache)                                               \
+  _(cuDeviceGetName)                                                           \
+  _(cuDeviceGetAttribute)                                                      \
+  _(cuModuleLoadData)                                                          \
+  _(cuModuleLoadDataEx)                                                        \
+  _(cuModuleUnload)                                                            \
+  _(cuModuleGetFunction)                                                       \
+  _(cuModuleGetGlobal)                                                         \
+  _(cuFuncSetAttribute)                                                        \
+  _(cuLaunchKernel)                                                            \
+  _(cuLaunchKernelEx)                                                          \
+  _(cuLaunchCooperativeKernel)                                                 \
+  _(cuMemsetD32)                                                               \
+  _(cuStreamSetAttribute)
+
+// Optional APIs (may not exist in older drivers or specific configurations)
+// These are loaded but may be nullptr if not available
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+#define TILELANG_LIBCUDA_API_OPTIONAL(_)                                       \
+  _(cuTensorMapEncodeTiled)                                                    \
+  _(cuTensorMapEncodeIm2col)
+#else
+#define TILELANG_LIBCUDA_API_OPTIONAL(_)
+#endif
+
+namespace tvm::tl::cuda {
+
+/**
+ * \brief CUDA Driver API accessor struct with lazy loading support.
+ *
+ * This struct provides lazy loading of libcuda.so symbols at first use,
+ * allowing tilelang to be imported on machines without CUDA installed.
+ * The library handle and function pointers are stored as static members
+ * to ensure one-time initialization.
+ *
+ * Usage:
+ *   CUresult result = CUDADriverAPI::get()->cuModuleLoadData_(&module, image);
+ *
+ * Note: Function pointers have a trailing underscore to differentiate from
+ * the macro-redefined names in cuda.h (e.g., cuModuleGetGlobal ->
+ * cuModuleGetGlobal_v2)
+ */
+struct TILELANG_CUDA_STUB_API CUDADriverAPI {
+// Create function pointer members for each API function
+// The trailing underscore avoids conflict with cuda.h macros
+#define CREATE_MEMBER(name) decltype(&name) name##_;
+  TILELANG_LIBCUDA_API_REQUIRED(CREATE_MEMBER)
+  TILELANG_LIBCUDA_API_OPTIONAL(CREATE_MEMBER)
+#undef CREATE_MEMBER
+
+  /**
+   * \brief Get the singleton instance of CUDADriverAPI.
+   *
+   * On first call, this loads libcuda.so and resolves all symbols.
+   * Subsequent calls return the cached instance.
+   *
+   * \return Pointer to the singleton CUDADriverAPI instance.
+   * \throws std::runtime_error if libcuda.so cannot be loaded or
+   *         required symbols are missing.
+   */
+  static CUDADriverAPI *get();
+
+  /**
+   * \brief Check if CUDA driver is available without throwing.
+   *
+   * \return true if libcuda.so can be loaded, false otherwise.
+   */
+  static bool is_available();
+
+  /**
+   * \brief Get the raw library handle for libcuda.so.
+   *
+   * \return The dlopen handle, or nullptr if not loaded.
+   */
+  static void *get_handle();
+};
+
+} // namespace tvm::tl::cuda
+
+// ============================================================================
+// Global wrapper functions for lazy-loaded CUDA driver API
+// ============================================================================
+// These functions provide drop-in replacements for CUDA driver API calls.
+// They are exported from the stub library and can be linked against directly.
+// The implementations are in cuda.cc.
+
+extern "C" {
+
+TILELANG_CUDA_STUB_API CUresult cuGetErrorName(CUresult error,
+                                               const char **pStr);
+TILELANG_CUDA_STUB_API CUresult cuGetErrorString(CUresult error,
+                                                 const char **pStr);
+TILELANG_CUDA_STUB_API CUresult cuCtxGetDevice(CUdevice *device);
+TILELANG_CUDA_STUB_API CUresult cuCtxGetLimit(size_t *pvalue, CUlimit limit);
+TILELANG_CUDA_STUB_API CUresult cuCtxSetLimit(CUlimit limit, size_t value);
+TILELANG_CUDA_STUB_API CUresult cuCtxResetPersistingL2Cache(void);
+TILELANG_CUDA_STUB_API CUresult cuDeviceGetName(char *name, int len,
+                                                CUdevice dev);
+TILELANG_CUDA_STUB_API CUresult cuDeviceGetAttribute(int *pi,
+                                                     CUdevice_attribute attrib,
+                                                     CUdevice dev);
+TILELANG_CUDA_STUB_API CUresult cuModuleLoadData(CUmodule *module,
+                                                 const void *image);
+TILELANG_CUDA_STUB_API CUresult cuModuleLoadDataEx(CUmodule *module,
+                                                   const void *image,
+                                                   unsigned int numOptions,
+                                                   CUjit_option *options,
+                                                   void **optionValues);
+TILELANG_CUDA_STUB_API CUresult cuModuleUnload(CUmodule hmod);
+TILELANG_CUDA_STUB_API CUresult cuModuleGetFunction(CUfunction *hfunc,
+                                                    CUmodule hmod,
+                                                    const char *name);
+TILELANG_CUDA_STUB_API CUresult cuModuleGetGlobal_v2(CUdeviceptr *dptr,
+                                                     size_t *bytes,
+                                                     CUmodule hmod,
+                                                     const char *name);
+TILELANG_CUDA_STUB_API CUresult cuFuncSetAttribute(CUfunction hfunc,
+                                                   CUfunction_attribute attrib,
+                                                   int value);
+TILELANG_CUDA_STUB_API CUresult cuLaunchKernel(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams, void **extra);
+TILELANG_CUDA_STUB_API CUresult cuLaunchKernelEx(const CUlaunchConfig *config,
+                                                 CUfunction f,
+                                                 void **kernelParams,
+                                                 void **extra);
+TILELANG_CUDA_STUB_API CUresult cuLaunchCooperativeKernel(
+    CUfunction f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, CUstream hStream,
+    void **kernelParams);
+TILELANG_CUDA_STUB_API CUresult cuMemsetD32_v2(CUdeviceptr dstDevice,
+                                               unsigned int ui, size_t N);
+TILELANG_CUDA_STUB_API CUresult cuStreamSetAttribute(
+    CUstream hStream, CUstreamAttrID attr, const CUstreamAttrValue *value);
+
+#if defined(CUDA_VERSION) && (CUDA_VERSION >= 12000)
+TILELANG_CUDA_STUB_API CUresult cuTensorMapEncodeTiled(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const cuuint32_t *boxDim,
+    const cuuint32_t *elementStrides, CUtensorMapInterleave interleave,
+    CUtensorMapSwizzle swizzle, CUtensorMapL2promotion l2Promotion,
+    CUtensorMapFloatOOBfill oobFill);
+TILELANG_CUDA_STUB_API CUresult cuTensorMapEncodeIm2col(
+    CUtensorMap *tensorMap, CUtensorMapDataType tensorDataType,
+    cuuint32_t tensorRank, void *globalAddress, const cuuint64_t *globalDim,
+    const cuuint64_t *globalStrides, const int *pixelBoxLowerCorner,
+    const int *pixelBoxUpperCorner, cuuint32_t channelsPerPixel,
+    cuuint32_t pixelsPerColumn, const cuuint32_t *elementStrides,
+    CUtensorMapInterleave interleave, CUtensorMapSwizzle swizzle,
+    CUtensorMapL2promotion l2Promotion, CUtensorMapFloatOOBfill oobFill);
+#endif
+
+} // extern "C"
diff --git a/src/target/stubs/cudart.cc b/src/target/stubs/cudart.cc
new file mode 100644
index 0000000000..4e75af58e9
--- /dev/null
+++ b/src/target/stubs/cudart.cc
@@ -0,0 +1,671 @@
+/**
+ * \file cudart.cc
+ * \brief CUDA Runtime API stub library for lazy loading libcudart.so at
+ * runtime.
+ *
+ * Motivation
+ * ----------
+ * The primary purpose is to resolve SONAME mismatches (e.g., libcudart.so.11.0
+ * vs libcudart.so.12), allowing a single build to work across different CUDA
+ * versions. This is achieved by reusing the CUDA runtime already loaded by
+ * frameworks like PyTorch.
+ *
+ * This stub exports the subset of CUDA Runtime API entrypoints used by TVM in
+ * this repository. The real libcudart is loaded lazily via dlopen() on first
+ * API call, and symbols are resolved via dlsym().
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <cuda_runtime_api.h>
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#error "cudart_stub is currently POSIX-only (requires <dlfcn.h> / dlopen). "       \
+    "On Windows, build TileLang from source with -DTILELANG_USE_CUDA_STUBS=OFF " \
+    "to link against the real CUDA libraries."
+#endif
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+// This stub supports CUDA 11+.
+//
+// Note: CUDA 12 changed the `cudaGraphInstantiate` entrypoint signature in
+// `cuda_runtime_api.h` from a legacy 5-parameter form to a 3-parameter form
+// (with flags). To keep wheels usable across CUDA majors, this stub resolves
+// and dispatches to `cudaGraphInstantiateWithFlags` when available
+// (CUDA 11.4+), otherwise falls back to the legacy `cudaGraphInstantiate`
+// symbol.
+#ifndef CUDART_VERSION
+#error                                                                         \
+    "CUDART_VERSION is not defined. Ensure CUDA runtime headers are available."
+#endif
+static_assert(CUDART_VERSION >= 11000,
+              "cudart_stub requires CUDA Toolkit headers >= 11.0 "
+              "(CUDART_VERSION >= 11000).");
+
+// Export symbols with default visibility for the shared stub library.
+#define TILELANG_CUDART_STUB_API __attribute__((visibility("default")))
+
+namespace {
+
+constexpr const char *kLibCudartPaths[] = {
+    "libcudart.so",
+    // Some distros ship a versioned SONAME as well; try a few common ones.
+    "libcudart.so.13",
+    "libcudart.so.12",
+    "libcudart.so.11",
+};
+
+using CudaGraphInstantiateLegacy = cudaError_t (*)(cudaGraphExec_t *pGraphExec,
+                                                   cudaGraph_t graph,
+                                                   cudaGraphNode_t *pErrorNode,
+                                                   char *pLogBuffer,
+                                                   size_t bufferSize);
+using CudaGraphInstantiateWithFlags = cudaError_t (*)(
+    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags);
+
+void *TryLoadLibCudart() {
+  // First, check if the symbols are already available globally.
+  // This handles cases where PyTorch or another library has already loaded
+  // libcudart, making its symbols available in the global namespace.
+  // We use a representative symbol like cudaGetErrorString.
+  // dlsym with RTLD_DEFAULT searches the global scope.
+  void *sym = dlsym(RTLD_DEFAULT, "cudaGetErrorString");
+  if (sym != nullptr && sym != reinterpret_cast<void *>(&cudaGetErrorString)) {
+    return RTLD_DEFAULT;
+  }
+  sym = dlsym(RTLD_NEXT, "cudaGetErrorString");
+  if (sym != nullptr) {
+    return RTLD_NEXT;
+  }
+
+  // Otherwise, attempt to dlopen the library directly.
+  void *handle = nullptr;
+  for (const char *path : kLibCudartPaths) {
+    handle = dlopen(path, RTLD_LAZY | RTLD_LOCAL);
+    if (handle != nullptr) {
+      return handle;
+    }
+  }
+
+  fprintf(stderr,
+          "TileLang Error: libcudart symbols not found. "
+          "Make sure PyTorch with CUDA is installed before using TileLang.\n");
+  abort();
+}
+
+template <typename T> T GetSymbol(void *handle, const char *name) {
+  (void)dlerror();
+  void *sym = dlsym(handle, name);
+  const char *error = dlerror();
+  if (error != nullptr) {
+    return nullptr;
+  }
+  return reinterpret_cast<T>(sym);
+}
+
+struct CUDARuntimeAPI {
+  decltype(&::cudaGetErrorString) cudaGetErrorString_{nullptr};
+  decltype(&::cudaGetLastError) cudaGetLastError_{nullptr};
+  decltype(&::cudaPeekAtLastError) cudaPeekAtLastError_{nullptr};
+
+  decltype(&::cudaSetDevice) cudaSetDevice_{nullptr};
+  decltype(&::cudaGetDevice) cudaGetDevice_{nullptr};
+  decltype(&::cudaGetDeviceCount) cudaGetDeviceCount_{nullptr};
+  decltype(&::cudaDeviceGetAttribute) cudaDeviceGetAttribute_{nullptr};
+  decltype(&::cudaGetDeviceProperties) cudaGetDeviceProperties_{nullptr};
+
+  decltype(&::cudaMemGetInfo) cudaMemGetInfo_{nullptr};
+  decltype(&::cudaMalloc) cudaMalloc_{nullptr};
+  decltype(&::cudaFree) cudaFree_{nullptr};
+  decltype(&::cudaMallocHost) cudaMallocHost_{nullptr};
+  decltype(&::cudaFreeHost) cudaFreeHost_{nullptr};
+  decltype(&::cudaMemset) cudaMemset_{nullptr};
+  decltype(&::cudaMemsetAsync) cudaMemsetAsync_{nullptr};
+
+  decltype(&::cudaMemcpy) cudaMemcpy_{nullptr};
+  decltype(&::cudaMemcpyAsync) cudaMemcpyAsync_{nullptr};
+  decltype(&::cudaMemcpyPeerAsync) cudaMemcpyPeerAsync_{nullptr};
+
+  decltype(&::cudaStreamCreate) cudaStreamCreate_{nullptr};
+  decltype(&::cudaStreamCreateWithFlags) cudaStreamCreateWithFlags_{nullptr};
+  decltype(&::cudaStreamDestroy) cudaStreamDestroy_{nullptr};
+  decltype(&::cudaStreamSynchronize) cudaStreamSynchronize_{nullptr};
+  decltype(&::cudaStreamWaitEvent) cudaStreamWaitEvent_{nullptr};
+
+  decltype(&::cudaEventCreate) cudaEventCreate_{nullptr};
+  decltype(&::cudaEventDestroy) cudaEventDestroy_{nullptr};
+  decltype(&::cudaEventRecord) cudaEventRecord_{nullptr};
+  decltype(&::cudaEventSynchronize) cudaEventSynchronize_{nullptr};
+  decltype(&::cudaEventElapsedTime) cudaEventElapsedTime_{nullptr};
+
+  decltype(&::cudaDeviceSynchronize) cudaDeviceSynchronize_{nullptr};
+
+  decltype(&::cudaStreamBeginCapture) cudaStreamBeginCapture_{nullptr};
+  decltype(&::cudaStreamEndCapture) cudaStreamEndCapture_{nullptr};
+  // `cudaGraphInstantiate` changed signature in CUDA 12. Use explicit function
+  // pointer typedefs and dispatch based on available symbols.
+  CudaGraphInstantiateLegacy cudaGraphInstantiate_{nullptr};
+  CudaGraphInstantiateWithFlags cudaGraphInstantiateWithFlags_{nullptr};
+  decltype(&::cudaGraphLaunch) cudaGraphLaunch_{nullptr};
+  decltype(&::cudaGraphDestroy) cudaGraphDestroy_{nullptr};
+  decltype(&::cudaGraphExecDestroy) cudaGraphExecDestroy_{nullptr};
+
+  decltype(&::cudaIpcGetMemHandle) cudaIpcGetMemHandle_{nullptr};
+  decltype(&::cudaIpcOpenMemHandle) cudaIpcOpenMemHandle_{nullptr};
+  decltype(&::cudaIpcCloseMemHandle) cudaIpcCloseMemHandle_{nullptr};
+
+  // Not currently required by default build, but cheap to include for optional
+  // contribs (e.g. vllm kernels).
+  decltype(&::cudaFuncSetAttribute) cudaFuncSetAttribute_{nullptr};
+};
+
+void *GetLibCudartHandle() {
+  static void *handle = TryLoadLibCudart();
+  return handle;
+}
+
+cudaError_t MissingLibraryError() { return cudaErrorUnknown; }
+
+const char *FallbackCudaErrorString(cudaError_t error) {
+  if (error == cudaSuccess) {
+    return "cudaSuccess";
+  }
+  if (error == cudaErrorUnknown) {
+    return "cudaErrorUnknown (CUDA runtime stub: libcudart not found)";
+  }
+  return "cudaError (CUDA runtime stub: libcudart not found)";
+}
+
+CUDARuntimeAPI CreateCUDARuntimeAPI() {
+  CUDARuntimeAPI api{};
+  void *handle = GetLibCudartHandle();
+#define LOOKUP_REQUIRED(name)                                                  \
+  api.name##_ = GetSymbol<decltype(api.name##_)>(handle, #name);               \
+  if (api.name##_ == nullptr) {                                                \
+    return CUDARuntimeAPI{};                                                   \
+  }
+
+  // NOTE: cudaGetErrorString is optional in the sense that we can provide a
+  // fallback string, but when libcudart is present it should always exist.
+  api.cudaGetErrorString_ = GetSymbol<decltype(api.cudaGetErrorString_)>(
+      handle, "cudaGetErrorString");
+
+  LOOKUP_REQUIRED(cudaGetLastError)
+  LOOKUP_REQUIRED(cudaPeekAtLastError)
+  LOOKUP_REQUIRED(cudaSetDevice)
+  LOOKUP_REQUIRED(cudaGetDevice)
+  LOOKUP_REQUIRED(cudaGetDeviceCount)
+  LOOKUP_REQUIRED(cudaDeviceGetAttribute)
+  LOOKUP_REQUIRED(cudaGetDeviceProperties)
+  LOOKUP_REQUIRED(cudaMemGetInfo)
+  LOOKUP_REQUIRED(cudaMalloc)
+  LOOKUP_REQUIRED(cudaFree)
+  LOOKUP_REQUIRED(cudaMallocHost)
+  LOOKUP_REQUIRED(cudaFreeHost)
+  LOOKUP_REQUIRED(cudaMemset)
+  LOOKUP_REQUIRED(cudaMemsetAsync)
+  LOOKUP_REQUIRED(cudaMemcpy)
+  LOOKUP_REQUIRED(cudaMemcpyAsync)
+  LOOKUP_REQUIRED(cudaMemcpyPeerAsync)
+  LOOKUP_REQUIRED(cudaStreamCreate)
+  LOOKUP_REQUIRED(cudaStreamCreateWithFlags)
+  LOOKUP_REQUIRED(cudaStreamDestroy)
+  LOOKUP_REQUIRED(cudaStreamSynchronize)
+  LOOKUP_REQUIRED(cudaStreamWaitEvent)
+  LOOKUP_REQUIRED(cudaEventCreate)
+  LOOKUP_REQUIRED(cudaEventDestroy)
+  LOOKUP_REQUIRED(cudaEventRecord)
+  LOOKUP_REQUIRED(cudaEventSynchronize)
+  LOOKUP_REQUIRED(cudaEventElapsedTime)
+  LOOKUP_REQUIRED(cudaDeviceSynchronize)
+  LOOKUP_REQUIRED(cudaStreamBeginCapture)
+  LOOKUP_REQUIRED(cudaStreamEndCapture)
+  LOOKUP_REQUIRED(cudaGraphInstantiate)
+  api.cudaGraphInstantiateWithFlags_ = GetSymbol<CudaGraphInstantiateWithFlags>(
+      handle, "cudaGraphInstantiateWithFlags");
+  LOOKUP_REQUIRED(cudaGraphLaunch)
+  LOOKUP_REQUIRED(cudaGraphDestroy)
+  LOOKUP_REQUIRED(cudaGraphExecDestroy)
+  LOOKUP_REQUIRED(cudaIpcGetMemHandle)
+  LOOKUP_REQUIRED(cudaIpcOpenMemHandle)
+  LOOKUP_REQUIRED(cudaIpcCloseMemHandle)
+
+  // Optional
+  api.cudaFuncSetAttribute_ = GetSymbol<decltype(api.cudaFuncSetAttribute_)>(
+      handle, "cudaFuncSetAttribute");
+
+#undef LOOKUP_REQUIRED
+
+  return api;
+}
+
+CUDARuntimeAPI *GetCUDARuntimeAPI() {
+  static CUDARuntimeAPI singleton = CreateCUDARuntimeAPI();
+  return &singleton;
+}
+
+cudaError_t GraphInstantiate(cudaGraphExec_t *pGraphExec, cudaGraph_t graph,
+                             unsigned long long flags,
+                             cudaGraphNode_t *pErrorNode, char *pLogBuffer,
+                             size_t bufferSize) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGraphInstantiateWithFlags_ != nullptr) {
+    return api->cudaGraphInstantiateWithFlags_(pGraphExec, graph, flags);
+  }
+  if (api->cudaGraphInstantiate_ == nullptr) {
+    if (pGraphExec != nullptr) {
+      *pGraphExec = nullptr;
+    }
+    return MissingLibraryError();
+  }
+  // Legacy API (CUDA 11.0-11.3): `cudaGraphInstantiate` has no flags parameter.
+  // The caller (TVM) uses flags=0 and passes NULL diagnostics.
+  (void)flags;
+  return api->cudaGraphInstantiate_(pGraphExec, graph, pErrorNode, pLogBuffer,
+                                    bufferSize);
+}
+
+} // namespace
+
+extern "C" {
+
+TILELANG_CUDART_STUB_API const char *cudaGetErrorString(cudaError_t error) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGetErrorString_ != nullptr) {
+    return api->cudaGetErrorString_(error);
+  }
+  return FallbackCudaErrorString(error);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaGetLastError(void) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGetLastError_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaGetLastError_();
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaPeekAtLastError(void) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaPeekAtLastError_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaPeekAtLastError_();
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaSetDevice(int device) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaSetDevice_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaSetDevice_(device);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaGetDevice(int *device) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGetDevice_ == nullptr) {
+    if (device != nullptr) {
+      *device = 0;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaGetDevice_(device);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaGetDeviceCount(int *count) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGetDeviceCount_ == nullptr) {
+    if (count != nullptr) {
+      *count = 0;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaGetDeviceCount_(count);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaDeviceGetAttribute(int *value,
+                                                            cudaDeviceAttr attr,
+                                                            int device) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaDeviceGetAttribute_ == nullptr) {
+    if (value != nullptr) {
+      *value = 0;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaDeviceGetAttribute_(value, attr, device);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t
+cudaGetDeviceProperties(cudaDeviceProp *prop, int device) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGetDeviceProperties_ == nullptr) {
+    if (prop != nullptr) {
+      memset(prop, 0, sizeof(*prop));
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaGetDeviceProperties_(prop, device);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaMemGetInfo(size_t *free,
+                                                    size_t *total) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaMemGetInfo_ == nullptr) {
+    if (free != nullptr) {
+      *free = 0;
+    }
+    if (total != nullptr) {
+      *total = 0;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaMemGetInfo_(free, total);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaMalloc(void **devPtr, size_t size) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaMalloc_ == nullptr) {
+    if (devPtr != nullptr) {
+      *devPtr = nullptr;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaMalloc_(devPtr, size);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaFree(void *devPtr) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaFree_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaFree_(devPtr);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaMallocHost(void **ptr, size_t size) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaMallocHost_ == nullptr) {
+    if (ptr != nullptr) {
+      *ptr = nullptr;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaMallocHost_(ptr, size);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaFreeHost(void *ptr) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaFreeHost_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaFreeHost_(ptr);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaMemset(void *devPtr, int value,
+                                                size_t count) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaMemset_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaMemset_(devPtr, value, count);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaMemsetAsync(void *devPtr, int value,
+                                                     size_t count,
+                                                     cudaStream_t stream) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaMemsetAsync_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaMemsetAsync_(devPtr, value, count, stream);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaMemcpy(void *dst, const void *src,
+                                                size_t count,
+                                                cudaMemcpyKind kind) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaMemcpy_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaMemcpy_(dst, src, count, kind);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaMemcpyAsync(void *dst, const void *src,
+                                                     size_t count,
+                                                     cudaMemcpyKind kind,
+                                                     cudaStream_t stream) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaMemcpyAsync_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaMemcpyAsync_(dst, src, count, kind, stream);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t
+cudaMemcpyPeerAsync(void *dst, int dstDevice, const void *src, int srcDevice,
+                    size_t count, cudaStream_t stream) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaMemcpyPeerAsync_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaMemcpyPeerAsync_(dst, dstDevice, src, srcDevice, count,
+                                   stream);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaStreamCreate(cudaStream_t *pStream) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaStreamCreate_ == nullptr) {
+    if (pStream != nullptr) {
+      *pStream = nullptr;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaStreamCreate_(pStream);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t
+cudaStreamCreateWithFlags(cudaStream_t *pStream, unsigned int flags) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaStreamCreateWithFlags_ == nullptr) {
+    if (pStream != nullptr) {
+      *pStream = nullptr;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaStreamCreateWithFlags_(pStream, flags);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaStreamDestroy(cudaStream_t stream) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaStreamDestroy_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaStreamDestroy_(stream);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t
+cudaStreamSynchronize(cudaStream_t stream) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaStreamSynchronize_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaStreamSynchronize_(stream);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaStreamWaitEvent(cudaStream_t stream,
+                                                         cudaEvent_t event,
+                                                         unsigned int flags) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaStreamWaitEvent_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaStreamWaitEvent_(stream, event, flags);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaEventCreate(cudaEvent_t *event) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaEventCreate_ == nullptr) {
+    if (event != nullptr) {
+      *event = nullptr;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaEventCreate_(event);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaEventDestroy(cudaEvent_t event) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaEventDestroy_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaEventDestroy_(event);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaEventRecord(cudaEvent_t event,
+                                                     cudaStream_t stream) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaEventRecord_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaEventRecord_(event, stream);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaEventSynchronize(cudaEvent_t event) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaEventSynchronize_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaEventSynchronize_(event);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaEventElapsedTime(float *ms,
+                                                          cudaEvent_t start,
+                                                          cudaEvent_t end) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaEventElapsedTime_ == nullptr) {
+    if (ms != nullptr) {
+      *ms = 0.0f;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaEventElapsedTime_(ms, start, end);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaDeviceSynchronize(void) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaDeviceSynchronize_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaDeviceSynchronize_();
+}
+
+TILELANG_CUDART_STUB_API cudaError_t
+cudaStreamBeginCapture(cudaStream_t stream, cudaStreamCaptureMode mode) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaStreamBeginCapture_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaStreamBeginCapture_(stream, mode);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaStreamEndCapture(cudaStream_t stream,
+                                                          cudaGraph_t *graph) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaStreamEndCapture_ == nullptr) {
+    if (graph != nullptr) {
+      *graph = nullptr;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaStreamEndCapture_(stream, graph);
+}
+
+#if CUDART_VERSION >= 12000
+TILELANG_CUDART_STUB_API cudaError_t cudaGraphInstantiate(
+    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, unsigned long long flags) {
+  return GraphInstantiate(pGraphExec, graph, flags, nullptr, nullptr, 0);
+}
+#else
+TILELANG_CUDART_STUB_API cudaError_t cudaGraphInstantiate(
+    cudaGraphExec_t *pGraphExec, cudaGraph_t graph, cudaGraphNode_t *pErrorNode,
+    char *pLogBuffer, size_t bufferSize) {
+  return GraphInstantiate(pGraphExec, graph, 0, pErrorNode, pLogBuffer,
+                          bufferSize);
+}
+#endif
+
+TILELANG_CUDART_STUB_API cudaError_t cudaGraphLaunch(cudaGraphExec_t graphExec,
+                                                     cudaStream_t stream) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGraphLaunch_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaGraphLaunch_(graphExec, stream);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaGraphDestroy(cudaGraph_t graph) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGraphDestroy_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaGraphDestroy_(graph);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t
+cudaGraphExecDestroy(cudaGraphExec_t graphExec) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaGraphExecDestroy_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaGraphExecDestroy_(graphExec);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t
+cudaIpcGetMemHandle(cudaIpcMemHandle_t *handle, void *devPtr) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaIpcGetMemHandle_ == nullptr) {
+    if (handle != nullptr) {
+      memset(handle, 0, sizeof(*handle));
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaIpcGetMemHandle_(handle, devPtr);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaIpcOpenMemHandle(
+    void **devPtr, cudaIpcMemHandle_t handle, unsigned int flags) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaIpcOpenMemHandle_ == nullptr) {
+    if (devPtr != nullptr) {
+      *devPtr = nullptr;
+    }
+    return MissingLibraryError();
+  }
+  return api->cudaIpcOpenMemHandle_(devPtr, handle, flags);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t cudaIpcCloseMemHandle(void *devPtr) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaIpcCloseMemHandle_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaIpcCloseMemHandle_(devPtr);
+}
+
+TILELANG_CUDART_STUB_API cudaError_t
+cudaFuncSetAttribute(const void *func, cudaFuncAttribute attr, int value) {
+  auto *api = GetCUDARuntimeAPI();
+  if (api->cudaFuncSetAttribute_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->cudaFuncSetAttribute_(func, attr, value);
+}
+
+} // extern "C"
diff --git a/src/target/stubs/hip.cc b/src/target/stubs/hip.cc
new file mode 100644
index 0000000000..131b123f57
--- /dev/null
+++ b/src/target/stubs/hip.cc
@@ -0,0 +1,431 @@
+/**
+ * \file hip.cc
+ * \brief Implementation of HIP stub library.
+ *
+ * This implements lazy loading of libamdhip64.so and provides exported global
+ * wrapper functions that serve as drop-in replacements for the HIP runtime /
+ * module APIs used by TVM/TileLang.
+ *
+ * The implementation mirrors src/target/stubs/cuda.cc:
+ * - Resolve symbols via dlopen/dlsym on first use.
+ * - Prefer RTLD_DEFAULT/RTLD_NEXT when HIP is already loaded by another
+ *   framework (e.g. PyTorch ROCm).
+ *
+ * Additionally, this stub provides wrappers for the minimal HSA APIs used by
+ * TVM's ROCm device existence check (hsa_init / hsa_shut_down) so that a ROCm
+ * enabled build can still be imported on machines without ROCm installed.
+ */
+
+#include "hip.h"
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#error "hip_stub is currently POSIX-only (requires <dlfcn.h> / dlopen). "       \
+    "On Windows, build TileLang from source with -DTILELANG_USE_HIP_STUBS=OFF " \
+    "to link against the real ROCm libraries."
+#endif
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <dlfcn.h>
+
+#include <stdexcept>
+#include <string>
+
+// HSA is only used for two entrypoints, and we want to keep this stub
+// buildable in environments without ROCm headers installed.
+#if __has_include(<hsa/hsa.h>)
+#include <hsa/hsa.h>
+#define TILELANG_HAS_HSA_HEADERS 1
+#else
+#define TILELANG_HAS_HSA_HEADERS 0
+typedef int hsa_status_t;
+#ifndef HSA_STATUS_SUCCESS
+#define HSA_STATUS_SUCCESS 0
+#endif
+extern "C" hsa_status_t hsa_init(void);
+extern "C" hsa_status_t hsa_shut_down(void);
+#endif
+
+namespace tvm::tl::hip {
+
+namespace {
+
+constexpr const char *kLibHipPaths[] = {
+    "libamdhip64.so",
+    // Some distros ship a versioned SONAME as well; try a few common ones.
+    "libamdhip64.so.6",
+    "libamdhip64.so.5",
+};
+
+constexpr const char *kLibHsaPaths[] = {
+    "libhsa-runtime64.so.1",
+    "libhsa-runtime64.so",
+};
+
+template <typename T> T GetSymbol(void *handle, const char *name) {
+  (void)dlerror();
+  void *sym = dlsym(handle, name);
+  const char *error = dlerror();
+  if (error != nullptr) {
+    return nullptr;
+  }
+  return reinterpret_cast<T>(sym);
+}
+
+void *TryLoadLibAmdHip64() {
+  // Prefer already-loaded symbols (e.g. if PyTorch ROCm is imported first).
+  // We use a representative symbol and ensure we don't just find ourselves.
+  void *sym = dlsym(RTLD_DEFAULT, "hipGetErrorString");
+  if (sym != nullptr && sym != reinterpret_cast<void *>(&hipGetErrorString)) {
+    return RTLD_DEFAULT;
+  }
+  sym = dlsym(RTLD_NEXT, "hipGetErrorString");
+  if (sym != nullptr && sym != reinterpret_cast<void *>(&hipGetErrorString)) {
+    return RTLD_NEXT;
+  }
+
+  // Otherwise, attempt to dlopen the library directly.
+  void *handle = nullptr;
+  for (const char *path : kLibHipPaths) {
+    handle = dlopen(path, RTLD_LAZY | RTLD_LOCAL);
+    if (handle != nullptr) {
+      break;
+    }
+  }
+  return handle;
+}
+
+HIPDriverAPI CreateHIPDriverAPI() {
+  HIPDriverAPI api{};
+  void *handle = HIPDriverAPI::get_handle();
+  if (handle == nullptr) {
+    return api;
+  }
+
+#define LOOKUP(member, symbol)                                                 \
+  api.member = GetSymbol<decltype(api.member)>(handle, symbol);                \
+  if (api.member == nullptr) {                                                 \
+    return HIPDriverAPI{};                                                     \
+  }
+
+  LOOKUP(hipGetErrorName_, "hipGetErrorName")
+  LOOKUP(hipGetErrorString_, "hipGetErrorString")
+  LOOKUP(hipGetLastError_, "hipGetLastError")
+  LOOKUP(hipSetDevice_, "hipSetDevice")
+  LOOKUP(hipGetDevice_, "hipGetDevice")
+  LOOKUP(hipGetDeviceCount_, "hipGetDeviceCount")
+  LOOKUP(hipDeviceGetAttribute_, "hipDeviceGetAttribute")
+  LOOKUP(hipDeviceGetName_, "hipDeviceGetName")
+  LOOKUP(hipGetDeviceProperties_, "hipGetDeviceProperties")
+  LOOKUP(hipMalloc_, "hipMalloc")
+  LOOKUP(hipFree_, "hipFree")
+  LOOKUP(hipHostMalloc_, "hipHostMalloc")
+  LOOKUP(hipHostFree_, "hipHostFree")
+  LOOKUP(hipMemcpy_, "hipMemcpy")
+  LOOKUP(hipMemcpyAsync_, "hipMemcpyAsync")
+  LOOKUP(hipMemcpyPeerAsync_, "hipMemcpyPeerAsync")
+  LOOKUP(hipStreamCreate_, "hipStreamCreate")
+  LOOKUP(hipStreamDestroy_, "hipStreamDestroy")
+  LOOKUP(hipStreamSynchronize_, "hipStreamSynchronize")
+  LOOKUP(hipEventCreate_, "hipEventCreate")
+  LOOKUP(hipEventDestroy_, "hipEventDestroy")
+  LOOKUP(hipEventRecord_, "hipEventRecord")
+  LOOKUP(hipEventSynchronize_, "hipEventSynchronize")
+  LOOKUP(hipEventElapsedTime_, "hipEventElapsedTime")
+  LOOKUP(hipModuleLoadData_, "hipModuleLoadData")
+  LOOKUP(hipModuleUnload_, "hipModuleUnload")
+  LOOKUP(hipModuleGetFunction_, "hipModuleGetFunction")
+  LOOKUP(hipModuleGetGlobal_, "hipModuleGetGlobal")
+  LOOKUP(hipModuleLaunchKernel_, "hipModuleLaunchKernel")
+  LOOKUP(hipModuleLaunchCooperativeKernel_, "hipModuleLaunchCooperativeKernel")
+#undef LOOKUP
+
+  return api;
+}
+
+// -----------------------------------------------------------------------------
+// Minimal HSA stub (needed by TVM's ROCm runtime).
+// -----------------------------------------------------------------------------
+struct HSAAPI {
+  decltype(&::hsa_init) hsa_init_{nullptr};
+  decltype(&::hsa_shut_down) hsa_shut_down_{nullptr};
+};
+
+void *TryLoadLibHsaRuntime() {
+  void *sym = dlsym(RTLD_DEFAULT, "hsa_init");
+  if (sym != nullptr && sym != reinterpret_cast<void *>(&hsa_init)) {
+    return RTLD_DEFAULT;
+  }
+  sym = dlsym(RTLD_NEXT, "hsa_init");
+  if (sym != nullptr && sym != reinterpret_cast<void *>(&hsa_init)) {
+    return RTLD_NEXT;
+  }
+
+  void *handle = nullptr;
+  for (const char *path : kLibHsaPaths) {
+    handle = dlopen(path, RTLD_LAZY | RTLD_LOCAL);
+    if (handle != nullptr) {
+      break;
+    }
+  }
+  return handle;
+}
+
+void *GetLibHsaHandle() {
+  static void *handle = TryLoadLibHsaRuntime();
+  return handle;
+}
+
+HSAAPI CreateHSAAPI() {
+  HSAAPI api{};
+  void *handle = GetLibHsaHandle();
+  if (handle == nullptr) {
+    return api;
+  }
+  api.hsa_init_ = GetSymbol<decltype(api.hsa_init_)>(handle, "hsa_init");
+  api.hsa_shut_down_ =
+      GetSymbol<decltype(api.hsa_shut_down_)>(handle, "hsa_shut_down");
+  // It's fine if these are nullptr; wrappers will return an error code.
+  return api;
+}
+
+HSAAPI *GetHSAAPI() {
+  static HSAAPI singleton = CreateHSAAPI();
+  return &singleton;
+}
+
+#if TILELANG_HAS_HSA_HEADERS
+static hsa_status_t MissingHsaError() {
+  // Any non-success value makes TVM treat ROCm as not existing.
+  return static_cast<hsa_status_t>(1);
+}
+#else
+static hsa_status_t MissingHsaError() { return 1; }
+#endif
+
+} // namespace
+
+void *HIPDriverAPI::get_handle() {
+  static void *handle = TryLoadLibAmdHip64();
+  return handle;
+}
+
+bool HIPDriverAPI::is_available() { return get_handle() != nullptr; }
+
+HIPDriverAPI *HIPDriverAPI::get() {
+  static HIPDriverAPI singleton = CreateHIPDriverAPI();
+  if (!is_available()) {
+    throw std::runtime_error(
+        "HIP runtime library (libamdhip64.so) not found. "
+        "Install ROCm (or import a ROCm-enabled framework like PyTorch) before "
+        "using TileLang's ROCm backend.");
+  }
+  return &singleton;
+}
+
+} // namespace tvm::tl::hip
+
+// ============================================================================
+// Global wrapper function implementations
+// ============================================================================
+
+using tvm::tl::hip::HIPDriverAPI;
+
+extern "C" {
+
+// --- HIP runtime/module wrappers
+// ------------------------------------------------
+
+const char *hipGetErrorName(hipError_t error) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipGetErrorName_(error);
+}
+
+const char *hipGetErrorString(hipError_t error) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipGetErrorString_(error);
+}
+
+hipError_t hipGetLastError(void) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipGetLastError_();
+}
+
+hipError_t hipSetDevice(int deviceId) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipSetDevice_(deviceId);
+}
+
+hipError_t hipGetDevice(int *deviceId) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipGetDevice_(deviceId);
+}
+
+hipError_t hipGetDeviceCount(int *count) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipGetDeviceCount_(count);
+}
+
+hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attr,
+                                 int deviceId) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipDeviceGetAttribute_(pi, attr, deviceId);
+}
+
+hipError_t hipDeviceGetName(char *name, int len, int deviceId) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipDeviceGetName_(name, len, deviceId);
+}
+
+hipError_t hipGetDeviceProperties(hipDeviceProp_t *prop, int deviceId) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipGetDeviceProperties_(prop, deviceId);
+}
+
+hipError_t hipMalloc(void **ptr, size_t size) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipMalloc_(ptr, size);
+}
+
+// NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+hipError_t hipFree(void *ptr) { return HIPDriverAPI::get()->hipFree_(ptr); }
+
+hipError_t hipHostMalloc(void **ptr, size_t size, unsigned int flags) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipHostMalloc_(ptr, size, flags);
+}
+
+hipError_t hipHostFree(void *ptr) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipHostFree_(ptr);
+}
+
+hipError_t hipMemcpy(void *dst, const void *src, size_t sizeBytes,
+                     hipMemcpyKind kind) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipMemcpy_(dst, src, sizeBytes, kind);
+}
+
+hipError_t hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes,
+                          hipMemcpyKind kind, hipStream_t stream) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipMemcpyAsync_(dst, src, sizeBytes, kind,
+                                              stream);
+}
+
+hipError_t hipMemcpyPeerAsync(void *dst, int dstDeviceId, const void *src,
+                              int srcDeviceId, size_t sizeBytes,
+                              hipStream_t stream) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipMemcpyPeerAsync_(
+      dst, dstDeviceId, src, srcDeviceId, sizeBytes, stream);
+}
+
+hipError_t hipStreamCreate(hipStream_t *stream) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipStreamCreate_(stream);
+}
+
+hipError_t hipStreamDestroy(hipStream_t stream) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipStreamDestroy_(stream);
+}
+
+hipError_t hipStreamSynchronize(hipStream_t stream) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipStreamSynchronize_(stream);
+}
+
+hipError_t hipEventCreate(hipEvent_t *event) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipEventCreate_(event);
+}
+
+hipError_t hipEventDestroy(hipEvent_t event) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipEventDestroy_(event);
+}
+
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipEventRecord_(event, stream);
+}
+
+hipError_t hipEventSynchronize(hipEvent_t event) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipEventSynchronize_(event);
+}
+
+hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipEventElapsedTime_(ms, start, stop);
+}
+
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipModuleLoadData_(module, image);
+}
+
+hipError_t hipModuleUnload(hipModule_t module) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipModuleUnload_(module);
+}
+
+hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module,
+                                const char *name) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipModuleGetFunction_(function, module, name);
+}
+
+hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+                              hipModule_t module, const char *name) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipModuleGetGlobal_(dptr, bytes, module, name);
+}
+
+hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
+                                 unsigned int gridDimY, unsigned int gridDimZ,
+                                 unsigned int blockDimX, unsigned int blockDimY,
+                                 unsigned int blockDimZ,
+                                 unsigned int sharedMemBytes,
+                                 hipStream_t stream, void **kernelParams,
+                                 void **extra) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipModuleLaunchKernel_(
+      f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+      sharedMemBytes, stream, kernelParams, extra);
+}
+
+hipError_t hipModuleLaunchCooperativeKernel(
+    hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t stream,
+    void **kernelParams) {
+  // NOLINTNEXTLINE(clang-analyzer-core.CallAndMessage)
+  return HIPDriverAPI::get()->hipModuleLaunchCooperativeKernel_(
+      f, gridDimX, gridDimY, gridDimZ, blockDimX, blockDimY, blockDimZ,
+      sharedMemBytes, stream, kernelParams);
+}
+
+// --- Minimal HSA wrappers
+// -------------------------------------------------------
+
+TILELANG_HIP_STUB_API hsa_status_t hsa_init(void) {
+  auto *api = tvm::tl::hip::GetHSAAPI();
+  if (api->hsa_init_ == nullptr) {
+    return tvm::tl::hip::MissingHsaError();
+  }
+  return api->hsa_init_();
+}
+
+TILELANG_HIP_STUB_API hsa_status_t hsa_shut_down(void) {
+  auto *api = tvm::tl::hip::GetHSAAPI();
+  if (api->hsa_shut_down_ == nullptr) {
+    return tvm::tl::hip::MissingHsaError();
+  }
+  return api->hsa_shut_down_();
+}
+
+} // extern "C"
diff --git a/src/target/stubs/hip.h b/src/target/stubs/hip.h
new file mode 100644
index 0000000000..5030e2e7dc
--- /dev/null
+++ b/src/target/stubs/hip.h
@@ -0,0 +1,226 @@
+/**
+ * \file hip.h
+ * \brief Stub library header for lazy loading ROCm/HIP libraries at runtime.
+ *
+ * This mirrors the existing CUDA stubs in src/target/stubs/:
+ * - Instead of linking against libamdhip64.so at build time, TileLang can link
+ *   against a small stub library (libhip_stub.so) that resolves HIP symbols via
+ *   dlopen()/dlsym() on first use.
+ *
+ * This enables:
+ * 1. Importing TileLang on CPU-only machines (no ROCm runtime installed).
+ * 2. Avoiding version conflicts by using the ROCm runtime available at runtime.
+ * 3. Building a single wheel that can run across environments.
+ *
+ * Usage:
+ *   #include "target/stubs/hip.h"
+ *   hipError_t e = hipSetDevice(0);
+ */
+
+#pragma once
+
+// Define guard before including vendor/hip_runtime.h.
+// This ensures vendor/hip_runtime.h can only be included through this stub
+// header.
+#define _TILELANG_HIP_STUB_INCLUDE_GUARD
+
+// Prefer real ROCm headers when available for exact ABI matching. Fall back to
+// a minimal vendored header to allow building CPU-only wheels.
+#if __has_include(<hip/hip_runtime_api.h>)
+#include <hip/hip_runtime_api.h>
+#else
+#include "vendor/hip_runtime.h"
+#endif
+
+#undef _TILELANG_HIP_STUB_INCLUDE_GUARD
+
+// Symbol visibility macros for shared library export.
+#if defined(_WIN32) || defined(__CYGWIN__)
+#ifdef TILELANG_HIP_STUB_EXPORTS
+#define TILELANG_HIP_STUB_API __declspec(dllexport)
+#else
+#define TILELANG_HIP_STUB_API __declspec(dllimport)
+#endif
+#else
+#define TILELANG_HIP_STUB_API __attribute__((visibility("default")))
+#endif
+
+// X-macro for listing all required HIP API functions.
+// Format: _(function_name)
+//
+// NOTE: libamdhip64.so contains both HIP "runtime" APIs and module/driver-style
+// APIs (hipModuleLoadData, hipModuleLaunchKernel, ...). TVM's ROCm runtime uses
+// both, so we stub them from the same shared library.
+#define TILELANG_LIBHIP_API_REQUIRED(_)                                        \
+  _(hipGetErrorName)                                                           \
+  _(hipGetErrorString)                                                         \
+  _(hipGetLastError)                                                           \
+  _(hipSetDevice)                                                              \
+  _(hipGetDevice)                                                              \
+  _(hipGetDeviceCount)                                                         \
+  _(hipDeviceGetAttribute)                                                     \
+  _(hipDeviceGetName)                                                          \
+  _(hipGetDeviceProperties)                                                    \
+  _(hipMalloc)                                                                 \
+  _(hipFree)                                                                   \
+  _(hipHostMalloc)                                                             \
+  _(hipHostFree)                                                               \
+  _(hipMemcpy)                                                                 \
+  _(hipMemcpyAsync)                                                            \
+  _(hipMemcpyPeerAsync)                                                        \
+  _(hipStreamCreate)                                                           \
+  _(hipStreamDestroy)                                                          \
+  _(hipStreamSynchronize)                                                      \
+  _(hipEventCreate)                                                            \
+  _(hipEventDestroy)                                                           \
+  _(hipEventRecord)                                                            \
+  _(hipEventSynchronize)                                                       \
+  _(hipEventElapsedTime)                                                       \
+  _(hipModuleLoadData)                                                         \
+  _(hipModuleUnload)                                                           \
+  _(hipModuleGetFunction)                                                      \
+  _(hipModuleGetGlobal)                                                        \
+  _(hipModuleLaunchKernel)                                                     \
+  _(hipModuleLaunchCooperativeKernel)
+
+namespace tvm::tl::hip {
+
+/**
+ * \brief HIP API accessor struct with lazy loading support.
+ *
+ * Similar to tvm::tl::cuda::CUDADriverAPI, this struct resolves libamdhip64.so
+ * symbols lazily on first use.
+ *
+ * Function pointer members have a trailing underscore to avoid collisions with
+ * the exported global wrapper functions.
+ */
+struct TILELANG_HIP_STUB_API HIPDriverAPI {
+  // Explicit function pointer types to avoid decltype issues with
+  // C++ template overloads in HIP headers (e.g., hipMalloc).
+  const char *(*hipGetErrorName_)(hipError_t);
+  const char *(*hipGetErrorString_)(hipError_t);
+  hipError_t (*hipGetLastError_)(void);
+  hipError_t (*hipSetDevice_)(int);
+  hipError_t (*hipGetDevice_)(int *);
+  hipError_t (*hipGetDeviceCount_)(int *);
+  hipError_t (*hipDeviceGetAttribute_)(int *, hipDeviceAttribute_t, int);
+  hipError_t (*hipDeviceGetName_)(char *, int, int);
+  hipError_t (*hipGetDeviceProperties_)(hipDeviceProp_t *, int);
+  hipError_t (*hipMalloc_)(void **, size_t);
+  hipError_t (*hipFree_)(void *);
+  hipError_t (*hipHostMalloc_)(void **, size_t, unsigned int);
+  hipError_t (*hipHostFree_)(void *);
+  hipError_t (*hipMemcpy_)(void *, const void *, size_t, hipMemcpyKind);
+  hipError_t (*hipMemcpyAsync_)(void *, const void *, size_t, hipMemcpyKind,
+                                hipStream_t);
+  hipError_t (*hipMemcpyPeerAsync_)(void *, int, const void *, int, size_t,
+                                    hipStream_t);
+  hipError_t (*hipStreamCreate_)(hipStream_t *);
+  hipError_t (*hipStreamDestroy_)(hipStream_t);
+  hipError_t (*hipStreamSynchronize_)(hipStream_t);
+  hipError_t (*hipEventCreate_)(hipEvent_t *);
+  hipError_t (*hipEventDestroy_)(hipEvent_t);
+  hipError_t (*hipEventRecord_)(hipEvent_t, hipStream_t);
+  hipError_t (*hipEventSynchronize_)(hipEvent_t);
+  hipError_t (*hipEventElapsedTime_)(float *, hipEvent_t, hipEvent_t);
+  hipError_t (*hipModuleLoadData_)(hipModule_t *, const void *);
+  hipError_t (*hipModuleUnload_)(hipModule_t);
+  hipError_t (*hipModuleGetFunction_)(hipFunction_t *, hipModule_t,
+                                      const char *);
+  hipError_t (*hipModuleGetGlobal_)(hipDeviceptr_t *, size_t *, hipModule_t,
+                                    const char *);
+  hipError_t (*hipModuleLaunchKernel_)(hipFunction_t, unsigned int,
+                                       unsigned int, unsigned int, unsigned int,
+                                       unsigned int, unsigned int, unsigned int,
+                                       hipStream_t, void **, void **);
+  hipError_t (*hipModuleLaunchCooperativeKernel_)(hipFunction_t, unsigned int,
+                                                  unsigned int, unsigned int,
+                                                  unsigned int, unsigned int,
+                                                  unsigned int, unsigned int,
+                                                  hipStream_t, void **);
+
+  static HIPDriverAPI *get();
+  static bool is_available();
+  static void *get_handle();
+};
+
+} // namespace tvm::tl::hip
+
+// ============================================================================
+// Global wrapper functions for lazy-loaded HIP API
+// ============================================================================
+// These functions provide drop-in replacements for HIP runtime/module API
+// calls. The implementations are in hip.cc.
+
+extern "C" {
+
+TILELANG_HIP_STUB_API const char *hipGetErrorName(hipError_t error);
+TILELANG_HIP_STUB_API const char *hipGetErrorString(hipError_t error);
+TILELANG_HIP_STUB_API hipError_t hipGetLastError(void);
+
+TILELANG_HIP_STUB_API hipError_t hipSetDevice(int deviceId);
+TILELANG_HIP_STUB_API hipError_t hipGetDevice(int *deviceId);
+TILELANG_HIP_STUB_API hipError_t hipGetDeviceCount(int *count);
+
+TILELANG_HIP_STUB_API hipError_t
+hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attr, int deviceId);
+TILELANG_HIP_STUB_API hipError_t hipDeviceGetName(char *name, int len,
+                                                  int deviceId);
+TILELANG_HIP_STUB_API hipError_t hipGetDeviceProperties(hipDeviceProp_t *prop,
+                                                        int deviceId);
+
+TILELANG_HIP_STUB_API hipError_t hipMalloc(void **ptr, size_t size);
+TILELANG_HIP_STUB_API hipError_t hipFree(void *ptr);
+TILELANG_HIP_STUB_API hipError_t hipHostMalloc(void **ptr, size_t size,
+                                               unsigned int flags);
+TILELANG_HIP_STUB_API hipError_t hipHostFree(void *ptr);
+
+TILELANG_HIP_STUB_API hipError_t hipMemcpy(void *dst, const void *src,
+                                           size_t sizeBytes,
+                                           hipMemcpyKind kind);
+TILELANG_HIP_STUB_API hipError_t hipMemcpyAsync(void *dst, const void *src,
+                                                size_t sizeBytes,
+                                                hipMemcpyKind kind,
+                                                hipStream_t stream);
+TILELANG_HIP_STUB_API hipError_t hipMemcpyPeerAsync(void *dst, int dstDeviceId,
+                                                    const void *src,
+                                                    int srcDeviceId,
+                                                    size_t sizeBytes,
+                                                    hipStream_t stream);
+
+TILELANG_HIP_STUB_API hipError_t hipStreamCreate(hipStream_t *stream);
+TILELANG_HIP_STUB_API hipError_t hipStreamDestroy(hipStream_t stream);
+TILELANG_HIP_STUB_API hipError_t hipStreamSynchronize(hipStream_t stream);
+
+TILELANG_HIP_STUB_API hipError_t hipEventCreate(hipEvent_t *event);
+TILELANG_HIP_STUB_API hipError_t hipEventDestroy(hipEvent_t event);
+TILELANG_HIP_STUB_API hipError_t hipEventRecord(hipEvent_t event,
+                                                hipStream_t stream);
+TILELANG_HIP_STUB_API hipError_t hipEventSynchronize(hipEvent_t event);
+TILELANG_HIP_STUB_API hipError_t hipEventElapsedTime(float *ms,
+                                                     hipEvent_t start,
+                                                     hipEvent_t stop);
+
+TILELANG_HIP_STUB_API hipError_t hipModuleLoadData(hipModule_t *module,
+                                                   const void *image);
+TILELANG_HIP_STUB_API hipError_t hipModuleUnload(hipModule_t module);
+TILELANG_HIP_STUB_API hipError_t hipModuleGetFunction(hipFunction_t *function,
+                                                      hipModule_t module,
+                                                      const char *name);
+TILELANG_HIP_STUB_API hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr,
+                                                    size_t *bytes,
+                                                    hipModule_t module,
+                                                    const char *name);
+TILELANG_HIP_STUB_API hipError_t hipModuleLaunchKernel(
+    hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t stream,
+    void **kernelParams, void **extra);
+
+TILELANG_HIP_STUB_API hipError_t hipModuleLaunchCooperativeKernel(
+    hipFunction_t f, unsigned int gridDimX, unsigned int gridDimY,
+    unsigned int gridDimZ, unsigned int blockDimX, unsigned int blockDimY,
+    unsigned int blockDimZ, unsigned int sharedMemBytes, hipStream_t stream,
+    void **kernelParams);
+
+} // extern "C"
diff --git a/src/target/stubs/hiprtc.cc b/src/target/stubs/hiprtc.cc
new file mode 100644
index 0000000000..664f0fb2c6
--- /dev/null
+++ b/src/target/stubs/hiprtc.cc
@@ -0,0 +1,260 @@
+/**
+ * \file hiprtc.cc
+ * \brief HIPRTC stub library for lazy loading libhiprtc.so at runtime.
+ *
+ * Similar to src/target/stubs/nvrtc.cc, this stub exports a minimal subset of
+ * the HIPRTC C API and resolves the real implementation with dlopen()/dlsym().
+ *
+ * This allows a ROCm-enabled TileLang build to be imported on machines without
+ * ROCm installed, and avoids hard DT_NEEDED dependencies on libhiprtc.
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#if __has_include(<hip/hiprtc.h>)
+#include <hip/hiprtc.h>
+#define TILELANG_HAS_HIPRTC_HEADERS 1
+#else
+#define TILELANG_HAS_HIPRTC_HEADERS 0
+// Minimal fallback definitions to keep this file buildable without ROCm SDK.
+// These are intentionally incomplete; prefer building with real ROCm headers.
+#include <stddef.h>
+typedef struct _hiprtcProgram *hiprtcProgram;
+typedef enum hiprtcResult {
+  HIPRTC_SUCCESS = 0,
+  HIPRTC_ERROR_INTERNAL_ERROR = 1
+} hiprtcResult;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+const char *hiprtcGetErrorString(hiprtcResult result);
+hiprtcResult hiprtcVersion(int *major, int *minor);
+hiprtcResult hiprtcCreateProgram(hiprtcProgram *prog, const char *src,
+                                 const char *name, int numHeaders,
+                                 const char *const *headers,
+                                 const char *const *includeNames);
+hiprtcResult hiprtcDestroyProgram(hiprtcProgram *prog);
+hiprtcResult hiprtcCompileProgram(hiprtcProgram prog, int numOptions,
+                                  const char *const *options);
+hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog, size_t *codeSizeRet);
+hiprtcResult hiprtcGetCode(hiprtcProgram prog, char *code);
+hiprtcResult hiprtcGetProgramLogSize(hiprtcProgram prog, size_t *logSizeRet);
+hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog, char *log);
+#ifdef __cplusplus
+} // extern "C"
+#endif
+#endif
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#error "hiprtc_stub is currently POSIX-only (requires <dlfcn.h> / dlopen). "     \
+    "On Windows, build TileLang from source with -DTILELANG_USE_HIP_STUBS=OFF " \
+    "to link against the real ROCm libraries."
+#endif
+
+#include <dlfcn.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// Export symbols with default visibility for the shared stub library.
+#define TILELANG_HIPRTC_STUB_API __attribute__((visibility("default")))
+
+namespace {
+
+constexpr const char *kLibHiprtcPaths[] = {
+    "libhiprtc.so",
+    "libhiprtc.so.6",
+    "libhiprtc.so.5",
+};
+
+void *TryLoadLibHiprtc() {
+  // Reuse already-loaded HIPRTC if present (e.g. framework import order).
+  void *sym = dlsym(RTLD_DEFAULT, "hiprtcGetErrorString");
+  if (sym != nullptr &&
+      sym != reinterpret_cast<void *>(&hiprtcGetErrorString)) {
+    return RTLD_DEFAULT;
+  }
+  sym = dlsym(RTLD_NEXT, "hiprtcGetErrorString");
+  if (sym != nullptr &&
+      sym != reinterpret_cast<void *>(&hiprtcGetErrorString)) {
+    return RTLD_NEXT;
+  }
+
+  void *handle = nullptr;
+  for (const char *path : kLibHiprtcPaths) {
+    handle = dlopen(path, RTLD_LAZY | RTLD_LOCAL);
+    if (handle != nullptr) {
+      break;
+    }
+  }
+  return handle;
+}
+
+template <typename T> T GetSymbol(void *handle, const char *name) {
+  (void)dlerror();
+  void *sym = dlsym(handle, name);
+  const char *error = dlerror();
+  if (error != nullptr) {
+    return nullptr;
+  }
+  return reinterpret_cast<T>(sym);
+}
+
+struct HIPRTCAPI {
+  decltype(&::hiprtcGetErrorString) hiprtcGetErrorString_{nullptr};
+  decltype(&::hiprtcVersion) hiprtcVersion_{nullptr};
+  decltype(&::hiprtcCreateProgram) hiprtcCreateProgram_{nullptr};
+  decltype(&::hiprtcDestroyProgram) hiprtcDestroyProgram_{nullptr};
+  decltype(&::hiprtcCompileProgram) hiprtcCompileProgram_{nullptr};
+  decltype(&::hiprtcGetCodeSize) hiprtcGetCodeSize_{nullptr};
+  decltype(&::hiprtcGetCode) hiprtcGetCode_{nullptr};
+  decltype(&::hiprtcGetProgramLogSize) hiprtcGetProgramLogSize_{nullptr};
+  decltype(&::hiprtcGetProgramLog) hiprtcGetProgramLog_{nullptr};
+};
+
+void *GetLibHiprtcHandle() {
+  static void *handle = TryLoadLibHiprtc();
+  return handle;
+}
+
+HIPRTCAPI CreateHIPRTCAPI() {
+  HIPRTCAPI api{};
+  void *handle = GetLibHiprtcHandle();
+  if (handle == nullptr) {
+    return api;
+  }
+
+#define LOOKUP_REQUIRED(name)                                                  \
+  api.name##_ = GetSymbol<decltype(api.name##_)>(handle, #name);               \
+  if (api.name##_ == nullptr) {                                                \
+    return HIPRTCAPI{};                                                        \
+  }
+
+  // hiprtcGetErrorString is optional; we can provide a fallback string.
+  api.hiprtcGetErrorString_ = GetSymbol<decltype(api.hiprtcGetErrorString_)>(
+      handle, "hiprtcGetErrorString");
+
+  LOOKUP_REQUIRED(hiprtcVersion)
+  LOOKUP_REQUIRED(hiprtcCreateProgram)
+  LOOKUP_REQUIRED(hiprtcDestroyProgram)
+  LOOKUP_REQUIRED(hiprtcCompileProgram)
+  LOOKUP_REQUIRED(hiprtcGetCodeSize)
+  LOOKUP_REQUIRED(hiprtcGetCode)
+  LOOKUP_REQUIRED(hiprtcGetProgramLogSize)
+  LOOKUP_REQUIRED(hiprtcGetProgramLog)
+
+#undef LOOKUP_REQUIRED
+
+  return api;
+}
+
+HIPRTCAPI *GetHIPRTCAPI() {
+  static HIPRTCAPI singleton = CreateHIPRTCAPI();
+  return &singleton;
+}
+
+hiprtcResult MissingLibraryError() {
+#if TILELANG_HAS_HIPRTC_HEADERS
+  return HIPRTC_ERROR_INTERNAL_ERROR;
+#else
+  return HIPRTC_ERROR_INTERNAL_ERROR;
+#endif
+}
+
+const char *FallbackHiprtcErrorString(hiprtcResult result) {
+  switch (result) {
+  case HIPRTC_SUCCESS:
+    return "HIPRTC_SUCCESS";
+  default:
+    return "HIPRTC_ERROR (HIPRTC stub: libhiprtc not found)";
+  }
+}
+
+} // namespace
+
+extern "C" {
+
+TILELANG_HIPRTC_STUB_API const char *hiprtcGetErrorString(hiprtcResult result) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcGetErrorString_ != nullptr) {
+    return api->hiprtcGetErrorString_(result);
+  }
+  return FallbackHiprtcErrorString(result);
+}
+
+TILELANG_HIPRTC_STUB_API hiprtcResult hiprtcVersion(int *major, int *minor) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcVersion_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->hiprtcVersion_(major, minor);
+}
+
+TILELANG_HIPRTC_STUB_API hiprtcResult hiprtcCreateProgram(
+    hiprtcProgram *prog, const char *src, const char *name, int numHeaders,
+    const char *const *headers, const char *const *includeNames) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcCreateProgram_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->hiprtcCreateProgram_(prog, src, name, numHeaders, headers,
+                                   includeNames);
+}
+
+TILELANG_HIPRTC_STUB_API hiprtcResult
+hiprtcDestroyProgram(hiprtcProgram *prog) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcDestroyProgram_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->hiprtcDestroyProgram_(prog);
+}
+
+TILELANG_HIPRTC_STUB_API hiprtcResult hiprtcCompileProgram(
+    hiprtcProgram prog, int numOptions, const char *const *options) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcCompileProgram_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->hiprtcCompileProgram_(prog, numOptions, options);
+}
+
+TILELANG_HIPRTC_STUB_API hiprtcResult hiprtcGetCodeSize(hiprtcProgram prog,
+                                                        size_t *codeSizeRet) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcGetCodeSize_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->hiprtcGetCodeSize_(prog, codeSizeRet);
+}
+
+TILELANG_HIPRTC_STUB_API hiprtcResult hiprtcGetCode(hiprtcProgram prog,
+                                                    char *code) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcGetCode_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->hiprtcGetCode_(prog, code);
+}
+
+TILELANG_HIPRTC_STUB_API hiprtcResult
+hiprtcGetProgramLogSize(hiprtcProgram prog, size_t *logSizeRet) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcGetProgramLogSize_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->hiprtcGetProgramLogSize_(prog, logSizeRet);
+}
+
+TILELANG_HIPRTC_STUB_API hiprtcResult hiprtcGetProgramLog(hiprtcProgram prog,
+                                                          char *log) {
+  auto *api = GetHIPRTCAPI();
+  if (api->hiprtcGetProgramLog_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->hiprtcGetProgramLog_(prog, log);
+}
+
+} // extern "C"
diff --git a/src/target/stubs/nvrtc.cc b/src/target/stubs/nvrtc.cc
new file mode 100644
index 0000000000..87320a6ca9
--- /dev/null
+++ b/src/target/stubs/nvrtc.cc
@@ -0,0 +1,212 @@
+/**
+ * \file nvrtc.cc
+ * \brief NVRTC stub library for lazy loading libnvrtc.so at runtime.
+ *
+ * Motivation
+ * ----------
+ * Similar to cudart, the primary purpose is to resolve SONAME mismatches,
+ * allowing a single build to work across different CUDA versions. This is
+ * achieved by reusing the NVRTC library already loaded by frameworks like
+ * PyTorch.
+ *
+ * This stub exports a minimal set of NVRTC C API entrypoints used by
+ * TVM/TileLang. The actual libnvrtc is loaded lazily via dlopen() on first API
+ * call, and symbols are resolved via dlsym().
+ */
+
+#ifndef _GNU_SOURCE
+#define _GNU_SOURCE
+#endif
+
+#include <nvrtc.h>
+
+#if defined(_WIN32) && !defined(__CYGWIN__)
+#error "nvrtc_stub is currently POSIX-only (requires <dlfcn.h> / dlopen). "        \
+    "On Windows, build TileLang from source with -DTILELANG_USE_CUDA_STUBS=OFF " \
+    "to link against the real CUDA libraries."
+#endif
+
+#include <dlfcn.h>
+#include <stddef.h>
+#include <stdio.h>
+#include <stdlib.h>
+
+// Export symbols with default visibility for the shared stub library.
+#define TILELANG_NVRTC_STUB_API __attribute__((visibility("default")))
+
+namespace {
+
+void *TryLoadLibNvrtc() {
+  // First, check if the symbols are already available globally.
+  // This handles cases where PyTorch or another library has already loaded
+  // libnvrtc.
+  // We use a representative symbol like nvrtcVersion.
+  void *sym = dlsym(RTLD_DEFAULT, "nvrtcVersion");
+  if (sym != nullptr && sym != reinterpret_cast<void *>(&nvrtcVersion)) {
+    return RTLD_DEFAULT;
+  }
+  sym = dlsym(RTLD_NEXT, "nvrtcVersion");
+  if (sym != nullptr) {
+    return RTLD_NEXT;
+  }
+
+  fprintf(stderr,
+          "TileLang Error: libnvrtc symbols not found globally. "
+          "Make sure PyTorch with CUDA is installed before using TileLang.\n");
+  abort();
+}
+
+template <typename T> T GetSymbol(void *handle, const char *name) {
+  (void)dlerror();
+  void *sym = dlsym(handle, name);
+  const char *error = dlerror();
+  if (error != nullptr) {
+    return nullptr;
+  }
+  return reinterpret_cast<T>(sym);
+}
+
+struct NVRTCAPI {
+  decltype(&::nvrtcGetErrorString) nvrtcGetErrorString_{nullptr};
+  decltype(&::nvrtcVersion) nvrtcVersion_{nullptr};
+  decltype(&::nvrtcCreateProgram) nvrtcCreateProgram_{nullptr};
+  decltype(&::nvrtcDestroyProgram) nvrtcDestroyProgram_{nullptr};
+  decltype(&::nvrtcCompileProgram) nvrtcCompileProgram_{nullptr};
+  decltype(&::nvrtcGetPTXSize) nvrtcGetPTXSize_{nullptr};
+  decltype(&::nvrtcGetPTX) nvrtcGetPTX_{nullptr};
+  decltype(&::nvrtcGetProgramLogSize) nvrtcGetProgramLogSize_{nullptr};
+  decltype(&::nvrtcGetProgramLog) nvrtcGetProgramLog_{nullptr};
+};
+
+void *GetLibNvrtcHandle() {
+  static void *handle = TryLoadLibNvrtc();
+  return handle;
+}
+
+NVRTCAPI CreateNVRTCAPI() {
+  NVRTCAPI api{};
+  void *handle = GetLibNvrtcHandle();
+#define LOOKUP_REQUIRED(name)                                                  \
+  api.name##_ = GetSymbol<decltype(api.name##_)>(handle, #name);               \
+  if (api.name##_ == nullptr) {                                                \
+    return NVRTCAPI{};                                                         \
+  }
+
+  LOOKUP_REQUIRED(nvrtcGetErrorString)
+  LOOKUP_REQUIRED(nvrtcVersion)
+  LOOKUP_REQUIRED(nvrtcCreateProgram)
+  LOOKUP_REQUIRED(nvrtcDestroyProgram)
+  LOOKUP_REQUIRED(nvrtcCompileProgram)
+  LOOKUP_REQUIRED(nvrtcGetPTXSize)
+  LOOKUP_REQUIRED(nvrtcGetPTX)
+  LOOKUP_REQUIRED(nvrtcGetProgramLogSize)
+  LOOKUP_REQUIRED(nvrtcGetProgramLog)
+
+#undef LOOKUP_REQUIRED
+
+  return api;
+}
+
+NVRTCAPI *GetNVRTCAPI() {
+  static NVRTCAPI singleton = CreateNVRTCAPI();
+  return &singleton;
+}
+
+// Provide a stable error string even if libnvrtc cannot be loaded.
+const char *FallbackNvrtcErrorString(nvrtcResult result) {
+  switch (result) {
+  case NVRTC_SUCCESS:
+    return "NVRTC_SUCCESS";
+  case NVRTC_ERROR_INTERNAL_ERROR:
+    return "NVRTC_ERROR_INTERNAL_ERROR (NVRTC stub: libnvrtc not found)";
+  default:
+    return "NVRTC_ERROR (NVRTC stub: libnvrtc not found)";
+  }
+}
+
+nvrtcResult MissingLibraryError() { return NVRTC_ERROR_INTERNAL_ERROR; }
+
+} // namespace
+
+extern "C" {
+
+TILELANG_NVRTC_STUB_API const char *nvrtcGetErrorString(nvrtcResult result) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcGetErrorString_ != nullptr) {
+    return api->nvrtcGetErrorString_(result);
+  }
+  return FallbackNvrtcErrorString(result);
+}
+
+TILELANG_NVRTC_STUB_API nvrtcResult nvrtcVersion(int *major, int *minor) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcVersion_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->nvrtcVersion_(major, minor);
+}
+
+TILELANG_NVRTC_STUB_API nvrtcResult nvrtcCreateProgram(
+    nvrtcProgram *prog, const char *src, const char *name, int numHeaders,
+    const char *const *headers, const char *const *includeNames) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcCreateProgram_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->nvrtcCreateProgram_(prog, src, name, numHeaders, headers,
+                                  includeNames);
+}
+
+TILELANG_NVRTC_STUB_API nvrtcResult nvrtcDestroyProgram(nvrtcProgram *prog) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcDestroyProgram_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->nvrtcDestroyProgram_(prog);
+}
+
+TILELANG_NVRTC_STUB_API nvrtcResult nvrtcCompileProgram(
+    nvrtcProgram prog, int numOptions, const char *const *options) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcCompileProgram_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->nvrtcCompileProgram_(prog, numOptions, options);
+}
+
+TILELANG_NVRTC_STUB_API nvrtcResult nvrtcGetPTXSize(nvrtcProgram prog,
+                                                    size_t *ptxSizeRet) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcGetPTXSize_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->nvrtcGetPTXSize_(prog, ptxSizeRet);
+}
+
+TILELANG_NVRTC_STUB_API nvrtcResult nvrtcGetPTX(nvrtcProgram prog, char *ptx) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcGetPTX_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->nvrtcGetPTX_(prog, ptx);
+}
+
+TILELANG_NVRTC_STUB_API nvrtcResult nvrtcGetProgramLogSize(nvrtcProgram prog,
+                                                           size_t *logSizeRet) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcGetProgramLogSize_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->nvrtcGetProgramLogSize_(prog, logSizeRet);
+}
+
+TILELANG_NVRTC_STUB_API nvrtcResult nvrtcGetProgramLog(nvrtcProgram prog,
+                                                       char *log) {
+  auto *api = GetNVRTCAPI();
+  if (api->nvrtcGetProgramLog_ == nullptr) {
+    return MissingLibraryError();
+  }
+  return api->nvrtcGetProgramLog_(prog, log);
+}
+
+} // extern "C"
diff --git a/src/target/cuda.h b/src/target/stubs/vendor/cuda.h
similarity index 99%
rename from src/target/cuda.h
rename to src/target/stubs/vendor/cuda.h
index a9dfb13ab0..534831f227 100644
--- a/src/target/cuda.h
+++ b/src/target/stubs/vendor/cuda.h
@@ -47,6 +47,12 @@
  * Users Notice.
  */
 
+// Guard to ensure this header is only included by the stub library
+#ifndef _TILELANG_CUDA_STUB_INCLUDE_GUARD
+#error "vendor/cuda.h should only be included by target/stubs/cuda.h. " \
+       "Do not include this file directly and use target/stubs/cuda.h instead."
+#endif
+
 #ifndef __cuda_cuda_h__
 #define __cuda_cuda_h__
 
diff --git a/src/target/stubs/vendor/hip_runtime.h b/src/target/stubs/vendor/hip_runtime.h
new file mode 100644
index 0000000000..43b3ff01c9
--- /dev/null
+++ b/src/target/stubs/vendor/hip_runtime.h
@@ -0,0 +1,121 @@
+// Minimal HIP runtime/driver API declarations for TileLang's HIP stub library.
+//
+// This file exists to allow building the stub (src/target/stubs/hip.cc) without
+// requiring a full ROCm SDK at build time. When ROCm headers are available,
+// target/stubs/hip.h prefers including <hip/hip_runtime_api.h>.
+//
+// IMPORTANT:
+// - This header is NOT a complete HIP API.
+// - Types that are passed by pointer are kept opaque/incomplete on purpose.
+// - Do not include this file directly; include target/stubs/hip.h instead.
+
+// Guard to ensure this header is only included by the stub wrapper header.
+#ifndef _TILELANG_HIP_STUB_INCLUDE_GUARD
+#error "vendor/hip_runtime.h should only be included by target/stubs/hip.h. "     \
+    "Do not include this file directly; include target/stubs/hip.h instead."
+#endif
+
+#pragma once
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// -----------------------------------------------------------------------------
+// Opaque handle types (match HIP's public API shape).
+// -----------------------------------------------------------------------------
+typedef struct ihipStream_t *hipStream_t;
+typedef struct ihipEvent_t *hipEvent_t;
+typedef struct ihipModule_t *hipModule_t;
+typedef struct ihipFunction_t *hipFunction_t;
+
+// In HIP, hipDeviceptr_t is an opaque device pointer type.  We only ever pass
+// it around, never dereference it from host code.
+typedef void *hipDeviceptr_t;
+
+// In the full ROCm SDK this is a large struct.  The stub never needs its
+// contents, only the pointer type.
+typedef struct hipDeviceProp_t hipDeviceProp_t;
+
+// -----------------------------------------------------------------------------
+// Scalar enums / values.
+// -----------------------------------------------------------------------------
+// Keep these as int-compatible so calling conventions match even when using the
+// vendor fallback header.
+typedef enum hipError_t {
+  hipSuccess = 0,
+  hipErrorDeinitialized = 3,
+  hipErrorUnknown = 999
+} hipError_t;
+
+// hipMemcpyKind is passed by value.  Values follow the CUDA convention.
+typedef enum hipMemcpyKind {
+  hipMemcpyHostToHost = 0,
+  hipMemcpyHostToDevice = 1,
+  hipMemcpyDeviceToHost = 2,
+  hipMemcpyDeviceToDevice = 3,
+  hipMemcpyDefault = 4
+} hipMemcpyKind;
+
+// hipDeviceAttribute_t is passed by value.  We do not rely on specific numeric
+// values in the stub.  Use the real ROCm headers when available.
+typedef int hipDeviceAttribute_t;
+
+// -----------------------------------------------------------------------------
+// Minimal subset of HIP runtime/driver entrypoints used by TileLang/TVM.
+// -----------------------------------------------------------------------------
+const char *hipGetErrorName(hipError_t error);
+const char *hipGetErrorString(hipError_t error);
+hipError_t hipGetLastError(void);
+
+hipError_t hipSetDevice(int deviceId);
+hipError_t hipGetDevice(int *deviceId);
+hipError_t hipGetDeviceCount(int *count);
+
+hipError_t hipDeviceGetAttribute(int *pi, hipDeviceAttribute_t attr,
+                                 int deviceId);
+hipError_t hipDeviceGetName(char *name, int len, int deviceId);
+hipError_t hipGetDeviceProperties(hipDeviceProp_t *prop, int deviceId);
+
+hipError_t hipMalloc(void **ptr, size_t size);
+hipError_t hipFree(void *ptr);
+hipError_t hipHostMalloc(void **ptr, size_t size, unsigned int flags);
+hipError_t hipHostFree(void *ptr);
+
+hipError_t hipMemcpy(void *dst, const void *src, size_t sizeBytes,
+                     hipMemcpyKind kind);
+hipError_t hipMemcpyAsync(void *dst, const void *src, size_t sizeBytes,
+                          hipMemcpyKind kind, hipStream_t stream);
+hipError_t hipMemcpyPeerAsync(void *dst, int dstDeviceId, const void *src,
+                              int srcDeviceId, size_t sizeBytes,
+                              hipStream_t stream);
+
+hipError_t hipStreamCreate(hipStream_t *stream);
+hipError_t hipStreamDestroy(hipStream_t stream);
+hipError_t hipStreamSynchronize(hipStream_t stream);
+
+hipError_t hipEventCreate(hipEvent_t *event);
+hipError_t hipEventDestroy(hipEvent_t event);
+hipError_t hipEventRecord(hipEvent_t event, hipStream_t stream);
+hipError_t hipEventSynchronize(hipEvent_t event);
+hipError_t hipEventElapsedTime(float *ms, hipEvent_t start, hipEvent_t stop);
+
+hipError_t hipModuleLoadData(hipModule_t *module, const void *image);
+hipError_t hipModuleUnload(hipModule_t module);
+hipError_t hipModuleGetFunction(hipFunction_t *function, hipModule_t module,
+                                const char *name);
+hipError_t hipModuleGetGlobal(hipDeviceptr_t *dptr, size_t *bytes,
+                              hipModule_t module, const char *name);
+hipError_t hipModuleLaunchKernel(hipFunction_t f, unsigned int gridDimX,
+                                 unsigned int gridDimY, unsigned int gridDimZ,
+                                 unsigned int blockDimX, unsigned int blockDimY,
+                                 unsigned int blockDimZ,
+                                 unsigned int sharedMemBytes,
+                                 hipStream_t stream, void **kernelParams,
+                                 void **extra);
+
+#ifdef __cplusplus
+} // extern "C"
+#endif
diff --git a/src/target/utils.cc b/src/target/utils.cc
index 993590ffb6..36a845c477 100644
--- a/src/target/utils.cc
+++ b/src/target/utils.cc
@@ -6,6 +6,7 @@
 #include "utils.h"
 
 #include "../support/ffi_aliases.h"
+#include "dlpack/dlpack.h"
 #include <tvm/node/node.h>
 
 namespace tvm {
@@ -17,6 +18,12 @@ bool TargetIsCuda(Target target) {
 bool TargetIsRocm(Target target) {
   return target->GetTargetDeviceType() == kDLROCM;
 }
+bool TargetIsMetal(Target target) {
+  return target->GetTargetDeviceType() == kDLMetal;
+}
+bool TargetIsCPU(Target target) {
+  return target->GetTargetDeviceType() == kDLCPU;
+}
 
 int GetArchInt(Target target) {
   auto s = target->GetAttr<tvm::ffi::String>("arch");
@@ -60,7 +67,7 @@ bool TargetIsSm100(Target target) {
   if (!TargetIsCuda(target))
     return false;
   int arch = GetArchInt(target);
-  return arch >= 100 & arch <= 110;
+  return arch >= 100 && arch <= 110;
 }
 
 bool TargetIsSM120(Target target) {
@@ -81,6 +88,27 @@ bool TargetIsCDNA(Target target) {
   return false;
 }
 
+bool TargetIsRDNA(Target target) {
+  if (!TargetIsRocm(target))
+    return false;
+  if (target->attrs.count("mcpu")) {
+    std::string mcpu = Downcast<tvm::ffi::String>(target->attrs.at("mcpu"));
+    // gfx11xx, gfx12xx are RDNA architectures
+    return mcpu.find("gfx11") == 0 || mcpu.find("gfx12") == 0;
+  }
+  return false;
+}
+
+bool TargetIsGfx950(Target target) {
+  if (!TargetIsRocm(target))
+    return false;
+  if (target->attrs.count("mcpu")) {
+    std::string mcpu = Downcast<tvm::ffi::String>(target->attrs.at("mcpu"));
+    return mcpu.find("gfx950") != std::string::npos;
+  }
+  return false;
+}
+
 bool TargetHasAsyncCopy(Target target) {
   if (TargetIsCuda(target)) {
     int arch = GetArchInt(target);
@@ -127,6 +155,28 @@ bool TargetHasBulkCopy(Target target) {
   return arch >= 90;
 }
 
+bool TargetIsCuTeDSL(Target target) {
+  for (const auto &key : target->keys) {
+    if (key == "cutedsl")
+      return true;
+  }
+  return false;
+}
+
+bool TargetSupportVectorize256(Target target) {
+  if (!TargetIsCuda(target))
+    return false;
+  int arch = GetArchInt(target);
+  return arch >= 100;
+}
+
+bool TargetHasSMVersionGE(Target target, int version) {
+  if (!TargetIsCuda(target))
+    return false;
+  int arch = GetArchInt(target);
+  return arch >= version;
+}
+
 int TargetGetWarpSize(Target target) {
   int res = 32;
   if (TargetIsCDNA(target))
@@ -135,46 +185,122 @@ int TargetGetWarpSize(Target target) {
 }
 
 bool IsCudaVectorizableFP8(DataType dtype) {
+  // NOTE: E8M0 is a special type of FP8 which is not handled here
+  // We only handle FP8 types which can be represented with
+  // __nv_fp8_interpretation_t here
   return dtype.is_float8_e4m3() || dtype.is_float8_e4m3fn() ||
          dtype.is_float8_e5m2();
 }
 
 bool IsCudaVectorizableCast(DataType from_ty, DataType target_ty) {
   // float16 -> float32
-  if (from_ty.is_float16() && target_ty.is_float())
+  if (from_ty.is_float16() && target_ty.is_float() && target_ty.bits() == 32)
     return true;
 
   // float32 -> float16
-  if (from_ty.is_float() && target_ty.is_float16())
+  if (from_ty.is_float() && from_ty.bits() == 32 && target_ty.is_float16())
     return true;
 
   // bfloat16 -> float32
-  if (from_ty.is_bfloat16() && target_ty.is_float())
+  if (from_ty.is_bfloat16() && target_ty.is_float() && target_ty.bits() == 32)
     return true;
 
   // float32 -> bfloat16
-  if (from_ty.is_float() && target_ty.is_bfloat16())
+  if (from_ty.is_float() && from_ty.bits() == 32 && target_ty.is_bfloat16())
     return true;
 
   // float32 -> float8 (E4M3/E5M2)
-  if (from_ty.is_float() && IsCudaVectorizableFP8(target_ty))
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      IsCudaVectorizableFP8(target_ty))
     return true;
 
   // float8 (E4M3/E5M2) -> float32
-  if (IsCudaVectorizableFP8(from_ty) && target_ty.is_float())
+  if (IsCudaVectorizableFP8(from_ty) && target_ty.is_float() &&
+      target_ty.bits() == 32)
+    return true;
+
+  // Not implemented for now
+
+  // float64(double) -> float8 (E4M3/E5M2)
+  // if (from_ty.is_float() && from_ty.bits() == 64 &&
+  //     IsCudaVectorizableFP8(target_ty))
+  //   return true;
+
+  // float8 (E4M3/E5M2) -> float64(double)
+  // if (IsCudaVectorizableFP8(from_ty) && target_ty.is_float() &&
+  //     target_ty.bits() == 64)
+  //   return true;
+
+  // float8 (E8M0) -> bfloat16
+  if (from_ty.is_float8_e8m0fnu() && target_ty.is_bfloat16())
+    return true;
+
+  // bfloat16 -> float8 (E8M0)
+  if (from_ty.is_bfloat16() && target_ty.is_float8_e8m0fnu())
+    return true;
+
+  // float32 -> float8 (E8M0)
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      target_ty.is_float8_e8m0fnu())
+    return true;
+
+  // float64(double) -> float8 (E8M0)
+  if (from_ty.is_float() && from_ty.bits() == 64 &&
+      target_ty.is_float8_e8m0fnu())
+    return true;
+
+  // float4_e2m1fn -> float16
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float16())
+    return true;
+
+  // float16 -> float4_e2m1fn
+  if (from_ty.is_float16() && target_ty.is_float4_e2m1fn())
     return true;
 
   // float4_e2m1fn -> float32
-  if (from_ty.is_float4_e2m1fn() && target_ty.is_float())
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float() &&
+      target_ty.bits() == 32)
     return true;
 
   // float32 -> float4_e2m1fn
-  if (from_ty.is_float() && target_ty.is_float4_e2m1fn())
+  if (from_ty.is_float() && from_ty.bits() == 32 &&
+      target_ty.is_float4_e2m1fn())
+    return true;
+
+  // float4_e2m1fn -> float64(double)
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_float() &&
+      target_ty.bits() == 64)
+    return true;
+
+  // float64(double) -> float4_e2m1fn
+  if (from_ty.is_float() && from_ty.bits() == 64 &&
+      target_ty.is_float4_e2m1fn())
+    return true;
+
+  // float4_e2m1fn -> bfloat16
+  if (from_ty.is_float4_e2m1fn() && target_ty.is_bfloat16())
+    return true;
+
+  // bfloat16 -> float4_e2m1fn
+  if (from_ty.is_bfloat16() && target_ty.is_float4_e2m1fn())
     return true;
 
   return false;
 }
 
+int TargetGetRDNAGeneration(Target target) {
+  if (!TargetIsRDNA(target))
+    return 0;
+  if (target->attrs.count("mcpu")) {
+    std::string mcpu = Downcast<tvm::ffi::String>(target->attrs.at("mcpu"));
+    if (mcpu.rfind("gfx11", 0) == 0)
+      return 11;
+    if (mcpu.rfind("gfx12", 0) == 0)
+      return 12;
+  }
+  return 0;
+}
+
 TVM_FFI_STATIC_INIT_BLOCK() {
   namespace refl = tvm::ffi::reflection;
   refl::GlobalDef()
@@ -182,6 +308,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
            [](Target target) { return TargetIsCuda(target); })
       .def("tl.TargetIsRocm",
            [](Target target) { return TargetIsRocm(target); })
+      .def("tl.TargetIsMetal",
+           [](Target target) { return TargetIsMetal(target); })
       .def("tl.TargetIsVolta",
            [](Target target) { return TargetIsVolta(target); })
       .def("tl.TargetIsTuring",
@@ -194,6 +322,10 @@ TVM_FFI_STATIC_INIT_BLOCK() {
            [](Target target) { return TargetIsSM120(target); })
       .def("tl.TargetIsCDNA",
            [](Target target) { return TargetIsCDNA(target); })
+      .def("tl.TargetIsRDNA",
+           [](Target target) { return TargetIsRDNA(target); })
+      .def("tl.TargetIsGfx950",
+           [](Target target) { return TargetIsGfx950(target); })
       .def("tl.TargetHasAsyncCopy",
            [](Target target) { return TargetHasAsyncCopy(target); })
       .def("tl.TargetHasLdmatrix",
@@ -202,6 +334,8 @@ TVM_FFI_STATIC_INIT_BLOCK() {
            [](Target target) { return TargetHasStmatrix(target); })
       .def("tl.TargetHasBulkCopy",
            [](Target target) { return TargetHasBulkCopy(target); })
+      .def("tl.TargetGetRDNAGeneration",
+           [](Target target) { return TargetGetRDNAGeneration(target); })
       .def("tl.TargetGetWarpSize",
            [](Target target) { return TargetGetWarpSize(target); });
 }
diff --git a/src/target/utils.h b/src/target/utils.h
index 9de2d4d4ff..424a5b7463 100644
--- a/src/target/utils.h
+++ b/src/target/utils.h
@@ -14,6 +14,8 @@ namespace tl {
 
 bool TargetIsCuda(Target target);
 bool TargetIsRocm(Target target);
+bool TargetIsMetal(Target target);
+bool TargetIsCPU(Target target);
 
 bool TargetIsVolta(Target target);
 bool TargetIsTuring(Target target);
@@ -22,17 +24,23 @@ bool TargetIsHopper(Target target);
 bool TargetIsSm100(Target target);
 bool TargetIsSM120(Target target);
 bool TargetIsCDNA(Target target);
+bool TargetIsRDNA(Target target);
+bool TargetIsGfx950(Target target);
 
 bool TargetHasAsyncCopy(Target target);
 bool TargetHasLdmatrix(Target target);
 bool TargetHasStmatrix(Target target);
 bool TargetHasTmem(Target target);
 bool TargetHasBulkCopy(Target target);
+bool TargetIsCuTeDSL(Target target);
+bool TargetSupportVectorize256(Target target);
 int TargetGetWarpSize(Target target);
+bool TargetHasSMVersionGE(Target target, int version);
 
 bool IsCudaVectorizableFP8(DataType dtype);
 bool IsCudaVectorizableCast(DataType from_ty, DataType target_ty);
 
+int TargetGetRDNAGeneration(Target target);
 } // namespace tl
 } // namespace tvm
 
diff --git a/src/tl_templates/cpp/common.h b/src/tl_templates/cpp/common.h
index f1fe801e69..198412f73f 100644
--- a/src/tl_templates/cpp/common.h
+++ b/src/tl_templates/cpp/common.h
@@ -3,6 +3,40 @@
 #include "half.hpp"
 #include <math.h>
 #include <stdbool.h>
+#include <stdint.h>
+#include <string.h>
 
 using half_float::half;
-// Not Implemented
+
+// Vector types for CPU codegen.
+// The C codegen emits vector pointer casts like *(float4*)(ptr + offset)
+// and broadcast expressions like ((float4)(v, v, v, v)) where the comma
+// expression evaluates to a single value, requiring a one-arg constructor.
+template <typename T, int N> struct vec_type {
+  T data[N];
+  vec_type() = default;
+  explicit vec_type(T v) {
+    for (int i = 0; i < N; i++)
+      data[i] = v;
+  }
+};
+
+#define TL_DEFINE_VEC(T)                                                       \
+  using T##2 = vec_type<T, 2>;                                                 \
+  using T##4 = vec_type<T, 4>;                                                 \
+  using T##8 = vec_type<T, 8>;                                                 \
+  using T##16 = vec_type<T, 16>;
+
+TL_DEFINE_VEC(float)
+TL_DEFINE_VEC(double)
+TL_DEFINE_VEC(half)
+TL_DEFINE_VEC(int8_t)
+TL_DEFINE_VEC(int16_t)
+TL_DEFINE_VEC(int32_t)
+TL_DEFINE_VEC(int64_t)
+TL_DEFINE_VEC(uint8_t)
+TL_DEFINE_VEC(uint16_t)
+TL_DEFINE_VEC(uint32_t)
+TL_DEFINE_VEC(uint64_t)
+
+#undef TL_DEFINE_VEC
diff --git a/src/tl_templates/cpp/gemm.h b/src/tl_templates/cpp/gemm.h
deleted file mode 100644
index 1d8fbb7e2a..0000000000
--- a/src/tl_templates/cpp/gemm.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#pragma once
-
-// Not Implemented
diff --git a/src/tl_templates/cuda/atomic.h b/src/tl_templates/cuda/atomic.h
index 5f81d8148f..9bd1ff6eab 100644
--- a/src/tl_templates/cuda/atomic.h
+++ b/src/tl_templates/cuda/atomic.h
@@ -31,6 +31,10 @@ template <> struct normalize_atomic_type<bfloat16_t> {
 };
 #endif
 
+template <> struct normalize_atomic_type<int64_t> {
+  using type = unsigned long long;
+};
+
 template <typename T1, typename T2> TL_DEVICE T1 cuda_cast(T2 val) {
   return T1(val);
 }
@@ -45,6 +49,205 @@ template <> TL_DEVICE __nv_bfloat16 cuda_cast<__nv_bfloat16, float>(float val) {
 }
 #endif
 
+// Helpers for atomic operations
+
+namespace tl_atomic_detail {
+
+TL_DEVICE bool IsRelaxedMemoryOrder(int memory_order) {
+  return memory_order == int(cuda::memory_order_relaxed);
+}
+
+TL_DEVICE bool IsReleaseLikeMemoryOrder(int memory_order) {
+  return memory_order == int(cuda::memory_order_release) ||
+         memory_order == int(cuda::memory_order_consume);
+}
+
+TL_DEVICE bool IsAcquireMemoryOrder(int memory_order) {
+  return memory_order == int(cuda::memory_order_acquire);
+}
+
+TL_DEVICE bool IsAcqRelLikeMemoryOrder(int memory_order) {
+  return memory_order == int(cuda::memory_order_acq_rel) ||
+         memory_order == int(cuda::memory_order_seq_cst);
+}
+
+template <typename T> TL_DEVICE unsigned short PackBits16(const T &val) {
+  return *reinterpret_cast<const unsigned short *>(&val);
+}
+
+template <typename T> TL_DEVICE T UnpackBits16(unsigned short val) {
+  return *reinterpret_cast<T *>(&val);
+}
+
+TL_DEVICE void tl_atomic_add_f16(unsigned short &ret, unsigned long long addr,
+                                 unsigned short val, int memory_order) {
+  if (IsReleaseLikeMemoryOrder(memory_order)) {
+    asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                 : "=h"(ret)
+                 : "l"(addr), "h"(val)
+                 : "memory");
+  } else if (IsAcquireMemoryOrder(memory_order)) {
+    asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                 : "=h"(ret)
+                 : "l"(addr), "h"(val)
+                 : "memory");
+  } else if (IsAcqRelLikeMemoryOrder(memory_order)) {
+    asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
+                 : "=h"(ret)
+                 : "l"(addr), "h"(val)
+                 : "memory");
+  }
+}
+
+TL_DEVICE void tl_atomic_add_bf16(unsigned short &ret, unsigned long long addr,
+                                  unsigned short val, int memory_order) {
+  if (IsReleaseLikeMemoryOrder(memory_order)) {
+    asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                 : "=h"(ret)
+                 : "l"(addr), "h"(val)
+                 : "memory");
+  } else if (IsAcquireMemoryOrder(memory_order)) {
+    asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                 : "=h"(ret)
+                 : "l"(addr), "h"(val)
+                 : "memory");
+  } else if (IsAcqRelLikeMemoryOrder(memory_order)) {
+    asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
+                 : "=h"(ret)
+                 : "l"(addr), "h"(val)
+                 : "memory");
+  }
+}
+
+TL_DEVICE void tl_atomic_add_v2_f16(unsigned short &ret_x,
+                                    unsigned short &ret_y,
+                                    unsigned long long addr,
+                                    unsigned short val_x, unsigned short val_y,
+                                    int memory_order) {
+  if (IsReleaseLikeMemoryOrder(memory_order)) {
+    asm volatile(
+        "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+        : "=h"(ret_x), "=h"(ret_y)
+        : "l"(addr), "h"(val_x), "h"(val_y)
+        : "memory");
+  } else if (IsAcquireMemoryOrder(memory_order)) {
+    asm volatile(
+        "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+        : "=h"(ret_x), "=h"(ret_y)
+        : "l"(addr), "h"(val_x), "h"(val_y)
+        : "memory");
+  } else if (IsAcqRelLikeMemoryOrder(memory_order)) {
+    asm volatile(
+        "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
+        : "=h"(ret_x), "=h"(ret_y)
+        : "l"(addr), "h"(val_x), "h"(val_y)
+        : "memory");
+  }
+}
+
+TL_DEVICE void tl_atomic_add_v2_bf16(unsigned short &ret_x,
+                                     unsigned short &ret_y,
+                                     unsigned long long addr,
+                                     unsigned short val_x, unsigned short val_y,
+                                     int memory_order) {
+  if (IsReleaseLikeMemoryOrder(memory_order)) {
+    asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                 : "=h"(ret_x), "=h"(ret_y)
+                 : "l"(addr), "h"(val_x), "h"(val_y)
+                 : "memory");
+  } else if (IsAcquireMemoryOrder(memory_order)) {
+    asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                 : "=h"(ret_x), "=h"(ret_y)
+                 : "l"(addr), "h"(val_x), "h"(val_y)
+                 : "memory");
+  } else if (IsAcqRelLikeMemoryOrder(memory_order)) {
+    asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
+                 : "=h"(ret_x), "=h"(ret_y)
+                 : "l"(addr), "h"(val_x), "h"(val_y)
+                 : "memory");
+  }
+}
+
+TL_DEVICE void tl_atomic_add_v2_f32(float &ret_x, float &ret_y,
+                                    unsigned long long addr, float val_x,
+                                    float val_y, int memory_order) {
+  if (IsReleaseLikeMemoryOrder(memory_order)) {
+    asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                 : "=f"(ret_x), "=f"(ret_y)
+                 : "l"(addr), "f"(val_x), "f"(val_y)
+                 : "memory");
+  } else if (IsAcquireMemoryOrder(memory_order)) {
+    asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                 : "=f"(ret_x), "=f"(ret_y)
+                 : "l"(addr), "f"(val_x), "f"(val_y)
+                 : "memory");
+  } else if (IsAcqRelLikeMemoryOrder(memory_order)) {
+    asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
+                 : "=f"(ret_x), "=f"(ret_y)
+                 : "l"(addr), "f"(val_x), "f"(val_y)
+                 : "memory");
+  }
+}
+
+TL_DEVICE void tl_atomic_add_v4_f32(float &ret_x, float &ret_y, float &ret_z,
+                                    float &ret_w, unsigned long long addr,
+                                    float val_x, float val_y, float val_z,
+                                    float val_w, int memory_order) {
+  if (IsReleaseLikeMemoryOrder(memory_order)) {
+    asm volatile(
+        "atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], {%5,%6,%7,%8};"
+        : "=f"(ret_x), "=f"(ret_y), "=f"(ret_z), "=f"(ret_w)
+        : "l"(addr), "f"(val_x), "f"(val_y), "f"(val_z), "f"(val_w)
+        : "memory");
+  } else if (IsAcquireMemoryOrder(memory_order)) {
+    asm volatile(
+        "atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], {%5,%6,%7,%8};"
+        : "=f"(ret_x), "=f"(ret_y), "=f"(ret_z), "=f"(ret_w)
+        : "l"(addr), "f"(val_x), "f"(val_y), "f"(val_z), "f"(val_w)
+        : "memory");
+  } else if (IsAcqRelLikeMemoryOrder(memory_order)) {
+    asm volatile(
+        "atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], {%5,%6,%7,%8};"
+        : "=f"(ret_x), "=f"(ret_y), "=f"(ret_z), "=f"(ret_w)
+        : "l"(addr), "f"(val_x), "f"(val_y), "f"(val_z), "f"(val_w)
+        : "memory");
+  }
+}
+
+// Fallback implementations: do atomicAdd sequentially.
+
+template <typename T> TL_DEVICE void AtomicAddx2Scalar(T *ref, T x, T y) {
+  atomicAdd(ref + 0, x);
+  atomicAdd(ref + 1, y);
+}
+
+template <typename T>
+TL_DEVICE void AtomicAddx4Scalar(T *ref, T x, T y, T z, T w) {
+  atomicAdd(ref + 0, x);
+  atomicAdd(ref + 1, y);
+  atomicAdd(ref + 2, z);
+  atomicAdd(ref + 3, w);
+}
+
+TL_DEVICE float2 AtomicAddx2ScalarRet(float *ref, float2 add_val) {
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
+}
+
+template <typename dst_dtype>
+TL_DEVICE float4 AtomicAddx4ScalarRet(dst_dtype *ref, float4 add_val) {
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
+}
+
+} // namespace tl_atomic_detail
+
 template <typename T1, typename T2>
 TL_DEVICE void AtomicMax(T1 *ref, T2 val,
                          int memory_order = int(cuda::memory_order_relaxed)) {
@@ -176,63 +379,29 @@ TL_DEVICE void AtomicAdd(T1 *address, T2 val,
   using NT1 = typename normalize_atomic_type<T1>::type;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    if (memory_order == int(cuda::memory_order_relaxed)) {
+    if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
       atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val));
     } else {
       // Since atomic ref do not support memory order, we need to inline ptx
       // code here for each situation
       if constexpr (std::is_same_v<NT1, half>) {
         // fp16
-        __half ret_val;
-        unsigned short ret_val_cast =
-            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned short ret_val_cast;
         unsigned long long ref_address =
             reinterpret_cast<unsigned long long>(address);
-        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
-        if (memory_order == int(cuda::memory_order_release) ||
-            memory_order == int(cuda::memory_order_consume)) {
-          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        } else if (memory_order == int(cuda::memory_order_acquire)) {
-          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-                   memory_order == int(cuda::memory_order_seq_cst)) {
-          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        }
+        unsigned short val_cast =
+            tl_atomic_detail::PackBits16(cuda_cast<NT1>(val));
+        tl_atomic_detail::tl_atomic_add_f16(ret_val_cast, ref_address, val_cast,
+                                            memory_order);
       } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
         // bf16
-        __nv_bfloat16 ret_val;
-        unsigned short ret_val_cast =
-            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned short ret_val_cast;
         unsigned long long ref_address =
             reinterpret_cast<unsigned long long>(address);
-        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
-        if (memory_order == int(cuda::memory_order_release) ||
-            memory_order == int(cuda::memory_order_consume)) {
-          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        } else if (memory_order == int(cuda::memory_order_acquire)) {
-          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-                   memory_order == int(cuda::memory_order_seq_cst)) {
-          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        }
+        unsigned short val_cast =
+            tl_atomic_detail::PackBits16(cuda_cast<NT1>(val));
+        tl_atomic_detail::tl_atomic_add_bf16(ret_val_cast, ref_address,
+                                             val_cast, memory_order);
       }
     }
   } else {
@@ -255,65 +424,32 @@ TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val,
   using NT1 = typename normalize_atomic_type<T1>::type;
   if constexpr (std::is_same_v<NT1, half> ||
                 std::is_same_v<NT1, __nv_bfloat16>) {
-    if (memory_order == int(cuda::memory_order_relaxed)) {
+    if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
       return static_cast<T1>(
           atomicAdd(reinterpret_cast<NT1 *>(address), static_cast<NT1>(val)));
     } else {
       if constexpr (std::is_same_v<NT1, half>) {
         // fp16
-        __half ret_val;
-        unsigned short ret_val_cast =
-            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned short ret_val_cast;
         unsigned long long ref_address =
             reinterpret_cast<unsigned long long>(address);
-        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
-        if (memory_order == int(cuda::memory_order_release) ||
-            memory_order == int(cuda::memory_order_consume)) {
-          asm volatile("atom.release.gpu.global.add.noftz.f16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        } else if (memory_order == int(cuda::memory_order_acquire)) {
-          asm volatile("atom.acquire.gpu.global.add.noftz.f16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-                   memory_order == int(cuda::memory_order_seq_cst)) {
-          asm volatile("atom.acq_rel.gpu.global.add.noftz.f16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        }
-        return static_cast<T1>(*reinterpret_cast<__half *>(&ret_val_cast));
+        unsigned short val_cast =
+            tl_atomic_detail::PackBits16(cuda_cast<NT1>(val));
+        tl_atomic_detail::tl_atomic_add_f16(ret_val_cast, ref_address, val_cast,
+                                            memory_order);
+        return static_cast<T1>(
+            tl_atomic_detail::UnpackBits16<__half>(ret_val_cast));
       } else if constexpr (std::is_same_v<NT1, __nv_bfloat16>) {
         // bf16
-        __nv_bfloat16 ret_val;
-        unsigned short ret_val_cast =
-            *reinterpret_cast<unsigned short *>(&ret_val);
+        unsigned short ret_val_cast;
         unsigned long long ref_address =
             reinterpret_cast<unsigned long long>(address);
-        unsigned short val_cast = *reinterpret_cast<unsigned short *>(&val);
-        if (memory_order == int(cuda::memory_order_release) ||
-            memory_order == int(cuda::memory_order_consume)) {
-          asm volatile("atom.release.gpu.global.add.noftz.bf16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        } else if (memory_order == int(cuda::memory_order_acquire)) {
-          asm volatile("atom.acquire.gpu.global.add.noftz.bf16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-                   memory_order == int(cuda::memory_order_seq_cst)) {
-          asm volatile("atom.acq_rel.gpu.global.add.noftz.bf16 %0, [%1], %2;"
-                       : "=h"(ret_val_cast)
-                       : "l"(ref_address), "h"(val_cast)
-                       : "memory");
-        }
+        unsigned short val_cast =
+            tl_atomic_detail::PackBits16(cuda_cast<NT1>(val));
+        tl_atomic_detail::tl_atomic_add_bf16(ret_val_cast, ref_address,
+                                             val_cast, memory_order);
         return static_cast<T1>(
-            *reinterpret_cast<__nv_bfloat16 *>(&ret_val_cast));
+            tl_atomic_detail::UnpackBits16<__nv_bfloat16>(ret_val_cast));
       }
     }
   } else {
@@ -327,366 +463,231 @@ TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val,
   }
 }
 
-// TODO add memory_order for vectorized atomic add
-TL_DEVICE void AtomicAddx2(half_t *ref, half_t *val,
+// For vectorized AtomicAdd, we maintain two versions of interfaces:
+// 1. AtomicAddxN(dst_type* ref, src_type *val) // Pass pointer
+// 2. AtomicAddxN(dst_type* ref, src_type val) // Pass value
+template <typename T> TL_DEVICE half2 ToHalf2(T *val) {
+  return *reinterpret_cast<const half2 *>(val);
+}
+
+template <typename T> TL_DEVICE half2 ToHalf2(T val) {
+  return static_cast<half2>(*reinterpret_cast<const half2 *>(&val));
+}
+
+TL_DEVICE half2 ToHalf2(half2 val) { return val; }
+
+// Here ValType can be either value or value* (pointer)
+
+template <typename ValType>
+TL_DEVICE void AtomicAddx2(half_t *ref, ValType val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
-  if (memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<half2 *>(ref),
-              static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+  half2 add_val = ToHalf2(val);
+  if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
+    atomicAdd(reinterpret_cast<half2 *>(ref), add_val);
   } else {
     // Since atomicAdd does not support memory order, atomic_ref does not
     // support vectorized atomic operation we can only inline ptx code here
     // Note: Vectorized atomic operations only support global space
     // Note: for 16-bit value, we need to reinterpret_cast the value to unsigned
     // short and use "h" register in assembly
-    __half2 add_val = *reinterpret_cast<__half2 *>(val);
-    unsigned short add_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&add_val.x);
-    unsigned short add_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned short add_val_x_cast = tl_atomic_detail::PackBits16(add_val.x);
+    unsigned short add_val_y_cast = tl_atomic_detail::PackBits16(add_val.y);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
-    __half ret_val_x, ret_val_y;
-    unsigned short ret_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val_x);
-    unsigned short ret_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val_y);
-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
-      asm volatile(
-          "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
-          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-          : "memory");
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
-      asm volatile(
-          "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
-          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-          : "memory");
-    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-               memory_order == int(cuda::memory_order_seq_cst)) {
-      asm volatile(
-          "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
-          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-          : "memory");
-    }
+    unsigned short ret_val_x_cast;
+    unsigned short ret_val_y_cast;
+    tl_atomic_detail::tl_atomic_add_v2_f16(ret_val_x_cast, ret_val_y_cast,
+                                           ref_addr, add_val_x_cast,
+                                           add_val_y_cast, memory_order);
   }
 }
 
+template <typename ValType>
 TL_DEVICE half2
-AtomicAddx2Ret(half_t *ref, half_t *val,
+AtomicAddx2Ret(half_t *ref, ValType val,
                int memory_order = int(cuda::memory_order_relaxed)) {
-  if (memory_order == int(cuda::memory_order_relaxed)) {
-    return atomicAdd(reinterpret_cast<half2 *>(ref),
-                     static_cast<half2>(*reinterpret_cast<half2 *>(val)));
+  half2 add_val = ToHalf2(val);
+  if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
+    return atomicAdd(reinterpret_cast<half2 *>(ref), add_val);
   } else {
-    __half2 add_val = *reinterpret_cast<__half2 *>(val);
-    unsigned short add_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&add_val.x);
-    unsigned short add_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned short add_val_x_cast = tl_atomic_detail::PackBits16(add_val.x);
+    unsigned short add_val_y_cast = tl_atomic_detail::PackBits16(add_val.y);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
-    __half ret_val_x, ret_val_y;
-    unsigned short ret_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val_x);
-    unsigned short ret_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val_y);
-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
-      asm volatile(
-          "atom.release.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
-          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-          : "memory");
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
-      asm volatile(
-          "atom.acquire.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
-          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-          : "memory");
-    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-               memory_order == int(cuda::memory_order_seq_cst)) {
-      asm volatile(
-          "atom.acq_rel.gpu.global.add.noftz.v2.f16 {%0,%1}, [%2], {%3,%4};"
-          : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-          : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-          : "memory");
-    }
-    return half2(*reinterpret_cast<__half *>(&ret_val_x_cast),
-                 *reinterpret_cast<__half *>(&ret_val_y_cast));
+    unsigned short ret_val_x_cast;
+    unsigned short ret_val_y_cast;
+    tl_atomic_detail::tl_atomic_add_v2_f16(ret_val_x_cast, ret_val_y_cast,
+                                           ref_addr, add_val_x_cast,
+                                           add_val_y_cast, memory_order);
+    return half2(tl_atomic_detail::UnpackBits16<__half>(ret_val_x_cast),
+                 tl_atomic_detail::UnpackBits16<__half>(ret_val_y_cast));
   }
 }
 
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ > 750))
-TL_DEVICE void AtomicAddx2(bfloat16_t *ref, bfloat16_t *val,
+template <typename T> TL_DEVICE __nv_bfloat162 ToBfloat162(T *val) {
+  return *reinterpret_cast<const __nv_bfloat162 *>(val);
+}
+
+template <typename T> TL_DEVICE __nv_bfloat162 ToBfloat162(T val) {
+  return static_cast<__nv_bfloat162>(
+      *reinterpret_cast<const __nv_bfloat162 *>(&val));
+}
+
+TL_DEVICE __nv_bfloat162 ToBfloat162(__nv_bfloat162 val) { return val; }
+
+template <typename ValType>
+TL_DEVICE void AtomicAddx2(bfloat16_t *ref, ValType val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
-  if (memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(
-        reinterpret_cast<__nv_bfloat162 *>(ref),
-        static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+  __nv_bfloat162 add_val = ToBfloat162(val);
+  if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
+    atomicAdd(reinterpret_cast<__nv_bfloat162 *>(ref), add_val);
   } else {
-    __nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
-    unsigned short add_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&add_val.x);
-    unsigned short add_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&add_val.y);
+    unsigned short add_val_x_cast = tl_atomic_detail::PackBits16(add_val.x);
+    unsigned short add_val_y_cast = tl_atomic_detail::PackBits16(add_val.y);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
-    __nv_bfloat162 ret_val;
-    unsigned short ret_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val.x);
-    unsigned short ret_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val.y);
-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
-      asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
-                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
-      asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
-                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-               memory_order == int(cuda::memory_order_seq_cst)) {
-      asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
-                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-                   : "memory");
-    }
+    unsigned short ret_val_x_cast;
+    unsigned short ret_val_y_cast;
+    tl_atomic_detail::tl_atomic_add_v2_bf16(ret_val_x_cast, ret_val_y_cast,
+                                            ref_addr, add_val_x_cast,
+                                            add_val_y_cast, memory_order);
   }
 }
 
+template <typename src_type>
 TL_DEVICE __nv_bfloat162
-AtomicAddx2Ret(bfloat16_t *ref, bfloat16_t *val,
+AtomicAddx2Ret(bfloat16_t *ref, src_type *val,
                int memory_order = int(cuda::memory_order_relaxed)) {
-  if (memory_order == int(cuda::memory_order_relaxed)) {
-    return atomicAdd(
-        reinterpret_cast<__nv_bfloat162 *>(ref),
-        static_cast<__nv_bfloat162>(*reinterpret_cast<__nv_bfloat162 *>(val)));
+  if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
+    return atomicAdd(reinterpret_cast<__nv_bfloat162 *>(ref),
+                     static_cast<__nv_bfloat162>(
+                         *reinterpret_cast<const __nv_bfloat162 *>(val)));
   } else {
-    __nv_bfloat162 add_val = *reinterpret_cast<__nv_bfloat162 *>(val);
-    unsigned short add_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&add_val.x);
-    unsigned short add_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&add_val.y);
+    __nv_bfloat162 add_val = *reinterpret_cast<const __nv_bfloat162 *>(val);
+    unsigned short add_val_x_cast = tl_atomic_detail::PackBits16(add_val.x);
+    unsigned short add_val_y_cast = tl_atomic_detail::PackBits16(add_val.y);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
-    __nv_bfloat162 ret_val;
-    unsigned short ret_val_x_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val.x);
-    unsigned short ret_val_y_cast =
-        *reinterpret_cast<unsigned short *>(&ret_val.y);
-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
-      asm volatile("atom.release.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
-                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
-      asm volatile("atom.acquire.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
-                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-               memory_order == int(cuda::memory_order_seq_cst)) {
-      asm volatile("atom.acq_rel.gpu.global.add.v2.bf16 {%0,%1}, [%2], {%3,%4};"
-                   : "=h"(ret_val_x_cast), "=h"(ret_val_y_cast)
-                   : "l"(ref_addr), "h"(add_val_x_cast), "h"(add_val_y_cast)
-                   : "memory");
-    }
-    return __nv_bfloat162(*reinterpret_cast<__nv_bfloat16 *>(&ret_val_x_cast),
-                          *reinterpret_cast<__nv_bfloat16 *>(&ret_val_y_cast));
+    unsigned short ret_val_x_cast;
+    unsigned short ret_val_y_cast;
+    tl_atomic_detail::tl_atomic_add_v2_bf16(ret_val_x_cast, ret_val_y_cast,
+                                            ref_addr, add_val_x_cast,
+                                            add_val_y_cast, memory_order);
+    return __nv_bfloat162(
+        tl_atomic_detail::UnpackBits16<__nv_bfloat16>(ret_val_x_cast),
+        tl_atomic_detail::UnpackBits16<__nv_bfloat16>(ret_val_y_cast));
   }
 }
 #endif
 
+template <typename T> TL_DEVICE float2 ToFloat2(T *val) {
+  return *reinterpret_cast<const float2 *>(val);
+}
+
+TL_DEVICE float2 ToFloat2(float2 val) { return val; }
+
+template <typename T> TL_DEVICE float4 ToFloat4(T *val) {
+  return *reinterpret_cast<const float4 *>(val);
+}
+
+TL_DEVICE float4 ToFloat4(float4 val) { return val; }
+
 #if (defined(__CUDA_ARCH_LIST__) && (__CUDA_ARCH_LIST__ >= 900))
-TL_DEVICE void AtomicAddx2(float *ref, float *val,
+template <typename ValType>
+TL_DEVICE void AtomicAddx2(float *ref, ValType val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
-  if (memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<float2 *>(ref),
-              static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+  float2 add_val = ToFloat2(val);
+  if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
+    atomicAdd(reinterpret_cast<float2 *>(ref), add_val);
   } else {
-    float2 add_val = *reinterpret_cast<float2 *>(val);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
     float2 ret_val;
-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
-      asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
-      asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-               memory_order == int(cuda::memory_order_seq_cst)) {
-      asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
-                   : "memory");
-    }
+    tl_atomic_detail::tl_atomic_add_v2_f32(ret_val.x, ret_val.y, ref_addr,
+                                           add_val.x, add_val.y, memory_order);
   }
 }
 
+template <typename ValType>
 TL_DEVICE float2
-AtomicAddx2Ret(float *ref, float *val,
+AtomicAddx2Ret(float *ref, ValType val,
                int memory_order = int(cuda::memory_order_relaxed)) {
-  if (memory_order == int(cuda::memory_order_relaxed)) {
-    return atomicAdd(reinterpret_cast<float2 *>(ref),
-                     static_cast<float2>(*reinterpret_cast<float2 *>(val)));
+  float2 add_val = ToFloat2(val);
+  if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
+    return atomicAdd(reinterpret_cast<float2 *>(ref), add_val);
   } else {
-    float2 add_val = *reinterpret_cast<float2 *>(val);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
     float2 ret_val;
-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
-      asm volatile("atom.release.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
-      asm volatile("atom.acquire.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-               memory_order == int(cuda::memory_order_seq_cst)) {
-      asm volatile("atom.acq_rel.gpu.global.add.v2.f32 {%0,%1}, [%2], {%3,%4};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y)
-                   : "memory");
-    }
+    tl_atomic_detail::tl_atomic_add_v2_f32(ret_val.x, ret_val.y, ref_addr,
+                                           add_val.x, add_val.y, memory_order);
     return ret_val;
   }
 }
 
-TL_DEVICE void AtomicAddx4(float *ref, float *val,
+template <typename dst_dtype, typename ValType>
+TL_DEVICE void AtomicAddx4(dst_dtype *ref, ValType val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
-  if (memory_order == int(cuda::memory_order_relaxed)) {
-    atomicAdd(reinterpret_cast<float4 *>(ref),
-              static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+  float4 add_val = ToFloat4(val);
+  if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
+    atomicAdd(reinterpret_cast<float4 *>(ref), add_val);
   } else {
     // Since atomicAdd does not support memory order, atomic_ref does not
     // support vectorized atomic operation we can only inline ptx code here
     // Note: Vectorized atomic operations only support global space
-    float4 add_val = *reinterpret_cast<float4 *>(val);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
     float4 ret_val;
-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
-      asm volatile("atom.release.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
-                   "{%5,%6,%7,%8};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
-                     "=f"(ret_val.w)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
-                     "f"(add_val.z), "f"(add_val.w)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
-      asm volatile("atom.acquire.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
-                   "{%5,%6,%7,%8};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
-                     "=f"(ret_val.w)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
-                     "f"(add_val.z), "f"(add_val.w)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-               memory_order == int(cuda::memory_order_seq_cst)) {
-      asm volatile("atom.acq_rel.gpu.global.add.v4.f32 {%0,%1,%2,%3}, [%4], "
-                   "{%5,%6,%7,%8};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
-                     "=f"(ret_val.w)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
-                     "f"(add_val.z), "f"(add_val.w)
-                   : "memory");
-    }
+    tl_atomic_detail::tl_atomic_add_v4_f32(
+        ret_val.x, ret_val.y, ret_val.z, ret_val.w, ref_addr, add_val.x,
+        add_val.y, add_val.z, add_val.w, memory_order);
   }
 }
 
+template <typename dst_dtype, typename ValType>
 TL_DEVICE float4
-AtomicAddx4Ret(float *ref, float *val,
+AtomicAddx4Ret(dst_dtype *ref, ValType val,
                int memory_order = int(cuda::memory_order_relaxed)) {
-  if (memory_order == int(cuda::memory_order_relaxed)) {
-    return atomicAdd(reinterpret_cast<float4 *>(ref),
-                     static_cast<float4>(*reinterpret_cast<float4 *>(val)));
+  float4 add_val = ToFloat4(val);
+  if (tl_atomic_detail::IsRelaxedMemoryOrder(memory_order)) {
+    return atomicAdd(reinterpret_cast<float4 *>(ref), add_val);
   } else {
-    float4 add_val = *reinterpret_cast<float4 *>(val);
     unsigned long long ref_addr = reinterpret_cast<unsigned long long>(ref);
     float4 ret_val;
-    if (memory_order == int(cuda::memory_order_release) ||
-        memory_order == int(cuda::memory_order_consume)) {
-      asm volatile("atom.global.gpu.release.add.v4.f32 {%0,%1,%2,%3}, [%4], "
-                   "{%5,%6,%7,%8};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
-                     "=f"(ret_val.w)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
-                     "f"(add_val.z), "f"(add_val.w)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acquire)) {
-      asm volatile("atom.global.gpu.acquire.add.v4.f32 {%0,%1,%2,%3}, [%4], "
-                   "{%5,%6,%7,%8};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
-                     "=f"(ret_val.w)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
-                     "f"(add_val.z), "f"(add_val.w)
-                   : "memory");
-    } else if (memory_order == int(cuda::memory_order_acq_rel) ||
-               memory_order == int(cuda::memory_order_seq_cst)) {
-      asm volatile("atom.global.gpu.acq_rel.add.v4.f32 {%0,%1,%2,%3}, [%4], "
-                   "{%5,%6,%7,%8};"
-                   : "=f"(ret_val.x), "=f"(ret_val.y), "=f"(ret_val.z),
-                     "=f"(ret_val.w)
-                   : "l"(ref_addr), "f"(add_val.x), "f"(add_val.y),
-                     "f"(add_val.z), "f"(add_val.w)
-                   : "memory");
-    }
+    tl_atomic_detail::tl_atomic_add_v4_f32(
+        ret_val.x, ret_val.y, ret_val.z, ret_val.w, ref_addr, add_val.x,
+        add_val.y, add_val.z, add_val.w, memory_order);
     return ret_val;
   }
 }
 #else
-TL_DEVICE void AtomicAddx2(float *ref, float *val,
+template <typename ValType>
+TL_DEVICE void AtomicAddx2(float *ref, ValType val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
   (void)memory_order;
-  float2 add_val = *reinterpret_cast<float2 *>(val);
-  atomicAdd(ref + 0, add_val.x);
-  atomicAdd(ref + 1, add_val.y);
+  float2 add_val = ToFloat2(val);
+  tl_atomic_detail::AtomicAddx2Scalar(ref, add_val.x, add_val.y);
 }
 
+template <typename ValType>
 TL_DEVICE float2
-AtomicAddx2Ret(float *ref, float *val,
+AtomicAddx2Ret(float *ref, ValType val,
                int memory_order = int(cuda::memory_order_relaxed)) {
   (void)memory_order;
-  float2 add_val = *reinterpret_cast<float2 *>(val);
-  float2 ret;
-  ret.x = atomicAdd(ref + 0, add_val.x);
-  ret.y = atomicAdd(ref + 1, add_val.y);
-  return ret;
+  float2 add_val = ToFloat2(val);
+  return tl_atomic_detail::AtomicAddx2ScalarRet(ref, add_val);
 }
 
-TL_DEVICE void AtomicAddx4(float *ref, float *val,
+template <typename dst_dtype, typename ValType>
+TL_DEVICE void AtomicAddx4(dst_dtype *ref, ValType val,
                            int memory_order = int(cuda::memory_order_relaxed)) {
   (void)memory_order;
-  float4 add_val = *reinterpret_cast<float4 *>(val);
-  atomicAdd(ref + 0, add_val.x);
-  atomicAdd(ref + 1, add_val.y);
-  atomicAdd(ref + 2, add_val.z);
-  atomicAdd(ref + 3, add_val.w);
+  float4 add_val = ToFloat4(val);
+  tl_atomic_detail::AtomicAddx4Scalar(ref, add_val.x, add_val.y, add_val.z,
+                                      add_val.w);
 }
 
+template <typename dst_dtype, typename ValType>
 TL_DEVICE float4
-AtomicAddx4Ret(float *ref, float *val,
+AtomicAddx4Ret(dst_dtype *ref, ValType val,
                int memory_order = int(cuda::memory_order_relaxed)) {
   (void)memory_order;
-  float4 add_val = *reinterpret_cast<float4 *>(val);
-  float4 ret;
-  ret.x = atomicAdd(ref + 0, add_val.x);
-  ret.y = atomicAdd(ref + 1, add_val.y);
-  ret.z = atomicAdd(ref + 2, add_val.z);
-  ret.w = atomicAdd(ref + 3, add_val.w);
-  return ret;
+  float4 add_val = ToFloat4(val);
+  return tl_atomic_detail::AtomicAddx4ScalarRet(ref, add_val);
 }
 #endif
 
@@ -709,79 +710,3 @@ TL_DEVICE void AtomicStore(T1 *ref, T2 value, int memory_order) {
   TL_NOT_IMPLEMENTED();
 #endif
 }
-
-namespace tl {
-
-TL_DEVICE uint32_t ptx_atom_add_relaxed_gpu(const uint32_t *ptr,
-                                            uint32_t value) {
-  uint32_t ret;
-  asm volatile("atom.add.relaxed.gpu.global.u32 %0, [%1], %2;\n"
-               : "=r"(ret)
-               : "l"(ptr), "r"(value));
-  return ret;
-}
-
-TL_DEVICE uint32_t ptx_atom_add_acquire_gpu(const uint32_t *ptr,
-                                            uint32_t value) {
-  uint32_t ret;
-  asm volatile("atom.add.acquire.gpu.global.u32 %0, [%1], %2;\n"
-               : "=r"(ret)
-               : "l"(ptr), "r"(value));
-  return ret;
-}
-
-TL_DEVICE uint32_t ptx_atom_add_release_gpu(const uint32_t *ptr,
-                                            uint32_t value) {
-  uint32_t ret;
-  asm volatile("atom.add.release.gpu.global.u32 %0, [%1], %2;\n"
-               : "=r"(ret)
-               : "l"(ptr), "r"(value));
-  return ret;
-}
-
-TL_DEVICE uint32_t ptx_atom_add_acq_rel_gpu(const uint32_t *ptr,
-                                            uint32_t value) {
-  uint32_t ret;
-  asm volatile("atom.add.acq_rel.gpu.global.u32 %0, [%1], %2;\n"
-               : "=r"(ret)
-               : "l"(ptr), "r"(value));
-  return ret;
-}
-
-TL_DEVICE uint32_t ptx_atom_add_relaxed_sys(const uint32_t *ptr,
-                                            uint32_t value) {
-  uint32_t ret;
-  asm volatile("atom.add.relaxed.sys.global.u32 %0, [%1], %2;\n"
-               : "=r"(ret)
-               : "l"(ptr), "r"(value));
-  return ret;
-}
-
-TL_DEVICE uint32_t ptx_atom_add_acquire_sys(const uint32_t *ptr,
-                                            uint32_t value) {
-  uint32_t ret;
-  asm volatile("atom.add.acquire.sys.global.u32 %0, [%1], %2;\n"
-               : "=r"(ret)
-               : "l"(ptr), "r"(value));
-  return ret;
-}
-
-TL_DEVICE uint32_t ptx_atom_add_release_sys(const uint32_t *ptr,
-                                            uint32_t value) {
-  uint32_t ret;
-  asm volatile("atom.add.release.sys.global.u32 %0, [%1], %2;\n"
-               : "=r"(ret)
-               : "l"(ptr), "r"(value));
-  return ret;
-}
-
-TL_DEVICE uint32_t ptx_atom_add_acq_rel_sys(const uint32_t *ptr,
-                                            uint32_t value) {
-  uint32_t ret;
-  asm volatile("atom.add.acq_rel.sys.global.u32 %0, [%1], %2;\n"
-               : "=r"(ret)
-               : "l"(ptr), "r"(value));
-  return ret;
-}
-
-} // namespace tl
diff --git a/src/tl_templates/cuda/cluster.h b/src/tl_templates/cuda/cluster.h
new file mode 100644
index 0000000000..b353f58be9
--- /dev/null
+++ b/src/tl_templates/cuda/cluster.h
@@ -0,0 +1,230 @@
+#pragma once
+
+#include "common.h"
+
+// Config
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900) &&                       \
+     ((__CUDACC_VER_MAJOR__ >= 12) ||                                          \
+      ((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 8))))
+#define TILELANG_CLUSTER_ENABLED
+#endif
+
+namespace tl {
+
+TL_DEVICE void cluster_arrive_relaxed() {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  asm volatile("barrier.cluster.arrive.relaxed.aligned;\n" : :);
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+TL_DEVICE void cluster_arrive() {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  asm volatile("barrier.cluster.arrive.aligned;\n" : :);
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+TL_DEVICE void cluster_wait() {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  asm volatile("barrier.cluster.wait.aligned;\n" : :);
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+TL_DEVICE void cluster_sync() {
+  cluster_arrive();
+  cluster_wait();
+}
+
+// Returns the dim3 grid size in terms of number of clusters.
+TL_DEVICE dim3 cluster_grid_dims() {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%nclusterid.x;\n" : "=r"(x) :);
+  asm volatile("mov.u32 %0, %%nclusterid.y;\n" : "=r"(y) :);
+  asm volatile("mov.u32 %0, %%nclusterid.z;\n" : "=r"(z) :);
+  return {x, y, z};
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+// Returns the dim3 cluster rank in the grid.
+TL_DEVICE dim3 cluster_id_in_grid() {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%clusterid.x;\n" : "=r"(x) :);
+  asm volatile("mov.u32 %0, %%clusterid.y;\n" : "=r"(y) :);
+  asm volatile("mov.u32 %0, %%clusterid.z;\n" : "=r"(z) :);
+  return {x, y, z};
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+// Returns the dim3 cluster shape.
+TL_DEVICE dim3 cluster_shape() {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%cluster_nctaid.x;\n" : "=r"(x) :);
+  asm volatile("mov.u32 %0, %%cluster_nctaid.y;\n" : "=r"(y) :);
+  asm volatile("mov.u32 %0, %%cluster_nctaid.z;\n" : "=r"(z) :);
+  return {x, y, z};
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+// Returns the relative dim3 block rank local to the cluster.
+TL_DEVICE dim3 block_id_in_cluster() {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  uint32_t x, y, z;
+  asm volatile("mov.u32 %0, %%cluster_ctaid.x;\n" : "=r"(x) :);
+  asm volatile("mov.u32 %0, %%cluster_ctaid.y;\n" : "=r"(y) :);
+  asm volatile("mov.u32 %0, %%cluster_ctaid.z;\n" : "=r"(z) :);
+  return {x, y, z};
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+// Get 1D ctaid in a cluster.
+TL_DEVICE int block_rank_in_cluster() {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  // NOTE(wt): cluster_ctarank is a uint32_t inherently,
+  // we return as int32 for TL analysis convenience.
+  uint32_t rank;
+  asm volatile("mov.u32 %0, %%cluster_ctarank;\n" : "=r"(rank) :);
+  return static_cast<int>(rank);
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+/* Cluster launch control for tile schedule (Available on sm100) */
+
+TL_DEVICE void clc_try_cancel(void *result_ptr, void *mbar_ptr) {
+#if defined(CUTLASS_ARCH_CLC_ENABLED)
+  uint32_t result_addr = smem_ptr_to_uint(result_ptr);
+  uint32_t mbar_addr = smem_ptr_to_uint(mbar_ptr);
+  asm volatile("{\n\t"
+               "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::"
+               "complete_tx::bytes.b128 [%0], [%1];\n\t"
+               "}\n"
+               :
+               : "r"(result_addr), "r"(mbar_addr));
+#else
+  TILELANG_UNREACHABLE("CUTLASS_ARCH_CLC_ENABLED is not defined");
+#endif
+}
+
+TL_DEVICE void clc_try_cancel_multicast(void *result_ptr, void *mbar_ptr) {
+#if defined(CUTLASS_ARCH_CLC_ENABLED)
+  uint32_t result_addr = smem_ptr_to_uint(result_ptr);
+  uint32_t mbar_addr = smem_ptr_to_uint(mbar_ptr);
+  asm volatile("{\n\t"
+               "clusterlaunchcontrol.try_cancel.async.shared::cta.mbarrier::"
+               "complete_tx::bytes.multicast::cluster::all.b128 [%0], [%1];\n\t"
+               "}\n"
+               :
+               : "r"(result_addr), "r"(mbar_addr));
+#else
+  TILELANG_UNREACHABLE("CUTLASS_ARCH_CLC_ENABLED is not defined");
+#endif
+}
+
+// CLC query responses are produced through the async shared-memory proxy and
+// must be fenced before normal shared-memory loads decode the 16-byte result.
+TL_DEVICE void clc_fence_proxy_async_shared_cta() {
+#if defined(CUTLASS_ARCH_CLC_ENABLED)
+  asm volatile("fence.proxy.async.shared::cta;" : : : "memory");
+#else
+  TILELANG_UNREACHABLE("CUTLASS_ARCH_CLC_ENABLED is not defined");
+#endif
+}
+
+struct CLCResponseDecode {
+  uint32_t x = 0;
+  uint32_t y = 0;
+  uint32_t z = 0;
+  uint32_t is_canceled = 0;
+};
+
+TL_DEVICE CLCResponseDecode clc_decode_response(void const *result_ptr) {
+#if defined(CUTLASS_ARCH_CLC_ENABLED)
+  uint32_t result_addr = smem_ptr_to_uint(result_ptr);
+  CLCResponseDecode decoded;
+  asm volatile(
+      "{\n\t"
+      ".reg .pred p1;\n\t"
+      ".reg .b128 clc_result;\n\t"
+      "ld.shared.b128 clc_result, [%4];\n\t"
+      "clusterlaunchcontrol.query_cancel.is_canceled.pred.b128 p1, "
+      "clc_result;\n\t"
+      "selp.u32 %3, 1, 0, p1;\n\t"
+      "@p1 clusterlaunchcontrol.query_cancel.get_first_ctaid.v4.b32.b128 {%0, "
+      "%1, %2, _}, clc_result;\n\t"
+      "}\n"
+      : "=r"(decoded.x), "=r"(decoded.y), "=r"(decoded.z),
+        "=r"(decoded.is_canceled)
+      : "r"(result_addr)
+      : "memory");
+  // Match CUTLASS's CLC decode path ordering: decode from shared first, then
+  // issue the async proxy fence before subsequent shared-memory consumers.
+  clc_fence_proxy_async_shared_cta();
+  return decoded;
+#else
+  TILELANG_UNREACHABLE("CUTLASS_ARCH_CLC_ENABLED is not defined");
+#endif
+}
+
+TL_DEVICE int clc_is_canceled(void const *result_ptr) {
+#if defined(CUTLASS_ARCH_CLC_ENABLED)
+  return static_cast<int>(clc_decode_response(result_ptr).is_canceled);
+#else
+  TILELANG_UNREACHABLE("CUTLASS_ARCH_CLC_ENABLED is not defined");
+#endif
+}
+
+TL_DEVICE uint32_t clc_get_first_ctaid_x(void const *result_ptr) {
+#if defined(CUTLASS_ARCH_CLC_ENABLED)
+  return clc_decode_response(result_ptr).x;
+#else
+  TILELANG_UNREACHABLE("CUTLASS_ARCH_CLC_ENABLED is not defined");
+#endif
+}
+
+TL_DEVICE uint32_t clc_get_first_ctaid_y(void const *result_ptr) {
+#if defined(CUTLASS_ARCH_CLC_ENABLED)
+  return clc_decode_response(result_ptr).y;
+#else
+  TILELANG_UNREACHABLE("CUTLASS_ARCH_CLC_ENABLED is not defined");
+#endif
+}
+
+TL_DEVICE uint32_t clc_get_first_ctaid_z(void const *result_ptr) {
+#if defined(CUTLASS_ARCH_CLC_ENABLED)
+  return clc_decode_response(result_ptr).z;
+#else
+  TILELANG_UNREACHABLE("CUTLASS_ARCH_CLC_ENABLED is not defined");
+#endif
+}
+
+// Set the destination block-ID in cluster for a given SMEM Address
+TL_DEVICE uint32_t set_block_rank(uint32_t smemAddr, uint32_t rank) {
+#if defined(TILELANG_CLUSTER_ENABLED)
+  uint32_t result;
+  asm volatile("mapa.shared::cluster.u32  %0, %1, %2;\n"
+               : "=r"(result)
+               : "r"(smemAddr), "r"(rank));
+  return result;
+#else
+  TILELANG_UNREACHABLE("TILELANG_CLUSTER_ENABLED is not defined");
+#endif
+}
+
+} // namespace tl
diff --git a/src/tl_templates/cuda/common.h b/src/tl_templates/cuda/common.h
index c33f225c08..250a1006cc 100644
--- a/src/tl_templates/cuda/common.h
+++ b/src/tl_templates/cuda/common.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #ifndef __CUDACC_RTC__
+#include <cstdio>
+#include <cstdlib>
 #include <cuda_runtime.h>
 #endif
 
@@ -57,11 +59,34 @@ using int4_t = int4;
     }                                                                          \
   } while (0)
 
+#if defined(__CUDA_ARCH__)
+#define TILELANG_UNREACHABLE(msg)                                              \
+  do {                                                                         \
+    printf("%s, %s:%d\n", msg, __FILE__, __LINE__);                            \
+    __trap();                                                                  \
+  } while (0)
+#elif defined(__CUDACC_RTC__)
+#define TILELANG_UNREACHABLE(msg)                                              \
+  do {                                                                         \
+    __builtin_trap();                                                          \
+  } while (0)
+#else
+#define TILELANG_UNREACHABLE(msg)                                              \
+  do {                                                                         \
+    fprintf(stderr, "%s, %s:%d\n", msg, __FILE__, __LINE__);                   \
+    abort();                                                                   \
+  } while (0)
+#endif
+
 // using cutlass abs function for half_t
-TL_PATCH TL_DEVICE half_t __habs(const half_t x) { return abs(x); }
+TL_PATCH TL_DEVICE half_t __habs(const half_t x) {
+  return half_t(__habs(x.to_half()));
+}
 
 // using cutlass abs function for bfloat_t
-TL_PATCH TL_DEVICE bfloat16_t __habs(const bfloat16_t x) { return abs(x); }
+TL_PATCH TL_DEVICE bfloat16_t __habs(const bfloat16_t x) {
+  return bfloat16_t(__habs(x.to_nv_bfloat16()));
+}
 
 // hrsqrt function for half_t
 TL_PATCH TL_DEVICE half_t hrsqrt(const half_t x) {
@@ -180,6 +205,44 @@ TL_DEVICE uint4 make_uint4(unsigned short x0, unsigned short x1,
   return result;
 }
 
+// ============================================================================
+// Packed INT4 Buffer Access Helpers
+// ============================================================================
+// TileLang lowers scalar int4/uint4 storage through byte-packed buffers, where
+// each byte carries 2 logical 4-bit elements.
+
+TL_DEVICE int tl_int4_packed_load(const signed char *packed, int idx) {
+  unsigned char byte = static_cast<unsigned char>(packed[idx >> 1]);
+  unsigned int shift = (idx & 1) * 4;
+  int value = static_cast<int>((byte >> shift) & 0xF);
+  return (value << 28) >> 28;
+}
+
+TL_DEVICE unsigned int tl_uint4_packed_load(const unsigned char *packed,
+                                            int idx) {
+  unsigned char byte = packed[idx >> 1];
+  unsigned int shift = (idx & 1) * 4;
+  return (byte >> shift) & 0xF;
+}
+
+TL_DEVICE void tl_int4_packed_store(signed char *packed, int idx, int val) {
+  unsigned int shift = (idx & 1) * 4;
+  unsigned char mask = static_cast<unsigned char>(0xFu << shift);
+  unsigned char nibble = static_cast<unsigned char>(
+      (static_cast<unsigned int>(val) & 0xF) << shift);
+  unsigned char byte = static_cast<unsigned char>(packed[idx >> 1]);
+  packed[idx >> 1] = static_cast<signed char>((byte & ~mask) | nibble);
+}
+
+TL_DEVICE void tl_uint4_packed_store(unsigned char *packed, int idx,
+                                     unsigned int val) {
+  unsigned int shift = (idx & 1) * 4;
+  unsigned char mask = static_cast<unsigned char>(0xFu << shift);
+  unsigned char nibble = static_cast<unsigned char>((val & 0xF) << shift);
+  packed[idx >> 1] =
+      static_cast<unsigned char>((packed[idx >> 1] & ~mask) | nibble);
+}
+
 // Pack eight int values.
 TL_DEVICE longlong4 make_longlong4(int x0, int x1, int y0, int y1, int z0,
                                    int z1, int w0, int w1) {
@@ -277,7 +340,10 @@ enum class DataType : int {
   kBit8 = 19,
   kBit16 = 20,
   kBit32 = 21,
-  kBit64 = 22
+  kBit64 = 22,
+  kFloat6_e2m3fn = 23,
+  kFloat6_e3m2fn = 24,
+  kFloat4_e2m1fn = 25
 };
 
 union GmmaDescriptor {
@@ -504,9 +570,8 @@ template <int y = 1, typename T> TL_DEVICE T pow_of_int(T x) {
 
 // Thread partial barrier synchronization
 // https://docs.nvidia.com/cuda/parallel-thread-execution/#memory-consistency-model
-TL_DEVICE void __sync_thread_partial(int barrier_id = 0, int thread_count = 0) {
-  // In contrast to TileLang, here we support runtime determined barrier_id and
-  // thread_count.`
+template <int barrier_id = 0, int thread_count = 0>
+TL_DEVICE void __sync_thread_partial() {
   asm volatile("bar.sync %0, %1;" : : "r"(barrier_id), "r"(thread_count));
 }
 
@@ -554,6 +619,10 @@ struct float_e4m3_t : public cute::float_e4m3_t {
   CUTLASS_HOST_DEVICE
   explicit float_e4m3_t(__nv_bfloat16 x)
       : float_e4m3_t(static_cast<float>(x)) {}
+
+  CUTLASS_HOST_DEVICE
+  float_e4m3_t(cutlass::float_e4m3_t x)
+      : cute::float_e4m3_t(*reinterpret_cast<cute::float_e4m3_t *>(&x)) {}
 };
 
 struct float_e5m2_t : public cute::float_e5m2_t {
@@ -564,6 +633,10 @@ struct float_e5m2_t : public cute::float_e5m2_t {
   CUTLASS_HOST_DEVICE
   explicit float_e5m2_t(__nv_bfloat16 x)
       : float_e5m2_t(static_cast<float>(x)) {}
+
+  CUTLASS_HOST_DEVICE
+  float_e5m2_t(cutlass::float_e5m2_t x)
+      : cute::float_e5m2_t(*reinterpret_cast<cute::float_e5m2_t *>(&x)) {}
 };
 
 template <typename T> struct to_cute_type {
@@ -576,6 +649,211 @@ template <> struct to_cute_type<tl::float_e5m2_t> {
   using type = cute::float_e5m2_t;
 };
 
+// =========================================================================
+// Packed x2 element-wise math helpers
+//
+// Each operation (add2, sub2, mul2, fma2, max2, min2, abs2) is provided for
+// three dtype families:
+//   1. float2           (FP32x2)
+//   2. __nv_bfloat162   (BF16x2)
+//   3. __half2          (FP16x2)
+//
+// TVM stores bfloat16x2 and float16x2 as ``uint1`` in generated CUDA code.
+// The CUDA codegen emits explicit casts from uint1 to __nv_bfloat162 or
+// __half2 based on the TIR dtype, so C++ overload resolution correctly
+// dispatches to the right overload without ambiguous uint1 bridges.
+// =========================================================================
+
+// Cast helpers between uint1 and native packed types.
+// Used by the CUDA codegen to convert between TVM's uint1 representation
+// and the native __nv_bfloat162 / __half2 types.
+template <typename T> TL_DEVICE T from_uint1(uint1 v) {
+  T r;
+  memcpy(&r, &v, sizeof(T));
+  return r;
+}
+
+template <typename T> TL_DEVICE uint1 to_uint1(T v) {
+  uint1 r;
+  memcpy(&r, &v, sizeof(uint1));
+  return r;
+}
+
+// --- add2 ----------------------------------------------------------------
+
+TL_DEVICE float2 add2(float2 a, float2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) &&                       \
+    ((__CUDACC_VER_MAJOR__ > 12) ||                                            \
+     (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+  return __fadd2_rn(a, b);
+#else
+  return make_float2(a.x + b.x, a.y + b.y);
+#endif
+}
+
+TL_DEVICE __nv_bfloat162 add2(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hadd2(a, b);
+#else
+  return __nv_bfloat162{__hadd(a.x, b.x), __hadd(a.y, b.y)};
+#endif
+}
+
+TL_DEVICE __half2 add2(__half2 a, __half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hadd2(a, b);
+#else
+  return __half2{__hadd(a.x, b.x), __hadd(a.y, b.y)};
+#endif
+}
+
+// Note: uint1 bridge overloads removed -- the CUDA codegen now emits
+// explicit casts to __nv_bfloat162 or __half2 based on the TIR dtype,
+// so C++ overload resolution correctly dispatches to the right overload.
+
+// --- sub2 ----------------------------------------------------------------
+
+TL_DEVICE float2 sub2(float2 a, float2 b) {
+  return make_float2(a.x - b.x, a.y - b.y);
+}
+
+TL_DEVICE __nv_bfloat162 sub2(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hsub2(a, b);
+#else
+  return __nv_bfloat162{__hsub(a.x, b.x), __hsub(a.y, b.y)};
+#endif
+}
+
+TL_DEVICE __half2 sub2(__half2 a, __half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hsub2(a, b);
+#else
+  return __half2{__hsub(a.x, b.x), __hsub(a.y, b.y)};
+#endif
+}
+
+// --- mul2 ----------------------------------------------------------------
+
+TL_DEVICE float2 mul2(float2 a, float2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) &&                       \
+    ((__CUDACC_VER_MAJOR__ > 12) ||                                            \
+     (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+  return __fmul2_rn(a, b);
+#else
+  return make_float2(a.x * b.x, a.y * b.y);
+#endif
+}
+
+TL_DEVICE __nv_bfloat162 mul2(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmul2(a, b);
+#else
+  return __nv_bfloat162{__hmul(a.x, b.x), __hmul(a.y, b.y)};
+#endif
+}
+
+TL_DEVICE __half2 mul2(__half2 a, __half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hmul2(a, b);
+#else
+  return __half2{__hmul(a.x, b.x), __hmul(a.y, b.y)};
+#endif
+}
+
+// --- fma2 ----------------------------------------------------------------
+
+TL_DEVICE float2 fma2(float2 a, float2 b, float2 c) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) &&                       \
+    ((__CUDACC_VER_MAJOR__ > 12) ||                                            \
+     (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8))
+  return __ffma2_rn(a, b, c);
+#else
+  return make_float2(a.x * b.x + c.x, a.y * b.y + c.y);
+#endif
+}
+
+TL_DEVICE __nv_bfloat162 fma2(__nv_bfloat162 a, __nv_bfloat162 b,
+                              __nv_bfloat162 c) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hfma2(a, b, c);
+#else
+  return __nv_bfloat162{__hfma(a.x, b.x, c.x), __hfma(a.y, b.y, c.y)};
+#endif
+}
+
+TL_DEVICE __half2 fma2(__half2 a, __half2 b, __half2 c) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hfma2(a, b, c);
+#else
+  return __half2{__hfma(a.x, b.x, c.x), __hfma(a.y, b.y, c.y)};
+#endif
+}
+
+// --- max2 ----------------------------------------------------------------
+
+TL_DEVICE float2 max2(float2 a, float2 b) {
+  return make_float2(fmaxf(a.x, b.x), fmaxf(a.y, b.y));
+}
+
+TL_DEVICE __nv_bfloat162 max2(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmax2(a, b);
+#else
+  return __nv_bfloat162{__hmax(a.x, b.x), __hmax(a.y, b.y)};
+#endif
+}
+
+TL_DEVICE __half2 max2(__half2 a, __half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hmax2(a, b);
+#else
+  return __half2{__hmax(a.x, b.x), __hmax(a.y, b.y)};
+#endif
+}
+
+// --- min2 ----------------------------------------------------------------
+
+TL_DEVICE float2 min2(float2 a, float2 b) {
+  return make_float2(fminf(a.x, b.x), fminf(a.y, b.y));
+}
+
+TL_DEVICE __nv_bfloat162 min2(__nv_bfloat162 a, __nv_bfloat162 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __hmin2(a, b);
+#else
+  return __nv_bfloat162{__hmin(a.x, b.x), __hmin(a.y, b.y)};
+#endif
+}
+
+TL_DEVICE __half2 min2(__half2 a, __half2 b) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __hmin2(a, b);
+#else
+  return __half2{__hmin(a.x, b.x), __hmin(a.y, b.y)};
+#endif
+}
+
+// --- abs2 ----------------------------------------------------------------
+
+TL_DEVICE float2 abs2(float2 a) { return make_float2(fabsf(a.x), fabsf(a.y)); }
+
+TL_DEVICE __nv_bfloat162 abs2(__nv_bfloat162 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  return __habs2(a);
+#else
+  return __nv_bfloat162{__habs(a.x), __habs(a.y)};
+#endif
+}
+
+TL_DEVICE __half2 abs2(__half2 a) {
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 530)
+  return __habs2(a);
+#else
+  return __half2{__habs(a.x), __habs(a.y)};
+#endif
+}
+
 } // namespace tl
 
 namespace cutlass {
@@ -584,10 +862,9 @@ bfloat16_t fast_exp(bfloat16_t x) { return ::hexp(x); }
 } // namespace cutlass
 
 //
-// Type-safe warp shuffle helpers for 16-bit float types
-// These wrappers avoid relying on implicit conversions that may be disallowed
-// (e.g., converting float -> cutlass::bfloat16_t) by explicitly promoting to
-// float for the shuffle and then down-converting.
+// Optimized type-punned warp shuffle helpers for 16-bit types
+// Directly shuffle the underlying bits (as uint16/uint32) to avoid
+// costly fp32 conversions and instruction overhead.
 //
 namespace tl {
 
@@ -614,59 +891,75 @@ template <typename T> TL_DEVICE T shfl_sync(unsigned mask, T val, int srcLane) {
 // Specializations for cutlass::half_t
 template <>
 TL_DEVICE half_t shfl_xor_sync(unsigned mask, half_t val, int laneMask) {
-  float f = static_cast<float>(val);
-  float r = __shfl_xor_sync(mask, f, laneMask);
-  return half_t(r);
+  uint16_t raw = reinterpret_cast<uint16_t &>(val);
+  uint32_t raw32 = static_cast<uint32_t>(raw);
+  uint32_t ret32 = __shfl_xor_sync(mask, raw32, laneMask);
+  uint16_t ret16 = static_cast<uint16_t>(ret32);
+  return reinterpret_cast<half_t &>(ret16);
 }
 
 template <>
 TL_DEVICE half_t shfl_down_sync(unsigned mask, half_t val, int delta) {
-  float f = static_cast<float>(val);
-  float r = __shfl_down_sync(mask, f, delta);
-  return half_t(r);
+  uint16_t raw = reinterpret_cast<uint16_t &>(val);
+  uint32_t raw32 = static_cast<uint32_t>(raw);
+  uint32_t ret32 = __shfl_down_sync(mask, raw32, delta);
+  uint16_t ret16 = static_cast<uint16_t>(ret32);
+  return reinterpret_cast<half_t &>(ret16);
 }
 
 template <>
 TL_DEVICE half_t shfl_up_sync(unsigned mask, half_t val, int delta) {
-  float f = static_cast<float>(val);
-  float r = __shfl_up_sync(mask, f, delta);
-  return half_t(r);
+  uint16_t raw = reinterpret_cast<uint16_t &>(val);
+  uint32_t raw32 = static_cast<uint32_t>(raw);
+  uint32_t ret32 = __shfl_up_sync(mask, raw32, delta);
+  uint16_t ret16 = static_cast<uint16_t>(ret32);
+  return reinterpret_cast<half_t &>(ret16);
 }
 
 template <> TL_DEVICE half_t shfl_sync(unsigned mask, half_t val, int srcLane) {
-  float f = static_cast<float>(val);
-  float r = __shfl_sync(mask, f, srcLane);
-  return half_t(r);
+  uint16_t raw = reinterpret_cast<uint16_t &>(val);
+  uint32_t raw32 = static_cast<uint32_t>(raw);
+  uint32_t ret32 = __shfl_sync(mask, raw32, srcLane);
+  uint16_t ret16 = static_cast<uint16_t>(ret32);
+  return reinterpret_cast<half_t &>(ret16);
 }
 
 // Specializations for cutlass::bfloat16_t
 template <>
 TL_DEVICE bfloat16_t shfl_xor_sync(unsigned mask, bfloat16_t val,
                                    int laneMask) {
-  float f = static_cast<float>(val);
-  float r = __shfl_xor_sync(mask, f, laneMask);
-  return bfloat16_t(r);
+  uint16_t raw = reinterpret_cast<uint16_t &>(val);
+  uint32_t raw32 = static_cast<uint32_t>(raw);
+  uint32_t ret32 = __shfl_xor_sync(mask, raw32, laneMask);
+  uint16_t ret16 = static_cast<uint16_t>(ret32);
+  return reinterpret_cast<bfloat16_t &>(ret16);
 }
 
 template <>
 TL_DEVICE bfloat16_t shfl_down_sync(unsigned mask, bfloat16_t val, int delta) {
-  float f = static_cast<float>(val);
-  float r = __shfl_down_sync(mask, f, delta);
-  return bfloat16_t(r);
+  uint16_t raw = reinterpret_cast<uint16_t &>(val);
+  uint32_t raw32 = static_cast<uint32_t>(raw);
+  uint32_t ret32 = __shfl_down_sync(mask, raw32, delta);
+  uint16_t ret16 = static_cast<uint16_t>(ret32);
+  return reinterpret_cast<bfloat16_t &>(ret16);
 }
 
 template <>
 TL_DEVICE bfloat16_t shfl_up_sync(unsigned mask, bfloat16_t val, int delta) {
-  float f = static_cast<float>(val);
-  float r = __shfl_up_sync(mask, f, delta);
-  return bfloat16_t(r);
+  uint16_t raw = reinterpret_cast<uint16_t &>(val);
+  uint32_t raw32 = static_cast<uint32_t>(raw);
+  uint32_t ret32 = __shfl_up_sync(mask, raw32, delta);
+  uint16_t ret16 = static_cast<uint16_t>(ret32);
+  return reinterpret_cast<bfloat16_t &>(ret16);
 }
 
 template <>
 TL_DEVICE bfloat16_t shfl_sync(unsigned mask, bfloat16_t val, int srcLane) {
-  float f = static_cast<float>(val);
-  float r = __shfl_sync(mask, f, srcLane);
-  return bfloat16_t(r);
+  uint16_t raw = reinterpret_cast<uint16_t &>(val);
+  uint32_t raw32 = static_cast<uint32_t>(raw);
+  uint32_t ret32 = __shfl_sync(mask, raw32, srcLane);
+  uint16_t ret16 = static_cast<uint16_t>(ret32);
+  return reinterpret_cast<bfloat16_t &>(ret16);
 }
 
 } // namespace tl
diff --git a/src/tl_templates/cuda/copy.h b/src/tl_templates/cuda/copy.h
index f905d8834d..3a98286520 100644
--- a/src/tl_templates/cuda/copy.h
+++ b/src/tl_templates/cuda/copy.h
@@ -78,329 +78,192 @@ TL_DEVICE void cp_async_gs_conditional(void const *const smem_addr,
   }
 }
 
-template <int kBytes> struct VecInt {};
-
-template <> struct VecInt<1> {
-  using vec_t = int8_t;
-};
-template <> struct VecInt<2> {
-  using vec_t = int16_t;
-};
-template <> struct VecInt<4> {
-  using vec_t = int;
-};
-template <> struct VecInt<8> {
-  using vec_t = int64_t;
-};
-template <> struct VecInt<16> {
-  using vec_t = int4;
+// Global memory load intrinsics with explicit vector widths
+// Following CUTLASS style with template specialization
+
+// Primary template declaration
+template <typename AccessType, int LoadBytes> struct global_load;
+
+// ldg32: Load 32 bits (4 bytes) from global memory
+template <typename AccessType> struct global_load<AccessType, 4> {
+  TL_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) {
+    unsigned &data = reinterpret_cast<unsigned &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %2, 0;\n"
+                 "  mov.b32 %0, %3;\n"
+#if TL_ENABLE_L2_PREFETCH
+                 "  @p ld.global.L2::128B.u32 %0, [%1];\n"
+#else
+                 "  @p ld.global.u32 %0, [%1];\n"
+#endif
+                 "}\n"
+                 : "=r"(data)
+                 : "l"(ptr), "r"((int)pred_guard), "r"(data));
+  }
 };
 
-#define LD_NC_FUNC "ld.nc.global"
-#define ST_NA_FUNC "st.global"
-
-template <typename dtype_t> TL_DEVICE dtype_t ld_nc_global(const dtype_t *ptr) {
-  auto ret = ld_nc_global(
-      reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t *>(ptr));
-  return *reinterpret_cast<dtype_t *>(&ret);
-}
-
-template <> TL_DEVICE uint8_t ld_nc_global(const uint8_t *ptr) {
-  uint16_t ret;
-  // NOTES: we must use `uint16_t` as inline ASM does not support 8-bit
-  // constraint letter (`h` below means unsigned 16-bit)
-  asm volatile(LD_NC_FUNC ".u8 %0, [%1];" : "=h"(ret) : "l"(ptr));
-  return static_cast<uint8_t>(ret);
-}
+// ldg64: Load 64 bits (8 bytes) from global memory
+template <typename AccessType> struct global_load<AccessType, 8> {
+  TL_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) {
+    uint2 &data = reinterpret_cast<uint2 &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %3, 0;\n"
+                 "  mov.b32 %0, %4;\n"
+                 "  mov.b32 %1, %5;\n"
+#if TL_ENABLE_L2_PREFETCH
+                 "  @p ld.global.L2::128B.v2.u32 {%0, %1}, [%2];\n"
+#else
+                 "  @p ld.global.v2.u32 {%0, %1}, [%2];\n"
+#endif
+                 "}\n"
+                 : "=r"(data.x), "=r"(data.y)
+                 : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
+  }
+};
 
-template <> TL_DEVICE int16_t ld_nc_global(const int16_t *ptr) {
-  uint16_t ret;
-  asm volatile(LD_NC_FUNC ".s16 %0, [%1];" : "=h"(ret) : "l"(ptr));
-  return static_cast<int16_t>(ret);
-}
+// ldg128: Load 128 bits (16 bytes) from global memory
+template <typename AccessType> struct global_load<AccessType, 16> {
+  TL_DEVICE global_load(AccessType &D, void const *ptr, bool pred_guard) {
+    uint4 &data = reinterpret_cast<uint4 &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  mov.b32 %0, %6;\n"
+                 "  mov.b32 %1, %7;\n"
+                 "  mov.b32 %2, %8;\n"
+                 "  mov.b32 %3, %9;\n"
+#if TL_ENABLE_L2_PREFETCH
+                 "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+#else
+                 "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+#endif
+                 "}\n"
+                 : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+                 : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y),
+                   "r"(data.z), "r"(data.w));
+  }
+};
 
-template <> TL_DEVICE uint16_t ld_nc_global(const uint16_t *ptr) {
-  uint16_t ret;
-  asm volatile(LD_NC_FUNC ".u16 %0, [%1];" : "=h"(ret) : "l"(ptr));
+// Convenience wrapper functions for direct use
+// load_global_32: Load 32 bits, return uint32_t
+TL_DEVICE uint32_t load_global_32(const void *ptr) {
+  uint32_t ret{};
+  global_load<uint32_t, 4>(ret, ptr, true);
   return ret;
 }
 
-template <> TL_DEVICE int ld_nc_global(const int *ptr) {
-  int ret;
-  asm volatile(LD_NC_FUNC ".s32 %0, [%1];" : "=r"(ret) : "l"(ptr));
+// load_global_64: Load 64 bits, return uint64_t
+TL_DEVICE uint2 load_global_64(const void *ptr) {
+  uint2 ret{};
+  global_load<uint2, 8>(ret, ptr, true);
   return ret;
 }
 
-template <> TL_DEVICE int64_t ld_nc_global(const int64_t *ptr) {
-  int64_t ret;
-  asm volatile(LD_NC_FUNC ".s64 %0, [%1];" : "=l"(ret) : "l"(ptr));
+// load_global_128: Load 128 bits, return uint4
+TL_DEVICE uint4 load_global_128(const void *ptr) {
+  uint4 ret{};
+  global_load<uint4, 16>(ret, ptr, true);
   return ret;
 }
 
-template <> TL_DEVICE float ld_nc_global(const float *ptr) {
-  float ret;
-  asm volatile(LD_NC_FUNC ".f32 %0, [%1];" : "=f"(ret) : "l"(ptr));
+// Predicated (conditional) versions
+TL_DEVICE uint32_t load_global_32_conditional(const void *ptr, bool pred) {
+  uint32_t ret{};
+  global_load<uint32_t, 4>(ret, ptr, pred);
   return ret;
 }
 
-template <> TL_DEVICE int2 ld_nc_global(const int2 *ptr) {
-  int2 ret;
-  asm volatile(LD_NC_FUNC ".v2.s32 {%0, %1}, [%2];"
-               : "=r"(ret.x), "=r"(ret.y)
-               : "l"(ptr));
+TL_DEVICE uint2 load_global_64_conditional(const void *ptr, bool pred) {
+  uint2 ret{};
+  global_load<uint2, 8>(ret, ptr, pred);
   return ret;
 }
 
-template <> TL_DEVICE int4 ld_nc_global(const int4 *ptr) {
-  int4 ret;
-  asm volatile(LD_NC_FUNC ".v4.s32 {%0, %1, %2, %3}, [%4];"
-               : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w)
-               : "l"(ptr));
+TL_DEVICE uint4 load_global_128_conditional(const void *ptr, bool pred) {
+  uint4 ret{};
+  global_load<uint4, 16>(ret, ptr, pred);
   return ret;
 }
 
-template <typename dtype_t>
-TL_DEVICE void st_na_global(const dtype_t *ptr, const dtype_t &value) {
-  st_na_global(
-      reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t *>(ptr),
-      *reinterpret_cast<const typename VecInt<sizeof(dtype_t)>::vec_t *>(
-          &value));
-}
-
-template <>
-TL_DEVICE void st_na_global(const int16_t *ptr, const int16_t &value) {
-  asm volatile(ST_NA_FUNC ".s16 [%0], %1;" ::"l"(ptr), "h"(value));
-}
-
-template <>
-TL_DEVICE void st_na_global(const uint16_t *ptr, const uint16_t &value) {
-  asm volatile(ST_NA_FUNC ".u16 [%0], %1;" ::"l"(ptr), "h"(value));
-}
-
-template <> TL_DEVICE void st_na_global(const int *ptr, const int &value) {
-  asm volatile(ST_NA_FUNC ".s32 [%0], %1;" ::"l"(ptr), "r"(value));
-}
-
-template <>
-TL_DEVICE void st_na_global(const int64_t *ptr, const int64_t &value) {
-  asm volatile(ST_NA_FUNC ".s64 [%0], %1;" ::"l"(ptr), "l"(value));
-}
-
-template <> TL_DEVICE void st_na_global(const float *ptr, const float &value) {
-  asm volatile(ST_NA_FUNC ".f32 [%0], %1;" ::"l"(ptr), "f"(value));
-}
-
-template <> TL_DEVICE void st_na_global(const int4 *ptr, const int4 &value) {
-  asm volatile(ST_NA_FUNC ".v4.s32 [%0], {%1, %2, %3, %4};" ::"l"(ptr),
-               "r"(value.x), "r"(value.y), "r"(value.z), "r"(value.w));
-}
-
-#define LD_FUNC(ptr) ld_nc_global(ptr)
-#define ST_FUNC(ptr, value) st_na_global(ptr, value)
-
-template <int N, int UNROLL_FACTOR, typename dtype_t>
-TL_DEVICE void cp_warp_impl(dtype_t const *const dst_addr,
-                            dtype_t const *const src_addr) {
-  int lane_id;
-  asm("mov.s32 %0, %laneid;" : "=r"(lane_id));
-  constexpr int kLoopStride = 32 * UNROLL_FACTOR;
-  typename std::remove_reference<decltype(LD_FUNC(src_addr + 0))>::type
-      unrolled_values[UNROLL_FACTOR];
-  auto __src = src_addr;
-  auto __dst = dst_addr;
-  for (int __i = lane_id; __i < (N / kLoopStride) * kLoopStride;
-       __i += kLoopStride) {
-    _Pragma("unroll") for (int __j = 0; __j < UNROLL_FACTOR; ++__j)
-        unrolled_values[__j] = LD_FUNC(__src + __i + __j * 32);
-    _Pragma("unroll") for (int __j = 0; __j < UNROLL_FACTOR; ++__j)
-        ST_FUNC(__dst + __i + __j * 32, unrolled_values[__j]);
+// Global memory store intrinsics with explicit vector widths
+// Following CUTLASS style with template specialization
+
+// Primary template declaration
+template <typename AccessType, int StoreBytes> struct global_store;
+
+// stg32: Store 32 bits (4 bytes) to global memory
+template <typename AccessType> struct global_store<AccessType, 4> {
+  TL_DEVICE global_store(void *ptr, AccessType const &D, bool pred_guard) {
+    unsigned const &data = reinterpret_cast<unsigned const &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %2, 0;\n"
+                 "  @p st.global.u32 [%0], %1;\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "r"(data), "r"((int)pred_guard));
   }
-  for (int __i = (N / kLoopStride) * kLoopStride + lane_id; __i < N; __i += 32)
-    ST_FUNC(__dst + __i, LD_FUNC(__src + __i));
-}
-
-/**
- * @param enable_aggressive_vectorize If set to true, the copy will be performed
- * with aggressive vectorization (e.g., using int4 for aligned and sized
- * transfers), which requires that both source and destination addresses are
- * 16-byte aligned and N*sizeof(dtype_t) is a multiple of 16 for optimal memory
- * access and throughput. If false, performs a standard element-wise copy.
- */
-// todo: support more auto-vectorize later
-template <int N, int UNROLL_FACTOR, bool enable_aggressive_vectorize = false,
-          typename dtype_t>
-TL_DEVICE void cp_warp(dtype_t const *const dst_addr,
-                       dtype_t const *const src_addr) {
-  if constexpr (enable_aggressive_vectorize) {
-    int4 *__restrict__ dst_addr_int4 = (int4 *)dst_addr;
-    const int4 *__restrict__ src_addr_int4 = (const int4 *)src_addr;
-    constexpr int N_int4 = sizeof(dtype_t) * N / 16;
-    cp_warp_impl<N_int4, UNROLL_FACTOR>(dst_addr_int4, src_addr_int4);
-  } else {
-    cp_warp_impl<N, UNROLL_FACTOR>(dst_addr, src_addr);
-  }
-}
-
-template <int N, int UNROLL_FACTOR, bool enable_aggressive_vectorize = false,
-          typename dtype_t>
-TL_DEVICE void cp_warp(uint64_t dst_addr_uint64,
-                       dtype_t const *const src_addr) {
-  dtype_t *dst_addr = reinterpret_cast<dtype_t *>(dst_addr_uint64);
-  if constexpr (enable_aggressive_vectorize) {
-    int4 *__restrict__ dst_addr_int4 = (int4 *)dst_addr;
-    const int4 *__restrict__ src_addr_int4 = (const int4 *)src_addr;
-    constexpr int N_int4 = sizeof(dtype_t) * N / 16;
-    cp_warp_impl<N_int4, UNROLL_FACTOR>(dst_addr_int4, src_addr_int4);
-  } else {
-    cp_warp_impl<N, UNROLL_FACTOR>(dst_addr, src_addr);
-  }
-}
-
-template <int N, int UNROLL_FACTOR, bool enable_aggressive_vectorize = false,
-          typename dtype_t>
-TL_DEVICE void cp_warp(dtype_t *const dst_addr, uint64_t src_addr_uint64) {
-  const dtype_t *src_addr = reinterpret_cast<const dtype_t *>(src_addr_uint64);
-  if constexpr (enable_aggressive_vectorize) {
-    int4 *__restrict__ dst_addr_int4 = (int4 *)dst_addr;
-    const int4 *__restrict__ src_addr_int4 = (const int4 *)src_addr;
-    constexpr int N_int4 = sizeof(dtype_t) * N / 16;
-    cp_warp_impl<N_int4, UNROLL_FACTOR>(dst_addr_int4, src_addr_int4);
-  } else {
-    cp_warp_impl<N, UNROLL_FACTOR>(dst_addr, src_addr);
-  }
-}
-
-/**
- * Check:
- * nvshmem_src/src/include/non_abi/device/common/nvshmemi_common_device.cuh::nvshmemi_memcpy_threadgroup()
- */
-template <int N, typename dtype_t>
-TL_DEVICE void nvshmem_cp_threadgroup(dtype_t *__restrict__ _dst,
-                                      const dtype_t *__restrict__ _src,
-                                      int myIdx, int groupSize) {
-  size_t len = N * sizeof(dtype_t);
-  void *dst = _dst;
-  const void *src = _src;
-
-  /*
-   * If src and dst are 16B aligned copy as much as possible using 16B chunks
-   */
-  if ((uintptr_t)dst % 16 == 0 && (uintptr_t)src % 16 == 0) {
-    const size_t nelems = len / 16;
-
-    int4 *__restrict__ dst_p = (int4 *)dst;
-    const int4 *__restrict__ src_p = (const int4 *)src;
-    for (size_t i = myIdx; i < nelems; i += groupSize)
-      dst_p[i] = src_p[i];
-
-    len -= nelems * 16;
-
-    if (0 == len)
-      return;
-
-    dst = (void *)(dst_p + nelems);
-    src = (void *)(src_p + nelems);
-  }
-
-  /*
-   * If src and dst are 8B aligned copy as much as possible using 8B chunks
-   */
-  if ((uintptr_t)dst % 8 == 0 && (uintptr_t)src % 8 == 0) {
-    uint64_t *__restrict__ dst_p = (uint64_t *)dst;
-    const uint64_t *__restrict__ src_p = (const uint64_t *)src;
-    const size_t nelems = len / 8;
-
-    for (size_t i = myIdx; i < nelems; i += groupSize)
-      dst_p[i] = src_p[i];
-
-    len -= nelems * 8;
-
-    if (0 == len)
-      return;
-
-    dst = (void *)(dst_p + nelems);
-    src = (void *)(src_p + nelems);
-  }
-
-  /*
-   * If src and dst are 4B aligned copy as much as possible using 4B chunks
-   */
-  if ((uintptr_t)dst % 4 == 0 && (uintptr_t)src % 4 == 0) {
-    uint32_t *__restrict__ dst_p = (uint32_t *)dst;
-    const uint32_t *__restrict__ src_p = (const uint32_t *)src;
-    const size_t nelems = len / 4;
-
-    for (size_t i = myIdx; i < nelems; i += groupSize)
-      dst_p[i] = src_p[i];
-
-    len -= nelems * 4;
-
-    if (0 == len)
-      return;
+};
 
-    dst = (void *)(dst_p + nelems);
-    src = (void *)(src_p + nelems);
+// stg64: Store 64 bits (8 bytes) to global memory
+template <typename AccessType> struct global_store<AccessType, 8> {
+  TL_DEVICE global_store(void *ptr, AccessType const &D, bool pred_guard) {
+    uint2 const &data = reinterpret_cast<uint2 const &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %3, 0;\n"
+                 "  @p st.global.v2.u32 [%0], {%1, %2};\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "r"(data.x), "r"(data.y), "r"((int)pred_guard));
   }
+};
 
-  /*
-   * If src and dst are 2B aligned copy as much as possible using 2B chunks
-   */
-  if ((uintptr_t)dst % 2 == 0 && (uintptr_t)src % 2 == 0) {
-    uint16_t *__restrict__ dst_p = (uint16_t *)dst;
-    const uint16_t *__restrict__ src_p = (const uint16_t *)src;
-    const size_t nelems = len / 2;
-
-    for (size_t i = myIdx; i < nelems; i += groupSize)
-      dst_p[i] = src_p[i];
-
-    len -= nelems * 2;
-
-    if (0 == len)
-      return;
-
-    dst = (void *)(dst_p + nelems);
-    src = (void *)(src_p + nelems);
+// stg128: Store 128 bits (16 bytes) to global memory
+template <typename AccessType> struct global_store<AccessType, 16> {
+  TL_DEVICE global_store(void *ptr, AccessType const &D, bool pred_guard) {
+    uint4 const &data = reinterpret_cast<uint4 const &>(D);
+    asm volatile("{\n"
+                 "  .reg .pred p;\n"
+                 "  setp.ne.b32 p, %5, 0;\n"
+                 "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+                 "}\n"
+                 :
+                 : "l"(ptr), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w),
+                   "r"((int)pred_guard));
   }
+};
 
-  unsigned char *__restrict__ dst_c = (unsigned char *)dst;
-  const unsigned char *__restrict__ src_c = (const unsigned char *)src;
-
-  for (size_t i = myIdx; i < len; i += groupSize)
-    dst_c[i] = src_c[i];
+// Convenience wrapper functions for direct use
+// store_global_32: Store 32 bits
+TL_DEVICE void store_global_32(void *ptr, uint32_t value) {
+  global_store<uint32_t, 4>(ptr, value, true);
 }
 
-template <int N, typename dtype_t>
-TL_DEVICE void nvshmem_cp_warp(dtype_t *__restrict__ dst,
-                               const dtype_t *__restrict__ src) {
-  int lane_id;
-  asm("mov.s32 %0, %laneid;" : "=r"(lane_id));
-  nvshmem_cp_threadgroup<N>(dst, src, lane_id, 32);
+// store_global_64: Store 64 bits
+TL_DEVICE void store_global_64(void *ptr, uint2 value) {
+  global_store<uint2, 8>(ptr, value, true);
 }
 
-template <int N, typename dtype_t>
-TL_DEVICE void nvshmem_cp_block(dtype_t *__restrict__ dst,
-                                const dtype_t *__restrict__ src) {
-  int thread_id = threadIdx.x + threadIdx.y * blockDim.x +
-                  threadIdx.z * blockDim.x * blockDim.y;
-  int block_size = blockDim.x * blockDim.y * blockDim.z;
-  nvshmem_cp_threadgroup<N>(dst, src, thread_id, block_size);
+// store_global_128: Store 128 bits
+TL_DEVICE void store_global_128(void *ptr, uint4 value) {
+  global_store<uint4, 16>(ptr, value, true);
 }
 
-template <int N, typename dtype_t>
-TL_DEVICE void cp_block(dtype_t *dst_addr, const dtype_t *src_addr) {
-  nvshmem_cp_block<N>(dst_addr, src_addr);
+// Predicated (conditional) versions
+TL_DEVICE void store_global_32_conditional(void *ptr, uint32_t value,
+                                           bool pred) {
+  global_store<uint32_t, 4>(ptr, value, pred);
 }
 
-template <int N, typename dtype_t>
-TL_DEVICE void cp_block(uint64_t dst_addr_uint64, const dtype_t *src_addr) {
-  dtype_t *dst_addr = reinterpret_cast<dtype_t *>(dst_addr_uint64);
-  nvshmem_cp_block<N>(dst_addr, src_addr);
+TL_DEVICE void store_global_64_conditional(void *ptr, uint2 value, bool pred) {
+  global_store<uint2, 8>(ptr, value, pred);
 }
 
-template <int N, typename dtype_t>
-TL_DEVICE void cp_block(dtype_t *dst_addr, const uint64_t src_addr_uint64) {
-  const dtype_t *src_addr = reinterpret_cast<const dtype_t *>(src_addr_uint64);
-  nvshmem_cp_block<N>(dst_addr, src_addr);
+TL_DEVICE void store_global_128_conditional(void *ptr, uint4 value, bool pred) {
+  global_store<uint4, 16>(ptr, value, pred);
 }
 
 } // namespace tl
diff --git a/src/tl_templates/cuda/copy_sm100.h b/src/tl_templates/cuda/copy_sm100.h
index 82d0cca260..a871bb1ca6 100644
--- a/src/tl_templates/cuda/copy_sm100.h
+++ b/src/tl_templates/cuda/copy_sm100.h
@@ -1,79 +1,157 @@
 #pragma once
+
+#ifndef __CUDACC_RTC__
+#include <cuda.h>
+#endif
+
+#include "barrier.h"
+#include "common.h"
 #include "cuda_fp8.h"
 #include "tcgen_05.h"
 #include "tcgen_05_ld.h"
+#include "tcgen_05_st.h"
 
 namespace tl {
 
-// 256-bit load for longlong4
-__device__ __forceinline__ longlong4 ld_global_256(const longlong4 *ptr) {
-  longlong4 ret;
-  asm volatile("ld.global.v4.s64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
+// 256-bit load specialization for ulonglong4
+__device__ __forceinline__ void global_load_256(ulonglong4 &D, void const *ptr,
+                                                bool pred_guard) {
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+  asm volatile("{\n"
+               "  .reg .pred p;\n"
+               "  setp.ne.b32 p, %5, 0;\n"
+               "  mov.b64 %0, %6;\n"
+               "  mov.b64 %1, %7;\n"
+               "  mov.b64 %2, %8;\n"
+               "  mov.b64 %3, %9;\n"
+#if TL_ENABLE_L2_PREFETCH
+               "  @p ld.global.L2::128B.v4.u64 {%0, %1, %2, %3}, [%4];\n"
+#else
+               "  @p ld.global.v4.u64 {%0, %1, %2, %3}, [%4];\n"
+#endif
+               "}\n"
+               : "=l"(D.x), "=l"(D.y), "=l"(D.z), "=l"(D.w)
+               : "l"(ptr), "r"((int)pred_guard), "l"(D.x), "l"(D.y), "l"(D.z),
+                 "l"(D.w));
+#else
+  // CUDA < 12.9 fallback: two 128-bit loads (may have performance regression)
+  uint4 *data = reinterpret_cast<uint4 *>(&D);
+  asm volatile("{\n"
+               "  .reg .pred p;\n"
+               "  setp.ne.b32 p, %9, 0;\n"
+               "  mov.b32 %0, %10;\n"
+               "  mov.b32 %1, %11;\n"
+               "  mov.b32 %2, %12;\n"
+               "  mov.b32 %3, %13;\n"
+               "  mov.b32 %4, %14;\n"
+               "  mov.b32 %5, %15;\n"
+               "  mov.b32 %6, %16;\n"
+               "  mov.b32 %7, %17;\n"
+#if TL_ENABLE_L2_PREFETCH
+               "  @p ld.global.L2::128B.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+               "  @p ld.global.L2::128B.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+#else
+               "  @p ld.global.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+               "  @p ld.global.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+#endif
+               "}\n"
+               : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z),
+                 "=r"(data[0].w), "=r"(data[1].x), "=r"(data[1].y),
+                 "=r"(data[1].z), "=r"(data[1].w)
+               : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+                 "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+                 "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+#endif
+}
+
+// Convenience wrapper functions
+__device__ __forceinline__ longlong4 load_global_256(const longlong4 *ptr) {
+  ulonglong4 ret{};
+  global_load_256(ret, ptr, true);
+  return *reinterpret_cast<longlong4 *>(&ret);
+}
+
+__device__ __forceinline__ ulonglong4 load_global_256(const ulonglong4 *ptr) {
+  ulonglong4 ret{};
+  global_load_256(ret, ptr, true);
   return ret;
 }
 
-// 256-bit load for ulonglong4
-__device__ __forceinline__ ulonglong4 ld_global_256(const ulonglong4 *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
+// Predicated (conditional) versions
+__device__ __forceinline__ longlong4
+load_global_256_conditional(const longlong4 *ptr, bool pred) {
+  ulonglong4 ret{};
+  global_load_256(ret, ptr, pred);
+  return *reinterpret_cast<longlong4 *>(&ret);
+}
+
+__device__ __forceinline__ ulonglong4
+load_global_256_conditional(const ulonglong4 *ptr, bool pred) {
+  ulonglong4 ret{};
+  global_load_256(ret, ptr, pred);
   return ret;
 }
 
-// Generic 256-bit load for FP8 types (returns ulonglong4)
+// Generic 256-bit load for FP8 and other types (returns ulonglong4)
 template <typename T>
-__device__ __forceinline__ ulonglong4 ld_global_256(const T *ptr) {
-  ulonglong4 ret;
-  asm volatile("ld.global.v4.u64 {%0, %1, %2, %3}, [%4];"
-               : "=l"(ret.x), "=l"(ret.y), "=l"(ret.z), "=l"(ret.w)
-               : "l"(ptr));
+__device__ __forceinline__ ulonglong4 load_global_256(const T *ptr) {
+  ulonglong4 ret{};
+  global_load_256(ret, ptr, true);
   return ret;
 }
 
-// 256-bit store for longlong4
-__device__ __forceinline__ void st_global_256(longlong4 *ptr, longlong4 &val) {
-  asm volatile("st.global.v4.s64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+template <typename T>
+__device__ __forceinline__ ulonglong4 load_global_256_conditional(const T *ptr,
+                                                                  bool pred) {
+  ulonglong4 ret{};
+  global_load_256(ret, ptr, pred);
+  return ret;
 }
 
-// 256-bit store for ulonglong4 with non-const reference
-__device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
-                                              ulonglong4 &val) {
-  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+// 256-bit store specialization for ulonglong4
+__device__ __forceinline__ void global_store_256(ulonglong4 const &D, void *ptr,
+                                                 bool pred_guard) {
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 9)
+  asm volatile("{\n"
+               "  .reg .pred p;\n"
+               "  setp.ne.b32 p, %5, 0;\n"
+               "  @p st.global.v4.u64 [%0], {%1, %2, %3, %4};\n"
+               "}\n"
                :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
-}
-
-// 256-bit store for ulonglong4 with const reference
-// must be const &val, otherwise the compiler will generate a temporary variable
-// and compilation will fail if we have st_global_256(ptr, ld_global_256(ptr))
-__device__ __forceinline__ void st_global_256(ulonglong4 *ptr,
-                                              const ulonglong4 &val) {
-  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
+               : "l"(ptr), "l"(D.x), "l"(D.y), "l"(D.z), "l"(D.w),
+                 "r"((int)pred_guard));
+#else
+  // CUDA < 12.9 fallback: two 128-bit stores (may have performance
+  // regression)
+  uint4 const *data = reinterpret_cast<uint4 const *>(&D);
+  asm volatile("{\n"
+               "  .reg .pred p;\n"
+               "  setp.ne.b32 p, %5, 0;\n"
+               "  @p st.global.v4.u32 [%0], {%1, %2, %3, %4};\n"
+               "  @p st.global.v4.u32 [%6], {%7, %8, %9, %10};\n"
+               "}\n"
                :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+               : "l"(ptr), "r"(data[0].x), "r"(data[0].y), "r"(data[0].z),
+                 "r"(data[0].w), "r"((int)pred_guard),
+                 "l"(((uint8_t *)ptr) + 16), "r"(data[1].x), "r"(data[1].y),
+                 "r"(data[1].z), "r"(data[1].w));
+#endif
 }
 
-// Generic 256-bit store for FP8 types
+// Convenience wrapper functions for 256-bit store
 template <typename T>
-__device__ __forceinline__ void st_global_256(T *ptr, const ulonglong4 &val) {
-  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val.x), "l"(val.y), "l"(val.z), "l"(val.w));
+__device__ __forceinline__ void store_global_256(void *ptr, const T &val) {
+  ulonglong4 const &val_u64 = *reinterpret_cast<ulonglong4 const *>(&val);
+  global_store_256(val_u64, ptr, true);
 }
 
-// Generic 256-bit store for FP8 types with non-const reference
 template <typename T>
-__device__ __forceinline__ void st_global_256(T *ptr, T &val) {
-  ulonglong4 &val_u64 = *((ulonglong4 *)&val);
-  asm volatile("st.global.v4.u64 [%0], {%1, %2, %3, %4};"
-               :
-               : "l"(ptr), "l"(val_u64.x), "l"(val_u64.y), "l"(val_u64.z),
-                 "l"(val_u64.w));
+__device__ __forceinline__ void
+store_global_256_conditional(void *ptr, const T &val, bool pred) {
+  ulonglong4 const &val_u64 = *reinterpret_cast<ulonglong4 const *>(&val);
+  global_store_256(val_u64, ptr, pred);
 }
 
 __device__ __forceinline__ unsigned long long
@@ -155,4 +233,200 @@ tcgen05_ld_32dp256bNx(uint32_t const &tmem_start_col,
   tl::fence_view_async_tmem_load();
 }
 
+// NOTE: The column offset increment (CUR_SEGMENT_LEN) assumes each register
+// maps to exactly one TMEM column (i.e. unpack::16b is NOT active). If
+// unpack::16b were used, each register would expand to 2 columns, requiring
+// an increment of 2*CUR_SEGMENT_LEN. Currently the codegen always passes
+// unpack16=false for stores (see copy.cc use_pack_unpack_modifier), so this
+// is correct. Do not enable unpack for stores without fixing this offset.
+template <typename target_call_cls, int MAX_LOGN, int N, typename src_t>
+__device__ __forceinline__ void tcgen05_st_core(uint32_t const &tmem_start_col,
+                                                src_t const *src_ptr) {
+  static_assert(N > 0);
+  constexpr int LOG_N = get_floor_log2<N>();
+  constexpr int CUR_SEGMENT_LEN = 1 << (LOG_N > MAX_LOGN ? MAX_LOGN : LOG_N);
+  target_call_cls::template copy<CUR_SEGMENT_LEN>(tmem_start_col,
+                                                  (uint32_t const *)src_ptr);
+  if constexpr (N - CUR_SEGMENT_LEN > 0) {
+    tcgen05_st_core<target_call_cls, MAX_LOGN, N - CUR_SEGMENT_LEN>(
+        tmem_start_col + CUR_SEGMENT_LEN, src_ptr + CUR_SEGMENT_LEN);
+  }
+}
+
+template <int N, bool unpack16, typename src_t>
+__device__ __forceinline__ void
+tcgen05_st_32dp32bNx(uint32_t const &tmem_start_col,
+                     uint32_t const &tmem_col_offset, src_t const *src_ptr) {
+  tcgen05_st_core<tl::tmem_st_32dp32bNx<unpack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, src_ptr);
+  tl::fence_view_async_tmem_store();
+}
+
+template <int N, bool unpack16, typename src_t>
+__device__ __forceinline__ void
+tcgen05_st_32dp64bNx(uint32_t const &tmem_start_col,
+                     uint32_t const &tmem_col_offset, src_t const *src_ptr) {
+  tcgen05_st_core<tl::tmem_st_32dp64bNx<unpack16>, 7, N>(
+      tmem_start_col + tmem_col_offset, src_ptr);
+  tl::fence_view_async_tmem_store();
+}
+
+template <int N, bool unpack16, typename src_t>
+__device__ __forceinline__ void
+tcgen05_st_32dp128bNx(uint32_t const &tmem_start_col,
+                      uint32_t const &tmem_col_offset, src_t const *src_ptr) {
+  tcgen05_st_core<tl::tmem_st_32dp128bNx<unpack16>, 6, N>(
+      tmem_start_col + tmem_col_offset, src_ptr);
+  tl::fence_view_async_tmem_store();
+}
+
+template <int N, bool unpack16, typename src_t>
+__device__ __forceinline__ void
+tcgen05_st_32dp256bNx(uint32_t const &tmem_start_col,
+                      uint32_t const &tmem_col_offset, src_t const *src_ptr) {
+  tcgen05_st_core<tl::tmem_st_32dp256bNx<unpack16>, 5, N>(
+      tmem_start_col + tmem_col_offset, src_ptr);
+  tl::fence_view_async_tmem_store();
+}
+
+/*q SM100 TMA 2SM load (cta_group::2) */
+
+enum class CacheHintSm100 : uint64_t {
+  EVICT_NORMAL = 0x1000000000000000,
+  EVICT_FIRST = 0x12F0000000000000,
+  EVICT_LAST = 0x14F0000000000000,
+};
+
+constexpr uint32_t Sm100MmaPeerBitMask = 0xFEFFFFFF;
+
+template <CacheHintSm100 cache_hint = CacheHintSm100::EVICT_NORMAL,
+          typename BarrierType = uint64_t>
+TL_DEVICE void tma_load_2sm(const CUtensorMap &descriptor,
+                            BarrierType &smem_mbar, void const *const smem_ptr,
+                            int32_t const &crd0) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  // Executed by both CTAs. Set peer bit to 0 so that the
+  // transaction bytes will update CTA0's barrier.
+  uint32_t smem_int_mbar;
+  if constexpr (std::is_pointer_v<BarrierType>) {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(smem_mbar));
+  } else {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
+  }
+  smem_int_mbar &= Sm100MmaPeerBitMask;
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile("cp.async.bulk.tensor.1d.cta_group::2.shared::cluster.global."
+               "mbarrier::complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3}], [%2], %4;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "l"(cache_hint)
+               : "memory");
+}
+
+template <CacheHintSm100 cache_hint = CacheHintSm100::EVICT_NORMAL,
+          typename BarrierType = uint64_t>
+TL_DEVICE void tma_load_2sm(const CUtensorMap &descriptor,
+                            BarrierType &smem_mbar, void const *const smem_ptr,
+                            int32_t const &crd0, int32_t const &crd1) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  // Executed by both CTAs. Set peer bit to 0 so that the
+  // transaction bytes will update CTA0's barrier.
+  uint32_t smem_int_mbar;
+  if constexpr (std::is_pointer_v<BarrierType>) {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(smem_mbar));
+  } else {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
+  }
+  smem_int_mbar &= Sm100MmaPeerBitMask;
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile("cp.async.bulk.tensor.2d.cta_group::2.shared::cluster.global."
+               "mbarrier::complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4}], [%2], %5;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "r"(crd1), "l"(cache_hint)
+               : "memory");
+}
+
+template <CacheHintSm100 cache_hint = CacheHintSm100::EVICT_NORMAL,
+          typename BarrierType = uint64_t>
+TL_DEVICE void tma_load_2sm(const CUtensorMap &descriptor,
+                            BarrierType &smem_mbar, void const *const smem_ptr,
+                            int32_t const &crd0, int32_t const &crd1,
+                            int32_t const &crd2) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  // Executed by both CTAs. Set peer bit to 0 so that the
+  // transaction bytes will update CTA0's barrier.
+  uint32_t smem_int_mbar;
+  if constexpr (std::is_pointer_v<BarrierType>) {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(smem_mbar));
+  } else {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
+  }
+  smem_int_mbar &= Sm100MmaPeerBitMask;
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile("cp.async.bulk.tensor.3d.cta_group::2.shared::cluster.global."
+               "mbarrier::complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5}], [%2], %6;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
+               : "memory");
+}
+
+template <CacheHintSm100 cache_hint = CacheHintSm100::EVICT_NORMAL,
+          typename BarrierType = uint64_t>
+TL_DEVICE void tma_load_2sm(const CUtensorMap &descriptor,
+                            BarrierType &smem_mbar, void const *const smem_ptr,
+                            int32_t const &crd0, int32_t const &crd1,
+                            int32_t const &crd2, int32_t const &crd3) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  // Executed by both CTAs. Set peer bit to 0 so that the
+  // transaction bytes will update CTA0's barrier.
+  uint32_t smem_int_mbar;
+  if constexpr (std::is_pointer_v<BarrierType>) {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(smem_mbar));
+  } else {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
+  }
+  smem_int_mbar &= Sm100MmaPeerBitMask;
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile("cp.async.bulk.tensor.4d.cta_group::2.shared::cluster.global."
+               "mbarrier::complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
+               : "memory");
+}
+
+template <CacheHintSm100 cache_hint = CacheHintSm100::EVICT_NORMAL,
+          typename BarrierType = uint64_t>
+TL_DEVICE void tma_load_2sm(const CUtensorMap &descriptor,
+                            BarrierType &smem_mbar, void const *const smem_ptr,
+                            int32_t const &crd0, int32_t const &crd1,
+                            int32_t const &crd2, int32_t const &crd3,
+                            int32_t const &crd4) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  // Executed by both CTAs. Set peer bit to 0 so that the
+  // transaction bytes will update CTA0's barrier.
+  uint32_t smem_int_mbar;
+  if constexpr (std::is_pointer_v<BarrierType>) {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(smem_mbar));
+  } else {
+    smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
+  }
+  smem_int_mbar &= Sm100MmaPeerBitMask;
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile("cp.async.bulk.tensor.5d.cta_group::2.shared::cluster.global."
+               "mbarrier::complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4),
+                 "l"(cache_hint)
+               : "memory");
+}
+
 } // namespace tl
diff --git a/src/tl_templates/cuda/copy_sm90.h b/src/tl_templates/cuda/copy_sm90.h
index 0b51450b31..c8e1794485 100644
--- a/src/tl_templates/cuda/copy_sm90.h
+++ b/src/tl_templates/cuda/copy_sm90.h
@@ -20,10 +20,18 @@ TL_DEVICE void tma_load(void *smem_ptr, void const *gmem_ptr,
   uint32_t smem_int_mbar =
       smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+  asm volatile("cp.async.bulk.shared::cta.global.mbarrier::complete_tx::"
+               "bytes [%0], [%1], %2, [%3]; \n" ::"r"(smem_int_ptr),
+               "l"((void const *)gmem_ptr), "r"(size), "r"(smem_int_mbar)
+               :);
+#else
   asm volatile("cp.async.bulk.shared::cluster.global.mbarrier::complete_tx::"
                "bytes [%0], [%1], %2, [%3]; \n" ::"r"(smem_int_ptr),
                "l"((void const *)gmem_ptr), "r"(size), "r"(smem_int_mbar)
                :);
+#endif
 }
 
 TL_DEVICE void tma_load_multicast(void *smem_ptr, void *gmem_ptr,
@@ -50,6 +58,16 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
     smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   }
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+  asm volatile("cp.async.bulk.tensor.1d.shared::cta.global.mbarrier::"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3}], [%2], %4;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "l"(cache_hint)
+               : "memory");
+#else
   asm volatile("cp.async.bulk.tensor.1d.shared::cluster.global.mbarrier::"
                "complete_tx::bytes.L2::cache_hint"
                " [%0], [%1, {%3}], [%2], %4;"
@@ -57,6 +75,7 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
                  "r"(crd0), "l"(cache_hint)
                : "memory");
+#endif
 }
 
 template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL,
@@ -72,6 +91,16 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
     smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   }
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+  asm volatile("cp.async.bulk.tensor.2d.shared::cta.global.mbarrier::"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4}], [%2], %5;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "r"(crd1), "l"(cache_hint)
+               : "memory");
+#else
   asm volatile("cp.async.bulk.tensor.2d.shared::cluster.global.mbarrier::"
                "complete_tx::bytes.L2::cache_hint"
                " [%0], [%1, {%3, %4}], [%2], %5;"
@@ -79,6 +108,7 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
                  "r"(crd0), "r"(crd1), "l"(cache_hint)
                : "memory");
+#endif
 }
 
 template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL,
@@ -94,6 +124,16 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
     smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   }
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+  asm volatile("cp.async.bulk.tensor.3d.shared::cta.global.mbarrier::"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5}], [%2], %6;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
+               : "memory");
+#else
   asm volatile("cp.async.bulk.tensor.3d.shared::cluster.global.mbarrier::"
                "complete_tx::bytes.L2::cache_hint"
                " [%0], [%1, {%3, %4, %5}], [%2], %6;"
@@ -101,6 +141,7 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
                  "r"(crd0), "r"(crd1), "r"(crd2), "l"(cache_hint)
                : "memory");
+#endif
 }
 template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL,
           typename BarrierType = uint64_t>
@@ -116,6 +157,16 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
     smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   }
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+  asm volatile("cp.async.bulk.tensor.4d.shared::cta.global.mbarrier::"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
+               : "memory");
+#else
   asm volatile("cp.async.bulk.tensor.4d.shared::cluster.global.mbarrier::"
                "complete_tx::bytes.L2::cache_hint"
                " [%0], [%1, {%3, %4, %5, %6}], [%2], %7;"
@@ -123,6 +174,7 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
                : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
                  "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "l"(cache_hint)
                : "memory");
+#endif
 }
 
 template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL,
@@ -139,6 +191,17 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
     smem_int_mbar = smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   }
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+  asm volatile("cp.async.bulk.tensor.5d.shared::cta.global.mbarrier::"
+               "complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4),
+                 "l"(cache_hint)
+               : "memory");
+#else
   asm volatile("cp.async.bulk.tensor.5d.shared::cluster.global.mbarrier::"
                "complete_tx::bytes.L2::cache_hint"
                " [%0], [%1, {%3, %4, %5, %6, %7}], [%2], %8;"
@@ -147,6 +210,7 @@ TL_DEVICE void tma_load(const CUtensorMap &descriptor, BarrierType &smem_mbar,
                  "r"(crd0), "r"(crd1), "r"(crd2), "r"(crd3), "r"(crd4),
                  "l"(cache_hint)
                : "memory");
+#endif
 }
 
 template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL,
@@ -161,6 +225,17 @@ tma_load_im2col(const CUtensorMap &descriptor, BarrierType &smem_mbar,
   uint32_t smem_int_mbar =
       smem_ptr_to_uint(reinterpret_cast<uint64_t *>(&smem_mbar));
   uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+#if (__CUDACC_VER_MAJOR__ > 12) ||                                             \
+    (__CUDACC_VER_MAJOR__ == 12 && __CUDACC_VER_MINOR__ >= 8)
+  asm volatile("cp.async.bulk.tensor.4d.shared::cta.global.im2col.mbarrier:"
+               ":complete_tx::bytes.L2::cache_hint"
+               " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8}, %9;"
+               :
+               : "r"(smem_int_ptr), "l"(gmem_int_desc), "r"(smem_int_mbar),
+                 "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
+                 "h"(offset_w), "h"(offset_h), "l"(cache_hint)
+               : "memory");
+#else
   asm volatile("cp.async.bulk.tensor.4d.shared::cluster.global.im2col.mbarrier:"
                ":complete_tx::bytes.L2::cache_hint"
                " [%0], [%1, {%3, %4, %5, %6}], [%2], {%7, %8}, %9;"
@@ -169,6 +244,7 @@ tma_load_im2col(const CUtensorMap &descriptor, BarrierType &smem_mbar,
                  "r"(coord_c), "r"(coord_w), "r"(coord_h), "r"(coord_n),
                  "h"(offset_w), "h"(offset_h), "l"(cache_hint)
                : "memory");
+#endif
 }
 
 template <CacheHintSm90 cache_hint = CacheHintSm90::EVICT_NORMAL>
@@ -262,6 +338,74 @@ TL_DEVICE void tma_store_add(float *const smem_ptr, float *gmem_ptr,
                : "memory");
 }
 
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.1d.global.shared::cta.add.bulk_group "
+      "[%0, {%2}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0)
+      : "memory");
+}
+
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0,
+                             int32_t const &crd1) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.2d.global.shared::cta.add.bulk_group "
+      "[%0, {%2, %3}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1)
+      : "memory");
+}
+
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0,
+                             int32_t const &crd1, int32_t const &crd2) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.3d.global.shared::cta.add.bulk_group "
+      "[%0, {%2, %3, %4}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1), "r"(crd2)
+      : "memory");
+}
+
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0,
+                             int32_t const &crd1, int32_t const &crd2,
+                             int32_t const &crd3) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.4d.global.shared::cta.add.bulk_group "
+      "[%0, {%2, %3, %4, %5}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1), "r"(crd2),
+        "r"(crd3)
+      : "memory");
+}
+
+TL_DEVICE void tma_store_add(const CUtensorMap &descriptor,
+                             void const *const smem_ptr, int32_t const &crd0,
+                             int32_t const &crd1, int32_t const &crd2,
+                             int32_t const &crd3, int32_t const &crd4) {
+  uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
+  uint32_t smem_int_ptr = smem_ptr_to_uint(smem_ptr);
+  asm volatile(
+      "cp.reduce.async.bulk.tensor.5d.global.shared::cta.add.bulk_group "
+      "[%0, {%2, %3, %4, %5, %6}], [%1];"
+      :
+      : "l"(gmem_int_desc), "r"(smem_int_ptr), "r"(crd0), "r"(crd1), "r"(crd2),
+        "r"(crd3), "r"(crd4)
+      : "memory");
+}
+
 TL_DEVICE void prefetch_tma_descriptor(const CUtensorMap &descriptor) {
   uint64_t gmem_int_desc = reinterpret_cast<uint64_t>(&descriptor);
   asm volatile("prefetch.tensormap [%0];" : : "l"(gmem_int_desc) : "memory");
diff --git a/src/tl_templates/cuda/cuda_fp4.h b/src/tl_templates/cuda/cuda_fp4.h
index 22cc0460cb..d2efef24ae 100644
--- a/src/tl_templates/cuda/cuda_fp4.h
+++ b/src/tl_templates/cuda/cuda_fp4.h
@@ -44,27 +44,36 @@ struct fp4_e2_t {
   TL_DEVICE operator __half() const { return __half(float(*this)); }
 };
 
-using fp4_e2x2_t = __nv_fp4x2_e2m1;
-using fp4_e2x4_t = __nv_fp4x4_e2m1;
+class fp4_e2_2_t {
+public:
+  __nv_fp4x2_storage_t __x;
 
-struct fp4_e2x8_t {
-  fp4_e2_t data[8];
-};
+  TL_DEVICE fp4_e2_2_t() = default;
+  TL_DEVICE fp4_e2_2_t(__nv_fp4x2_storage_t data) : __x(data) {}
+  TL_DEVICE fp4_e2_2_t(__nv_fp4x2_e2m1 data) : __x(data.__x) {}
 
-struct fp4_e2x16_t {
-  fp4_e2_t data[16];
-};
+  // Get low 4 bits (first fp4)
+  TL_DEVICE fp4_e2_t x() const {
+    return fp4_e2_t(__nv_fp4_storage_t(__x & 0x0F));
+  }
+
+  // Get high 4 bits (second fp4)
+  TL_DEVICE fp4_e2_t y() const {
+    return fp4_e2_t(__nv_fp4_storage_t((__x >> 4) & 0x0F));
+  }
 
-struct __CUDA_ALIGN__(1) fp4_e2_2_t {
-  fp4_e2_t x;
-  fp4_e2_t y;
+  // Set low 4 bits (first fp4)
+  TL_DEVICE void set_x(fp4_e2_t val) { __x = (__x & 0xF0) | (val.__x & 0x0F); }
+
+  // Set high 4 bits (second fp4)
+  TL_DEVICE void set_y(fp4_e2_t val) {
+    __x = (__x & 0x0F) | ((val.__x & 0x0F) << 4);
+  }
 };
 
 struct __CUDA_ALIGN__(2) fp4_e2_4_t {
-  fp4_e2_t x;
-  fp4_e2_t y;
-  fp4_e2_t z;
-  fp4_e2_t w;
+  fp4_e2_2_t x;
+  fp4_e2_2_t y;
 };
 
 struct __CUDA_ALIGN__(4) fp4_e2_8_t {
@@ -97,9 +106,9 @@ struct __CUDA_ALIGN__(32) fp4_e2_64_t {
 
 // Pack two fp4_e2_t values.
 TL_DEVICE fp4_e2_2_t make_fp4_e2_2_t(fp4_e2_t x, fp4_e2_t y) {
+  __nv_fp4x2_storage_t packed = (x.__x & 0x0F) | ((y.__x & 0x0F) << 4);
   fp4_e2_2_t result;
-  result.x = x;
-  result.y = y;
+  result.__x = packed;
   return result;
 }
 
@@ -107,10 +116,8 @@ TL_DEVICE fp4_e2_2_t make_fp4_e2_2_t(fp4_e2_t x, fp4_e2_t y) {
 TL_DEVICE fp4_e2_4_t make_fp4_e2_4_t(fp4_e2_t x0, fp4_e2_t x1, fp4_e2_t x2,
                                      fp4_e2_t x3) {
   fp4_e2_4_t result;
-  result.x = x0;
-  result.y = x1;
-  result.z = x2;
-  result.w = x3;
+  result.x = make_fp4_e2_2_t(x0, x1);
+  result.y = make_fp4_e2_2_t(x2, x3);
   return result;
 }
 
@@ -159,9 +166,44 @@ TL_DEVICE fp4_e2_32_t make_fp4_e2_32_t(
 // ============================================================================
 // https://docs.nvidia.com/cuda/cuda-math-api/cuda_math_api/group__CUDA__MATH__FP4__MISC.html
 
+// Custom fp4_e2m1 -> half convertion for CUDA version < 13.0 to avoid using
+// `cvt.rn.relu.f16x2.e2m1x2`, as there are bugs in PTXAS related to
+// `cvt.rn.relu.f16x2.e2m1x2` between CUDA 12.6 and 12.9
+__device__ __half_raw __tl_cvt_fp4_to_halfraw_naive(
+    const __nv_fp4_storage_t x,
+    const __nv_fp4_interpretation_t fp4_interpretation) {
+  __half_raw res;
+  res.x = 0U;
+  // fp4_interpretation == __NV_E2M1
+  // convert to e2m3 first
+  __nv_fp6_storage_t fp6e2m3 = (x & 0xFU) << 2U;
+  res = __nv_cvt_fp6_to_halfraw(fp6e2m3, __NV_E2M3);
+  return res;
+}
+
+// Custom fp4_e2m1 -> half convertion for CUDA version < 13.0 to avoid using
+// `cvt.rn.relu.f16x2.e2m1x2`, as there are bugs in PTXAS related to
+// `cvt.rn.relu.f16x2.e2m1x2` between CUDA 12.6 and 12.9
+__device__ __half2_raw __tl_cvt_fp4x2_to_halfraw2_naive(
+    const __nv_fp4x2_storage_t x,
+    const __nv_fp4_interpretation_t fp4_interpretation) {
+  __half2_raw res;
+  res.x =
+      __tl_cvt_fp4_to_halfraw_naive((__nv_fp4_storage_t)x, fp4_interpretation)
+          .x;
+  res.y = __tl_cvt_fp4_to_halfraw_naive((__nv_fp4_storage_t)(x >> 4U),
+                                        fp4_interpretation)
+              .x;
+  return res;
+}
+
 // fp4_e2m1 -> half
 TL_DEVICE __half __tl_cvt_fp4_to_half(const __nv_fp4_storage_t src) {
+#if __CUDACC_VER_MAJOR__ >= 13
   __half_raw raw = __nv_cvt_fp4_to_halfraw(src, __NV_E2M1);
+#else
+  __half_raw raw = __tl_cvt_fp4_to_halfraw_naive(src, __NV_E2M1);
+#endif
   __half result;
   result = *reinterpret_cast<__half *>(&raw);
   return result;
@@ -169,7 +211,11 @@ TL_DEVICE __half __tl_cvt_fp4_to_half(const __nv_fp4_storage_t src) {
 
 // fp4_e2m1x2 (1 byte) -> half2
 TL_DEVICE half2 __tl_cvt_fp4x2_to_half2(const __nv_fp4x2_storage_t src) {
+#if __CUDACC_VER_MAJOR__ >= 13
   __half2_raw raw = __nv_cvt_fp4x2_to_halfraw2(src, __NV_E2M1);
+#else
+  __half2_raw raw = __tl_cvt_fp4x2_to_halfraw2_naive(src, __NV_E2M1);
+#endif
   half2 result;
   result = *reinterpret_cast<half2 *>(&raw);
   return result;
@@ -178,13 +224,13 @@ TL_DEVICE half2 __tl_cvt_fp4x2_to_half2(const __nv_fp4x2_storage_t src) {
 // half -> fp4_e2m1
 TL_DEVICE __nv_fp4_storage_t __tl_cvt_half_to_fp4(const __half src) {
   __half_raw raw = *reinterpret_cast<const __half_raw *>(&src);
-  return __nv_cvt_halfraw_to_fp4(raw, __NV_E2M1, cudaRoundZero);
+  return __nv_cvt_halfraw_to_fp4(raw, __NV_E2M1, cudaRoundNearest);
 }
 
 // half2 -> fp4_e2m1x2 (1 byte)
 TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_half2_to_fp4x2(const half2 src) {
   __half2_raw raw = *reinterpret_cast<const __half2_raw *>(&src);
-  return __nv_cvt_halfraw2_to_fp4x2(raw, __NV_E2M1, cudaRoundZero);
+  return __nv_cvt_halfraw2_to_fp4x2(raw, __NV_E2M1, cudaRoundNearest);
 }
 
 // ============================================================================
@@ -207,12 +253,12 @@ TL_DEVICE float2 __tl_cvt_fp4x2_to_float2(const __nv_fp4x2_storage_t src) {
 
 // float -> fp4_e2m1
 TL_DEVICE __nv_fp4_storage_t __tl_cvt_float_to_fp4(const float src) {
-  return __nv_cvt_float_to_fp4(src, __NV_E2M1, cudaRoundZero);
+  return __nv_cvt_float_to_fp4(src, __NV_E2M1, cudaRoundNearest);
 }
 
 // float2 -> fp4_e2m1x2 (1 byte)
 TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_float2_to_fp4x2(const float2 src) {
-  return __nv_cvt_float2_to_fp4x2(src, __NV_E2M1, cudaRoundZero);
+  return __nv_cvt_float2_to_fp4x2(src, __NV_E2M1, cudaRoundNearest);
 }
 
 // ============================================================================
@@ -235,12 +281,12 @@ TL_DEVICE double2 __tl_cvt_fp4x2_to_double2(const __nv_fp4x2_storage_t src) {
 
 // double -> fp4_e2m1
 TL_DEVICE __nv_fp4_storage_t __tl_cvt_double_to_fp4(const double src) {
-  return __nv_cvt_double_to_fp4(src, __NV_E2M1, cudaRoundZero);
+  return __nv_cvt_double_to_fp4(src, __NV_E2M1, cudaRoundNearest);
 }
 
 // double2 -> fp4_e2m1x2
 TL_DEVICE __nv_fp4x2_storage_t __tl_cvt_double2_to_fp4x2(const double2 src) {
-  return __nv_cvt_double2_to_fp4x2(src, __NV_E2M1, cudaRoundZero);
+  return __nv_cvt_double2_to_fp4x2(src, __NV_E2M1, cudaRoundNearest);
 }
 
 // ============================================================================
@@ -262,14 +308,39 @@ __tl_cvt_fp4x2_to_bfloat162(const __nv_fp4x2_storage_t src) {
 // bfloat16 -> fp4_e2m1
 TL_DEVICE __nv_fp4_storage_t __tl_cvt_bfloat16_to_fp4(const __nv_bfloat16 src) {
   __nv_bfloat16_raw raw = *reinterpret_cast<const __nv_bfloat16_raw *>(&src);
-  return __nv_cvt_bfloat16raw_to_fp4(raw, __NV_E2M1, cudaRoundZero);
+  return __nv_cvt_bfloat16raw_to_fp4(raw, __NV_E2M1, cudaRoundNearest);
 }
 
 // bfloat162 -> fp4_e2m1x2
 TL_DEVICE __nv_fp4x2_storage_t
 __tl_cvt_bfloat162_to_fp4x2(const __nv_bfloat162 src) {
   __nv_bfloat162_raw raw = *reinterpret_cast<const __nv_bfloat162_raw *>(&src);
-  return __nv_cvt_bfloat16raw2_to_fp4x2(raw, __NV_E2M1, cudaRoundZero);
+  return __nv_cvt_bfloat16raw2_to_fp4x2(raw, __NV_E2M1, cudaRoundNearest);
+}
+
+// ============================================================================
+// FP4 Packed Buffer Access Helpers
+// ============================================================================
+// These helpers are used for accessing individual fp4 elements from packed
+// fp4_e2_2_t storage, where each byte stores 2 fp4 values.
+
+// Load a single fp4 element from packed storage
+// packed: pointer to fp4_e2_2_t array
+// idx: logical index of the fp4 element
+TL_DEVICE fp4_e2_t tl_fp4_packed_load(fp4_e2_2_t *packed, int idx) {
+  return (idx & 1) ? packed[idx >> 1].y() : packed[idx >> 1].x();
+}
+
+// Store a single fp4 element to packed storage
+// packed: pointer to fp4_e2_2_t array
+// idx: logical index of the fp4 element
+// val: value to store
+TL_DEVICE void tl_fp4_packed_store(fp4_e2_2_t *packed, int idx, fp4_e2_t val) {
+  if (idx & 1) {
+    packed[idx >> 1].set_y(val);
+  } else {
+    packed[idx >> 1].set_x(val);
+  }
 }
 
 #endif
diff --git a/src/tl_templates/cuda/cuda_fp8.h b/src/tl_templates/cuda/cuda_fp8.h
index c800462961..3b8422700c 100644
--- a/src/tl_templates/cuda/cuda_fp8.h
+++ b/src/tl_templates/cuda/cuda_fp8.h
@@ -312,3 +312,58 @@ __tl_cvt_fp8x2_to_float2(const __nv_fp8x2_storage_t x,
   result.y = (float)tmp.y;
   return result;
 }
+
+// ============================================================================
+// FP8 E8M0 Related Conversions
+// ============================================================================
+#if TL_HAS_FP8_E8M0
+
+// fp8_e8m0 -> bfloat16
+TL_DEVICE __nv_bfloat16
+__tl_cvt_e8m0_to_bfloat16(const __nv_fp8_storage_t src) {
+  __nv_bfloat16_raw raw = __nv_cvt_e8m0_to_bf16raw(src);
+  return *reinterpret_cast<const __nv_bfloat16 *>(&raw);
+}
+
+// fp8_e8m0x2 -> bfloat16x2
+TL_DEVICE __nv_bfloat162
+__tl_cvt_e8m0x2_to_bfloat162(const __nv_fp8x2_storage_t src) {
+  __nv_bfloat162_raw raw = __nv_cvt_e8m0x2_to_bf162raw(src);
+  return *reinterpret_cast<const __nv_bfloat162 *>(&raw);
+}
+
+// bfloat16 -> fp8_e8m0
+TL_DEVICE
+__nv_fp8_storage_t __tl_cvt_bfloat16_to_e8m0(const __nv_bfloat16 src) {
+  __nv_bfloat16_raw raw = *reinterpret_cast<const __nv_bfloat16_raw *>(&src);
+  return __nv_cvt_bfloat16raw_to_e8m0(raw, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// bfloat162 -> fp8_e8m0x2
+TL_DEVICE __nv_fp8x2_storage_t
+__tl_cvt_bfloat162_to_e8m0x2(const __nv_bfloat162 src) {
+  __nv_bfloat162_raw raw = *reinterpret_cast<const __nv_bfloat162_raw *>(&src);
+  return __nv_cvt_bfloat162raw_to_e8m0x2(raw, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// float -> fp8_e8m0
+TL_DEVICE __nv_fp8_storage_t __tl_cvt_float_to_e8m0(const float src) {
+  return __nv_cvt_float_to_e8m0(src, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// float2 -> fp8_e8m0x2
+TL_DEVICE __nv_fp8x2_storage_t __tl_cvt_float2_to_e8m0x2(const float2 src) {
+  return __nv_cvt_float2_to_e8m0x2(src, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// double -> fp8_e8m0
+TL_DEVICE __nv_fp8_storage_t __tl_cvt_double_to_e8m0(const double src) {
+  return __nv_cvt_double_to_e8m0(src, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+// double2 -> fp8_e8m0x2
+TL_DEVICE __nv_fp8x2_storage_t __tl_cvt_double2_to_e8m0x2(const double2 src) {
+  return __nv_cvt_double2_to_e8m0x2(src, __NV_SATFINITE, cudaRoundPosInf);
+}
+
+#endif
diff --git a/src/tl_templates/cuda/debug.h b/src/tl_templates/cuda/debug.h
index 8fae775a04..c832a5e5e4 100644
--- a/src/tl_templates/cuda/debug.h
+++ b/src/tl_templates/cuda/debug.h
@@ -127,7 +127,8 @@ TL_DEVICE void device_assert_with_msg(bool cond, const char *msg) {
   }
 }
 
-TL_DEVICE void debug_print_msg(const char *msg) {
+// Specialization for msg-only debug print
+__device__ void debug_print_msg(const char *msg) {
   printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d)\n", msg,
          blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
          threadIdx.z);
diff --git a/src/tl_templates/cuda/distributed.h b/src/tl_templates/cuda/distributed.h
index 64c3cd453d..e96daec2ea 100644
--- a/src/tl_templates/cuda/distributed.h
+++ b/src/tl_templates/cuda/distributed.h
@@ -2,7 +2,7 @@
 
 #include "common.h"
 
-extern "C" extern __constant__ uint64_t meta_data[1024];
+extern "C" __constant__ uint64_t meta_data[1024];
 namespace tl {
 
 TL_DEVICE uint64_t get_rank() { return meta_data[0]; }
@@ -17,6 +17,25 @@ template <typename dtype_t> TL_DEVICE uint64_t get_uintptr_t(dtype_t *ptr) {
   return reinterpret_cast<uint64_t>(ptr);
 }
 
+// Block-level remote copy: copies N*sizeof(float) bytes from src to dst
+template <int N> TL_DEVICE void cp_block(uint64_t dst_addr, uint64_t src_addr) {
+  using CopyT = uint64_t;
+  constexpr int num_elements =
+      (N * (int)sizeof(float) + (int)sizeof(CopyT) - 1) / (int)sizeof(CopyT);
+  auto *dst = reinterpret_cast<CopyT *>(dst_addr);
+  const auto *src = reinterpret_cast<const CopyT *>(src_addr);
+#pragma unroll
+  for (int i = 0; i < num_elements; i++) {
+    dst[i] = src[i];
+  }
+}
+
+// Warp-level remote copy
+template <int N, int UNROLL_FACTOR, bool AGGRESSIVE_VECTORIZE>
+TL_DEVICE void cp_warp(uint64_t dst_addr, uint64_t src_addr) {
+  cp_block<N>(dst_addr, src_addr);
+}
+
 } // namespace tl
 
 TL_DEVICE void print_table() {
diff --git a/src/tl_templates/cuda/gemm_mma.h b/src/tl_templates/cuda/gemm_mma.h
index ea01fa9aab..b73369ecf0 100644
--- a/src/tl_templates/cuda/gemm_mma.h
+++ b/src/tl_templates/cuda/gemm_mma.h
@@ -2,6 +2,7 @@
 
 #include <cute/algorithm/clear.hpp>
 #include <cute/arch/mma_sm120.hpp>
+#include <cute/arch/mma_sm75.hpp>
 #include <cute/arch/mma_sm80.hpp>
 #include <cute/arch/mma_sm89.hpp>
 #include <cute/atom/mma_atom.hpp>
@@ -103,6 +104,8 @@ TL_DISPATCH_MMA(float, float, float, SM80_16x8x8_F32TF32TF32F32_TN)
 TL_DISPATCH_MMA(int8_t, int8_t, int, SM80_16x8x32_S32S8S8S32_TN)
 TL_DISPATCH_MMA(double, double, double, SM80_8x8x4_F64F64F64F64_TN)
 #elif __CUDA_ARCH_LIST__ >= 750
+#include <cute/arch/mma_sm75.hpp>
+TL_DISPATCH_MMA(half_t, half_t, half_t, SM75_16x8x8_F16F16F16F16_TN)
 TL_DISPATCH_MMA(half_t, half_t, float, SM75_16x8x8_F32F16F16F32_TN)
 #endif
 #endif
diff --git a/src/tl_templates/cuda/instruction/mma.h b/src/tl_templates/cuda/instruction/mma.h
index 869fa777bc..c4a276f3a9 100644
--- a/src/tl_templates/cuda/instruction/mma.h
+++ b/src/tl_templates/cuda/instruction/mma.h
@@ -105,11 +105,15 @@ TL_DEFINE_MMA_DISPATCHER(kInt8, kInt8, kInt32, 16, 8, 32, false, true, false,
 TL_DEFINE_MMA_DISPATCHER(kUInt8, kUInt8, kInt32, 16, 8, 32, false, true, false,
                          cute::SM80_16x8x32_S32U8U8S32_TN)
 
-// INT4 inputs (k32)
+// INT4 inputs (k32, k64)
 TL_DEFINE_MMA_DISPATCHER(kInt4, kInt4, kInt32, 16, 8, 32, false, true, false,
                          cute::SM80_16x8x32_S32S4S4S32_TN)
+TL_DEFINE_MMA_DISPATCHER(kInt4, kInt4, kInt32, 16, 8, 64, false, true, false,
+                         cute::SM80_16x8x64_S32S4S4S32_TN)
 TL_DEFINE_MMA_DISPATCHER(kUInt4, kUInt4, kInt32, 16, 8, 32, false, true, false,
                          cute::SM80_16x8x32_S32U4U4S32_TN)
+TL_DEFINE_MMA_DISPATCHER(kUInt4, kUInt4, kInt32, 16, 8, 64, false, true, false,
+                         cute::SM80_16x8x64_S32U4U4S32_TN)
 
 // FP8 inputs (k32)
 TL_DEFINE_MMA_DISPATCHER(kFloat8_e4m3, kFloat8_e4m3, kFloat16, 16, 8, 32, false,
diff --git a/src/tl_templates/cuda/instruction/tcgen05mma.h b/src/tl_templates/cuda/instruction/tcgen05mma.h
index 9772d64382..c719e05e10 100644
--- a/src/tl_templates/cuda/instruction/tcgen05mma.h
+++ b/src/tl_templates/cuda/instruction/tcgen05mma.h
@@ -11,7 +11,7 @@ template <class> inline constexpr bool always_false_v = false;
 #endif
 
 // Generic declaration: unsupported by default
-template <DataType C_type>
+template <DataType C_type, bool use_2cta = false>
 TL_DEVICE void
 tcgen05mma_ss(uint64_t const & /*desc_a*/, uint64_t const & /*desc_b*/,
               uint32_t const & /*tmem_c*/, uint32_t const & /*scalec*/,
@@ -25,7 +25,7 @@ tcgen05mma_ss(uint64_t const & /*desc_a*/, uint64_t const & /*desc_b*/,
 
 // TS variants: A from TMEM, B from SMEM (desc)
 // Generic declaration: unsupported by default
-template <DataType C_type>
+template <DataType C_type, bool use_2cta = false>
 TL_DEVICE void
 tcgen05mma_ts(uint32_t const & /*tmem_a*/, uint64_t const & /*desc_b*/,
               uint32_t const & /*tmem_c*/, uint32_t const & /*scalec*/,
@@ -39,7 +39,7 @@ tcgen05mma_ts(uint32_t const & /*tmem_a*/, uint64_t const & /*desc_b*/,
 
 // F16/BF16 instruction kind (maps to kind::f16)
 template <>
-TL_DEVICE void tcgen05mma_ts<DataType::kFloat16>(
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat16, false>(
     uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
@@ -56,17 +56,46 @@ TL_DEVICE void tcgen05mma_ts<DataType::kFloat16>(
   }
 }
 
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat16, true>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  // NOTE(wt): 2cta tcgen05mma requires 8 masks.
+  // To keep API compatible, we still pass 4 masks as 1cta for now, but don't
+  // use them in PTX. We shall refactor this in the future.
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.cta_group::2.kind::f16 [%0], [%1], %2, %3, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "r"(tmem_a), "l"(desc_b), "r"(desc_val), "r"(scalec));
+  }
+}
+
 // BF16 maps to the same f16-kind instruction
 template <>
-TL_DEVICE void tcgen05mma_ts<DataType::kBFloat16>(
+TL_DEVICE void tcgen05mma_ts<DataType::kBFloat16, false>(
     uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
-  tcgen05mma_ts<DataType::kFloat16>(tmem_a, desc_b, tmem_c, scalec, desc_val,
-                                    mask0, mask1, mask2, mask3);
+  tcgen05mma_ts<DataType::kFloat16, false>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
 }
 
-// TF32 instruction kind
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kBFloat16, true>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat16, true>(tmem_a, desc_b, tmem_c, scalec,
+                                          desc_val, mask0, mask1, mask2, mask3);
+}
+
+// TF32 instruction kind (2cta not supported currently)
 template <>
 TL_DEVICE void tcgen05mma_ts<DataType::kTensorFloat32>(
     uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
@@ -85,9 +114,9 @@ TL_DEVICE void tcgen05mma_ts<DataType::kTensorFloat32>(
   }
 }
 
-// INT8 instruction kind
+// INT8 instruction kind (maps to kind::i8)
 template <>
-TL_DEVICE void tcgen05mma_ts<DataType::kInt8>(
+TL_DEVICE void tcgen05mma_ts<DataType::kInt8, false>(
     uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
@@ -104,9 +133,45 @@ TL_DEVICE void tcgen05mma_ts<DataType::kInt8>(
   }
 }
 
-// FP8 family instruction kind (maps to f8f6f4)
 template <>
-TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e4m3>(
+TL_DEVICE void tcgen05mma_ts<DataType::kInt8, true>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::2.kind::i8 [%0], [%1], %2, %3, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "r"(tmem_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec));
+  }
+}
+
+// UINT8 maps to the same i8-kind instruction
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kUInt8, false>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kInt8, false>(tmem_a, desc_b, tmem_c, scalec,
+                                        desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kUInt8, true>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kInt8, true>(tmem_a, desc_b, tmem_c, scalec, desc_val,
+                                       mask0, mask1, mask2, mask3);
+}
+
+// FP8 family instruction kind (maps to kind::f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e4m3, false>(
     uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
@@ -124,17 +189,99 @@ TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e4m3>(
 }
 
 template <>
-TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e5m2>(
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e4m3, true>(
     uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
-  tcgen05mma_ts<DataType::kFloat8_e4m3>(tmem_a, desc_b, tmem_c, scalec,
-                                        desc_val, mask0, mask1, mask2, mask3);
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], [%1], %2, %3, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "r"(tmem_a), "l"(desc_b), "r"(desc_val), "r"(scalec));
+  }
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e5m2, false>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3, false>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat8_e5m2, true>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3, true>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+// FP6 family instruction kind (maps to kind::f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat6_e2m3fn, false>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3, false>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat6_e2m3fn, true>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3, true>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat6_e3m2fn, false>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3, false>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat6_e3m2fn, true>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3, true>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+// FP4 family instruction kind (maps to kind::f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat4_e2m1fn, false>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3, false>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ts<DataType::kFloat4_e2m1fn, true>(
+    uint32_t const &tmem_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ts<DataType::kFloat8_e4m3, true>(
+      tmem_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
 }
 
 // F16/BF16 instruction kind (maps to kind::f16)
 template <>
-TL_DEVICE void tcgen05mma_ss<DataType::kFloat16>(
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat16, false>(
     uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
@@ -154,14 +301,43 @@ TL_DEVICE void tcgen05mma_ss<DataType::kFloat16>(
   }
 }
 
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat16, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  // idescE upper 32 bits carry the instruction descriptor; lower 32 ignored for
+  // SS Load TMEM base from shared memory slot handled by caller
+
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::2.kind::f16 [%0], %1, %2, %3, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec));
+  }
+}
+
 // BF16 maps to the same f16-kind instruction
 template <>
-TL_DEVICE void tcgen05mma_ss<DataType::kBFloat16>(
+TL_DEVICE void tcgen05mma_ss<DataType::kBFloat16, false>(
     uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
-  tcgen05mma_ss<DataType::kFloat16>(desc_a, desc_b, tmem_c, scalec, desc_val,
-                                    mask0, mask1, mask2, mask3);
+  tcgen05mma_ss<DataType::kFloat16, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kBFloat16, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat16, true>(desc_a, desc_b, tmem_c, scalec,
+                                          desc_val, mask0, mask1, mask2, mask3);
 }
 
 // TF32 instruction kind
@@ -183,9 +359,9 @@ TL_DEVICE void tcgen05mma_ss<DataType::kTensorFloat32>(
   }
 }
 
-// INT8 instruction kind
+// INT8 instruction kind (maps to kind::i8)
 template <>
-TL_DEVICE void tcgen05mma_ss<DataType::kInt8>(
+TL_DEVICE void tcgen05mma_ss<DataType::kInt8, false>(
     uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
@@ -202,9 +378,45 @@ TL_DEVICE void tcgen05mma_ss<DataType::kInt8>(
   }
 }
 
-// FP8 family instruction kind (maps to f8f6f4)
 template <>
-TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e4m3>(
+TL_DEVICE void tcgen05mma_ss<DataType::kInt8, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  if (cute::elect_one_sync()) {
+    asm volatile("{\n\t"
+                 ".reg .pred p;\n\t"
+                 "setp.ne.b32 p, %4, 0;\n\t"
+                 "tcgen05.mma.cta_group::2.kind::i8 [%0], %1, %2, %3, p; \n\t"
+                 "}\n"
+                 :
+                 : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val),
+                   "r"(scalec));
+  }
+}
+
+// UINT8 maps to the same i8-kind instruction
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kUInt8, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kInt8, false>(desc_a, desc_b, tmem_c, scalec,
+                                        desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kUInt8, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kInt8, true>(desc_a, desc_b, tmem_c, scalec, desc_val,
+                                       mask0, mask1, mask2, mask3);
+}
+
+// FP8 family instruction kind (maps to kind::f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e4m3, false>(
     uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
@@ -222,12 +434,94 @@ TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e4m3>(
 }
 
 template <>
-TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e5m2>(
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e4m3, true>(
     uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
     uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
     int const &mask1, int const &mask2, int const &mask3) {
-  tcgen05mma_ss<DataType::kFloat8_e4m3>(desc_a, desc_b, tmem_c, scalec,
-                                        desc_val, mask0, mask1, mask2, mask3);
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.cta_group::2.kind::f8f6f4 [%0], %1, %2, %3, p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val), "r"(scalec));
+  }
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e5m2, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat8_e5m2, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3, true>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+// FP6 family instruction kind (maps to kind::f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat6_e2m3fn, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat6_e2m3fn, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3, true>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat6_e3m2fn, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat6_e3m2fn, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3, true>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+// FP4 family instruction kind (maps to kind::f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat4_e2m1fn, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ss<DataType::kFloat4_e2m1fn, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ss<DataType::kFloat8_e4m3, true>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
 }
 
 // WS variants: tcgen05.mma.ws.cta_group::1.kind::xxx
@@ -307,7 +601,17 @@ TL_DEVICE void tcgen05mma_ws_ss<DataType::kInt8>(
   }
 }
 
-// FP8 ws (maps to f8f6f4)
+// UINT8 ws, maps to the same i8-kind instruction
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kUInt8>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ws_ss<DataType::kInt8>(desc_a, desc_b, tmem_c, scalec, desc_val,
+                                    mask0, mask1, mask2, mask3);
+}
+
+// FP8 ws (maps to kind::f8f6f4)
 template <>
 TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat8_e4m3>(
     uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
@@ -334,4 +638,167 @@ TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat8_e5m2>(
       desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
 }
 
+// FP6 ws (maps to kind::f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat6_e2m3fn>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ws_ss<DataType::kFloat8_e4m3>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat6_e3m2fn>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ws_ss<DataType::kFloat8_e4m3>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+// FP4 ws (maps to kind::f8f6f4)
+template <>
+TL_DEVICE void tcgen05mma_ws_ss<DataType::kFloat4_e2m1fn>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, int const &mask0,
+    int const &mask1, int const &mask2, int const &mask3) {
+  tcgen05mma_ws_ss<DataType::kFloat8_e4m3>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, mask0, mask1, mask2, mask3);
+}
+
+// ============================================================================
+// Block-scaled MMA variants:
+// tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale Used for MXFP8
+// block-scaled GEMM with scale factors in TMEM.
+// ============================================================================
+
+// Generic declaration: unsupported by default
+template <DataType C_type, bool use_2cta = false>
+TL_DEVICE void tcgen05mma_blockscaled_ss(uint64_t const & /*desc_a*/,
+                                         uint64_t const & /*desc_b*/,
+                                         uint32_t const & /*tmem_c*/,
+                                         uint32_t const & /*scalec*/,
+                                         uint32_t const & /*desc_val*/,
+                                         uint32_t const & /*tmem_sfa*/,
+                                         uint32_t const & /*tmem_sfb*/) {
+  static_assert(
+      always_false_v<std::integral_constant<int, static_cast<int>(C_type)>>,
+      "tl::tcgen05mma_blockscaled_ss: unsupported accumulator type");
+}
+
+// FP8 E4M3 block-scaled
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.cta_group::1.kind::mxf8f6f4.block_scale [%0], %1, %2, "
+        "%3, [%5], [%6], p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val), "r"(scalec),
+          "r"(tmem_sfa), "r"(tmem_sfb));
+  }
+}
+
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  if (cute::elect_one_sync()) {
+    asm volatile(
+        "{\n\t"
+        ".reg .pred p;\n\t"
+        "setp.ne.b32 p, %4, 0;\n\t"
+        "tcgen05.mma.cta_group::2.kind::mxf8f6f4.block_scale [%0], %1, %2, "
+        "%3, [%5], [%6], p; \n\t"
+        "}\n"
+        :
+        : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(desc_val), "r"(scalec),
+          "r"(tmem_sfa), "r"(tmem_sfb));
+  }
+}
+
+// FP8 E5M2 maps to the same instruction
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat8_e5m2, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, tmem_sfa, tmem_sfb);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat8_e5m2, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, true>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, tmem_sfa, tmem_sfb);
+}
+
+// FP6 block-scaled
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat6_e2m3fn, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, tmem_sfa, tmem_sfb);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat6_e2m3fn, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, true>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, tmem_sfa, tmem_sfb);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat6_e3m2fn, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, tmem_sfa, tmem_sfb);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat6_e3m2fn, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, true>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, tmem_sfa, tmem_sfb);
+}
+
+// FP4 block-scaled
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat4_e2m1fn, false>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, false>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, tmem_sfa, tmem_sfb);
+}
+
+template <>
+TL_DEVICE void tcgen05mma_blockscaled_ss<DataType::kFloat4_e2m1fn, true>(
+    uint64_t const &desc_a, uint64_t const &desc_b, uint32_t const &tmem_c,
+    uint32_t const &scalec, uint32_t const &desc_val, uint32_t const &tmem_sfa,
+    uint32_t const &tmem_sfb) {
+  tcgen05mma_blockscaled_ss<DataType::kFloat8_e4m3, true>(
+      desc_a, desc_b, tmem_c, scalec, desc_val, tmem_sfa, tmem_sfb);
+}
+
 } // namespace tl
diff --git a/src/tl_templates/cuda/intrin.h b/src/tl_templates/cuda/intrin.h
index 0d5b5639de..7d585fa7bd 100644
--- a/src/tl_templates/cuda/intrin.h
+++ b/src/tl_templates/cuda/intrin.h
@@ -101,8 +101,9 @@ template <int thread_extent> TL_DEVICE bool tl_shuffle_elect() {
     //   (1) We are in warp 0 of the block.
     //   (2) We are the elected lane in this warp.
     return cutlass::canonical_warp_idx_sync() == 0 && cute::elect_one_sync();
+  } else if constexpr (thread_extent == 32) {
+    return cute::elect_one_sync();
   }
-
   // General case: thread_extent != 0
   // (threadIdx.x / 32) is the warp index in the block.
   // (thread_extent / 32) is the number of warps in one group of size
diff --git a/src/tl_templates/cuda/multimem.h b/src/tl_templates/cuda/multimem.h
new file mode 100644
index 0000000000..6caaf5a8cd
--- /dev/null
+++ b/src/tl_templates/cuda/multimem.h
@@ -0,0 +1,256 @@
+#pragma once
+
+#include "common.h"
+
+// multimem instructions require SM 90+ (Hopper) and CUDA 12.0+
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 && __CUDACC_VER_MAJOR__ >= 12
+
+#ifndef TL_ALWAYS_FALSE_V_DEFINED
+#define TL_ALWAYS_FALSE_V_DEFINED
+template <class> inline constexpr bool always_false_v = false;
+#endif
+
+namespace tl {
+namespace multimem {
+
+enum class ReduceOp { ADD = 0, MIN = 1, MAX = 2 };
+
+// === Per-instruction primitives (used by MultimemRewriter post-process) ===
+
+// --- LdReduceV4: 128-bit load-reduce from multicast address ---
+
+template <ReduceOp op, typename DType> struct LdReduceV4 {
+  TL_DEVICE static void run(void *, const void *) {
+    static_assert(always_false_v<DType>,
+                  "tl::multimem::LdReduceV4: unsupported dtype/op combination");
+  }
+};
+
+template <> struct LdReduceV4<ReduceOp::ADD, float> {
+  TL_DEVICE static void run(void *dst, const void *mcast_ptr) {
+    int4 ret;
+    asm volatile(
+        "multimem.ld_reduce.relaxed.sys.global.add.v4.f32 {%0, %1, %2, %3}, "
+        "[%4];"
+        : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w)
+        : "l"(mcast_ptr)
+        : "memory");
+    *reinterpret_cast<int4 *>(dst) = ret;
+  }
+};
+
+template <> struct LdReduceV4<ReduceOp::MIN, float> {
+  TL_DEVICE static void run(void *dst, const void *mcast_ptr) {
+    int4 ret;
+    asm volatile(
+        "multimem.ld_reduce.relaxed.sys.global.min.v4.f32 {%0, %1, %2, %3}, "
+        "[%4];"
+        : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w)
+        : "l"(mcast_ptr)
+        : "memory");
+    *reinterpret_cast<int4 *>(dst) = ret;
+  }
+};
+
+template <> struct LdReduceV4<ReduceOp::MAX, float> {
+  TL_DEVICE static void run(void *dst, const void *mcast_ptr) {
+    int4 ret;
+    asm volatile(
+        "multimem.ld_reduce.relaxed.sys.global.max.v4.f32 {%0, %1, %2, %3}, "
+        "[%4];"
+        : "=r"(ret.x), "=r"(ret.y), "=r"(ret.z), "=r"(ret.w)
+        : "l"(mcast_ptr)
+        : "memory");
+    *reinterpret_cast<int4 *>(dst) = ret;
+  }
+};
+
+// --- StV4: 128-bit store to multicast address ---
+
+template <typename DType> struct StV4 {
+  TL_DEVICE static void run(void *, const void *) {
+    static_assert(always_false_v<DType>,
+                  "tl::multimem::StV4: unsupported dtype");
+  }
+};
+
+template <> struct StV4<float> {
+  TL_DEVICE static void run(void *mcast_ptr, const void *src) {
+    int4 val = *reinterpret_cast<const int4 *>(src);
+    asm volatile("multimem.st.relaxed.sys.global.v4.b32 [%0], {%1, %2, %3, %4};"
+                 :
+                 : "l"(mcast_ptr), "r"(val.x), "r"(val.y), "r"(val.z),
+                   "r"(val.w)
+                 : "memory");
+  }
+};
+
+// --- RedV4: 128-bit reduce into multicast address ---
+
+template <ReduceOp op, typename DType> struct RedV4 {
+  TL_DEVICE static void run(void *, const void *) {
+    static_assert(always_false_v<DType>,
+                  "tl::multimem::RedV4: unsupported dtype/op combination");
+  }
+};
+
+template <> struct RedV4<ReduceOp::ADD, float> {
+  TL_DEVICE static void run(void *mcast_ptr, const void *src) {
+    int4 val = *reinterpret_cast<const int4 *>(src);
+    asm volatile(
+        "multimem.red.relaxed.sys.global.add.v4.f32 [%0], {%1, %2, %3, %4};"
+        :
+        : "l"(mcast_ptr), "r"(val.x), "r"(val.y), "r"(val.z), "r"(val.w)
+        : "memory");
+  }
+};
+
+template <> struct RedV4<ReduceOp::MIN, float> {
+  TL_DEVICE static void run(void *mcast_ptr, const void *src) {
+    // multimem.red min not directly available as v4; use scalar fallback
+    const float *src_f = reinterpret_cast<const float *>(src);
+    const char *mc_bytes = reinterpret_cast<const char *>(mcast_ptr);
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      unsigned val = __float_as_uint(src_f[i]);
+      asm volatile("multimem.red.relaxed.sys.global.min.f32 [%0], %1;"
+                   :
+                   : "l"(mc_bytes + i * 4), "r"(val)
+                   : "memory");
+    }
+  }
+};
+
+template <> struct RedV4<ReduceOp::MAX, float> {
+  TL_DEVICE static void run(void *mcast_ptr, const void *src) {
+    const float *src_f = reinterpret_cast<const float *>(src);
+    const char *mc_bytes = reinterpret_cast<const char *>(mcast_ptr);
+#pragma unroll
+    for (int i = 0; i < 4; i++) {
+      unsigned val = __float_as_uint(src_f[i]);
+      asm volatile("multimem.red.relaxed.sys.global.max.f32 [%0], %1;"
+                   :
+                   : "l"(mc_bytes + i * 4), "r"(val)
+                   : "memory");
+    }
+  }
+};
+
+// === V2 variants (64-bit = 2×f32, implemented as 2 scalar ops) ===
+
+template <ReduceOp op, typename DType> struct LdReduceV2 {
+  TL_DEVICE static void run(void *, const void *) {
+    static_assert(always_false_v<DType>,
+                  "tl::multimem::LdReduceV2: unsupported dtype/op");
+  }
+};
+
+template <> struct LdReduceV2<ReduceOp::ADD, float> {
+  TL_DEVICE static void run(void *dst, const void *mcast_ptr) {
+    int2 ret;
+    asm volatile(
+        "multimem.ld_reduce.relaxed.sys.global.add.v2.f32 {%0, %1}, [%2];"
+        : "=r"(ret.x), "=r"(ret.y)
+        : "l"(mcast_ptr)
+        : "memory");
+    *reinterpret_cast<int2 *>(dst) = ret;
+  }
+};
+
+template <> struct LdReduceV2<ReduceOp::MIN, float> {
+  TL_DEVICE static void run(void *dst, const void *mcast_ptr) {
+    int2 ret;
+    asm volatile(
+        "multimem.ld_reduce.relaxed.sys.global.min.v2.f32 {%0, %1}, [%2];"
+        : "=r"(ret.x), "=r"(ret.y)
+        : "l"(mcast_ptr)
+        : "memory");
+    *reinterpret_cast<int2 *>(dst) = ret;
+  }
+};
+
+template <> struct LdReduceV2<ReduceOp::MAX, float> {
+  TL_DEVICE static void run(void *dst, const void *mcast_ptr) {
+    int2 ret;
+    asm volatile(
+        "multimem.ld_reduce.relaxed.sys.global.max.v2.f32 {%0, %1}, [%2];"
+        : "=r"(ret.x), "=r"(ret.y)
+        : "l"(mcast_ptr)
+        : "memory");
+    *reinterpret_cast<int2 *>(dst) = ret;
+  }
+};
+
+template <typename DType> struct StV2 {
+  TL_DEVICE static void run(void *, const void *) {
+    static_assert(always_false_v<DType>,
+                  "tl::multimem::StV2: unsupported dtype");
+  }
+};
+
+template <> struct StV2<float> {
+  TL_DEVICE static void run(void *mcast_ptr, const void *src) {
+    int2 val = *reinterpret_cast<const int2 *>(src);
+    asm volatile("multimem.st.relaxed.sys.global.v2.b32 [%0], {%1, %2};"
+                 :
+                 : "l"(mcast_ptr), "r"(val.x), "r"(val.y)
+                 : "memory");
+  }
+};
+
+template <ReduceOp op, typename DType> struct RedV2 {
+  TL_DEVICE static void run(void *, const void *) {
+    static_assert(always_false_v<DType>,
+                  "tl::multimem::RedV2: unsupported dtype/op");
+  }
+};
+
+template <> struct RedV2<ReduceOp::ADD, float> {
+  TL_DEVICE static void run(void *mcast_ptr, const void *src) {
+    const float *src_f = reinterpret_cast<const float *>(src);
+    const char *mc = reinterpret_cast<const char *>(mcast_ptr);
+#pragma unroll
+    for (int i = 0; i < 2; i++) {
+      unsigned val = __float_as_uint(src_f[i]);
+      asm volatile("multimem.red.relaxed.sys.global.add.f32 [%0], %1;"
+                   :
+                   : "l"(mc + i * 4), "r"(val)
+                   : "memory");
+    }
+  }
+};
+
+template <> struct RedV2<ReduceOp::MIN, float> {
+  TL_DEVICE static void run(void *mcast_ptr, const void *src) {
+    const float *src_f = reinterpret_cast<const float *>(src);
+    const char *mc = reinterpret_cast<const char *>(mcast_ptr);
+#pragma unroll
+    for (int i = 0; i < 2; i++) {
+      unsigned val = __float_as_uint(src_f[i]);
+      asm volatile("multimem.red.relaxed.sys.global.min.f32 [%0], %1;"
+                   :
+                   : "l"(mc + i * 4), "r"(val)
+                   : "memory");
+    }
+  }
+};
+
+template <> struct RedV2<ReduceOp::MAX, float> {
+  TL_DEVICE static void run(void *mcast_ptr, const void *src) {
+    const float *src_f = reinterpret_cast<const float *>(src);
+    const char *mc = reinterpret_cast<const char *>(mcast_ptr);
+#pragma unroll
+    for (int i = 0; i < 2; i++) {
+      unsigned val = __float_as_uint(src_f[i]);
+      asm volatile("multimem.red.relaxed.sys.global.max.f32 [%0], %1;"
+                   :
+                   : "l"(mc + i * 4), "r"(val)
+                   : "memory");
+    }
+  }
+};
+
+} // namespace multimem
+} // namespace tl
+
+#endif // __CUDA_ARCH__ >= 900 && __CUDACC_VER_MAJOR__ >= 12
diff --git a/src/tl_templates/cuda/reduce.h b/src/tl_templates/cuda/reduce.h
index 4582426493..1099b89590 100644
--- a/src/tl_templates/cuda/reduce.h
+++ b/src/tl_templates/cuda/reduce.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include "common.h"
+#include <cuda_bf16.h>
+#include <cuda_fp16.h>
 
 #ifndef __CUDACC_RTC__
 #include <cstdint>
@@ -9,6 +11,9 @@
 
 namespace tl {
 
+template <typename T, typename ReduceOp>
+TL_DEVICE T warp_reduce(T value, ReduceOp op);
+
 // Select a wider accumulator type for improved numerical accuracy.
 // Default: accumulate in the same type. Specialize FP16/BF16 to float.
 template <typename T> struct AccType {
@@ -31,12 +36,56 @@ struct MaxOp {
   template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
     return cutlass::fast_max(x, y);
   }
+
+  TL_DEVICE bfloat16_t operator()(bfloat16_t const &x, bfloat16_t const &y) {
+    return bfloat16_t(__hmax(x.to_nv_bfloat16(), y.to_nv_bfloat16()));
+  }
+
+  TL_DEVICE half_t operator()(half_t const &x, half_t const &y) {
+    return half_t(__hmax(x.to_half(), y.to_half()));
+  }
+};
+struct MaxOpNan {
+  template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
+    return cutlass::fast_max(x, y);
+  }
+
+  TL_DEVICE bfloat16_t operator()(bfloat16_t const &x, bfloat16_t const &y) {
+    return bfloat16_t(__hmax_nan(x.to_nv_bfloat16(), y.to_nv_bfloat16()));
+  }
+
+  TL_DEVICE half_t operator()(half_t const &x, half_t const &y) {
+    return half_t(__hmax_nan(x.to_half(), y.to_half()));
+  }
 };
 
 struct MinOp {
   template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
     return cutlass::fast_min(x, y);
   }
+
+  TL_DEVICE cutlass::bfloat16_t operator()(bfloat16_t const &x,
+                                           bfloat16_t const &y) {
+    return bfloat16_t(__hmin(x.to_nv_bfloat16(), y.to_nv_bfloat16()));
+  }
+
+  TL_DEVICE half_t operator()(half_t const &x, half_t const &y) {
+    return half_t(__hmin(x.to_half(), y.to_half()));
+  }
+};
+
+struct MinOpNan {
+  template <typename T> TL_DEVICE T operator()(T const &x, T const &y) {
+    return cutlass::fast_min(x, y);
+  }
+
+  TL_DEVICE bfloat16_t operator()(bfloat16_t const &x, bfloat16_t const &y) {
+    return bfloat16_t(__hmin_nan(x.to_nv_bfloat16(), y.to_nv_bfloat16()));
+  }
+
+  TL_DEVICE half_t operator()(half_t const &x, half_t const &y) {
+    return half_t(__hmin_nan(x.to_half(), y.to_half()));
+  }
 };
 
 struct BitAndOp {
@@ -57,19 +106,84 @@ struct BitXorOp {
   }
 };
 
+// Barrier policy: wraps __syncthreads().
+// The phase template parameter is ignored (all phases use the same barrier).
+struct SyncThreadsBarrier {
+  template <int phase = 0> static TL_DEVICE void sync() { __syncthreads(); }
+};
+
+// Barrier policy: wraps named barrier (bar.sync) with compile-time phase IDs.
+// Used on Hopper and later architectures where __syncthreads() cannot be used
+// in certain contexts.
+template <int all_threads> struct NamedBarrier {
+  template <int phase = 1> static TL_DEVICE void sync() {
+    asm volatile("bar.sync %0, %1;" : : "r"(phase), "r"(all_threads));
+  }
+};
+
+// AllReduce performs a cross-thread reduction over a group of `threads`
+// threads.
+//
+// Template parameters:
+//   Reducer         - binary reduction functor (e.g. SumOp, MaxOp).
+//   threads         - number of threads that span the reduce dimension,
+//                     equal to extent * scale.
+//   scale           - stride of participating threads in the thread index
+//                     space. When the thread-to-data mapping is normalized as
+//                       threadIdx = source * scale + ...
+//                     `scale` is the stride between consecutive logical
+//                     participants in the reduce dimension.
+//                     The recursion terminates when threads == scale, meaning
+//                     each reduce group has been collapsed to a single thread.
+//                     Uses a recursive XOR-butterfly pattern: at each level,
+//                     offset >= 32 goes through shared memory + barrier,
+//                     offset < 32 uses warp shuffle (shfl_xor_sync).
+//   thread_offset   - base thread index offset within the block.
+//   Barrier         - barrier policy type (SyncThreadsBarrier or
+//                     NamedBarrier<N>).
+//   batch_size      - number of independent values to reduce in parallel,
+//                     sharing synchronization barriers across all values.
+//                     Default 1 preserves the original scalar behaviour.
+//   workspace_stride - stride between per-channel slices in the shared-memory
+//                     workspace (typically total threads in the block).
+//                     Only used when batch_size > 1.
 template <class Reducer, int threads, int scale, int thread_offset = 0,
-          int all_threads = threads>
+          class Barrier = SyncThreadsBarrier, int batch_size = 1,
+          int workspace_stride = 0>
 struct AllReduce {
-  static_assert(threads == 1024 or threads == 512 or threads == 256 or
-                threads == 128 or threads == 64 or threads == 32 or
-                threads == 16 or threads == 8 or threads == 4 or threads == 2);
   static_assert(threads % scale == 0);
+
+  // Scalar interface (backward-compatible).
   template <typename T> static TL_DEVICE T run(T x, T *red_buf = nullptr) {
+    if constexpr (threads == scale) {
+      return x;
+    } else {
+      return butterfly_reduce_scalar(x, red_buf);
+    }
+  }
+
+  // Batch interface (named run_batch to avoid overload-resolution ambiguity
+  // with the scalar run(T x, T*) when a pointer is passed as the first arg).
+  template <typename T>
+  static TL_DEVICE void run_batch(T *x, T *red_buf = nullptr) {
+    if constexpr (threads == scale) {
+      return;
+    } else {
+      butterfly_reduce_batch(x, red_buf);
+    }
+  }
+
+private:
+  using Next = AllReduce<Reducer, threads / 2, scale, thread_offset, Barrier,
+                         batch_size, workspace_stride>;
+
+  template <typename T>
+  static TL_DEVICE T butterfly_reduce_scalar(T x, T *red_buf) {
     constexpr int offset = threads / 2;
     if constexpr (offset >= 32) {
-      __syncthreads();
+      Barrier::template sync<1>();
       red_buf[threadIdx.x - thread_offset] = x;
-      __syncthreads();
+      Barrier::template sync<2>();
       x = Reducer()(x, red_buf[(threadIdx.x - thread_offset) ^ offset]);
     } else {
       x = Reducer()(x, tl::shfl_xor_sync(uint32_t(-1), x, offset));
@@ -77,28 +191,36 @@ struct AllReduce {
     if constexpr (offset == scale) {
       return x;
     } else {
-      return AllReduce<Reducer, offset, scale, thread_offset, all_threads>::run(
-          x, red_buf);
+      return Next::run(x, red_buf);
     }
   }
 
   template <typename T>
-  static TL_DEVICE T run_hopper(T x, T *red_buf = nullptr) {
+  static TL_DEVICE void butterfly_reduce_batch(T *x, T *red_buf) {
     constexpr int offset = threads / 2;
     if constexpr (offset >= 32) {
-      asm volatile("bar.sync %0, %1;" : : "r"(1), "r"(all_threads));
-      red_buf[threadIdx.x - thread_offset] = x;
-      // TODO(lei): maybe we can merge the two bar.sync into one?
-      asm volatile("bar.sync %0, %1;" : : "r"(2), "r"(all_threads));
-      x = Reducer()(x, red_buf[(threadIdx.x - thread_offset) ^ offset]);
+      Barrier::template sync<1>();
+#pragma unroll
+      for (int i = 0; i < batch_size; i++) {
+        red_buf[(threadIdx.x - thread_offset) + i * workspace_stride] = x[i];
+      }
+      Barrier::template sync<2>();
+#pragma unroll
+      for (int i = 0; i < batch_size; i++) {
+        x[i] =
+            Reducer()(x[i], red_buf[((threadIdx.x - thread_offset) ^ offset) +
+                                    i * workspace_stride]);
+      }
     } else {
-      x = Reducer()(x, tl::shfl_xor_sync(uint32_t(-1), x, offset));
+#pragma unroll
+      for (int i = 0; i < batch_size; i++) {
+        x[i] = Reducer()(x[i], tl::shfl_xor_sync(uint32_t(-1), x[i], offset));
+      }
     }
     if constexpr (offset == scale) {
-      return x;
+      return;
     } else {
-      return AllReduce<Reducer, offset, scale, thread_offset,
-                       all_threads>::run_hopper(x, red_buf);
+      Next::run_batch(x, red_buf);
     }
   }
 };
@@ -175,15 +297,15 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
   static_assert(threads == 1024 or threads == 512 or threads == 256 or
                 threads == 128 or threads == 64 or threads == 32);
   template <typename T, int SEG = 32>
-  static TL_DEVICE T run(const T *__restrict__ src, T *__restrict__ dst, int H,
-                         int W) {
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int H, int W) {
 
     constexpr int TILE_H = threads / SEG;
     constexpr unsigned MASK = 0xffffffff;
     const int num_blocks = (H + TILE_H - 1) / TILE_H;
     const int tid = threadIdx.x;
-    const int lane = tid % 32;
-    const int row = tid / 32;
+    const int lane = tid % SEG;
+    const int row = tid / SEG;
 
     for (int b = 0; b < num_blocks; ++b) {
       const int gRow = b * TILE_H + row;
@@ -250,14 +372,64 @@ template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
   }
 };
 
+// Reference:
+// https://docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#reduction
 template <typename T, typename ReduceOp>
 TL_DEVICE T warp_reduce(T value, ReduceOp op) {
   constexpr uint32_t mask = 0xffffffff;
-  value = op(value, __shfl_xor_sync(mask, value, 16));
-  value = op(value, __shfl_xor_sync(mask, value, 8));
-  value = op(value, __shfl_xor_sync(mask, value, 4));
-  value = op(value, __shfl_xor_sync(mask, value, 2));
-  value = op(value, __shfl_xor_sync(mask, value, 1));
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 1000) &&                       \
+    (defined(__CUDA_ARCH_FEAT_SM100_ALL) || defined(__CUDA_ARCH_FEAT_SM100_F))
+  float value_cast = 0.0f;
+  if constexpr (std::is_same_v<T, half_t>) {
+    value_cast = __half2float(value);
+  } else if constexpr (std::is_same_v<T, bfloat16_t>) {
+    value_cast = __bfloat162float(value);
+  } else {
+    value_cast = static_cast<float>(value);
+  }
+  if constexpr (std::is_same_v<ReduceOp, MaxOp> && !std::is_integral_v<T>) {
+    float res;
+    asm("redux.sync.max.f32 %0, %1, %2;"
+        : "=f"(res)
+        : "f"(value_cast), "r"(mask));
+    return static_cast<T>(res);
+  } else if constexpr (std::is_same_v<ReduceOp, MinOp> &&
+                       !std::is_integral_v<T>) {
+    float res;
+    asm("redux.sync.min.f32 %0, %1, %2;"
+        : "=f"(res)
+        : "f"(value_cast), "r"(mask));
+    return static_cast<T>(res);
+  }
+#endif
+#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)
+  auto run_reduce_sync = [&]<typename T_cast>(T_cast val) {
+    if constexpr (std::is_same_v<ReduceOp, SumOp>) {
+      return __reduce_add_sync(mask, val);
+    } else if constexpr (std::is_same_v<ReduceOp, MaxOp>) {
+      return __reduce_max_sync(mask, val);
+    } else if constexpr (std::is_same_v<ReduceOp, MinOp>) {
+      return __reduce_min_sync(mask, val);
+    } else if constexpr (std::is_same_v<ReduceOp, BitAndOp>) {
+      return __reduce_and_sync(mask, val);
+    } else if constexpr (std::is_same_v<ReduceOp, BitOrOp>) {
+      return __reduce_or_sync(mask, val);
+    } else if constexpr (std::is_same_v<ReduceOp, BitXorOp>) {
+      return __reduce_xor_sync(mask, val);
+    }
+  };
+
+  if constexpr (std::is_same_v<T, int32_t> || std::is_same_v<T, uint32_t>) {
+    return run_reduce_sync(value);
+  } else if constexpr (std::is_integral_v<T>) {
+    return static_cast<T>(run_reduce_sync(static_cast<int32_t>(value)));
+  }
+#endif
+  value = op(value, tl::shfl_xor_sync(mask, value, 16));
+  value = op(value, tl::shfl_xor_sync(mask, value, 8));
+  value = op(value, tl::shfl_xor_sync(mask, value, 4));
+  value = op(value, tl::shfl_xor_sync(mask, value, 2));
+  value = op(value, tl::shfl_xor_sync(mask, value, 1));
   return value;
 }
 
diff --git a/src/tl_templates/cuda/tcgen_05.h b/src/tl_templates/cuda/tcgen_05.h
index e40907e340..85e7e491db 100644
--- a/src/tl_templates/cuda/tcgen_05.h
+++ b/src/tl_templates/cuda/tcgen_05.h
@@ -10,65 +10,131 @@
 
 namespace tl {
 
+template <bool use_2cta = false>
 TL_DEVICE void tmem_allocate(void *dst_ptr, int num_columns) {
   uint32_t dst_intptr = smem_ptr_to_uint(dst_ptr);
-  asm volatile(
-      "tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%0], %1;"
-      :
-      : "r"(dst_intptr), "r"(num_columns));
+  if constexpr (use_2cta) {
+    asm volatile(
+        "tcgen05.alloc.cta_group::2.sync.aligned.shared::cta.b32 [%0], %1;"
+        :
+        : "r"(dst_intptr), "r"(num_columns));
+  } else {
+    asm volatile(
+        "tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [%0], %1;"
+        :
+        : "r"(dst_intptr), "r"(num_columns));
+  }
 }
 
+template <bool use_2cta = false>
 TL_DEVICE void tmem_deallocate(uint32_t *tmem_ptr, int num_columns) {
-  asm volatile("{\n\t"
-               "tcgen05.dealloc.cta_group::1.sync.aligned.b32  %0, %1; \n\t"
-               "}"
-               :
-               : "r"(*tmem_ptr), "r"(num_columns));
+  if constexpr (use_2cta) {
+    asm volatile("{\n\t"
+                 "tcgen05.dealloc.cta_group::2.sync.aligned.b32  %0, %1; \n\t"
+                 "}"
+                 :
+                 : "r"(*tmem_ptr), "r"(num_columns));
+  } else {
+    asm volatile("{\n\t"
+                 "tcgen05.dealloc.cta_group::1.sync.aligned.b32  %0, %1; \n\t"
+                 "}"
+                 :
+                 : "r"(*tmem_ptr), "r"(num_columns));
+  }
 }
 
-inline void __device__ fence_view_async_tmem_load() {
-  asm volatile("tcgen05.wait::ld.sync.aligned; " ::);
+TL_DEVICE void tcgen05_before_thread_sync() {
+  asm volatile("tcgen05.fence::before_thread_sync;");
 }
 
-inline void __device__ fence_view_async_tmem_store() {
-  asm volatile("tcgen05.wait::st.sync.aligned; " ::);
+TL_DEVICE void tcgen05_after_thread_sync() {
+  asm volatile("tcgen05.fence::after_thread_sync;");
 }
 
-template <int M, int N>
-inline void __device__ amma_fp16bf16_ss(uint64_t const desc_a,
-                                        uint64_t const desc_b,
-                                        uint32_t const tmem_c,
-                                        uint32_t const idesc,
-                                        uint32_t const addC = 1) {
-  static_assert(M == 64 || M == 128, "SM100_MMA_F16BF16 M-mode size should be "
-                                     "64 or 128 for 1 CTA cluster MMA.");
-  static_assert(
-      (M == 64 && (N % 8 == 0) && (8 <= N) && (N <= 256)) ||
-          (M == 128 && (N % 16 == 0) && (16 <= N) && (N <= 256)),
-      "SM100_MMA_F16BF16 N-mode size should be a multiple of 8 between 8 and 256 for M=64,\
-                 or a multiple of 16 between 16 and 256 for M=128.");
+TL_DEVICE void fence_view_async_tmem_load() {
+  asm volatile("tcgen05.wait::ld.sync.aligned; " ::);
+}
 
-  uint32_t mask[4] = {0, 0, 0, 0};
-  asm volatile("{\n\t"
-               ".reg .pred p;\n\t"
-               "setp.ne.b32 p, %4, 0;\n\t"
-               "tcgen05.mma.cta_group::1.kind::f16 [%0], %1, %2, %3, {%5, %6, "
-               "%7, %8}, p; \n\t"
-               "}\n"
-               :
-               : "r"(tmem_c), "l"(desc_a), "l"(desc_b), "r"(idesc), "r"(addC),
-                 "r"(mask[0]), "r"(mask[1]), "r"(mask[2]), "r"(mask[3]));
+TL_DEVICE void fence_view_async_tmem_store() {
+  asm volatile("tcgen05.wait::st.sync.aligned; " ::);
 }
 
 // Wrapper for CUTLASS umma_arrive: elect one lane, then arrive the mbarrier
-TL_DEVICE void tcgen05_mma_arrive(void const *smem_ptr) {
+template <bool use_2cta = false>
+TL_DEVICE void tcgen05_mma_arrive(void const *smem_ptr,
+                                  const uint16_t cta_mask = 3) {
   uint32_t bar_intptr = smem_ptr_to_uint(smem_ptr);
+  if constexpr (use_2cta) {
+    // Adapted from cute::arch::umma_arrive_multicast_2x1SM
+    // Arrive at CTAs specified by cta_mask (default to both)
+    if (cute::elect_one_sync()) {
+      asm volatile("{\n\t"
+                   "tcgen05.commit.cta_group::2.mbarrier::arrive::one.shared::"
+                   "cluster.multicast::cluster.b64 [%0], %1; \n\t"
+                   "}"
+                   :
+                   : "r"(bar_intptr), "h"(cta_mask)
+                   : "memory");
+    }
+  } else {
+    if (cute::elect_one_sync()) {
+      asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::"
+                   "cluster.b64 [%0];"
+                   :
+                   : "r"(bar_intptr));
+    }
+  }
+}
+
+// UTCCP: Copy scale factors from shared memory to tensor memory.
+// Must be called by one warp; only one elected thread issues the instruction.
+template <bool use_2cta = false>
+TL_DEVICE void tcgen05_cp(uint64_t const &smem_desc, uint32_t const &tmem_col) {
   if (cute::elect_one_sync()) {
-    asm volatile("tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::"
-                 "cluster.b64 [%0];"
-                 :
-                 : "r"(bar_intptr));
+    if constexpr (use_2cta) {
+      asm volatile("tcgen05.cp.cta_group::2.32x128b.warpx4 [%0], %1;"
+                   :
+                   : "r"(tmem_col), "l"(smem_desc));
+    } else {
+      asm volatile("tcgen05.cp.cta_group::1.32x128b.warpx4 [%0], %1;"
+                   :
+                   : "r"(tmem_col), "l"(smem_desc));
+    }
   }
 }
 
+// Warp-level transpose of 128 uint32 elements in shared memory for UTCCP.
+// Each warp (32 threads) transposes a 4x32 block in-place.
+// Must be called by exactly one warp. Call __syncwarp() is embedded.
+TL_DEVICE void tcgen05_sf_warp_transpose(uint32_t *smem_ptr) {
+  const uint32_t lane = threadIdx.x % 32;
+  uint32_t values[4];
+#pragma unroll
+  for (uint32_t i = 0; i < 4; ++i)
+    values[i] = smem_ptr[(i ^ (lane >> 3)) * 32 + lane];
+  __syncwarp();
+#pragma unroll
+  for (uint32_t i = 0; i < 4; ++i)
+    smem_ptr[lane * 4 + (i ^ (lane >> 3))] = values[i];
+}
+
+// Build a SMEM descriptor for UTCCP scale factor copy (no swizzle, K-major)
+// SBO = 128 bytes (stride between atoms on MN), LBO = 0 (single K atom)
+TL_DEVICE uint64_t make_sf_smem_desc(void *smem_ptr) {
+  uint32_t uint_ptr = smem_ptr_to_uint(smem_ptr);
+  // SmemDescriptor bit layout:
+  // [0,14): start_address >> 4
+  // [16,30): leading_byte_offset >> 4 = 0
+  // [32,46): stride_byte_offset >> 4 = 128/16 = 8
+  // [46,48): version = 1 (SM100)
+  // [61,64): layout_type = 0 (SWIZZLE_NONE)
+  uint64_t desc = 0;
+  desc |= static_cast<uint64_t>(uint_ptr >> 4) & 0x3FFFull; // start_address
+  // leading_byte_offset = 0 (bits [16,30))
+  desc |= static_cast<uint64_t>(8u) << 32; // stride_byte_offset >> 4 = 8
+  desc |= static_cast<uint64_t>(1u) << 46; // version = 1
+  // layout_type = 0 (SWIZZLE_NONE), base_offset = 0, lbo_mode = 0
+  return desc;
+}
+
 } // namespace tl
diff --git a/src/tl_templates/cuda/tcgen_05_st.h b/src/tl_templates/cuda/tcgen_05_st.h
new file mode 100644
index 0000000000..111503861d
--- /dev/null
+++ b/src/tl_templates/cuda/tcgen_05_st.h
@@ -0,0 +1,1300 @@
+#pragma once
+
+#include <cstdint>
+#ifndef __CUDACC_RTC__
+#include <cuda.h>
+#endif
+
+#include "common.h"
+
+namespace tl {
+
+// 32 data path lanes, 32b-bit pattern, repeated N times (store)
+template <bool Unpack16> class tmem_st_32dp32bNx;
+
+template <> class tmem_st_32dp32bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.x1.b32"
+                   "[%0],"
+                   "{%1};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.x2.b32"
+                   "[%0],"
+                   "{%1, %2};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.x4.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.x8.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]));
+    } else if constexpr (N == 16) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.x16.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+                   "%14, %15, %16};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]),
+                     "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+                     "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+                     "r"(src_ptr[14]), "r"(src_ptr[15]));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.32x32b.x32.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.32x32b.x64.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.32x32b.x128.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67,"
+          "%68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80,"
+          "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93,"
+          "%94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105,"
+          "%106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116,"
+          "%117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %127,"
+          "%128};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]), "r"(src_ptr[64]),
+            "r"(src_ptr[65]), "r"(src_ptr[66]), "r"(src_ptr[67]),
+            "r"(src_ptr[68]), "r"(src_ptr[69]), "r"(src_ptr[70]),
+            "r"(src_ptr[71]), "r"(src_ptr[72]), "r"(src_ptr[73]),
+            "r"(src_ptr[74]), "r"(src_ptr[75]), "r"(src_ptr[76]),
+            "r"(src_ptr[77]), "r"(src_ptr[78]), "r"(src_ptr[79]),
+            "r"(src_ptr[80]), "r"(src_ptr[81]), "r"(src_ptr[82]),
+            "r"(src_ptr[83]), "r"(src_ptr[84]), "r"(src_ptr[85]),
+            "r"(src_ptr[86]), "r"(src_ptr[87]), "r"(src_ptr[88]),
+            "r"(src_ptr[89]), "r"(src_ptr[90]), "r"(src_ptr[91]),
+            "r"(src_ptr[92]), "r"(src_ptr[93]), "r"(src_ptr[94]),
+            "r"(src_ptr[95]), "r"(src_ptr[96]), "r"(src_ptr[97]),
+            "r"(src_ptr[98]), "r"(src_ptr[99]), "r"(src_ptr[100]),
+            "r"(src_ptr[101]), "r"(src_ptr[102]), "r"(src_ptr[103]),
+            "r"(src_ptr[104]), "r"(src_ptr[105]), "r"(src_ptr[106]),
+            "r"(src_ptr[107]), "r"(src_ptr[108]), "r"(src_ptr[109]),
+            "r"(src_ptr[110]), "r"(src_ptr[111]), "r"(src_ptr[112]),
+            "r"(src_ptr[113]), "r"(src_ptr[114]), "r"(src_ptr[115]),
+            "r"(src_ptr[116]), "r"(src_ptr[117]), "r"(src_ptr[118]),
+            "r"(src_ptr[119]), "r"(src_ptr[120]), "r"(src_ptr[121]),
+            "r"(src_ptr[122]), "r"(src_ptr[123]), "r"(src_ptr[124]),
+            "r"(src_ptr[125]), "r"(src_ptr[126]), "r"(src_ptr[127]));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_st_32dp32bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.unpack::16b.x1.b32"
+                   "[%0],"
+                   "{%1};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.unpack::16b.x2.b32"
+                   "[%0],"
+                   "{%1, %2};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.unpack::16b.x4.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.unpack::16b.x8.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]));
+    } else if constexpr (N == 16) {
+      asm volatile("tcgen05.st.sync.aligned.32x32b.unpack::16b.x16.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+                   "%14, %15, %16};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]),
+                     "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+                     "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+                     "r"(src_ptr[14]), "r"(src_ptr[15]));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.32x32b.unpack::16b.x32.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.32x32b.unpack::16b.x64.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.32x32b.unpack::16b.x128.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67,"
+          "%68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80,"
+          "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93,"
+          "%94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105,"
+          "%106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116,"
+          "%117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %127,"
+          "%128};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]), "r"(src_ptr[64]),
+            "r"(src_ptr[65]), "r"(src_ptr[66]), "r"(src_ptr[67]),
+            "r"(src_ptr[68]), "r"(src_ptr[69]), "r"(src_ptr[70]),
+            "r"(src_ptr[71]), "r"(src_ptr[72]), "r"(src_ptr[73]),
+            "r"(src_ptr[74]), "r"(src_ptr[75]), "r"(src_ptr[76]),
+            "r"(src_ptr[77]), "r"(src_ptr[78]), "r"(src_ptr[79]),
+            "r"(src_ptr[80]), "r"(src_ptr[81]), "r"(src_ptr[82]),
+            "r"(src_ptr[83]), "r"(src_ptr[84]), "r"(src_ptr[85]),
+            "r"(src_ptr[86]), "r"(src_ptr[87]), "r"(src_ptr[88]),
+            "r"(src_ptr[89]), "r"(src_ptr[90]), "r"(src_ptr[91]),
+            "r"(src_ptr[92]), "r"(src_ptr[93]), "r"(src_ptr[94]),
+            "r"(src_ptr[95]), "r"(src_ptr[96]), "r"(src_ptr[97]),
+            "r"(src_ptr[98]), "r"(src_ptr[99]), "r"(src_ptr[100]),
+            "r"(src_ptr[101]), "r"(src_ptr[102]), "r"(src_ptr[103]),
+            "r"(src_ptr[104]), "r"(src_ptr[105]), "r"(src_ptr[106]),
+            "r"(src_ptr[107]), "r"(src_ptr[108]), "r"(src_ptr[109]),
+            "r"(src_ptr[110]), "r"(src_ptr[111]), "r"(src_ptr[112]),
+            "r"(src_ptr[113]), "r"(src_ptr[114]), "r"(src_ptr[115]),
+            "r"(src_ptr[116]), "r"(src_ptr[117]), "r"(src_ptr[118]),
+            "r"(src_ptr[119]), "r"(src_ptr[120]), "r"(src_ptr[121]),
+            "r"(src_ptr[122]), "r"(src_ptr[123]), "r"(src_ptr[124]),
+            "r"(src_ptr[125]), "r"(src_ptr[126]), "r"(src_ptr[127]));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 16 data path lanes, 64b-bit pattern, repeated N times (store)
+template <bool Unpack16> class tmem_st_16dp64bNx;
+
+template <> class tmem_st_16dp64bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.x1.b32"
+                   "[%0],"
+                   "{%1};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.x2.b32"
+                   "[%0],"
+                   "{%1, %2};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.x4.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.x8.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]));
+    } else if constexpr (N == 16) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.x16.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+                   "%14, %15, %16};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]),
+                     "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+                     "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+                     "r"(src_ptr[14]), "r"(src_ptr[15]));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x64b.x32.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x64b.x64.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x64b.x128.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67,"
+          "%68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80,"
+          "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93,"
+          "%94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105,"
+          "%106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116,"
+          "%117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %127,"
+          "%128};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]), "r"(src_ptr[64]),
+            "r"(src_ptr[65]), "r"(src_ptr[66]), "r"(src_ptr[67]),
+            "r"(src_ptr[68]), "r"(src_ptr[69]), "r"(src_ptr[70]),
+            "r"(src_ptr[71]), "r"(src_ptr[72]), "r"(src_ptr[73]),
+            "r"(src_ptr[74]), "r"(src_ptr[75]), "r"(src_ptr[76]),
+            "r"(src_ptr[77]), "r"(src_ptr[78]), "r"(src_ptr[79]),
+            "r"(src_ptr[80]), "r"(src_ptr[81]), "r"(src_ptr[82]),
+            "r"(src_ptr[83]), "r"(src_ptr[84]), "r"(src_ptr[85]),
+            "r"(src_ptr[86]), "r"(src_ptr[87]), "r"(src_ptr[88]),
+            "r"(src_ptr[89]), "r"(src_ptr[90]), "r"(src_ptr[91]),
+            "r"(src_ptr[92]), "r"(src_ptr[93]), "r"(src_ptr[94]),
+            "r"(src_ptr[95]), "r"(src_ptr[96]), "r"(src_ptr[97]),
+            "r"(src_ptr[98]), "r"(src_ptr[99]), "r"(src_ptr[100]),
+            "r"(src_ptr[101]), "r"(src_ptr[102]), "r"(src_ptr[103]),
+            "r"(src_ptr[104]), "r"(src_ptr[105]), "r"(src_ptr[106]),
+            "r"(src_ptr[107]), "r"(src_ptr[108]), "r"(src_ptr[109]),
+            "r"(src_ptr[110]), "r"(src_ptr[111]), "r"(src_ptr[112]),
+            "r"(src_ptr[113]), "r"(src_ptr[114]), "r"(src_ptr[115]),
+            "r"(src_ptr[116]), "r"(src_ptr[117]), "r"(src_ptr[118]),
+            "r"(src_ptr[119]), "r"(src_ptr[120]), "r"(src_ptr[121]),
+            "r"(src_ptr[122]), "r"(src_ptr[123]), "r"(src_ptr[124]),
+            "r"(src_ptr[125]), "r"(src_ptr[126]), "r"(src_ptr[127]));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_st_16dp64bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 128,
+                  "N must be a power of 2 and lies between 1 ~ 128");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.unpack::16b.x1.b32"
+                   "[%0],"
+                   "{%1};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.unpack::16b.x2.b32"
+                   "[%0],"
+                   "{%1, %2};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.unpack::16b.x4.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.unpack::16b.x8.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]));
+    } else if constexpr (N == 16) {
+      asm volatile("tcgen05.st.sync.aligned.16x64b.unpack::16b.x16.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+                   "%14, %15, %16};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]),
+                     "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+                     "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+                     "r"(src_ptr[14]), "r"(src_ptr[15]));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x64b.unpack::16b.x32.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x64b.unpack::16b.x64.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]));
+    } else if constexpr (N == 128) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x64b.unpack::16b.x128.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67,"
+          "%68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80,"
+          "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93,"
+          "%94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105,"
+          "%106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116,"
+          "%117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %127,"
+          "%128};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]), "r"(src_ptr[64]),
+            "r"(src_ptr[65]), "r"(src_ptr[66]), "r"(src_ptr[67]),
+            "r"(src_ptr[68]), "r"(src_ptr[69]), "r"(src_ptr[70]),
+            "r"(src_ptr[71]), "r"(src_ptr[72]), "r"(src_ptr[73]),
+            "r"(src_ptr[74]), "r"(src_ptr[75]), "r"(src_ptr[76]),
+            "r"(src_ptr[77]), "r"(src_ptr[78]), "r"(src_ptr[79]),
+            "r"(src_ptr[80]), "r"(src_ptr[81]), "r"(src_ptr[82]),
+            "r"(src_ptr[83]), "r"(src_ptr[84]), "r"(src_ptr[85]),
+            "r"(src_ptr[86]), "r"(src_ptr[87]), "r"(src_ptr[88]),
+            "r"(src_ptr[89]), "r"(src_ptr[90]), "r"(src_ptr[91]),
+            "r"(src_ptr[92]), "r"(src_ptr[93]), "r"(src_ptr[94]),
+            "r"(src_ptr[95]), "r"(src_ptr[96]), "r"(src_ptr[97]),
+            "r"(src_ptr[98]), "r"(src_ptr[99]), "r"(src_ptr[100]),
+            "r"(src_ptr[101]), "r"(src_ptr[102]), "r"(src_ptr[103]),
+            "r"(src_ptr[104]), "r"(src_ptr[105]), "r"(src_ptr[106]),
+            "r"(src_ptr[107]), "r"(src_ptr[108]), "r"(src_ptr[109]),
+            "r"(src_ptr[110]), "r"(src_ptr[111]), "r"(src_ptr[112]),
+            "r"(src_ptr[113]), "r"(src_ptr[114]), "r"(src_ptr[115]),
+            "r"(src_ptr[116]), "r"(src_ptr[117]), "r"(src_ptr[118]),
+            "r"(src_ptr[119]), "r"(src_ptr[120]), "r"(src_ptr[121]),
+            "r"(src_ptr[122]), "r"(src_ptr[123]), "r"(src_ptr[124]),
+            "r"(src_ptr[125]), "r"(src_ptr[126]), "r"(src_ptr[127]));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 16 data path lanes, 128b-bit pattern, repeated N times (store)
+template <bool Unpack16> class tmem_st_16dp128bNx;
+
+template <> class tmem_st_16dp128bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.st.sync.aligned.16x128b.x1.b32"
+                   "[%0],"
+                   "{%1, %2};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.st.sync.aligned.16x128b.x2.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.st.sync.aligned.16x128b.x4.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.st.sync.aligned.16x128b.x8.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+                   "%14, %15, %16};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]),
+                     "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+                     "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+                     "r"(src_ptr[14]), "r"(src_ptr[15]));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x128b.x16.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x128b.x32.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x128b.x64.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67,"
+          "%68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80,"
+          "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93,"
+          "%94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105,"
+          "%106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116,"
+          "%117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %127,"
+          "%128};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]), "r"(src_ptr[64]),
+            "r"(src_ptr[65]), "r"(src_ptr[66]), "r"(src_ptr[67]),
+            "r"(src_ptr[68]), "r"(src_ptr[69]), "r"(src_ptr[70]),
+            "r"(src_ptr[71]), "r"(src_ptr[72]), "r"(src_ptr[73]),
+            "r"(src_ptr[74]), "r"(src_ptr[75]), "r"(src_ptr[76]),
+            "r"(src_ptr[77]), "r"(src_ptr[78]), "r"(src_ptr[79]),
+            "r"(src_ptr[80]), "r"(src_ptr[81]), "r"(src_ptr[82]),
+            "r"(src_ptr[83]), "r"(src_ptr[84]), "r"(src_ptr[85]),
+            "r"(src_ptr[86]), "r"(src_ptr[87]), "r"(src_ptr[88]),
+            "r"(src_ptr[89]), "r"(src_ptr[90]), "r"(src_ptr[91]),
+            "r"(src_ptr[92]), "r"(src_ptr[93]), "r"(src_ptr[94]),
+            "r"(src_ptr[95]), "r"(src_ptr[96]), "r"(src_ptr[97]),
+            "r"(src_ptr[98]), "r"(src_ptr[99]), "r"(src_ptr[100]),
+            "r"(src_ptr[101]), "r"(src_ptr[102]), "r"(src_ptr[103]),
+            "r"(src_ptr[104]), "r"(src_ptr[105]), "r"(src_ptr[106]),
+            "r"(src_ptr[107]), "r"(src_ptr[108]), "r"(src_ptr[109]),
+            "r"(src_ptr[110]), "r"(src_ptr[111]), "r"(src_ptr[112]),
+            "r"(src_ptr[113]), "r"(src_ptr[114]), "r"(src_ptr[115]),
+            "r"(src_ptr[116]), "r"(src_ptr[117]), "r"(src_ptr[118]),
+            "r"(src_ptr[119]), "r"(src_ptr[120]), "r"(src_ptr[121]),
+            "r"(src_ptr[122]), "r"(src_ptr[123]), "r"(src_ptr[124]),
+            "r"(src_ptr[125]), "r"(src_ptr[126]), "r"(src_ptr[127]));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_st_16dp128bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 64,
+                  "N must be a power of 2 and lies between 1 ~ 64");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.st.sync.aligned.16x128b.unpack::16b.x1.b32"
+                   "[%0],"
+                   "{%1, %2};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.st.sync.aligned.16x128b.unpack::16b.x2.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.st.sync.aligned.16x128b.unpack::16b.x4.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]));
+    } else if constexpr (N == 8) {
+      asm volatile("tcgen05.st.sync.aligned.16x128b.unpack::16b.x8.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+                   "%14, %15, %16};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]),
+                     "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+                     "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+                     "r"(src_ptr[14]), "r"(src_ptr[15]));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x128b.unpack::16b.x16.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x128b.unpack::16b.x32.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]));
+    } else if constexpr (N == 64) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x128b.unpack::16b.x64.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67,"
+          "%68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80,"
+          "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93,"
+          "%94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105,"
+          "%106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116,"
+          "%117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %127,"
+          "%128};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]), "r"(src_ptr[64]),
+            "r"(src_ptr[65]), "r"(src_ptr[66]), "r"(src_ptr[67]),
+            "r"(src_ptr[68]), "r"(src_ptr[69]), "r"(src_ptr[70]),
+            "r"(src_ptr[71]), "r"(src_ptr[72]), "r"(src_ptr[73]),
+            "r"(src_ptr[74]), "r"(src_ptr[75]), "r"(src_ptr[76]),
+            "r"(src_ptr[77]), "r"(src_ptr[78]), "r"(src_ptr[79]),
+            "r"(src_ptr[80]), "r"(src_ptr[81]), "r"(src_ptr[82]),
+            "r"(src_ptr[83]), "r"(src_ptr[84]), "r"(src_ptr[85]),
+            "r"(src_ptr[86]), "r"(src_ptr[87]), "r"(src_ptr[88]),
+            "r"(src_ptr[89]), "r"(src_ptr[90]), "r"(src_ptr[91]),
+            "r"(src_ptr[92]), "r"(src_ptr[93]), "r"(src_ptr[94]),
+            "r"(src_ptr[95]), "r"(src_ptr[96]), "r"(src_ptr[97]),
+            "r"(src_ptr[98]), "r"(src_ptr[99]), "r"(src_ptr[100]),
+            "r"(src_ptr[101]), "r"(src_ptr[102]), "r"(src_ptr[103]),
+            "r"(src_ptr[104]), "r"(src_ptr[105]), "r"(src_ptr[106]),
+            "r"(src_ptr[107]), "r"(src_ptr[108]), "r"(src_ptr[109]),
+            "r"(src_ptr[110]), "r"(src_ptr[111]), "r"(src_ptr[112]),
+            "r"(src_ptr[113]), "r"(src_ptr[114]), "r"(src_ptr[115]),
+            "r"(src_ptr[116]), "r"(src_ptr[117]), "r"(src_ptr[118]),
+            "r"(src_ptr[119]), "r"(src_ptr[120]), "r"(src_ptr[121]),
+            "r"(src_ptr[122]), "r"(src_ptr[123]), "r"(src_ptr[124]),
+            "r"(src_ptr[125]), "r"(src_ptr[126]), "r"(src_ptr[127]));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 16 data path lanes, 256b-bit pattern, repeated N times (store)
+template <bool Unpack16> class tmem_st_16dp256bNx;
+
+template <> class tmem_st_16dp256bNx<false> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.st.sync.aligned.16x256b.x1.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.st.sync.aligned.16x256b.x2.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.st.sync.aligned.16x256b.x4.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+                   "%14, %15, %16};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]),
+                     "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+                     "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+                     "r"(src_ptr[14]), "r"(src_ptr[15]));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x256b.x8.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x256b.x16.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x256b.x32.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67,"
+          "%68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80,"
+          "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93,"
+          "%94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105,"
+          "%106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116,"
+          "%117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %127,"
+          "%128};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]), "r"(src_ptr[64]),
+            "r"(src_ptr[65]), "r"(src_ptr[66]), "r"(src_ptr[67]),
+            "r"(src_ptr[68]), "r"(src_ptr[69]), "r"(src_ptr[70]),
+            "r"(src_ptr[71]), "r"(src_ptr[72]), "r"(src_ptr[73]),
+            "r"(src_ptr[74]), "r"(src_ptr[75]), "r"(src_ptr[76]),
+            "r"(src_ptr[77]), "r"(src_ptr[78]), "r"(src_ptr[79]),
+            "r"(src_ptr[80]), "r"(src_ptr[81]), "r"(src_ptr[82]),
+            "r"(src_ptr[83]), "r"(src_ptr[84]), "r"(src_ptr[85]),
+            "r"(src_ptr[86]), "r"(src_ptr[87]), "r"(src_ptr[88]),
+            "r"(src_ptr[89]), "r"(src_ptr[90]), "r"(src_ptr[91]),
+            "r"(src_ptr[92]), "r"(src_ptr[93]), "r"(src_ptr[94]),
+            "r"(src_ptr[95]), "r"(src_ptr[96]), "r"(src_ptr[97]),
+            "r"(src_ptr[98]), "r"(src_ptr[99]), "r"(src_ptr[100]),
+            "r"(src_ptr[101]), "r"(src_ptr[102]), "r"(src_ptr[103]),
+            "r"(src_ptr[104]), "r"(src_ptr[105]), "r"(src_ptr[106]),
+            "r"(src_ptr[107]), "r"(src_ptr[108]), "r"(src_ptr[109]),
+            "r"(src_ptr[110]), "r"(src_ptr[111]), "r"(src_ptr[112]),
+            "r"(src_ptr[113]), "r"(src_ptr[114]), "r"(src_ptr[115]),
+            "r"(src_ptr[116]), "r"(src_ptr[117]), "r"(src_ptr[118]),
+            "r"(src_ptr[119]), "r"(src_ptr[120]), "r"(src_ptr[121]),
+            "r"(src_ptr[122]), "r"(src_ptr[123]), "r"(src_ptr[124]),
+            "r"(src_ptr[125]), "r"(src_ptr[126]), "r"(src_ptr[127]));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+template <> class tmem_st_16dp256bNx<true> {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    static_assert(N > 0 && (N & (N - 1)) == 0 && N <= 32,
+                  "N must be a power of 2 and lies between 1 ~ 32");
+
+    if constexpr (N == 1) {
+      asm volatile("tcgen05.st.sync.aligned.16x256b.unpack::16b.x1.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]));
+    } else if constexpr (N == 2) {
+      asm volatile("tcgen05.st.sync.aligned.16x256b.unpack::16b.x2.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]));
+    } else if constexpr (N == 4) {
+      asm volatile("tcgen05.st.sync.aligned.16x256b.unpack::16b.x4.b32"
+                   "[%0],"
+                   "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, "
+                   "%14, %15, %16};\n"
+                   :
+                   : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]),
+                     "r"(src_ptr[2]), "r"(src_ptr[3]), "r"(src_ptr[4]),
+                     "r"(src_ptr[5]), "r"(src_ptr[6]), "r"(src_ptr[7]),
+                     "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+                     "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+                     "r"(src_ptr[14]), "r"(src_ptr[15]));
+    } else if constexpr (N == 8) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x256b.unpack::16b.x8.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]));
+    } else if constexpr (N == 16) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x256b.unpack::16b.x16.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]));
+    } else if constexpr (N == 32) {
+      asm volatile(
+          "tcgen05.st.sync.aligned.16x256b.unpack::16b.x32.b32"
+          "[%0],"
+          "{%1, %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12, %13, %14, %15,"
+          "%16, %17, %18, %19, %20, %21, %22, %23, %24, %25, %26, %27, %28,"
+          "%29, %30, %31, %32, %33, %34, %35, %36, %37, %38, %39, %40, %41,"
+          "%42, %43, %44, %45, %46, %47, %48, %49, %50, %51, %52, %53, %54,"
+          "%55, %56, %57, %58, %59, %60, %61, %62, %63, %64, %65, %66, %67,"
+          "%68, %69, %70, %71, %72, %73, %74, %75, %76, %77, %78, %79, %80,"
+          "%81, %82, %83, %84, %85, %86, %87, %88, %89, %90, %91, %92, %93,"
+          "%94, %95, %96, %97, %98, %99, %100, %101, %102, %103, %104, %105,"
+          "%106, %107, %108, %109, %110, %111, %112, %113, %114, %115, %116,"
+          "%117, %118, %119, %120, %121, %122, %123, %124, %125, %126, %127,"
+          "%128};\n"
+          :
+          : "r"(dst_addr), "r"(src_ptr[0]), "r"(src_ptr[1]), "r"(src_ptr[2]),
+            "r"(src_ptr[3]), "r"(src_ptr[4]), "r"(src_ptr[5]), "r"(src_ptr[6]),
+            "r"(src_ptr[7]), "r"(src_ptr[8]), "r"(src_ptr[9]), "r"(src_ptr[10]),
+            "r"(src_ptr[11]), "r"(src_ptr[12]), "r"(src_ptr[13]),
+            "r"(src_ptr[14]), "r"(src_ptr[15]), "r"(src_ptr[16]),
+            "r"(src_ptr[17]), "r"(src_ptr[18]), "r"(src_ptr[19]),
+            "r"(src_ptr[20]), "r"(src_ptr[21]), "r"(src_ptr[22]),
+            "r"(src_ptr[23]), "r"(src_ptr[24]), "r"(src_ptr[25]),
+            "r"(src_ptr[26]), "r"(src_ptr[27]), "r"(src_ptr[28]),
+            "r"(src_ptr[29]), "r"(src_ptr[30]), "r"(src_ptr[31]),
+            "r"(src_ptr[32]), "r"(src_ptr[33]), "r"(src_ptr[34]),
+            "r"(src_ptr[35]), "r"(src_ptr[36]), "r"(src_ptr[37]),
+            "r"(src_ptr[38]), "r"(src_ptr[39]), "r"(src_ptr[40]),
+            "r"(src_ptr[41]), "r"(src_ptr[42]), "r"(src_ptr[43]),
+            "r"(src_ptr[44]), "r"(src_ptr[45]), "r"(src_ptr[46]),
+            "r"(src_ptr[47]), "r"(src_ptr[48]), "r"(src_ptr[49]),
+            "r"(src_ptr[50]), "r"(src_ptr[51]), "r"(src_ptr[52]),
+            "r"(src_ptr[53]), "r"(src_ptr[54]), "r"(src_ptr[55]),
+            "r"(src_ptr[56]), "r"(src_ptr[57]), "r"(src_ptr[58]),
+            "r"(src_ptr[59]), "r"(src_ptr[60]), "r"(src_ptr[61]),
+            "r"(src_ptr[62]), "r"(src_ptr[63]), "r"(src_ptr[64]),
+            "r"(src_ptr[65]), "r"(src_ptr[66]), "r"(src_ptr[67]),
+            "r"(src_ptr[68]), "r"(src_ptr[69]), "r"(src_ptr[70]),
+            "r"(src_ptr[71]), "r"(src_ptr[72]), "r"(src_ptr[73]),
+            "r"(src_ptr[74]), "r"(src_ptr[75]), "r"(src_ptr[76]),
+            "r"(src_ptr[77]), "r"(src_ptr[78]), "r"(src_ptr[79]),
+            "r"(src_ptr[80]), "r"(src_ptr[81]), "r"(src_ptr[82]),
+            "r"(src_ptr[83]), "r"(src_ptr[84]), "r"(src_ptr[85]),
+            "r"(src_ptr[86]), "r"(src_ptr[87]), "r"(src_ptr[88]),
+            "r"(src_ptr[89]), "r"(src_ptr[90]), "r"(src_ptr[91]),
+            "r"(src_ptr[92]), "r"(src_ptr[93]), "r"(src_ptr[94]),
+            "r"(src_ptr[95]), "r"(src_ptr[96]), "r"(src_ptr[97]),
+            "r"(src_ptr[98]), "r"(src_ptr[99]), "r"(src_ptr[100]),
+            "r"(src_ptr[101]), "r"(src_ptr[102]), "r"(src_ptr[103]),
+            "r"(src_ptr[104]), "r"(src_ptr[105]), "r"(src_ptr[106]),
+            "r"(src_ptr[107]), "r"(src_ptr[108]), "r"(src_ptr[109]),
+            "r"(src_ptr[110]), "r"(src_ptr[111]), "r"(src_ptr[112]),
+            "r"(src_ptr[113]), "r"(src_ptr[114]), "r"(src_ptr[115]),
+            "r"(src_ptr[116]), "r"(src_ptr[117]), "r"(src_ptr[118]),
+            "r"(src_ptr[119]), "r"(src_ptr[120]), "r"(src_ptr[121]),
+            "r"(src_ptr[122]), "r"(src_ptr[123]), "r"(src_ptr[124]),
+            "r"(src_ptr[125]), "r"(src_ptr[126]), "r"(src_ptr[127]));
+    } else {
+      asm volatile("trap");
+    }
+  }
+};
+
+// 32 data path lanes, composite via 2x16dp (store)
+template <bool Unpack16 = false> class tmem_st_32dp64bNx {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    tmem_st_16dp64bNx<Unpack16>::template copy<N>(dst_addr, src_ptr);
+    tmem_st_16dp64bNx<Unpack16>::template copy<N>(dst_addr + (16 << 16),
+                                                  src_ptr + N);
+  }
+};
+
+// 32 data path lanes, composite via 2x16dp (store)
+template <bool Unpack16 = false> class tmem_st_32dp128bNx {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    tmem_st_16dp128bNx<Unpack16>::template copy<N>(dst_addr, src_ptr);
+    tmem_st_16dp128bNx<Unpack16>::template copy<N>(dst_addr + (16 << 16),
+                                                   src_ptr + N * 2);
+  }
+};
+
+// 32 data path lanes, composite via 2x16dp (store)
+template <bool Unpack16 = false> class tmem_st_32dp256bNx {
+public:
+  template <int N>
+  static TL_DEVICE void copy(uint32_t const &dst_addr,
+                             uint32_t const *src_ptr) {
+    tmem_st_16dp256bNx<Unpack16>::template copy<N>(dst_addr, src_ptr);
+    tmem_st_16dp256bNx<Unpack16>::template copy<N>(dst_addr + (16 << 16),
+                                                   src_ptr + N * 4);
+  }
+};
+
+} // namespace tl
diff --git a/src/tl_templates/cuda/threadblock_swizzle.h b/src/tl_templates/cuda/threadblock_swizzle.h
index 60fa0ad1f0..74ab355813 100644
--- a/src/tl_templates/cuda/threadblock_swizzle.h
+++ b/src/tl_templates/cuda/threadblock_swizzle.h
@@ -40,4 +40,68 @@ template <int panel_width> TL_DEVICE dim3 rasterization2DColumn() {
   return {col_idx, row_idx, blockIdx.z};
 }
 
+// Cluster-aware row-major swizzle: cluster_dim_x CTAs are grouped together
+// in the x-direction as one cluster unit. The swizzle operates at cluster
+// granularity; each CTA within the cluster retains its intra-cluster x offset.
+//
+// Example: cluster_dim_x=2, gridDim.x=8 → 4 clusters in x.
+//   The 4 clusters are swizzled as if gridDim.x were 4, then the final
+//   col_idx = swizzled_cluster_x * 2 + (blockIdx.x % 2).
+template <int panel_width, int cluster_dim_x>
+TL_DEVICE dim3 rasterization2DRowWithCluster() {
+  auto ceil_div = [](int a, int b) { return (a + b - 1) / b; };
+  const unsigned int num_cluster_x = gridDim.x / cluster_dim_x;
+  const unsigned int intra_cluster_x = blockIdx.x % cluster_dim_x;
+  const unsigned int cluster_x = blockIdx.x / cluster_dim_x;
+  const unsigned int cluster_idx = cluster_x + blockIdx.y * num_cluster_x;
+  const unsigned int cluster_grid_size = num_cluster_x * gridDim.y;
+  const unsigned int panel_size = panel_width * num_cluster_x;
+  const unsigned int panel_offset = cluster_idx % panel_size;
+  const unsigned int panel_idx = cluster_idx / panel_size;
+  const unsigned int total_panel = ceil_div(cluster_grid_size, panel_size);
+  const unsigned int stride =
+      panel_idx + 1 < total_panel
+          ? panel_width
+          : (cluster_grid_size - panel_idx * panel_size) / num_cluster_x;
+  const unsigned int swizzled_cluster_x =
+      (panel_idx & 1) ? num_cluster_x - 1 - panel_offset / stride
+                      : panel_offset / stride;
+  const unsigned int swizzled_cluster_y =
+      panel_offset % stride + panel_idx * panel_width;
+  const unsigned int col_idx =
+      swizzled_cluster_x * cluster_dim_x + intra_cluster_x;
+  const unsigned int row_idx = swizzled_cluster_y;
+  return {col_idx, row_idx, blockIdx.z};
+}
+
+// Cluster-aware column-major swizzle: cluster_dim_x CTAs are grouped together
+// in the x-direction. The swizzle operates at cluster granularity in a
+// column-major fashion.
+template <int panel_width, int cluster_dim_x>
+TL_DEVICE dim3 rasterization2DColumnWithCluster() {
+  auto ceil_div = [](int a, int b) { return (a + b - 1) / b; };
+  const unsigned int num_cluster_x = gridDim.x / cluster_dim_x;
+  const unsigned int intra_cluster_x = blockIdx.x % cluster_dim_x;
+  const unsigned int cluster_x = blockIdx.x / cluster_dim_x;
+  const unsigned int cluster_idx = cluster_x + blockIdx.y * num_cluster_x;
+  const unsigned int cluster_grid_size = num_cluster_x * gridDim.y;
+  const unsigned int panel_size = panel_width * gridDim.y;
+  const unsigned int panel_offset = cluster_idx % panel_size;
+  const unsigned int panel_idx = cluster_idx / panel_size;
+  const unsigned int total_panel = ceil_div(cluster_grid_size, panel_size);
+  const unsigned int stride =
+      panel_idx + 1 < total_panel
+          ? panel_width
+          : (cluster_grid_size - panel_idx * panel_size) / gridDim.y;
+  const unsigned int swizzled_cluster_y =
+      (panel_idx & 1) ? gridDim.y - 1 - panel_offset / stride
+                      : panel_offset / stride;
+  const unsigned int swizzled_cluster_x =
+      panel_offset % stride + panel_idx * panel_width;
+  const unsigned int col_idx =
+      swizzled_cluster_x * cluster_dim_x + intra_cluster_x;
+  const unsigned int row_idx = swizzled_cluster_y;
+  return {col_idx, row_idx, blockIdx.z};
+}
+
 } // namespace tl
diff --git a/src/tl_templates/hip/atomic.h b/src/tl_templates/hip/atomic.h
new file mode 100644
index 0000000000..30931361b8
--- /dev/null
+++ b/src/tl_templates/hip/atomic.h
@@ -0,0 +1,104 @@
+#pragma once
+
+#include <hip/hip_runtime.h>
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicAdd(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicAdd(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ T1 AtomicAddRet(T1 *ref, T2 val,
+                                           int memory_order = 0) {
+  return atomicAdd(ref, static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMax(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicMax(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMax(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicMax(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMin(T1 *address, T2 val,
+                                          int memory_order = 0) {
+  atomicMin(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+// Overload for when the first argument is a value instead of a pointer
+template <typename T1, typename T2>
+__forceinline__ __device__ void AtomicMin(T1 &address, T2 val,
+                                          int memory_order = 0) {
+  atomicMin(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+}
+
+__forceinline__ __device__ void AtomicAddx2(float *ref, float *val,
+                                            int memory_order = 0) {
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ float2 AtomicAddx2Ret(float *ref, float *val,
+                                                 int memory_order = 0) {
+  float2 add_val = *reinterpret_cast<float2 *>(val);
+  float2 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  return ret;
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ void AtomicAddx4(float *ref, float *val,
+                                            int memory_order = 0) {
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  atomicAdd(ref + 0, add_val.x);
+  atomicAdd(ref + 1, add_val.y);
+  atomicAdd(ref + 2, add_val.z);
+  atomicAdd(ref + 3, add_val.w);
+}
+
+// Add an extra unused input to accommodate the additional 'memory_order'
+// argument during lowering.
+__forceinline__ __device__ float4 AtomicAddx4Ret(float *ref, float *val,
+                                                 int memory_order = 0) {
+  float4 add_val = *reinterpret_cast<float4 *>(val);
+  float4 ret;
+  ret.x = atomicAdd(ref + 0, add_val.x);
+  ret.y = atomicAdd(ref + 1, add_val.y);
+  ret.z = atomicAdd(ref + 2, add_val.z);
+  ret.w = atomicAdd(ref + 3, add_val.w);
+  return ret;
+}
diff --git a/src/tl_templates/hip/common.h b/src/tl_templates/hip/common.h
index 8be247e77e..49c5b6c1e4 100644
--- a/src/tl_templates/hip/common.h
+++ b/src/tl_templates/hip/common.h
@@ -1,6 +1,8 @@
 #pragma once
 
+#include "atomic.h"
 #include <ck_tile/core.hpp>
+#include <hip/amd_detail/amd_warp_functions.h>
 #include <hip/hip_bf16.h>
 #include <hip/hip_fp16.h>
 #include <hip/hip_runtime.h>
@@ -84,10 +86,14 @@ struct bfloat16x16 {
 
 typedef
     __attribute__((__vector_size__(4 * sizeof(short)))) short bfloat16x4_vec;
+typedef
+    __attribute__((__vector_size__(8 * sizeof(short)))) short bfloat16x8_vec;
 
 using int32x4 = __attribute__((__vector_size__(4 * sizeof(int)))) int;
+using int32x16 = __attribute__((__vector_size__(16 * sizeof(int)))) int;
 using float32x4 = __attribute__((__vector_size__(4 * sizeof(float)))) float;
 using float32x16 = __attribute__((__vector_size__(16 * sizeof(float)))) float;
+using float32x32 = __attribute__((__vector_size__(32 * sizeof(float)))) float;
 
 using int8x4 = __attribute__((__vector_size__(4 * sizeof(int8_t)))) int8_t;
 
@@ -105,18 +111,222 @@ TL_DEVICE unsigned __pack_bfloat162(const bfloat16_t x, const bfloat16_t y) {
   return (v1 << 16) | v0;
 }
 
-template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 *address, T2 val) {
-  atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+// __habs overloads for hip_bfloat16 and float16_t to resolve ambiguity on ROCm.
+// hip_bfloat16 != __hip_bfloat16, and float16_t != __half, so the standard
+// __habs overloads don't match exactly, causing ambiguous overload errors.
+// Use __builtin_memcpy instead of reinterpret_cast to avoid strict-aliasing UB.
+__device__ __forceinline__ hip_bfloat16 __habs(hip_bfloat16 a) {
+  uint16_t bits;
+  __builtin_memcpy(&bits, &a, sizeof(bits));
+  bits &= 0x7FFFu;
+  hip_bfloat16 result;
+  __builtin_memcpy(&result, &bits, sizeof(result));
+  return result;
+}
+__device__ __forceinline__ float16_t __habs(float16_t a) {
+  uint16_t bits;
+  __builtin_memcpy(&bits, &a, sizeof(bits));
+  bits &= 0x7FFFu;
+  float16_t result;
+  __builtin_memcpy(&result, &bits, sizeof(result));
+  return result;
 }
 
-// Overload for when the first argument is a value instead of a pointer
-template <typename T1, typename T2>
-TL_DEVICE void AtomicAdd(T1 address, T2 val) {
-  atomicAdd(reinterpret_cast<T1 *>(&address), static_cast<T1>(val));
+namespace tl {
+
+// Packed x2 element-wise math helpers (HIP scalar fallbacks)
+//
+// HIP does not expose packed-FP32x2 instructions, so we provide per-lane
+// scalar fallbacks to keep the TileLang language surface portable.
+
+TL_DEVICE float2 add2(float2 a, float2 b) {
+  float2 out;
+  out.x = a.x + b.x;
+  out.y = a.y + b.y;
+  return out;
 }
 
-template <typename T1, typename T2>
-TL_DEVICE T1 AtomicAddRet(T1 *address, T2 val) {
-  return atomicAdd(reinterpret_cast<T1 *>(address), static_cast<T1>(val));
+TL_DEVICE float2 sub2(float2 a, float2 b) {
+  float2 out;
+  out.x = a.x - b.x;
+  out.y = a.y - b.y;
+  return out;
 }
+
+TL_DEVICE float2 mul2(float2 a, float2 b) {
+  float2 out;
+  out.x = a.x * b.x;
+  out.y = a.y * b.y;
+  return out;
+}
+
+TL_DEVICE float2 fma2(float2 a, float2 b, float2 c) {
+  float2 out;
+  out.x = a.x * b.x + c.x;
+  out.y = a.y * b.y + c.y;
+  return out;
+}
+
+TL_DEVICE float2 max2(float2 a, float2 b) {
+  float2 out;
+  out.x = (a.x > b.x) ? a.x : b.x;
+  out.y = (a.y > b.y) ? a.y : b.y;
+  return out;
+}
+
+TL_DEVICE float2 min2(float2 a, float2 b) {
+  float2 out;
+  out.x = (a.x < b.x) ? a.x : b.x;
+  out.y = (a.y < b.y) ? a.y : b.y;
+  return out;
+}
+
+TL_DEVICE float2 abs2(float2 a) {
+  float2 out;
+  out.x = (a.x >= 0.0f) ? a.x : -a.x;
+  out.y = (a.y >= 0.0f) ? a.y : -a.y;
+  return out;
+}
+
+// Packed bfloat16x2 overloads for uint1 carrier.
+// On HIP, uint1 = HIP_vector_type<unsigned int, 1> (32-bit), already defined
+// by ROCm via amd_hip_vector_types.h — no additional typedef needed.
+// A packed bfloat16x2 word layout:
+//   bits [15: 0] = first  bfloat16 (sign at bit 15)
+//   bits [31:16] = second bfloat16 (sign at bit 31)
+// These overloads are required by the HIP codegen's ShuffleNode packing path
+// (VisitExpr_ ShuffleNode emits uint1{__pack_bfloat162(a, b)}).
+TL_DEVICE uint1 abs2(uint1 val) {
+  // Clear both sign bits simultaneously.
+  return uint1{val.x & 0x7FFF7FFFu};
+}
+TL_DEVICE uint1 max2(uint1 a, uint1 b) {
+  bfloat16_t a0, a1, b0, b1;
+  __builtin_memcpy(&a0, &a.x, sizeof(a0));
+  __builtin_memcpy(&a1, (char *)&a.x + 2, sizeof(a1));
+  __builtin_memcpy(&b0, &b.x, sizeof(b0));
+  __builtin_memcpy(&b1, (char *)&b.x + 2, sizeof(b1));
+  bfloat16_t r0 = (float)a0 > (float)b0 ? a0 : b0;
+  bfloat16_t r1 = (float)a1 > (float)b1 ? a1 : b1;
+  return uint1{__pack_bfloat162(r0, r1)};
+}
+TL_DEVICE uint1 min2(uint1 a, uint1 b) {
+  bfloat16_t a0, a1, b0, b1;
+  __builtin_memcpy(&a0, &a.x, sizeof(a0));
+  __builtin_memcpy(&a1, (char *)&a.x + 2, sizeof(a1));
+  __builtin_memcpy(&b0, &b.x, sizeof(b0));
+  __builtin_memcpy(&b1, (char *)&b.x + 2, sizeof(b1));
+  bfloat16_t r0 = (float)a0 < (float)b0 ? a0 : b0;
+  bfloat16_t r1 = (float)a1 < (float)b1 ? a1 : b1;
+  return uint1{__pack_bfloat162(r0, r1)};
+}
+TL_DEVICE uint1 add2(uint1 a, uint1 b) {
+  bfloat16_t a0, a1, b0, b1;
+  __builtin_memcpy(&a0, &a.x, sizeof(a0));
+  __builtin_memcpy(&a1, (char *)&a.x + 2, sizeof(a1));
+  __builtin_memcpy(&b0, &b.x, sizeof(b0));
+  __builtin_memcpy(&b1, (char *)&b.x + 2, sizeof(b1));
+  bfloat16_t r0 = (bfloat16_t)((float)a0 + (float)b0);
+  bfloat16_t r1 = (bfloat16_t)((float)a1 + (float)b1);
+  return uint1{__pack_bfloat162(r0, r1)};
+}
+TL_DEVICE uint1 mul2(uint1 a, uint1 b) {
+  bfloat16_t a0, a1, b0, b1;
+  __builtin_memcpy(&a0, &a.x, sizeof(a0));
+  __builtin_memcpy(&a1, (char *)&a.x + 2, sizeof(a1));
+  __builtin_memcpy(&b0, &b.x, sizeof(b0));
+  __builtin_memcpy(&b1, (char *)&b.x + 2, sizeof(b1));
+  bfloat16_t r0 = (bfloat16_t)((float)a0 * (float)b0);
+  bfloat16_t r1 = (bfloat16_t)((float)a1 * (float)b1);
+  return uint1{__pack_bfloat162(r0, r1)};
+}
+
+// Any
+template <typename T> TL_DEVICE bool Any(T *a, int size) {
+  for (int i = 0; i < size; i++) {
+    if (a[i]) {
+      return true;
+    }
+  }
+  return false;
+}
+
+// All
+template <typename T> TL_DEVICE bool All(T *a, int size) {
+  for (int i = 0; i < size; i++) {
+    if (!a[i]) {
+      return false;
+    }
+  }
+  return true;
+}
+
+// TODO(gong): support shfl_sync(rocm 7.1.1 provide shfl_sync)
+// shfl_sync func
+template <typename T> TL_DEVICE T shfl_xor(T val, int delta) {
+  return __shfl_xor(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl_down(T val, int delta) {
+  return __shfl_down(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl_up(T val, int delta) {
+  return __shfl_up(val, delta);
+}
+
+template <typename T> TL_DEVICE T shfl(T val, int srcLane) {
+  return __shfl(val, srcLane);
+}
+
+// specialize half_t
+template <> TL_DEVICE half_t shfl_xor(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl_down(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl_up(half_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up(f, delta);
+  return half_t(r);
+}
+
+template <> TL_DEVICE half_t shfl(half_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl(f, srcLane);
+  return half_t(r);
+}
+
+// specialize bfloat16_t
+template <> TL_DEVICE bfloat16_t shfl_xor(bfloat16_t val, int laneMask) {
+  float f = static_cast<float>(val);
+  float r = __shfl_xor(f, laneMask);
+  return bfloat16_t(r);
+}
+
+template <> TL_DEVICE bfloat16_t shfl_down(bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_down(f, delta);
+  return bfloat16_t(r);
+}
+
+template <> TL_DEVICE bfloat16_t shfl_up(bfloat16_t val, int delta) {
+  float f = static_cast<float>(val);
+  float r = __shfl_up(f, delta);
+  return bfloat16_t(r);
+}
+
+template <> TL_DEVICE bfloat16_t shfl(bfloat16_t val, int srcLane) {
+  float f = static_cast<float>(val);
+  float r = __shfl(f, srcLane);
+  return bfloat16_t(r);
+}
+
+} // namespace tl
diff --git a/src/tl_templates/hip/copy.h b/src/tl_templates/hip/copy.h
index 3f122d801f..6142e3fdfb 100644
--- a/src/tl_templates/hip/copy.h
+++ b/src/tl_templates/hip/copy.h
@@ -72,9 +72,32 @@ CK_TILE_DEVICE void async_buffer_load_dword_v(void *smem, int32x4_t rsrc,
                : "memory");
 }
 
+// gfx950 (CDNA4 / MI350): 128-bit direct-to-LDS async load.
+// buffer_load_dwordx4 ... lds bypasses VGPRs entirely, giving 4x the
+// bandwidth of the 32-bit path and overlapping with MFMA computation.
+#if defined(__gfx950__)
+CK_TILE_DEVICE void async_buffer_load_dwordx4_v(void *smem, int32x4_t rsrc,
+                                                index_t voffset) {
+  auto const lds_ptr_sgpr =
+      __builtin_amdgcn_readfirstlane((reinterpret_cast<uintptr_t>(smem)));
+  asm volatile(
+      "s_mov_b32 m0, %0; \n\t"
+      "buffer_load_dwordx4 %1, %2, 0 offen lds;\n\t" ::"s"(lds_ptr_sgpr),
+      "v"(voffset), "s"(rsrc)
+      : "memory");
+}
+#endif // __gfx950__
+
 template <int N>
 TL_DEVICE void cp_async_gs(void *lds_base_ptr, void const *global_base_ptr) {
   if constexpr (N == 16) {
+    // NOTE: the gfx950 buffer_load_dwordx4 ... lds path requires the per-thread
+    // LDS destination address to be exactly base + threadIdx.x * 16 bytes
+    // (i.e. no swizzling).  The pipeline-planning pass generates swizzled LDS
+    // layouts that violate this constraint, so we always use the synchronous
+    // uint4 store here.  The T.async_copy path (called with a known-linear LDS
+    // layout) may still use the fast async path via async_buffer_load_dwordx4_v
+    // directly.
     *(uint4 *)lds_base_ptr = *(const uint4 *)global_base_ptr;
   } else if constexpr (N == 8) {
     *(uint2 *)lds_base_ptr = *(const uint2 *)global_base_ptr;
@@ -91,6 +114,12 @@ template <int N>
 TL_DEVICE void cp_async_gs_conditional(void *lds_base_ptr,
                                        void const *global_base_ptr, bool cond) {
   if constexpr (N == 16) {
+    // NOTE: same reasoning as cp_async_gs above — the pipeline-planning pass
+    // generates swizzled LDS layouts that are incompatible with the gfx950
+    // buffer_load_dwordx4 ... lds path (which requires a linear, un-swizzled
+    // destination).  Use the synchronous uint4 path here as well so that
+    // cp_async_wait (which issues s_waitcnt vmcnt) correctly synchronises the
+    // data before the consumer MFMA reads it.
     *(uint4 *)lds_base_ptr =
         cond ? *(const uint4 *)global_base_ptr : make_uint4(0, 0, 0, 0);
   } else if constexpr (N == 8) {
diff --git a/src/tl_templates/hip/debug.h b/src/tl_templates/hip/debug.h
index 7b19d3e943..309b8fd991 100644
--- a/src/tl_templates/hip/debug.h
+++ b/src/tl_templates/hip/debug.h
@@ -1,191 +1,108 @@
 #pragma once
 #include <hip/hip_runtime.h>
 
-// Base template declaration
-template <typename T> __device__ void debug_print_var(const char *msg, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void debug_print_var<signed char>(const char *msg, signed char var) {
-  const char *safe_msg = msg;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=signed "
-         "char value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void debug_print_var<unsigned char>(const char *msg,
-                                               unsigned char var) {
-  const char *safe_msg = msg;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned char value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for int type
-template <> __device__ void debug_print_var<int>(const char *msg, int var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=int "
-         "value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for unsigned int type
-template <>
-__device__ void debug_print_var<unsigned int>(const char *msg,
-                                              unsigned int var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned int value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for float type
-template <> __device__ void debug_print_var<float>(const char *msg, float var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=float "
-         "value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for double type
-template <>
-__device__ void debug_print_var<double>(const char *msg, double var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=double "
-         "value=%lf\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, var);
-}
-
-// Specialization for bool type
-template <> __device__ void debug_print_var<bool>(const char *msg, bool var) {
-  const char *safe_msg = msg;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
-         "value=%s\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z,
-         var ? "true" : "false");
-}
-
-// Specialization for short type
-template <> __device__ void debug_print_var<short>(const char *msg, short var) {
-  const char *safe_msg = msg;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=short "
-         "value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
-}
-
-// Specialization for unsigned short type
-template <>
-__device__ void debug_print_var<unsigned short>(const char *msg,
-                                                unsigned short var) {
-  const char *safe_msg = msg;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
-         "dtype=unsigned short value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, value);
+#include "hip_fp8.h"
+
+template <typename T> struct PrintTraits {
+  static __device__ void print_var(const char *msg, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (const void *)&val);
+  }
+
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=unknown value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (const void *)&val);
+  }
+};
+
+#define DEFINE_PRINT_TRAIT(TYPE, NAME, FORMAT, CAST_TYPE)                      \
+  template <> struct PrintTraits<TYPE> {                                       \
+    static __device__ void print_var(const char *msg, TYPE val) {              \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "dtype=" NAME " value=" FORMAT "\n",                              \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, (CAST_TYPE)val);                        \
+    }                                                                          \
+    static __device__ void print_buffer(const char *msg, const char *buf_name, \
+                                        int index, TYPE val) {                 \
+      printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "        \
+             "buffer=%s, index=%d, dtype=" NAME " value=" FORMAT "\n",         \
+             msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x,             \
+             threadIdx.y, threadIdx.z, buf_name, index, (CAST_TYPE)val);       \
+    }                                                                          \
+  }
+
+DEFINE_PRINT_TRAIT(char, "char", "%d", int);
+DEFINE_PRINT_TRAIT(signed char, "signed char", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned char, "unsigned char", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(short, "short", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned short, "unsigned short", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(int, "int", "%d", int);
+DEFINE_PRINT_TRAIT(unsigned int, "uint", "%u", unsigned int);
+DEFINE_PRINT_TRAIT(long, "long", "%ld", long);
+DEFINE_PRINT_TRAIT(unsigned long, "ulong", "%lu", unsigned long);
+DEFINE_PRINT_TRAIT(long long, "long long", "%lld", long long);
+
+DEFINE_PRINT_TRAIT(float, "float", "%f", float);
+DEFINE_PRINT_TRAIT(double, "double", "%lf", double);
+DEFINE_PRINT_TRAIT(half_t, "half_t", "%f", float);
+DEFINE_PRINT_TRAIT(bfloat16_t, "bfloat16_t", "%f", float);
+
+DEFINE_PRINT_TRAIT(fp8_e4_t, "fp8_e4_t", "%f", float);
+DEFINE_PRINT_TRAIT(fp8_e5_t, "fp8_e5_t", "%f", float);
+
+//
+template <> struct PrintTraits<bool> {
+  static __device__ void print_var(const char *msg, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): dtype=bool "
+           "value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, val ? "true" : "false");
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, bool val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=bool value=%s\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, val ? "true" : "false");
+  }
+};
+
+template <typename T> struct PrintTraits<T *> {
+  static __device__ void print_var(const char *msg, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): "
+           "dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, (void *)val);
+  }
+  static __device__ void print_buffer(const char *msg, const char *buf_name,
+                                      int index, T *val) {
+    printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
+           "index=%d, dtype=pointer value=%p\n",
+           msg, blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+           threadIdx.z, buf_name, index, (void *)val);
+  }
+};
+
+template <typename T> __device__ void debug_print_var(const char *msg, T var) {
+  PrintTraits<T>::print_var(msg, var);
 }
 
 // Template declaration for device-side debug printing (buffer only)
 template <typename T>
 __device__ void debug_print_buffer_value(const char *msg, const char *buf_name,
-                                         int index, T var);
-
-// Specialization for signed char type
-template <>
-__device__ void
-debug_print_buffer_value<signed char>(const char *msg, const char *buf_name,
-                                      int index, signed char var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  int value = static_cast<int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=signed char value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
-}
-
-// Specialization for unsigned char type
-template <>
-__device__ void
-debug_print_buffer_value<unsigned char>(const char *msg, const char *buf_name,
-                                        int index, unsigned char var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  unsigned int value = static_cast<unsigned int>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=unsigned char value=%u\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
-}
-
-// Specialization for integer type
-template <>
-__device__ void debug_print_buffer_value<int>(const char *msg,
-                                              const char *buf_name, int index,
-                                              int var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=int value=%d\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
-}
-
-// Specialization for float type
-template <>
-__device__ void debug_print_buffer_value<float>(const char *msg,
-                                                const char *buf_name, int index,
-                                                float var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=float value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
-}
-
-// Specialization for half_t type
-template <>
-__device__ void debug_print_buffer_value<half_t>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, half_t var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  float value = static_cast<float>(var);
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=half_t value=%f\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, value);
+                                         int index, T var) {
+  PrintTraits<T>::print_buffer(msg, buf_name, index, var);
 }
 
-// Specialization for double type
-template <>
-__device__ void debug_print_buffer_value<double>(const char *msg,
-                                                 const char *buf_name,
-                                                 int index, double var) {
-  const char *safe_msg = msg;
-  const char *safe_buf_name = buf_name;
-  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d): buffer=%s, "
-         "index=%d, dtype=double value=%lf\n",
-         safe_msg, (int)blockIdx.x, (int)blockIdx.y, (int)blockIdx.z,
-         (int)threadIdx.x, (int)threadIdx.y, (int)threadIdx.z, safe_buf_name,
-         index, var);
+// Specialization for msg-only debug print
+__device__ void debug_print_msg(const char *msg) {
+  printf("msg='%s' BlockIdx=(%d, %d, %d), ThreadIdx=(%d, %d, %d)\n", msg,
+         blockIdx.x, blockIdx.y, blockIdx.z, threadIdx.x, threadIdx.y,
+         threadIdx.z);
 }
diff --git a/src/tl_templates/hip/hip_fp8.h b/src/tl_templates/hip/hip_fp8.h
index 82fb53031b..ee9e3237bf 100644
--- a/src/tl_templates/hip/hip_fp8.h
+++ b/src/tl_templates/hip/hip_fp8.h
@@ -1,40 +1,130 @@
+#pragma once
 #include <hip/amd_detail/amd_hip_fp8.h>
+#include <stdint.h>
 
 #define HIP_FP8_ENABLED 1
 
-using fp8_e4_t = __hip_fp8_e4m3_fnuz;
-using fp8_e4_2_t = __hip_fp8x2_e4m3_fnuz;
+#define TILELANG_FP8_E4M3_VARIANT_FN 0
+#define TILELANG_FP8_E4M3_VARIANT_FNUZ 1
+
+#define TILELANG_FP8_E5M2_VARIANT_FN 0
+#define TILELANG_FP8_E5M2_VARIANT_FNUZ 1
+
+#ifndef TILELANG_FP8_E4M3_VARIANT
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define TILELANG_FP8_E4M3_VARIANT TILELANG_FP8_E4M3_VARIANT_FNUZ
+#else
+#define TILELANG_FP8_E4M3_VARIANT TILELANG_FP8_E4M3_VARIANT_FN
+#endif
+#endif
+
+#ifndef TILELANG_FP8_E5M2_VARIANT
+#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
+#define TILELANG_FP8_E5M2_VARIANT TILELANG_FP8_E5M2_VARIANT_FNUZ
+#else
+#define TILELANG_FP8_E5M2_VARIANT TILELANG_FP8_E5M2_VARIANT_FN
+#endif
+#endif
+
+#if (TILELANG_FP8_E4M3_VARIANT == TILELANG_FP8_E4M3_VARIANT_FN)
+using hip_fp8_e4_t = __hip_fp8_e4m3;
+using hip_fp8x2_e4_t = __hip_fp8x2_e4m3;
+using hip_fp8x4_e4_t = __hip_fp8x4_e4m3;
+#else
+// FNUZ path (MI300X and universal fallback)
+using hip_fp8_e4_t = __hip_fp8_e4m3_fnuz;
+using hip_fp8x2_e4_t = __hip_fp8x2_e4m3_fnuz;
+using hip_fp8x4_e4_t = __hip_fp8x4_e4m3_fnuz;
+#endif
+
+#if (TILELANG_FP8_E5M2_VARIANT == TILELANG_FP8_E5M2_VARIANT_FN)
+using hip_fp8_e5_t = __hip_fp8_e5m2;
+using hip_fp8x2_e5_t = __hip_fp8x2_e5m2;
+using hip_fp8x4_e5_t = __hip_fp8x4_e5m2;
+#else
+using hip_fp8_e5_t = __hip_fp8_e5m2_fnuz;
+using hip_fp8x2_e5_t = __hip_fp8x2_e5m2_fnuz;
+using hip_fp8x4_e5_t = __hip_fp8x4_e5m2_fnuz;
+#endif
+
+struct fp8_e4_t {
+  unsigned char data;
+  __device__ fp8_e4_t() {}
+  __device__ fp8_e4_t(hip_fp8_e4_t val) {
+    data = *reinterpret_cast<unsigned char *>(&val);
+  }
+  __device__ fp8_e4_t(float val) {
+    constexpr __hip_fp8_interpretation_t interp =
+#if (TILELANG_FP8_E4M3_VARIANT == TILELANG_FP8_E4M3_VARIANT_FNUZ)
+        __HIP_E4M3_FNUZ;
+#else
+        __HIP_E4M3;
+#endif
+    data = __hip_cvt_float_to_fp8(val, __HIP_SATFINITE, interp);
+  }
+  __device__ operator hip_fp8_e4_t() const {
+    return *reinterpret_cast<const hip_fp8_e4_t *>(&data);
+  }
+  __device__ operator float() const {
+    return static_cast<float>(static_cast<hip_fp8_e4_t>(*this));
+  }
+};
+
+using fp8_e4_2_t = hip_fp8x2_e4_t;
+using fp8_e4_4_storage_t = uint32_t;
 
 // Additional FP8 types for compatibility
-using fp8_e5_t = __hip_fp8_e5m2_fnuz;
-using fp8_e5_2_t = __hip_fp8x2_e5m2_fnuz;
+using fp8_e5_2_t = hip_fp8x2_e5_t;
+
+struct fp8_e5_t {
+  unsigned char data;
+  __device__ fp8_e5_t() {}
+  __device__ fp8_e5_t(hip_fp8_e5_t val) {
+    data = *reinterpret_cast<unsigned char *>(&val);
+  }
+  __device__ fp8_e5_t(float val) {
+    constexpr __hip_fp8_interpretation_t interp =
+#if (TILELANG_FP8_E5M2_VARIANT == TILELANG_FP8_E5M2_VARIANT_FNUZ)
+        __HIP_E5M2_FNUZ;
+#else
+        __HIP_E5M2;
+#endif
+    data = __hip_cvt_float_to_fp8(val, __HIP_SATFINITE, interp);
+  }
+  __device__ operator hip_fp8_e5_t() const {
+    return *reinterpret_cast<const hip_fp8_e5_t *>(&data);
+  }
+  __device__ operator float() const {
+    return static_cast<float>(static_cast<hip_fp8_e5_t>(*this));
+  }
+};
 // Note: E8M0 types are not supported in current HIP version
 // using fp8_e8_t = __hip_fp8_e8m0_fnuz;
 // using fp8_e8_2_t = __hip_fp8x2_e8m0_fnuz;
 
 // Simple wrapper that provides member access for generated code
-struct fp8_e4_4_t {
+struct __align__(4) fp8_e4_4_t {
   union {
-    __hip_fp8x4_e4m3_fnuz data;
+    fp8_e4_4_storage_t data;
     struct {
-      fp8_e4_t x, y, z, w;
+      fp8_e4_t x;
+      fp8_e4_t y;
+      fp8_e4_t z;
+      fp8_e4_t w;
     };
   };
 
-  // Default constructor
-  __device__ fp8_e4_4_t() = default;
-
-  // Constructor from __hip_fp8x4_e4m3_fnuz
-  __device__ fp8_e4_4_t(const __hip_fp8x4_e4m3_fnuz &val) : data(val) {}
-
-  // Constructor from float4
-  __device__ fp8_e4_4_t(const float4 &val) : data(val) {}
+  __device__ fp8_e4_4_t() {}
+  __device__ fp8_e4_4_t(const fp8_e4_4_storage_t &val) : data(val) {}
+  __device__ fp8_e4_4_t(const hip_fp8x4_e4_t &val) {
+    data = *reinterpret_cast<const fp8_e4_4_storage_t *>(&val);
+  }
 
-  // Conversion operator to __hip_fp8x4_e4m3_fnuz
-  __device__ operator __hip_fp8x4_e4m3_fnuz() const { return data; }
+  __device__ operator hip_fp8x4_e4_t() const {
+    return *reinterpret_cast<const hip_fp8x4_e4_t *>(&data);
+  }
 
-  // Assignment operator
-  __device__ fp8_e4_4_t &operator=(const __hip_fp8x4_e4m3_fnuz &val) {
+  __device__ fp8_e4_4_t &operator=(const fp8_e4_4_storage_t &val) {
     data = val;
     return *this;
   }
@@ -51,16 +141,25 @@ struct __align__(16) fp8_e4_16_t {
 };
 
 // FP8 E5M2 vector types
-struct fp8_e5_4_t {
+using fp8_e5_4_storage_t = uint32_t;
+
+struct __align__(4) fp8_e5_4_t {
   union {
-    __hip_fp8x4_e5m2_fnuz data;
+    fp8_e5_4_storage_t data;
     struct {
-      fp8_e5_t x, y, z, w;
+      fp8_e5_t x;
+      fp8_e5_t y;
+      fp8_e5_t z;
+      fp8_e5_t w;
     };
   };
-  __device__ fp8_e5_4_t() = default;
-  __device__ fp8_e5_4_t(const __hip_fp8x4_e5m2_fnuz &val) : data(val) {}
-  __device__ operator __hip_fp8x4_e5m2_fnuz() const { return data; }
+  __device__ fp8_e5_4_t() {}
+  __device__ fp8_e5_4_t(const hip_fp8x4_e5_t &val) {
+    data = *reinterpret_cast<const fp8_e5_4_storage_t *>(&val);
+  }
+  __device__ operator hip_fp8x4_e5_t() const {
+    return *reinterpret_cast<const hip_fp8x4_e5_t *>(&data);
+  }
 };
 
 struct __align__(8) fp8_e5_8_t {
@@ -100,28 +199,33 @@ struct __align__(16) fp8_e8_16_t {
 
 __device__ fp8_e4_4_t make_fp8_e4_4_t(fp8_e4_t x, fp8_e4_t y, fp8_e4_t z,
                                       fp8_e4_t w) {
-  // reinterpret the 4 fp8_e4_t values to signed char value and shift
-  signed char x_char = *reinterpret_cast<signed char *>(&x);
-  signed char y_char = *reinterpret_cast<signed char *>(&y);
-  signed char z_char = *reinterpret_cast<signed char *>(&z);
-  signed char w_char = *reinterpret_cast<signed char *>(&w);
-  int res = (w_char << 24) | (z_char << 16) | (y_char << 8) | x_char;
+  // reinterpret the 4 fp8_e4_t values to unsigned char to avoid sign extension
+  // on shift
+  unsigned char x_char = *reinterpret_cast<unsigned char *>(&x);
+  unsigned char y_char = *reinterpret_cast<unsigned char *>(&y);
+  unsigned char z_char = *reinterpret_cast<unsigned char *>(&z);
+  unsigned char w_char = *reinterpret_cast<unsigned char *>(&w);
+  unsigned int res = ((unsigned int)w_char << 24) |
+                     ((unsigned int)z_char << 16) |
+                     ((unsigned int)y_char << 8) | (unsigned int)x_char;
   return *reinterpret_cast<fp8_e4_4_t *>(&res);
 }
 
 __device__ fp8_e4_8_t make_fp8_e4_8_t(fp8_e4_t x, fp8_e4_t y, fp8_e4_t z,
                                       fp8_e4_t w, fp8_e4_t v, fp8_e4_t u,
                                       fp8_e4_t t, fp8_e4_t s) {
-  signed char x_char = *reinterpret_cast<signed char *>(&x);
-  signed char y_char = *reinterpret_cast<signed char *>(&y);
-  signed char z_char = *reinterpret_cast<signed char *>(&z);
-  signed char w_char = *reinterpret_cast<signed char *>(&w);
-  signed char v_char = *reinterpret_cast<signed char *>(&v);
-  signed char u_char = *reinterpret_cast<signed char *>(&u);
-  signed char t_char = *reinterpret_cast<signed char *>(&t);
-  signed char s_char = *reinterpret_cast<signed char *>(&s);
-  int a = (w_char << 24) | (z_char << 16) | (y_char << 8) | x_char;
-  int b = (s_char << 24) | (t_char << 16) | (u_char << 8) | v_char;
+  unsigned char x_char = *reinterpret_cast<unsigned char *>(&x);
+  unsigned char y_char = *reinterpret_cast<unsigned char *>(&y);
+  unsigned char z_char = *reinterpret_cast<unsigned char *>(&z);
+  unsigned char w_char = *reinterpret_cast<unsigned char *>(&w);
+  unsigned char v_char = *reinterpret_cast<unsigned char *>(&v);
+  unsigned char u_char = *reinterpret_cast<unsigned char *>(&u);
+  unsigned char t_char = *reinterpret_cast<unsigned char *>(&t);
+  unsigned char s_char = *reinterpret_cast<unsigned char *>(&s);
+  unsigned int a = ((unsigned int)w_char << 24) | ((unsigned int)z_char << 16) |
+                   ((unsigned int)y_char << 8) | (unsigned int)x_char;
+  unsigned int b = ((unsigned int)s_char << 24) | ((unsigned int)t_char << 16) |
+                   ((unsigned int)u_char << 8) | (unsigned int)v_char;
   fp8_e4_8_t res;
   res.x = *reinterpret_cast<fp8_e4_4_t *>(&a);
   res.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
@@ -134,26 +238,34 @@ __device__ fp8_e4_16_t make_fp8_e4_16_t(fp8_e4_t x0, fp8_e4_t x1, fp8_e4_t x2,
                                         fp8_e4_t y1, fp8_e4_t y2, fp8_e4_t y3,
                                         fp8_e4_t y4, fp8_e4_t y5, fp8_e4_t y6,
                                         fp8_e4_t y7) {
-  signed char x0_char = *reinterpret_cast<signed char *>(&x0);
-  signed char x1_char = *reinterpret_cast<signed char *>(&x1);
-  signed char x2_char = *reinterpret_cast<signed char *>(&x2);
-  signed char x3_char = *reinterpret_cast<signed char *>(&x3);
-  signed char x4_char = *reinterpret_cast<signed char *>(&x4);
-  signed char x5_char = *reinterpret_cast<signed char *>(&x5);
-  signed char x6_char = *reinterpret_cast<signed char *>(&x6);
-  signed char x7_char = *reinterpret_cast<signed char *>(&x7);
-  signed char y0_char = *reinterpret_cast<signed char *>(&y0);
-  signed char y1_char = *reinterpret_cast<signed char *>(&y1);
-  signed char y2_char = *reinterpret_cast<signed char *>(&y2);
-  signed char y3_char = *reinterpret_cast<signed char *>(&y3);
-  signed char y4_char = *reinterpret_cast<signed char *>(&y4);
-  signed char y5_char = *reinterpret_cast<signed char *>(&y5);
-  signed char y6_char = *reinterpret_cast<signed char *>(&y6);
-  signed char y7_char = *reinterpret_cast<signed char *>(&y7);
-  int a = (x3_char << 24) | (x2_char << 16) | (x1_char << 8) | x0_char;
-  int b = (x7_char << 24) | (x6_char << 16) | (x5_char << 8) | x4_char;
-  int c = (y3_char << 24) | (y2_char << 16) | (y1_char << 8) | y0_char;
-  int d = (y7_char << 24) | (y6_char << 16) | (y5_char << 8) | y4_char;
+  unsigned char x0_char = *reinterpret_cast<unsigned char *>(&x0);
+  unsigned char x1_char = *reinterpret_cast<unsigned char *>(&x1);
+  unsigned char x2_char = *reinterpret_cast<unsigned char *>(&x2);
+  unsigned char x3_char = *reinterpret_cast<unsigned char *>(&x3);
+  unsigned char x4_char = *reinterpret_cast<unsigned char *>(&x4);
+  unsigned char x5_char = *reinterpret_cast<unsigned char *>(&x5);
+  unsigned char x6_char = *reinterpret_cast<unsigned char *>(&x6);
+  unsigned char x7_char = *reinterpret_cast<unsigned char *>(&x7);
+  unsigned char y0_char = *reinterpret_cast<unsigned char *>(&y0);
+  unsigned char y1_char = *reinterpret_cast<unsigned char *>(&y1);
+  unsigned char y2_char = *reinterpret_cast<unsigned char *>(&y2);
+  unsigned char y3_char = *reinterpret_cast<unsigned char *>(&y3);
+  unsigned char y4_char = *reinterpret_cast<unsigned char *>(&y4);
+  unsigned char y5_char = *reinterpret_cast<unsigned char *>(&y5);
+  unsigned char y6_char = *reinterpret_cast<unsigned char *>(&y6);
+  unsigned char y7_char = *reinterpret_cast<unsigned char *>(&y7);
+  unsigned int a = ((unsigned int)x3_char << 24) |
+                   ((unsigned int)x2_char << 16) |
+                   ((unsigned int)x1_char << 8) | (unsigned int)x0_char;
+  unsigned int b = ((unsigned int)x7_char << 24) |
+                   ((unsigned int)x6_char << 16) |
+                   ((unsigned int)x5_char << 8) | (unsigned int)x4_char;
+  unsigned int c = ((unsigned int)y3_char << 24) |
+                   ((unsigned int)y2_char << 16) |
+                   ((unsigned int)y1_char << 8) | (unsigned int)y0_char;
+  unsigned int d = ((unsigned int)y7_char << 24) |
+                   ((unsigned int)y6_char << 16) |
+                   ((unsigned int)y5_char << 8) | (unsigned int)y4_char;
   fp8_e4_8_t res_x;
   res_x.x = *reinterpret_cast<fp8_e4_4_t *>(&a);
   res_x.y = *reinterpret_cast<fp8_e4_4_t *>(&b);
diff --git a/src/tl_templates/hip/ldsm.h b/src/tl_templates/hip/ldsm.h
index 286b773242..1fbd797f22 100644
--- a/src/tl_templates/hip/ldsm.h
+++ b/src/tl_templates/hip/ldsm.h
@@ -1,3 +1,46 @@
 #pragma once
 
 #include "common.h"
+
+namespace tl {
+
+#if defined(__gfx950__)
+
+// ds_read_tr16_b64: LDS transpose read, 64-bit, 16-element transpose.
+// Reads 8 bytes from LDS with a transpose across 16 elements.
+// Used for FP16/BF16 MFMA matrix B loads on gfx950 (MI350/MI355X).
+// smem_ptr must point into __shared__ memory.
+//
+// Uses __builtin_amdgcn_ds_read_tr16_b64_v4f16 (LLVM builtin) instead of
+// inline assembly because ROCm <= 7.2 assembler does not yet recognise the
+// ds_read_tr16_b64 mnemonic even though the hardware supports it.
+CK_TILE_DEVICE uint2 ds_read_tr16_b64(const void *smem_ptr) {
+  typedef __attribute__((__vector_size__(4 * sizeof(__fp16)))) __fp16 fp16x4_t;
+  // C-style cast: void* → LDS fp16x4_t* (required by the LLVM builtin
+  // signature)
+  fp16x4_t v = __builtin_amdgcn_ds_read_tr16_b64_v4f16(
+      (__attribute__((address_space(3))) fp16x4_t *)(smem_ptr));
+  uint2 result;
+  __builtin_memcpy(&result, &v, sizeof(result));
+  return result;
+}
+
+// ds_read_tr8_b64: LDS transpose read, 64-bit, 8-element transpose.
+// Reads 8 bytes from LDS with a transpose across 8 elements.
+// Used for FP32 MFMA matrix B loads on gfx950 (MI350/MI355X).
+// smem_ptr must point into __shared__ memory.
+//
+// Uses __builtin_amdgcn_ds_read_tr8_b64_v2i32 (LLVM builtin) for the same
+// reason as ds_read_tr16_b64 above.
+CK_TILE_DEVICE uint2 ds_read_tr8_b64(const void *smem_ptr) {
+  typedef __attribute__((__vector_size__(2 * sizeof(int)))) int i32x2_t;
+  i32x2_t v = __builtin_amdgcn_ds_read_tr8_b64_v2i32(
+      (__attribute__((address_space(3))) i32x2_t *)(smem_ptr));
+  uint2 result;
+  __builtin_memcpy(&result, &v, sizeof(result));
+  return result;
+}
+
+#endif // __gfx950__
+
+} // namespace tl
diff --git a/src/tl_templates/hip/reduce.h b/src/tl_templates/hip/reduce.h
index 16c51b6486..dd12a1b886 100644
--- a/src/tl_templates/hip/reduce.h
+++ b/src/tl_templates/hip/reduce.h
@@ -73,7 +73,7 @@ struct SharedReduceWarp {
       }
 
       for (int offset = kWarpSize / 2; offset > 0; offset >>= 1) {
-        T other = __shfl_down(partial, offset, kWarpSize);
+        T other = tl::shfl_down(partial, offset, kWarpSize);
         partial = Reducer()(partial, other);
       }
 
@@ -87,13 +87,15 @@ struct SharedReduceWarp {
   }
 };
 
-template <class Reducer, int threads, int scale, int thread_offset = 0>
+template <class Reducer, int threads, int scale, int thread_offset = 0,
+          int batch_size = 1, int workspace_stride = 0>
 struct AllReduce {
   static_assert(threads == 1024 || threads == 512 || threads == 256 ||
                 threads == 128 || threads == 64 || threads == 32 ||
                 threads == 16 || threads == 8 || threads == 4 || threads == 2);
   static_assert(threads % scale == 0);
 
+  // Scalar interface (backward-compatible).
   template <typename T> static __device__ T run(T x, T *red_buf = nullptr) {
     constexpr int offset = threads / 2;
     constexpr int warpSize = 64;
@@ -104,14 +106,243 @@ struct AllReduce {
       __syncthreads();
       x = Reducer()(x, red_buf[threadIdx.x ^ offset]);
     } else {
-      x = Reducer()(x, __shfl_xor(x, offset));
+      x = Reducer()(x, tl::shfl_xor(x, offset));
     }
     if constexpr (offset == scale) {
       return x;
     } else {
-      return AllReduce<Reducer, offset, scale, thread_offset>::run(x, red_buf);
+      return AllReduce<Reducer, offset, scale, thread_offset, batch_size,
+                       workspace_stride>::run(x, red_buf);
+    }
+  }
+
+  // Batch interface (named run_batch to avoid overload-resolution ambiguity).
+  template <typename T>
+  static __device__ void run_batch(T *x, T *red_buf = nullptr) {
+    constexpr int offset = threads / 2;
+    constexpr int warpSize = 64;
+
+    if constexpr (offset >= warpSize) {
+      __syncthreads();
+#pragma unroll
+      for (int i = 0; i < batch_size; i++)
+        red_buf[(threadIdx.x - thread_offset) + i * workspace_stride] = x[i];
+      __syncthreads();
+#pragma unroll
+      for (int i = 0; i < batch_size; i++)
+        x[i] =
+            Reducer()(x[i], red_buf[((threadIdx.x - thread_offset) ^ offset) +
+                                    i * workspace_stride]);
+    } else {
+#pragma unroll
+      for (int i = 0; i < batch_size; i++)
+        x[i] = Reducer()(x[i], tl::shfl_xor(x[i], offset));
+    }
+    if constexpr (offset == scale) {
+      return;
+    } else {
+      AllReduce<Reducer, offset, scale, thread_offset, batch_size,
+                workspace_stride>::run_batch(x, red_buf);
     }
   }
 };
 
+template <int threads, bool reverse = false> struct CumSum1D {
+  static_assert(threads == 1024 or threads == 512 or threads == 256 or
+                threads == 128 or threads == 64);
+  template <typename T, int SEG = 64>
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int N) {
+    if (N <= 0)
+      return;
+
+    const int tid = threadIdx.x;
+    const int lane = tid % SEG;
+
+    if (tid >= SEG)
+      return;
+
+    T carry = (T)0;
+
+    if (reverse) {
+      const int num_segments = (N + SEG - 1) / SEG;
+      for (int seg = num_segments - 1; seg >= 0; --seg) {
+        const int idx = seg * SEG + lane;
+        T val = (idx < N) ? src[idx] : (T)0;
+
+#pragma unroll
+        for (int off = 1; off < SEG; off <<= 1) {
+          T n = tl::shfl_down(val, off);
+          if (lane < SEG - off)
+            val += n;
+        }
+
+        val += carry;
+
+        if (idx < N)
+          dst[idx] = val;
+
+        T segSum = tl::shfl(val, 0);
+        if (lane == 0)
+          carry = segSum;
+        carry = tl::shfl(carry, 0);
+      }
+    } else {
+      const int num_segments = (N + SEG - 1) / SEG;
+      for (int seg = 0; seg < num_segments; ++seg) {
+        const int idx = seg * SEG + lane;
+        T val = (idx < N) ? src[idx] : (T)0;
+
+#pragma unroll
+        for (int off = 1; off < SEG; off <<= 1) {
+          T n = tl::shfl_up(val, off);
+          if (lane >= off)
+            val += n;
+        }
+
+        val += carry;
+
+        if (idx < N)
+          dst[idx] = val;
+
+        T segSum = tl::shfl(val, SEG - 1);
+        if (lane == SEG - 1)
+          carry = segSum;
+        carry = tl::shfl(carry, SEG - 1);
+      }
+    }
+  }
+};
+
+template <int threads, int Axis = 0, bool reverse = false> struct CumSum2D {
+  static_assert(threads == 1024 or threads == 512 or threads == 256 or
+                threads == 128 or threads == 64);
+  template <typename T, int SEG = 64>
+  static TL_DEVICE void run(const T *__restrict__ src, T *__restrict__ dst,
+                            int H, int W) {
+
+    constexpr int TILE_H = threads / SEG;
+    const int num_blocks = (H + TILE_H - 1) / TILE_H;
+    const int tid = threadIdx.x;
+    const int lane = tid % SEG;
+    const int row = tid / SEG;
+
+    for (int b = 0; b < num_blocks; ++b) {
+      const int gRow = b * TILE_H + row;
+      if (gRow >= H)
+        return;
+
+      T carry = (T)0;
+
+      if (reverse) {
+        // Start from the last segment for reverse mode
+        for (int seg = (W + SEG - 1) / SEG - 1; seg >= 0; --seg) {
+          const int col = seg * SEG + lane;
+
+          const int real_row = Axis == 1 ? gRow : col;
+          const int real_col = Axis == 1 ? col : gRow;
+
+          T val = (col < W) ? src[real_row * W + real_col] : (T)0;
+
+#pragma unroll
+          for (int off = 1; off < SEG; off <<= 1) {
+            T n = tl::shfl_down(val, off);
+            if (lane < SEG - off)
+              val += n;
+          }
+
+          val += carry;
+
+          if (real_col < W)
+            dst[real_row * W + real_col] = val;
+
+          T segSum = tl::shfl(val, 0);
+          if (lane == 0)
+            carry = segSum;
+          carry = tl::shfl(carry, 0);
+        }
+      } else {
+        for (int seg = 0; seg * SEG < W; ++seg) {
+          const int col = seg * SEG + lane;
+
+          const int real_row = Axis == 1 ? gRow : col;
+          const int real_col = Axis == 1 ? col : gRow;
+
+          T val = (col < W) ? src[real_row * W + real_col] : (T)0;
+
+#pragma unroll
+          for (int off = 1; off < SEG; off <<= 1) {
+            T n = tl::shfl_up(val, off);
+            if (lane >= off)
+              val += n;
+          }
+
+          val += carry;
+
+          if (real_col < W)
+            dst[real_row * W + real_col] = val;
+
+          T segSum = tl::shfl(val, SEG - 1);
+          if (lane == SEG - 1)
+            carry = segSum;
+          carry = tl::shfl(carry, SEG - 1);
+        }
+      }
+    }
+  }
+};
+
+template <typename T, typename ReduceOp>
+TL_DEVICE T warp_reduce(T value, ReduceOp op) {
+  // 5-step butterfly reduction with width=32, matching CUDA's 32-lane warp
+  // semantics on CDNA (wave64) and RDNA (wave32) targets.
+  //
+  // On CDNA (wave64, 64-lane wavefronts) with threads=32 per block:
+  //   Only lanes 0-31 are active; lanes 32-63 hold uninitialised VGPRs.
+  //   The old step `__shfl_xor(value, 32)` without a width argument read
+  //   from those uninitialised lanes, producing NaN or garbage.
+  //   width=32 confines every shuffle to the [0,31] group, preventing this.
+  //
+  // On CDNA with threads=64 (one full wave64):
+  //   width=32 splits the wavefront into two independent 32-lane groups.
+  //   Lane 0 of each group holds the group partial sum, which is exactly what
+  //   kernels that assume logical warp_size=32 expect for their inter-warp
+  //   shared-memory reductions.
+  //
+  // On RDNA (wave32, 32-lane wavefronts):
+  //   width=32 equals the wavefront size, so behaviour is identical to
+  //   omitting the width argument.
+  //
+  // Note: this intentionally preserves 32-lane logical-warp semantics for
+  // backward compatibility.  Full wave64 utilisation (6-step, width=64) would
+  // require restructuring inter-warp shared-memory communication in all
+  // kernels and is deferred to a separate optimisation pass.
+  value = op(value, __shfl_xor(value, 16, 32));
+  value = op(value, __shfl_xor(value, 8, 32));
+  value = op(value, __shfl_xor(value, 4, 32));
+  value = op(value, __shfl_xor(value, 2, 32));
+  value = op(value, __shfl_xor(value, 1, 32));
+  return value;
+}
+
+template <typename T> TL_DEVICE T warp_reduce_sum(T value) {
+  return warp_reduce<T>(value, SumOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_max(T value) {
+  return warp_reduce<T>(value, MaxOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_min(T value) {
+  return warp_reduce<T>(value, MinOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitand(T value) {
+  return warp_reduce<T>(value, BitAndOp());
+}
+
+template <typename T> TL_DEVICE T warp_reduce_bitor(T value) {
+  return warp_reduce<T>(value, BitOrOp());
+}
+
 } // namespace tl
diff --git a/src/transform/annotate_warp_group_reg_alloc.cc b/src/transform/annotate_warp_group_reg_alloc.cc
index 08be53f205..b2b6ae6a0b 100644
--- a/src/transform/annotate_warp_group_reg_alloc.cc
+++ b/src/transform/annotate_warp_group_reg_alloc.cc
@@ -3,7 +3,17 @@
  * \brief Annotate warp group reg alloc for warp specialization
  */
 
-#include "warp_specialized_rewriter.h"
+#include <tvm/ffi/cast.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+#include "runtime/thread_storage_scope.h"
+#include "tir/transforms/ir_utils.h"
+#include <functional>
 #include <unordered_set>
 #include <vector>
 
@@ -12,33 +22,118 @@ namespace tl {
 
 using namespace tir;
 
+namespace {
+
+template <typename F>
+Stmt RewriteWarpSpecializationBody(const Stmt &stmt, F &&rewrite_if,
+                                   bool *rewrote) {
+  if (*rewrote) {
+    return stmt;
+  }
+
+  if (const auto *if_node = stmt.as<IfThenElseNode>()) {
+    *rewrote = true;
+    return rewrite_if(ffi::GetRef<IfThenElse>(if_node));
+  }
+
+  if (const auto *seq = stmt.as<SeqStmtNode>()) {
+    Array<Stmt> new_seq;
+    bool changed = false;
+    for (const auto &sub_stmt : seq->seq) {
+      Stmt rewritten =
+          RewriteWarpSpecializationBody(sub_stmt, rewrite_if, rewrote);
+      changed = changed || !rewritten.same_as(sub_stmt);
+      new_seq.push_back(rewritten);
+    }
+    if (!changed) {
+      return stmt;
+    }
+    return new_seq.size() == 1 ? new_seq[0] : SeqStmt(new_seq);
+  }
+
+  if (const auto *attr = stmt.as<AttrStmtNode>()) {
+    Stmt new_body =
+        RewriteWarpSpecializationBody(attr->body, rewrite_if, rewrote);
+    if (new_body.same_as(attr->body)) {
+      return stmt;
+    }
+    return AttrStmt(attr->node, attr->attr_key, attr->value, new_body);
+  }
+
+  if (const auto *let_node = stmt.as<LetStmtNode>()) {
+    Stmt new_body =
+        RewriteWarpSpecializationBody(let_node->body, rewrite_if, rewrote);
+    if (new_body.same_as(let_node->body)) {
+      return stmt;
+    }
+    return LetStmt(let_node->var, let_node->value, new_body);
+  }
+
+  if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+    const Block &block = realize->block;
+    Stmt new_body =
+        RewriteWarpSpecializationBody(block->body, rewrite_if, rewrote);
+    if (new_body.same_as(block->body)) {
+      return stmt;
+    }
+    Block new_block(block->iter_vars, block->reads, block->writes,
+                    block->name_hint, new_body, block->init,
+                    block->alloc_buffers, block->match_buffers,
+                    block->annotations);
+    return BlockRealize(realize->iter_values, realize->predicate, new_block);
+  }
+
+  if (const auto *block = stmt.as<BlockNode>()) {
+    Stmt new_body =
+        RewriteWarpSpecializationBody(block->body, rewrite_if, rewrote);
+    if (new_body.same_as(block->body)) {
+      return stmt;
+    }
+    return Block(block->iter_vars, block->reads, block->writes,
+                 block->name_hint, new_body, block->init, block->alloc_buffers,
+                 block->match_buffers, block->annotations);
+  }
+
+  return stmt;
+}
+
+} // namespace
+
 class SetMaxNRegCollector : public StmtExprVisitor {
 public:
-  static Array<IntImm> Collect(const PrimFunc &f) {
+  struct Result {
+    Array<IntImm> nreg;
+    bool preserve_explicit_set_max_nreg{false};
+  };
+
+  static Result Collect(const PrimFunc &f) {
     SetMaxNRegCollector collector;
     collector(f->body);
     if (collector.warp_specialized_) {
-      return Array<IntImm>({});
+      return {Array<IntImm>({}), true};
     }
-    return collector.has_no_set_max_nreg_
-               ? Array<IntImm>({IntImm(DataType::Int(32), -1),
-                                IntImm(DataType::Int(32), -1)})
-               : collector.nreg_;
+    Array<IntImm> nreg = collector.has_no_set_max_nreg_
+                             ? Array<IntImm>({IntImm(DataType::Int(32), -1),
+                                              IntImm(DataType::Int(32), -1)})
+                             : collector.nreg_;
+    return {nreg, false};
   }
 
 private:
   void VisitStmt_(const EvaluateNode *op) final {
     if (const CallNode *call = op->value.as<CallNode>()) {
       if (call->op.same_as(set_max_nreg())) {
+        return;
+      } else if (call->op.same_as(annotate_producer_reg_dealloc())) {
         auto reg_hint = call->args[0].as<IntImmNode>()->value;
-        auto is_inc = call->args[1].as<IntImmNode>()->value;
         ICHECK(reg_hint <= 240 && reg_hint >= 24)
             << "Invalid reg hint: " << reg_hint;
-        ICHECK(is_inc == 0 || is_inc == 1) << "Invalid is_inc: " << is_inc;
-
-        // producer should decrease register hint while consumer should increase
-        // register hint
-        nreg_.Set(is_inc, IntImm(DataType::Int(32), reg_hint));
+        nreg_.Set(0, IntImm(DataType::Int(32), reg_hint));
+      } else if (call->op.same_as(annotate_consumer_reg_alloc())) {
+        auto reg_hint = call->args[0].as<IntImmNode>()->value;
+        ICHECK(reg_hint <= 240 && reg_hint >= 24)
+            << "Invalid reg hint: " << reg_hint;
+        nreg_.Set(1, IntImm(DataType::Int(32), reg_hint));
       } else if (call->op.same_as(no_set_max_nreg())) {
         has_no_set_max_nreg_ = true;
       }
@@ -68,6 +163,16 @@ class SimtCopyDetector : public StmtExprVisitor {
   }
 
 private:
+  void VisitStmt_(const EvaluateNode *op) final {
+    if (const CallNode *call = op->value.as<CallNode>()) {
+      if (call->op.same_as(builtin::ptx_cp_async()) ||
+          call->op.same_as(tl::ptx_cp_async())) {
+        has_simt_copy_ = true;
+      }
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
   void VisitStmt_(const BufferStoreNode *op) final {
     auto scope =
         runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
@@ -84,7 +189,9 @@ class SetMaxNRegInjector : public StmtExprMutator {
 public:
   static PrimFunc Inject(PrimFunc f) {
     auto T = SetMaxNRegInjector();
-    T.nreg_ = SetMaxNRegCollector::Collect(f);
+    SetMaxNRegCollector::Result result = SetMaxNRegCollector::Collect(f);
+    T.nreg_ = result.nreg;
+    T.preserve_explicit_set_max_nreg_ = result.preserve_explicit_set_max_nreg;
     if (T.nreg_.empty()) {
       return f;
     }
@@ -95,9 +202,14 @@ class SetMaxNRegInjector : public StmtExprMutator {
 private:
   Stmt VisitStmt_(const EvaluateNode *op) final {
     if (const CallNode *call = op->value.as<CallNode>()) {
-      if (call->op.same_as(no_set_max_nreg())) {
-        // Remove the original set_max_nreg calls as they will be re-inserted
-        // at appropriate locations
+      if (!preserve_explicit_set_max_nreg_ &&
+          call->op.same_as(set_max_nreg())) {
+        return StmtExprMutator::VisitStmt_(op);
+      }
+      if (call->op.same_as(annotate_producer_reg_dealloc()) ||
+          call->op.same_as(annotate_consumer_reg_alloc()) ||
+          call->op.same_as(no_set_max_nreg())) {
+        // Remove annotations after they have been consumed by this pass.
         return Evaluate(0);
       }
     }
@@ -118,61 +230,70 @@ class SetMaxNRegInjector : public StmtExprMutator {
       thread_iv_ = {};
       return attr_stmt;
     } else if (op->attr_key == attr::kWarpSpecializationScope) {
-      auto if_then_else = Downcast<IfThenElse>(op->body);
-      if (!if_then_else.defined()) {
-        return StmtExprMutator::VisitStmt_(op);
-      }
-      auto producer_body = if_then_else->then_case;
-      Optional<Stmt> consumer_body = if_then_else->else_case;
-      // In some degenerate warp-specialized patterns (e.g., producer-only),
-      // the consumer body may be absent. Handle gracefully by only annotating
-      // the producer side when consumer is missing.
-
-      auto dec_reg = nreg_[0].as<IntImmNode>()->value;
-      auto inc_reg = nreg_[1].as<IntImmNode>()->value;
-
-      auto inc_reg_stmt = Evaluate(0);
-      auto dec_reg_stmt = Evaluate(0);
-
-      // Only inject if we have valid register hints and no SIMT copy
-      bool has_simt_copy = SimtCopyDetector::Detect(producer_body);
-
-      if (dec_reg == 0 && inc_reg == 0 && !has_simt_copy) {
-        auto inc_reg_num = IntImm(DataType::Int(32), 240);
-        auto dec_reg_num = IntImm(DataType::Int(32), 24);
-        inc_reg_stmt = Evaluate(
-            Call(DataType::Handle(), set_max_nreg(), {inc_reg_num, 1}));
-        dec_reg_stmt = Evaluate(
-            Call(DataType::Handle(), set_max_nreg(), {dec_reg_num, 0}));
-      }
+      bool rewrote_ws_body = false;
+      auto rewrite_if = [&](const IfThenElse &if_then_else) -> Stmt {
+        auto producer_body = if_then_else->then_case;
+        Optional<Stmt> consumer_body = if_then_else->else_case;
+        // In some degenerate warp-specialized patterns (e.g., producer-only),
+        // the consumer body may be absent. Handle gracefully by only
+        // annotating the producer side when consumer is missing.
 
-      // Inject register setting statements
-      Array<Stmt> producer_stmts;
-      producer_stmts.push_back(dec_reg_stmt);
-      producer_stmts.push_back(producer_body);
-      auto new_producer_body = SeqStmt(producer_stmts);
-
-      Stmt new_if_stmt;
-      if (consumer_body.defined()) {
-        Array<Stmt> consumer_stmts;
-        consumer_stmts.push_back(inc_reg_stmt);
-        consumer_stmts.push_back(consumer_body.value());
-        auto new_consumer_body = SeqStmt(consumer_stmts);
-        new_if_stmt = IfThenElse(if_then_else->condition, new_producer_body,
-                                 new_consumer_body);
-      } else {
-        // No consumer branch; keep the if-then form.
-        new_if_stmt = IfThenElse(if_then_else->condition, new_producer_body);
-      }
+        auto dec_reg = nreg_[0].as<IntImmNode>()->value;
+        auto inc_reg = nreg_[1].as<IntImmNode>()->value;
+
+        auto inc_reg_stmt = Evaluate(0);
+        auto dec_reg_stmt = Evaluate(0);
+
+        // Default hints stay conservative: skip auto-injection when producer
+        // contains SIMT copy-like statements. Explicit user hints should still
+        // be honored even in that case.
+        bool has_simt_copy = SimtCopyDetector::Detect(producer_body);
+        bool has_explicit_hints = dec_reg != 0 || inc_reg != 0;
 
-      auto new_attr = AttrStmt(op->node, op->attr_key, op->value, new_if_stmt);
-      return new_attr;
+        if (dec_reg != -1 && inc_reg != -1 &&
+            (has_explicit_hints || !has_simt_copy)) {
+          int final_dec_reg = has_explicit_hints ? dec_reg : 24;
+          int final_inc_reg = has_explicit_hints ? inc_reg : 240;
+          dec_reg_stmt =
+              Evaluate(Call(DataType::Handle(), set_max_nreg(),
+                            {IntImm(DataType::Int(32), final_dec_reg),
+                             IntImm(DataType::Int(32), 0)}));
+          inc_reg_stmt =
+              Evaluate(Call(DataType::Handle(), set_max_nreg(),
+                            {IntImm(DataType::Int(32), final_inc_reg),
+                             IntImm(DataType::Int(32), 1)}));
+        }
+
+        Array<Stmt> producer_stmts;
+        producer_stmts.push_back(dec_reg_stmt);
+        producer_stmts.push_back(producer_body);
+        auto new_producer_body = SeqStmt(producer_stmts);
+
+        if (consumer_body.defined()) {
+          Array<Stmt> consumer_stmts;
+          consumer_stmts.push_back(inc_reg_stmt);
+          consumer_stmts.push_back(consumer_body.value());
+          auto new_consumer_body = SeqStmt(consumer_stmts);
+          return IfThenElse(if_then_else->condition, new_producer_body,
+                            new_consumer_body);
+        }
+
+        return IfThenElse(if_then_else->condition, new_producer_body);
+      };
+
+      Stmt new_body =
+          RewriteWarpSpecializationBody(op->body, rewrite_if, &rewrote_ws_body);
+      if (!rewrote_ws_body) {
+        return StmtExprMutator::VisitStmt_(op);
+      }
+      return AttrStmt(op->node, op->attr_key, op->value, new_body);
     } else {
       return StmtExprMutator::VisitStmt_(op);
     }
   }
 
   Array<IntImm> nreg_;
+  bool preserve_explicit_set_max_nreg_{false};
   IterVar thread_iv_;
   Optional<PrimExpr> updated_thread_extent_;
   bool need_update_thread_extent_ = false;
diff --git a/src/transform/arg_binder.cc b/src/transform/arg_binder.cc
index 73500e176a..bb1a4b8844 100644
--- a/src/transform/arg_binder.cc
+++ b/src/transform/arg_binder.cc
@@ -311,11 +311,49 @@ inline PrimExpr TVMArrayGet(DataType t, Var arr,
   return TVMStructGet(t, arr, 0, kind);
 }
 
+void ArgBinder::RelaxedStrideCheck(const int dim_idx, const PrimExpr &stride,
+                                   const PrimExpr &logical_stride_val,
+                                   const PrimExpr &dim_shape,
+                                   const PrimExpr &is_null,
+                                   const std::string &stride_element_name) {
+  if (const VarNode *v = stride.as<VarNode>()) {
+    auto it = def_map_->find(v);
+    if (it != def_map_->end()) {
+      PrimExpr expected = it->second;
+      if (is_zero(analyzer_.Simplify(expected))) {
+        LOG(WARNING) << "TileLang: Detected zero-dimension in "
+                     << stride_element_name << ". Relaxing stride check.";
+      }
+      // Tolerate any stride value when dim size is 1 (torch 2.1 DLPack bug:
+      // forces stride=1 for size-1 dims regardless of logical stride).
+      PrimExpr cond = (expected == logical_stride_val) || (expected == 0) ||
+                      (dim_shape == 1);
+      BinderAddAssert(&analyzer_, cond, stride_element_name, &asserts_,
+                      is_null);
+    } else {
+      BindNullable(stride, logical_stride_val, stride_element_name, true,
+                   is_null);
+    }
+  } else {
+    const PrimExpr &expected = stride;
+    if (is_zero(analyzer_.Simplify(expected))) {
+      LOG(WARNING) << "TileLang: Detected zero-dimension in "
+                   << stride_element_name << ". Relaxing stride check.";
+    }
+    // Tolerate any stride value when dim size is 1 (torch 2.1 DLPack bug:
+    // forces stride=1 for size-1 dims regardless of logical stride).
+    PrimExpr cond =
+        (expected == logical_stride_val) || (expected == 0) || (dim_shape == 1);
+    BinderAddAssert(&analyzer_, cond, stride_element_name, &asserts_, is_null);
+  }
+}
+
 void ArgBinder::BindDLTensors(
     const std::vector<std::pair<Var, Buffer>> &buffer_def,
     const PrimExpr &device_type, const PrimExpr &device_id,
     const std::string &func_name,
-    const std::unordered_set<const VarNode *> &used_param_buffers) {
+    const std::unordered_set<const VarNode *> &used_param_buffers,
+    const std::unordered_set<const VarNode *> &used_shape_vars) {
   ffi::Array<Buffer> buffers;
   ffi::Array<Var> handles;
 
@@ -592,140 +630,256 @@ void ArgBinder::BindDLTensors(
     Buffer buf_shape = shape_buffer_map[arg_name];
 
     // Bind symbolic variables from buffer shape
-    for (size_t k = 0; k < buffer->shape.size(); ++k) {
-      // These packed-bit dtype shapes were not bound in the original
-      // implementation, so we just use them as is.
-      if (data_is_subtype) {
-        break;
+    // For subtype (bits < 8), the runtime tensor has packed shape where
+    // the last dimension is divided by the packing factor k = 8 / bits.
+    // For example, fp4 (4 bits) has k=2, so logical shape [m, 16] becomes
+    // runtime shape [m, 8]. We need to solve for symbolic variables from
+    // this packed shape.
+    if (data_is_subtype) {
+      // For subtype, bind symbolic variables from the packed shape.
+      // The packing factor k = 8 / bits (number of elements packed into one
+      // storage unit)
+      int bits = buffer->dtype.bits();
+      int pack_factor = 8 / bits;
+
+      // Build a mapping from logical shape dimensions to runtime shape
+      // expressions For all dimensions except the last, runtime_shape[k] ==
+      // logical_shape[k] For the last dimension, logical_shape[-1] ==
+      // runtime_shape[-1] * pack_factor
+      for (size_t k = 0; k < buffer->shape.size(); ++k) {
+        // Read the runtime shape value from DLTensor
+        PrimExpr raw_shape_val =
+            cast(buffer->shape[k].dtype(),
+                 BufferLoad(buf_shape,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
+        PrimExpr runtime_shape_val = tvm::if_then_else(
+            Not(is_null), raw_shape_val, make_const(raw_shape_val.dtype(), 0));
+
+        // For the last dimension, the logical shape = runtime_shape *
+        // pack_factor
+        PrimExpr logical_shape_val;
+        bool is_last_dim = (k == buffer->shape.size() - 1);
+        if (is_last_dim) {
+          logical_shape_val =
+              runtime_shape_val *
+              make_const(runtime_shape_val.dtype(), pack_factor);
+        } else {
+          logical_shape_val = runtime_shape_val;
+        }
+
+        // Now bind the symbolic variable if present
+        if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+          auto it = def_map_->find(v);
+          if (it == def_map_->end()) {
+            // First time binding: use BindNullable which can solve equations
+            BindNullable(buffer->shape[k], logical_shape_val,
+                         shape_element_name(k), true, is_null);
+          } else {
+            // Variable already bound, add assertion with nullable guard
+            PrimExpr cond = (it->second == logical_shape_val);
+            BinderAddAssert(&analyzer_, cond, shape_element_name(k), &asserts_,
+                            is_null);
+          }
+        } else {
+          // Constant or expression dimension, bind/assert via BindNullable
+          BindNullable(buffer->shape[k], logical_shape_val,
+                       shape_element_name(k), true, is_null);
+        }
       }
+    } else {
+      // Non-subtype: normal shape binding
+      for (size_t k = 0; k < buffer->shape.size(); ++k) {
+        // The "real" runtime shape value read from DLTensor.
+        // Guard the load with `is_null` to avoid dereferencing NULL handles.
+        PrimExpr raw_shape_val =
+            cast(buffer->shape[k].dtype(),
+                 BufferLoad(buf_shape,
+                            {IntImm(DataType::Int(32), static_cast<int>(k))}));
+        PrimExpr shape_val = tvm::if_then_else(
+            Not(is_null), raw_shape_val, make_const(raw_shape_val.dtype(), 0));
+
+        // Check if this dimension is a symbolic variable
+        if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
+          auto it = def_map_->find(v);
+          if (it == def_map_->end()) {
+            // First time binding this symbolic variable
+            auto sources_it = shape_var_sources.find(v);
+            if (sources_it != shape_var_sources.end() &&
+                sources_it->second.size() > 1) {
+              // This variable appears in multiple buffers
+              // If this shape var is actually used by the PrimFunc body, we
+              // must be able to bind it at runtime, so require at least one
+              // non-null carrier buffer. Otherwise (e.g. the var only exists
+              // in shapes of unused nullable buffers), allow all carriers to be
+              // NULL and bind the var to 0.
+              if (used_shape_vars.count(v)) {
+                PrimExpr any_nonnull = const_false();
+                for (const auto &src : sources_it->second) {
+                  bool buf_is_used = used_param_buffers.count(src.handle_ptr);
+                  if (buf_is_used) {
+                    any_nonnull = const_true();
+                    break;
+                  }
+                  Var src_is_null = is_null_map[src.buf_name];
+                  any_nonnull = Or(any_nonnull, Not(src_is_null));
+                }
 
-      // The "real" runtime shape value read from DLTensor.
-      // Guard the load with `is_null` to avoid dereferencing NULL handles.
-      PrimExpr raw_shape_val =
-          cast(buffer->shape[k].dtype(),
-               BufferLoad(buf_shape,
-                          {IntImm(DataType::Int(32), static_cast<int>(k))}));
-      PrimExpr shape_val = tvm::if_then_else(
-          Not(is_null), raw_shape_val, make_const(raw_shape_val.dtype(), 0));
+                std::ostringstream err_msg;
+                err_msg << "Symbolic shape variable "
+                        << ffi::GetRef<Var>(v)->name_hint
+                        << " requires at least one non-null buffer among: ";
+                bool first = true;
+                for (const auto &src : sources_it->second) {
+                  if (!first)
+                    err_msg << ", ";
+                  err_msg << src.buf_name;
+                  first = false;
+                }
 
-      // Check if this dimension is a symbolic variable
-      if (const VarNode *v = buffer->shape[k].as<VarNode>()) {
-        auto it = def_map_->find(v);
-        if (it == def_map_->end()) {
-          // First time binding this symbolic variable
-          auto sources_it = shape_var_sources.find(v);
-          if (sources_it != shape_var_sources.end() &&
-              sources_it->second.size() > 1) {
-            // This variable appears in multiple buffers
-            // Assert that at least one buffer is non-null
-            PrimExpr any_nonnull = const_false();
-            for (const auto &src : sources_it->second) {
-              bool buf_is_used = used_param_buffers.count(src.handle_ptr);
-              if (buf_is_used) {
-                any_nonnull = const_true();
-                break;
+                init_nest_.emplace_back(AssertStmt(
+                    any_nonnull, tvm::tir::StringImm(err_msg.str()), nop));
               }
-              Var src_is_null = is_null_map[src.buf_name];
-              any_nonnull = Or(any_nonnull, Not(src_is_null));
-            }
-
-            std::ostringstream err_msg;
-            err_msg << "Symbolic shape variable "
-                    << ffi::GetRef<Var>(v)->name_hint
-                    << " requires at least one non-null buffer among: ";
-            bool first = true;
-            for (const auto &src : sources_it->second) {
-              if (!first)
-                err_msg << ", ";
-              err_msg << src.buf_name;
-              first = false;
-            }
-
-            init_nest_.emplace_back(AssertStmt(
-                any_nonnull, tvm::tir::StringImm(err_msg.str()), nop));
 
-            // Build cascaded if_then_else: if !is_null_a then a.shape[k] else
-            // if !is_null_b then b.shape[k] ... We need to construct this in
-            // reverse order
-            PrimExpr cascaded_value;
-            bool is_first_source = true;
+              // Build cascaded if_then_else: if !is_null_a then a.shape[k] else
+              // if !is_null_b then b.shape[k] ... We need to construct this in
+              // reverse order
+              PrimExpr cascaded_value;
+              bool is_first_source = true;
 
-            for (auto rit = sources_it->second.rbegin();
-                 rit != sources_it->second.rend(); ++rit) {
-              const auto &src = *rit;
+              for (auto rit = sources_it->second.rbegin();
+                   rit != sources_it->second.rend(); ++rit) {
+                const auto &src = *rit;
 
-              // Get the shape buffer for this source
-              auto it_buf = shape_buffer_map.find(src.buf_name);
-              if (it_buf == shape_buffer_map.end()) {
-                LOG(FATAL) << "Shape buffer not found for " << src.buf_name;
-              }
-              Buffer src_shape_buf = it_buf->second;
-
-              // Construct the shape load and guard it if the source may be NULL
-              PrimExpr src_raw_shape_val =
-                  cast(buffer->shape[k].dtype(),
-                       BufferLoad(src_shape_buf,
-                                  {IntImm(DataType::Int(32),
-                                          static_cast<int>(src.dim_idx))}));
-
-              // Check if this buffer is used (non-nullable)
-              bool src_is_used = used_param_buffers.count(src.handle_ptr);
-
-              if (is_first_source) {
-                // Base case: use this shape value directly (we know at least
-                // one is non-null from assert)
-                if (src_is_used) {
-                  cascaded_value = src_raw_shape_val;
-                } else {
-                  Var src_is_null = is_null_map[src.buf_name];
-                  cascaded_value = tvm::if_then_else(
-                      Not(src_is_null), src_raw_shape_val,
-                      make_const(src_raw_shape_val.dtype(), 0));
+                // Get the shape buffer for this source
+                auto it_buf = shape_buffer_map.find(src.buf_name);
+                if (it_buf == shape_buffer_map.end()) {
+                  LOG(FATAL) << "Shape buffer not found for " << src.buf_name;
                 }
-                is_first_source = false;
-              } else {
-                // if !is_null then use this shape, else use previous cascaded
-                // value But if buffer is used (non-nullable), always use its
-                // shape
-                if (src_is_used) {
-                  cascaded_value = src_raw_shape_val;
+                Buffer src_shape_buf = it_buf->second;
+
+                // Construct the shape load and guard it if the source may be
+                // NULL
+                PrimExpr src_raw_shape_val =
+                    cast(buffer->shape[k].dtype(),
+                         BufferLoad(src_shape_buf,
+                                    {IntImm(DataType::Int(32),
+                                            static_cast<int>(src.dim_idx))}));
+
+                // Check if this buffer is used (non-nullable)
+                bool src_is_used = used_param_buffers.count(src.handle_ptr);
+
+                if (is_first_source) {
+                  // Base case: use this shape value directly (we know at least
+                  // one is non-null from assert)
+                  if (src_is_used) {
+                    cascaded_value = src_raw_shape_val;
+                  } else {
+                    Var src_is_null = is_null_map[src.buf_name];
+                    cascaded_value = tvm::if_then_else(
+                        Not(src_is_null), src_raw_shape_val,
+                        make_const(src_raw_shape_val.dtype(), 0));
+                  }
+                  is_first_source = false;
                 } else {
-                  Var src_is_null = is_null_map[src.buf_name];
-                  cascaded_value = tvm::if_then_else(
-                      Not(src_is_null), src_raw_shape_val, cascaded_value);
+                  // if !is_null then use this shape, else use previous cascaded
+                  // value But if buffer is used (non-nullable), always use its
+                  // shape
+                  if (src_is_used) {
+                    cascaded_value = src_raw_shape_val;
+                  } else {
+                    Var src_is_null = is_null_map[src.buf_name];
+                    cascaded_value = tvm::if_then_else(
+                        Not(src_is_null), src_raw_shape_val, cascaded_value);
+                  }
                 }
               }
-            }
 
-            // Bind the variable to the cascaded expression
-            Var v_arg = ffi::GetRef<Var>(v);
-            defs_.emplace_back(v_arg);
-            (*def_map_)[v] = cascaded_value;
-            init_nest_.emplace_back(
-                LetStmt(v_arg, cascaded_value, Evaluate(0)));
+              // Bind the variable to the cascaded expression
+              Var v_arg = ffi::GetRef<Var>(v);
+              defs_.emplace_back(v_arg);
+              (*def_map_)[v] = cascaded_value;
+              init_nest_.emplace_back(
+                  LetStmt(v_arg, cascaded_value, Evaluate(0)));
+            } else {
+              // Single source or no special handling needed, use nullable
+              // binding. When the only source is NULL, bind m to 0 safely.
+              BindNullable(buffer->shape[k], shape_val, shape_element_name(k),
+                           true, is_null);
+            }
           } else {
-            // Single source or no special handling needed, use nullable
-            // binding. When the only source is NULL, bind m to 0 safely.
-            BindNullable(buffer->shape[k], shape_val, shape_element_name(k),
-                         true, is_null);
+            // Variable already bound, add assertion with nullable guard
+            PrimExpr cond = (it->second == shape_val);
+            BinderAddAssert(&analyzer_, cond, shape_element_name(k), &asserts_,
+                            is_null);
           }
         } else {
-          // Variable already bound, add assertion with nullable guard
-          PrimExpr cond = (it->second == shape_val);
-          BinderAddAssert(&analyzer_, cond, shape_element_name(k), &asserts_,
-                          is_null);
+          // Constant dimension, just add assertion
+          BindNullable(buffer->shape[k], shape_val, shape_element_name(k), true,
+                       is_null);
         }
-      } else {
-        // Constant dimension, just add assertion
-        BindNullable(buffer->shape[k], shape_val, shape_element_name(k), true,
-                     is_null);
       }
     }
 
     // strides field
-    // Skip stride checks for subbyte types (bits < 8), as they use packed
-    // storage and stride semantics don't apply directly.
-    if (!data_is_subtype) {
+    // For subbyte types (bits < 8), stride semantics need special handling
+    // due to packed storage. The relationship is:
+    // - Last dimension: logical_stride = runtime_stride (packing is within
+    // elements)
+    // - Other dimensions: logical_stride = runtime_stride * pack_factor
+    if (data_is_subtype) {
+      // For subtype, only process strides if there are explicit strides
+      // with symbolic variables that need binding
+      if (!buffer->strides.empty()) {
+        int bits = buffer->dtype.bits();
+        int pack_factor = 8 / bits;
+
+        Buffer buf_strides =
+            decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
+                        tvm_shape_type, arg_name + ".strides");
+        def_handle_dtype_.Set(buf_strides->data,
+                              tir::TypeAnnotation(tvm_shape_type));
+        init_nest_.emplace_back(
+            LetStmt(buf_strides->data,
+                    tvm::if_then_else(Not(is_null),
+                                      TVMArrayGet(DataType::Handle(), handle,
+                                                  builtin::kArrStrides),
+                                      make_zero(DataType::Handle())),
+                    nop));
+        init_nest_.emplace_back(DeclBuffer(buf_strides, nop));
+        PrimExpr v_strides_is_null =
+            Call(DataType::Bool(1), builtin::isnullptr(), {buf_strides->data});
+
+        for (int k = static_cast<int>(buffer->strides.size()) - 1; k >= 0;
+             --k) {
+          DataType stride_dtype = buffer->strides[k].dtype();
+          PrimExpr runtime_stride =
+              cast(stride_dtype,
+                   BufferLoad(buf_strides, {IntImm(DataType::Int(32), k)}));
+          runtime_stride =
+              tvm::if_then_else(Or(is_null, v_strides_is_null),
+                                make_const(stride_dtype, 0), runtime_stride);
+
+          // For the last dimension, logical stride = runtime stride
+          // For other dimensions, logical stride = runtime stride * pack_factor
+          PrimExpr logical_stride_val;
+          bool is_last_dim =
+              (k == static_cast<int>(buffer->strides.size()) - 1);
+          if (is_last_dim) {
+            logical_stride_val = runtime_stride;
+          } else {
+            logical_stride_val =
+                runtime_stride * make_const(stride_dtype, pack_factor);
+          }
+
+          // Relax stride check: if the expected stride is 0, allow any actual
+          // stride. This happens when one of the subsequent dimensions is 0.
+          // Also tolerate any stride when dim size is 1 (torch 2.1 DLPack bug).
+          RelaxedStrideCheck(k, buffer->strides[k], logical_stride_val,
+                             buffer->shape[k], is_null, stride_element_name(k));
+        }
+      }
+    } else {
+      // Non-subtype: normal stride handling
       Buffer buf_strides =
           decl_buffer({IntImm(DataType::Int(32), buffer->strides.size())},
                       tvm_shape_type, arg_name + ".strides");
@@ -752,7 +906,13 @@ void ArgBinder::BindDLTensors(
           PrimExpr svalue = cast(
               stype, BufferLoad(buf_strides, {IntImm(DataType::Int(32),
                                                      static_cast<int>(k))}));
-          conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue);
+          if (is_zero(analyzer_.Simplify(expect_stride))) {
+            LOG(WARNING) << "TileLang: Detected zero-dimension in compact "
+                         << "buffer strides calculation. "
+                         << "Relaxing check for " << stride_handle_name();
+          }
+          conds.push_back(buffer->shape[k] == 1 || expect_stride == svalue ||
+                          expect_stride == 0);
           expect_stride = expect_stride * buffer->shape[k];
         }
         std::ostringstream stride_err_msg;
@@ -798,8 +958,11 @@ void ArgBinder::BindDLTensors(
           PrimExpr stride_val = tvm::if_then_else(
               v_strides_is_null, stride_from_shape, explicit_stride);
 
-          BindNullable(buffer->strides[k], stride_val, stride_element_name(k),
-                       true, is_null);
+          // Relax stride check: if the expected stride is 0, allow any actual
+          // stride. This happens when one of the subsequent dimensions is 0.
+          // Also tolerate any stride when dim size is 1 (torch 2.1 DLPack bug).
+          RelaxedStrideCheck(k, buffer->strides[k], stride_val,
+                             buffer->shape[k], is_null, stride_element_name(k));
         }
       } else {
         PrimExpr stride_from_shape = 1;
@@ -817,11 +980,14 @@ void ArgBinder::BindDLTensors(
           PrimExpr stride_val = tvm::if_then_else(
               v_strides_is_null, stride_from_shape, explicit_stride);
 
-          BindNullable(buffer->strides[k], stride_val, stride_element_name(k),
-                       true, is_null);
+          // Relax stride check: if the expected stride is 0, allow any actual
+          // stride. This happens when one of the subsequent dimensions is 0.
+          // Also tolerate any stride when dim size is 1 (torch 2.1 DLPack bug).
+          RelaxedStrideCheck(k, buffer->strides[k], stride_val,
+                             buffer->shape[k], is_null, stride_element_name(k));
         }
       }
-    } // !data_is_subtype
+    }
 
     // Byte_offset field.
     int data_bytes = GetVectorBytes(buffer->dtype);
diff --git a/src/transform/arg_binder.h b/src/transform/arg_binder.h
index 0d7c3ee623..3d91fdceef 100644
--- a/src/transform/arg_binder.h
+++ b/src/transform/arg_binder.h
@@ -30,6 +30,7 @@
 
 #include <string>
 #include <unordered_map>
+#include <unordered_set>
 #include <vector>
 
 namespace tvm {
@@ -109,7 +110,8 @@ class ArgBinder {
   BindDLTensors(const std::vector<std::pair<Var, Buffer>> &buffer_def,
                 const PrimExpr &device_type, const PrimExpr &device_id,
                 const std::string &func_name,
-                const std::unordered_set<const VarNode *> &used_param_buffers);
+                const std::unordered_set<const VarNode *> &used_param_buffers,
+                const std::unordered_set<const VarNode *> &used_shape_vars);
 
   /*! \return The defs generated in binding. */
   const std::vector<Var> &defs() const { return defs_; }
@@ -162,6 +164,11 @@ class ArgBinder {
                     const std::string &arg_name, bool with_lets,
                     const PrimExpr &nullable_guard);
 
+  void RelaxedStrideCheck(const int dim_idx, const PrimExpr &stride,
+                          const PrimExpr &logical_stride_val,
+                          const PrimExpr &dim_shape, const PrimExpr &is_null,
+                          const std::string &stride_element_name);
+
 private:
   std::vector<Var> getUndefVars(const std::vector<PrimExpr> &arg);
   // Internal bind function
diff --git a/src/transform/atomicadd_vectorize.cc b/src/transform/atomicadd_vectorize.cc
deleted file mode 100644
index d66a538dbe..0000000000
--- a/src/transform/atomicadd_vectorize.cc
+++ /dev/null
@@ -1,308 +0,0 @@
-/*!
- * \file atomicadd_vectorize.cc
- * \brief A tool to automatically vectorize atomic add
- */
-
-#include "atomicadd_vectorize.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using arith::IRMutatorWithAnalyzer;
-using arith::IRVisitorWithAnalyzer;
-
-AtomicAddVectorizePlanner::AtomicAddVectorizePlanner() = default;
-
-AtomicAddVectorizePlanResult
-AtomicAddVectorizePlanner::Plan(const For &node, int compute_capability) {
-  int vectorize_size_max = 1;
-  this->vector_size_ = 4;
-  this->dynamic_ = false;
-  this->condition_ = PrimExpr();
-
-  PostOrderVisit(node, [&](const ObjectRef &obj) {
-    if (const auto *call = obj.as<CallNode>()) {
-      if (call->op == atomicadd_elem_op()) {
-        if (call->args.size() < 2) {
-          // Fallback: unexpected arity
-          vectorize_size_max = 1;
-          DLOG(WARNING) << "[AtomicAddVectorizePlanner] atomicadd_elem_op "
-                           "expects 2 args, got "
-                        << call->args.size() << "; Fallback to no vectorize";
-          return;
-        }
-        DataType dtype;
-        if (const auto *load = call->args[0].as<BufferLoadNode>()) {
-          dtype = load->dtype;
-          vectorize_size_max = GetVectorizeSizeMax(compute_capability, dtype);
-        } else if (const auto *ite = call->args[0].as<IfThenElseNode>()) {
-          if (const auto *then_load = ite->then_case.as<BufferLoadNode>()) {
-            dtype = then_load->dtype;
-            vectorize_size_max = GetVectorizeSizeMax(compute_capability, dtype);
-          } else if (const auto *else_load =
-                         ite->else_case.as<BufferLoadNode>()) {
-            dtype = else_load->dtype;
-            vectorize_size_max = GetVectorizeSizeMax(compute_capability, dtype);
-          } else {
-            // fallback
-            vectorize_size_max = 1;
-            DLOG(WARNING) << "[AtomicAddVectorizePlanner] IfThenElse case "
-                             "has no BufferLoad; Fallback to no vectorize";
-          }
-        } else {
-          // fallback
-          vectorize_size_max = 1;
-          DLOG(WARNING) << "[AtomicAddVectorizePlanner] Unexpected arg1 type "
-                        << call->args[1]->GetTypeKey()
-                        << "; Fallback to no vectorize";
-        }
-      }
-    }
-  });
-
-  if (vectorize_size_max <= 1) {
-    return {1, dynamic_, condition_};
-  }
-
-  this->max_vector_size = vectorize_size_max;
-  this->operator()(node);
-  return {vector_size_, dynamic_, condition_};
-}
-
-void AtomicAddVectorizePlanner::VisitStmt_(const ForNode *node) {
-  inner_for_ = node;
-  arith::IRVisitorWithAnalyzer::VisitStmt_(node);
-}
-
-void AtomicAddVectorizePlanner::VisitExpr_(const CallNode *node) {
-  if (node->op == atomicadd_elem_op() && !node->args.empty()) {
-    if (node->args.size() < 2) {
-      return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-    }
-    const BufferLoadNode *buffer_load_dst = node->args[0].as<BufferLoadNode>();
-    const BufferLoadNode *buffer_load_src = node->args[1].as<BufferLoadNode>();
-    if (buffer_load_src && buffer_load_src->buffer.defined() &&
-        buffer_load_dst && buffer_load_dst->buffer.defined()) {
-      Buffer dst_buffer = buffer_load_dst->buffer;
-      UpdateVectorSize(buffer_load_dst->indices, dst_buffer);
-
-      Buffer src_buffer = buffer_load_src->buffer;
-      UpdateVectorSize(buffer_load_src->indices, src_buffer);
-    }
-  }
-  return arith::IRVisitorWithAnalyzer::VisitExpr_(node);
-}
-
-int AtomicAddVectorizePlanner::GetVectorizeSizeMax(int compute_capability,
-                                                   DataType dtype) {
-  if (dtype == DataType::Float(16)) {
-    return 2;
-  }
-  if (dtype == DataType::BFloat(16)) {
-    return compute_capability > 75 ? 2 : 1;
-  }
-  if (dtype == DataType::Float(32)) {
-    return compute_capability >= 90 ? 4 : 1;
-  }
-  return 1;
-}
-
-void AtomicAddVectorizePlanner::UpdateVectorSize(const Array<PrimExpr> &indices,
-                                                 const Buffer &buffer) {
-  if (!inner_for_)
-    return;
-  auto extent_ptr = inner_for_->extent.as<IntImmNode>();
-  if (!extent_ptr)
-    return;
-
-  const DataType &access_type = buffer->dtype;
-  max_vector_size = arith::ZeroAwareGCD(max_vector_size, extent_ptr->value);
-
-  auto last_dim = buffer->shape.back();
-  auto mod_set = analyzer_.modular_set(last_dim);
-
-  if (buffer->shape.back().as<IntImmNode>()) {
-    max_vector_size = arith::ZeroAwareGCD(max_vector_size, mod_set->coeff);
-    auto gcd_base = arith::ZeroAwareGCD(max_vector_size, mod_set->base);
-
-    if (gcd_base < Downcast<IntImm>(last_dim)->value) {
-      max_vector_size = gcd_base;
-    }
-
-    vector_size_ = arith::ZeroAwareGCD(max_vector_size, vector_size_);
-
-    PrimExpr elem_offset = 0;
-    PrimExpr stride = 1;
-    for (int i = indices.size() - 1; i >= 0; --i) {
-      elem_offset = elem_offset + indices[i] * stride;
-      stride = stride * buffer->shape[i];
-    }
-
-    while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
-                               inner_for_->extent, vector_size_, &analyzer_)) {
-      vector_size_ /= 2;
-    }
-  } else if (vector_size_ <= 4) {
-    dynamic_ = true;
-    PrimExpr offset = buffer.OffsetOf(indices).back();
-    condition_ = (truncmod(offset, vector_size_) == 0);
-  }
-}
-
-class AtomicAddVectorizeRewriter : public StmtExprMutator {
-public:
-  AtomicAddVectorizeRewriter(const AtomicAddVectorizePlanResult &plan)
-      : vector_size_(plan.vector_size), dynamic_(plan.dynamic),
-        condition_(plan.condition) {}
-
-private:
-  /**
-   * @brief Visits a For node and rewrites the innermost loop for atomic-add
-   * vectorization.
-   *
-   * If the visited For node is the recorded innermost loop, this method
-   * validates that the loop extent is a constant, divisible by the planned
-   * vector size, and has a zero minimum. When vectorization is enabled
-   * (dynamic_ == false) it:
-   *  - locates the thread index variable named "tx" inside the loop body,
-   *  - creates a new outer loop variable named "<old_loop_var>_outer",
-   *  - substitutes occurrences of `tx` with `tx * vector_size_` and the old
-   * loop var with `outer_var * vector_size_` so each outer iteration maps to a
-   * contiguous vector-sized chunk,
-   *  - returns a new For with extent divided by vector_size_ and the
-   * transformed body.
-   *
-   * If dynamic_ is true, the method returns the (possibly mutated) inner For
-   * unchanged.
-   *
-   * Side effects:
-   *  - updates inner_for_ to point to the current For node during visitation.
-   *  - performs runtime checks (ICHECK) to enforce: constant extent, extent %
-   * vector_size_ == 0, and zero loop minimum; violations terminate execution.
-   *
-   * @return The original or transformed For statement as a Stmt.
-   */
-  Stmt VisitStmt_(const ForNode *node) final {
-    inner_for_ = node;
-    auto ret = StmtExprMutator::VisitStmt_(node);
-    if (vector_size_ == 1)
-      return ret;
-    if (inner_for_ == node) {
-      For fnode = ret.as<For>().value();
-      auto old_var = fnode->loop_var;
-      auto new_var = Var(old_var->name_hint);
-      auto extent_ptr = as_const_int(fnode->extent);
-      ICHECK(extent_ptr) << fnode->extent;
-      int extent = *extent_ptr;
-      ICHECK(extent % vector_size_ == 0)
-          << "extent: " << extent << " vector_size_: " << vector_size_;
-      ICHECK(is_zero(fnode->min));
-      if (!dynamic_) {
-        Map<Var, PrimExpr> vmap;
-        vmap.Set(old_var, new_var * vector_size_);
-        Stmt body = Substitute(fnode->body, vmap);
-        return For(new_var, 0, extent / vector_size_, fnode->kind, body,
-                   fnode->thread_binding, fnode->annotations, fnode->step,
-                   fnode->span);
-      }
-    }
-    return ret;
-  }
-
-  PrimExpr VisitExpr_(const CallNode *node) final {
-    bool legal_vectorize = true;
-    if (dynamic_)
-      legal_vectorize = false;
-    if (!(node->op == atomicadd_elem_op()))
-      legal_vectorize = false;
-    if (node->args.size() < 2)
-      legal_vectorize = false;
-    if (legal_vectorize) {
-      const BufferLoadNode *temp_dst_node = node->args[0].as<BufferLoadNode>();
-      const BufferLoadNode *temp_value_node =
-          node->args[1].as<BufferLoadNode>();
-      if (!temp_dst_node || !temp_value_node)
-        legal_vectorize = false;
-    }
-    if (legal_vectorize) {
-      const BufferLoad dst_node = Downcast<BufferLoad>(node->args[0]);
-      const BufferLoad value_node = Downcast<BufferLoad>(node->args[1]);
-      // The default memory order is relaxed
-      // Ref: src/tl_templates/cuda/atomic.h::AtomicAdd
-      const IntImm memory_order =
-          node->args.size() >= 3 ? Downcast<IntImm>(node->args[2]) : IntImm(0);
-      Array<PrimExpr> new_args;
-      Call address_of_dst =
-          Call(DataType::Handle(), builtin::address_of(), {dst_node});
-      Call address_of_value =
-          Call(DataType::Handle(), builtin::address_of(), {value_node});
-      if (vector_size_ == 4) {
-        new_args.push_back(StringImm("AtomicAddx4"));
-        new_args.push_back(address_of_dst);
-        new_args.push_back(address_of_value);
-      } else if (vector_size_ == 2) {
-        new_args.push_back(StringImm("AtomicAddx2"));
-        new_args.push_back(address_of_dst);
-        new_args.push_back(address_of_value);
-      } else {
-        // Scalar case: AtomicAdd now expects a pointer to destination.
-        new_args.push_back(StringImm("AtomicAdd"));
-        new_args.push_back(address_of_dst);
-        new_args.push_back(value_node);
-      }
-      new_args.push_back(memory_order);
-
-      Call new_call =
-          tvm::tir::Call(node->dtype, builtin::call_extern(), new_args);
-
-      return new_call;
-    } else {
-      Array<PrimExpr> new_args;
-      new_args.push_back(StringImm("AtomicAdd"));
-      // Ensure first argument is an address; keep value as-is.
-      if (!node->args.empty()) {
-        if (const auto *bl = node->args[0].as<BufferLoadNode>()) {
-          Call address_of_dst = Call(DataType::Handle(), builtin::address_of(),
-                                     {Downcast<BufferLoad>(node->args[0])});
-          new_args.push_back(address_of_dst);
-        } else if (const auto *call = node->args[0].as<CallNode>()) {
-          // If it's already an address_of, forward it; otherwise, keep
-          // original.
-          if (call->op.same_as(builtin::address_of())) {
-            new_args.push_back(node->args[0]);
-          } else {
-            new_args.push_back(node->args[0]);
-          }
-        } else {
-          new_args.push_back(node->args[0]);
-        }
-        // Push remaining args unchanged (value, optional memory_order, ...)
-        for (size_t i = 1; i < node->args.size(); ++i) {
-          new_args.push_back(node->args[i]);
-        }
-      }
-
-      Call new_call =
-          tvm::tir::Call(node->dtype, builtin::call_extern(), new_args);
-
-      return new_call;
-    }
-  }
-
-  const ForNode *inner_for_;
-  const int vector_size_;
-  const PrimExpr condition_;
-  const bool dynamic_;
-};
-
-For VectorizeAtomicAdd(const For &for_node, int compute_capability) {
-  AtomicAddVectorizePlanResult res = {1, false, 0};
-  AtomicAddVectorizePlanner planner;
-  res = planner.Plan(for_node, compute_capability);
-  auto rewriter = AtomicAddVectorizeRewriter(res);
-  return Downcast<For>(rewriter(for_node));
-}
-
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/atomicadd_vectorize.h b/src/transform/atomicadd_vectorize.h
deleted file mode 100644
index 6bd3309aee..0000000000
--- a/src/transform/atomicadd_vectorize.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*!
- * \file atomicadd_vectorize.h
- * \brief A tool to automatically vectorize a for atomicadd
- */
-
-#ifndef TVM_TL_ATOMICADD_VECTORIZE_H_
-#define TVM_TL_ATOMICADD_VECTORIZE_H_
-
-#include "../layout/layout.h"
-#include "../layout/utils.h"
-#include "../op/builtin.h"
-#include "arith/int_operator.h"
-#include "arith/ir_visitor_with_analyzer.h"
-#include "common/loop_vectorization_utils.h"
-#include <numeric>
-#include <tvm/arith/analyzer.h>
-#include <tvm/arith/iter_affine_map.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
-#include <utility>
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-
-For VectorizeAtomicAdd(const For &for_node, int compute_capability);
-
-struct AtomicAddVectorizePlanResult {
-  int vector_size;
-  bool dynamic;
-  PrimExpr condition;
-};
-
-class AtomicAddVectorizePlanner : public arith::IRVisitorWithAnalyzer {
-public:
-  AtomicAddVectorizePlanner();
-
-  AtomicAddVectorizePlanResult Plan(const For &node, int compute_capability);
-
-private:
-  void VisitStmt_(const ForNode *node) final;
-  void VisitExpr_(const CallNode *node) final;
-
-  int GetVectorizeSizeMax(int compute_capability, DataType dtype);
-  void UpdateVectorSize(const Array<PrimExpr> &indices, const Buffer &buffer);
-
-  const ForNode *inner_for_ = nullptr;
-  bool has_nonlocal_memory_access_ = false;
-  int vector_size_ = 4;
-  int max_vector_size = 1;
-  bool dynamic_ = false;
-  PrimExpr condition_;
-};
-
-} // namespace tl
-} // namespace tvm
-
-#endif // TVM_TL_ATOMICADD_VECTORIZE_H_
diff --git a/src/transform/common/attr.h b/src/transform/common/attr.h
index d71ee67231..c70cccb02e 100644
--- a/src/transform/common/attr.h
+++ b/src/transform/common/attr.h
@@ -3,13 +3,51 @@
  * \brief Check attributes of the IR
  */
 
+#ifndef TVM_TL_TRANSFORM_COMMON_ATTR_H_
+#define TVM_TL_TRANSFORM_COMMON_ATTR_H_
+
+#include "tvm/tir/stmt.h"
+#include <string>
+
 namespace tvm {
 namespace tl {
 
-constexpr const char *MainBlockName = "tilelang_root";
+constexpr const char *HostMainBlockName = "root";
+
+constexpr const char *DeviceMainBlockName = "tilelang_root";
+
+inline bool IsHostMainBlock(const tir::BlockNode *node) {
+  return node->name_hint == HostMainBlockName;
+}
+
+inline bool IsDeviceMainBlock(const tir::BlockNode *node) {
+  return node->name_hint == DeviceMainBlockName;
+}
 
 constexpr const char *tilelang_is_cpu_kernel_frame =
     "tilelang.is_cpu_kernel_frame";
 
+namespace attr {
+// Attributes to mark CUDA sync calls
+constexpr const char *kHasTriggerLaunch = "has_cuda_pdl_trigger";
+constexpr const char *kHasGridSync = "has_cuda_pdl_sync";
+
+// Attributes to implement SourceCodeBlock
+constexpr const char *kCodeBlockSource = "code_block_source";
+constexpr const char *kCodeBlockEntryName = "code_block_entry_name";
+
+/*!
+ * \brief Check if attr_key is a code block key extension
+ * \param attr_key The attr key to be compared
+ * \return true if it is a code block key
+ */
+inline bool IsCodeBlockKey(const std::string &attr_key) {
+  return attr_key.compare(0, 11, "code_block_") == 0;
+}
+
+} // namespace attr
+
 } // namespace tl
 } // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_COMMON_ATTR_H_
diff --git a/src/transform/common/constr_visitor.h b/src/transform/common/constr_visitor.h
new file mode 100644
index 0000000000..a87f7313d7
--- /dev/null
+++ b/src/transform/common/constr_visitor.h
@@ -0,0 +1,257 @@
+#ifndef TVM_TL_TRANSFORM_COMMON_CONSTR_VISITOR_H_
+#define TVM_TL_TRANSFORM_COMMON_CONSTR_VISITOR_H_
+
+#include "tvm/arith/analyzer.h"
+#include "tvm/ffi/base_details.h"
+#include "tvm/ffi/object.h"
+#include "tvm/ir/expr.h"
+#include "tvm/tir/op.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/var.h"
+#include <ostream>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm::tl {
+
+struct Constr {
+
+  enum Kind {
+    kConstr,
+    kBindValue,
+    kBindRange,
+  } kind;
+  bool is_assume = false;
+  tir::Var var;
+  PrimExpr value;
+  Range range;
+
+  Constr(PrimExpr constr, bool is_assume = false)
+      : kind(kConstr), value(constr), is_assume(is_assume) {};
+  Constr(tir::Var var, PrimExpr val)
+      : kind(kBindValue), var(var), value(val) {};
+  Constr(tir::Var var, Range range)
+      : kind(kBindRange), var(var), range(range) {};
+
+  Constr() = default;
+  Constr(const Constr &other) = default;
+  Constr(Constr &&other) = default;
+  Constr &operator=(const Constr &other) = default;
+
+  void format(std::ostream &os) const {
+    os << "Constr(kind=";
+    switch (kind) {
+    case kConstr:
+      os << "kConstr";
+      os << ", is_assume=" << (is_assume ? "true" : "false");
+      os << ", value=" << value;
+      break;
+    case kBindValue:
+      os << "kBindValue";
+      os << ", var=" << var->name_hint;
+      os << ", value=" << value;
+      break;
+    case kBindRange:
+      os << "kBindRange";
+      os << ", var=" << var->name_hint;
+      os << ", range=Range(min=" << range->min;
+      os << ", extent=" << range->extent << ")";
+      break;
+    default:
+      os << "Unknown";
+    }
+    os << ")";
+  }
+
+  PrimExpr ToGenericConstr() const {
+    switch (kind) {
+    case kConstr:
+      return value;
+    case kBindValue:
+      return var == value;
+    case kBindRange:
+      return tir::And(var >= range->min, var < (range->min + range->extent));
+    }
+    LOG(FATAL) << "Unreachable";
+    return PrimExpr();
+  }
+  Constr Substitute(ffi::Map<tir::Var, PrimExpr> subs) const {
+    return Constr(tir::Substitute(ToGenericConstr(), subs));
+  }
+  void Populate(arith::Analyzer &analyzer) const {
+    switch (kind) {
+    case kConstr:
+      analyzer.EnterConstraint(value);
+      break;
+    case kBindValue:
+      analyzer.Bind(var, value);
+      break;
+    case kBindRange:
+      analyzer.Bind(var, range);
+      break;
+    default:
+      LOG(FATAL) << "Unreachable";
+    }
+  }
+};
+
+struct ConstrSet {
+  ConstrSet Substitute(ffi::Map<tir::Var, PrimExpr> subs) const {
+    ConstrSet new_set;
+    for (const auto &c : constrs_) {
+      new_set.constrs_.push_back(c.Substitute(subs));
+    }
+    return new_set;
+  }
+  void Populate(arith::Analyzer &analyzer) const {
+    for (const auto &c : constrs_) {
+      c.Populate(analyzer);
+    }
+  }
+  bool CanProve(const PrimExpr &expr) const {
+    arith::Analyzer analyzer;
+    Populate(analyzer);
+    return analyzer.CanProve(expr);
+  }
+  template <typename... Args> void AddConstr(Args... args) {
+    constrs_.push_back(Constr(args...));
+  }
+  void Extend(const ConstrSet &other) {
+    for (const auto &c : other.constrs_) {
+      constrs_.push_back(c);
+    }
+  }
+
+  /*! \brief Convert the constraint set to a conjunction (AND) of all
+   * constraints */
+  PrimExpr ToConjunction() const {
+    if (constrs_.empty())
+      return Bool(true);
+    PrimExpr result = constrs_[0].ToGenericConstr();
+    for (size_t i = 1; i < constrs_.size(); ++i) {
+      result = tir::And(result, constrs_[i].ToGenericConstr());
+    }
+    return result;
+  }
+
+  void format(std::ostream &os) const {
+    os << "ConstrSet(size=" << constrs_.size() << ") {\n";
+    for (size_t i = 0; i < constrs_.size(); ++i) {
+      os << "  [" << i << "] ";
+      constrs_[i].format(os);
+      os << "\n";
+    }
+    os << "}";
+  }
+
+  std::vector<Constr> constrs_;
+};
+
+struct ConstrVisitor : public tir::StmtExprVisitor {
+private:
+  using Base = tir::StmtExprVisitor;
+
+  struct Guard {
+    std::vector<Constr> &constrs;
+    ~Guard() { constrs.pop_back(); }
+  };
+
+protected:
+  template <typename... Args> Guard MakeGuard(const Args... args) {
+    constr_stack_.push_back(Constr(args...));
+    return Guard{constr_stack_};
+  }
+
+public:
+  using StmtExprVisitor::VisitExpr_;
+  using StmtExprVisitor::VisitStmt_;
+  void VisitIfThenElseExpr(const PrimExpr cond, const PrimExpr true_value,
+                           const PrimExpr false_value) {
+    // Visit the condition first without any guard, as it is always evaluated
+    // This ensures any buffer accesses in the condition are recorded
+    Base::VisitExpr(cond);
+    {
+      auto guard = MakeGuard(cond);
+      Base::VisitExpr(true_value);
+    }
+    {
+      auto guard = MakeGuard(tir::Not(cond));
+      Base::VisitExpr(false_value);
+    }
+  }
+  void VisitStmt_(const tir::LetStmtNode *op) override {
+    auto guard = MakeGuard(op->var, op->value);
+    Base::VisitStmt_(op);
+  }
+  void VisitStmt_(const tir::AttrStmtNode *op) override {
+    if (op->attr_key == tir::attr::tilelang_assume) {
+      auto expr = Downcast<PrimExpr>(op->node);
+      auto guard = MakeGuard(expr, true);
+      Base::VisitStmt_(op);
+    } else if (op->attr_key == tir::attr::thread_extent ||
+               op->attr_key == tir::attr::virtual_thread) {
+      tir::IterVar iv = Downcast<tir::IterVar>(op->node);
+      Range dom =
+          Range::FromMinExtent(tir::make_zero(op->value.dtype()), op->value);
+      auto guard = MakeGuard(iv->var, dom);
+      Base::VisitStmt_(op);
+    } else {
+      Base::VisitStmt_(op);
+    }
+  }
+  void VisitStmt_(const tir::AssertStmtNode *op) override {
+    auto guard = MakeGuard(op->condition);
+    Base::VisitStmt_(op);
+  }
+  void VisitStmt_(const tir::IfThenElseNode *op) override {
+    {
+      auto guard = MakeGuard(op->condition);
+      Base::VisitStmt(op->then_case);
+    }
+    if (op->else_case) {
+      auto guard = MakeGuard(tir::Not(op->condition));
+      Base::VisitStmt(op->else_case.value());
+    }
+  }
+  void VisitExpr_(const tir::SelectNode *op) override {
+    VisitIfThenElseExpr(op->condition, op->true_value, op->false_value);
+  }
+  void VisitExpr_(const tir::CallNode *op) override {
+    static auto op_if_then_else = Op::Get("tir.if_then_else");
+    if (op->op.same_as(op_if_then_else)) {
+      VisitIfThenElseExpr(op->args[0], op->args[1], op->args[2]);
+    } else {
+      Base::VisitExpr_(op);
+    }
+  }
+  void VisitStmt_(const tir::ForNode *op) override {
+    if (op->kind == tir::ForKind::kParallel ||
+        op->kind == tir::ForKind::kVectorized) {
+      auto guard_1 =
+          MakeGuard(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+      auto guard_2 = MakeGuard(op->extent > 0);
+      Base::VisitStmt_(op);
+    } else {
+      auto guard_1 =
+          MakeGuard(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+      auto guard_2 = MakeGuard(op->extent > 0);
+      Base::VisitStmt_(op);
+    }
+  }
+  void VisitStmt_(const tir::WhileNode *op) override {
+    {
+      auto guard = MakeGuard(op->condition);
+      Base::VisitStmt(op->body);
+    }
+  }
+  ConstrSet GetConstrSet() const {
+    return ConstrSet{.constrs_ = constr_stack_};
+  }
+  std::vector<Constr> constr_stack_;
+};
+} // namespace tvm::tl
+
+#endif // TVM_TL_TRANSFORM_COMMON_CONSTR_VISITOR_H_
diff --git a/src/transform/common/loop_fusion_utils.h b/src/transform/common/loop_fusion_utils.h
index 2fa6cdede4..c4c5aec6b5 100644
--- a/src/transform/common/loop_fusion_utils.h
+++ b/src/transform/common/loop_fusion_utils.h
@@ -218,6 +218,28 @@ class ParallelLoopFuser : public IRMutatorWithAnalyzer {
     // Create the fused loop
     For fused_for = For(fused_var, 0, fused_extent, ForKind::kParallel, body);
     fused_for.CopyOnWrite()->annotations = op->annotations;
+
+    // If the outermost loop carried a parallel loop layout annotation,
+    // reshape it to the fused 1D domain so the fused loop's layout remains
+    // valid. Using Fragment::Reshape preserves forward_index/thread semantics
+    // and performs necessary simplifications.
+    if (fused_for->annotations.count(attr::kParallelLoopLayout)) {
+      auto old_layout = Downcast<Fragment>(
+          fused_for->annotations.Get(attr::kParallelLoopLayout).value());
+      size_t old_dim = old_layout->InputDim();
+      // Only attempt to fuse when dimensions match the number of fused loops.
+      if (old_dim == loop_chain.size()) {
+        Array<PrimExpr> new_shape = {fused_extent};
+        Fragment fused_layout =
+            Downcast<Fragment>(old_layout->Reshape(new_shape, analyzer_));
+        fused_for.CopyOnWrite()->annotations.Set(attr::kParallelLoopLayout,
+                                                 fused_layout);
+      } else {
+        // Dimension mismatch: drop the stale layout so downstream passes can
+        // re-infer a correct one for the fused loop.
+        fused_for.CopyOnWrite()->annotations.erase(attr::kParallelLoopLayout);
+      }
+    }
     return fused_for;
   }
 };
diff --git a/src/transform/common/loop_vectorization_utils.h b/src/transform/common/loop_vectorization_utils.h
index c6cedbcb2a..c23252f418 100644
--- a/src/transform/common/loop_vectorization_utils.h
+++ b/src/transform/common/loop_vectorization_utils.h
@@ -29,7 +29,6 @@
 #include <tvm/tir/utils.h>
 
 #include <queue>
-#include <string>
 #include <utility>
 
 #include "../../op/parallel.h"
@@ -422,165 +421,10 @@ class Vectorizer : public StmtMutator,
       }
     }
   }
-  // tl::ld or tl::st expr vectorization
-  // Transform: for k in vectorized(N): tl::ld(&buf[base+k], val[k])
-  // Into: tl::ld(&buf[base], reinterpret<int4>(val[base])) with vectorized load
-  //
-  // This function handles the vectorization of tl::ld and tl::st calls.
-  // The key insight is that for 8 consecutive bf16 loads (128 bits total),
-  // we can use a single int4 load which is more efficient by reinterpreting
-  // the value as int4.
-  PrimExpr MutateTlLdStExpr_(const CallNode *op, bool is_load) {
-    // Structure: call_extern("tl::ld<...>", address_of(BufferLoad), value, ...)
-    // or: call_extern("tl::st<...>", address_of(BufferLoad), value, ...)
-    ICHECK(op->args.size() >= 3) << "tl::ld/st expects at least 3 arguments";
-
-    PrimExpr func_name = op->args[0];
-    PrimExpr addr_arg = op->args[1];
-    PrimExpr value_arg = op->args[2];
-
-    // Visit the address argument to vectorize indices
-    PrimExpr new_addr = this->VisitExpr(addr_arg);
-    PrimExpr new_value = this->VisitExpr(value_arg);
-
-    // Helper to extract base from Ramp and get lanes
-    auto extract_ramp_info =
-        [](const Array<PrimExpr> &indices) -> std::pair<Array<PrimExpr>, int> {
-      Array<PrimExpr> base_indices;
-      int ramp_lanes = 1;
-      for (const auto &idx : indices) {
-        auto ramp = idx.as<RampNode>();
-        if (ramp && is_one(ramp->stride)) {
-          auto lanes_imm = ramp->lanes.as<IntImmNode>();
-          if (lanes_imm) {
-            ramp_lanes = lanes_imm->value;
-          }
-          base_indices.push_back(ramp->base);
-        } else {
-          base_indices.push_back(idx);
-        }
-      }
-      return {base_indices, ramp_lanes};
-    };
-
-    // Check source address for Ramp pattern
-    int src_ramp_lanes = 1;
-    auto addr_call = new_addr.as<CallNode>();
-    if (addr_call && addr_call->op.same_as(builtin::address_of())) {
-      auto buffer_load = addr_call->args[0].as<BufferLoadNode>();
-      if (buffer_load) {
-        auto [base_indices, lanes] = extract_ramp_info(buffer_load->indices);
-        if (lanes > 1) {
-          src_ramp_lanes = lanes;
-          // Create new address with base indices only
-          BufferLoad new_buffer_load(buffer_load->buffer, base_indices);
-          new_addr = Call(DataType::Handle(), builtin::address_of(),
-                          {new_buffer_load});
-        }
-      }
-    }
-
-    // Check destination value for Ramp pattern (for local buffer stores)
-    int dst_ramp_lanes = 1;
-    auto value_load = new_value.as<BufferLoadNode>();
-    if (value_load) {
-      auto [base_indices, lanes] = extract_ramp_info(value_load->indices);
-      if (lanes > 1) {
-        dst_ramp_lanes = lanes;
-        // Create new value with base indices only
-        new_value = BufferLoad(value_load->buffer, base_indices);
-      }
-    }
-
-    // Determine vectorization lanes
-    int vector_lanes = std::max(src_ramp_lanes, dst_ramp_lanes);
-    if (vector_lanes > 1) {
-      // Determine the vector type based on total bytes
-      // 8 x 16-bit = 128 bits = int4, 4 x 32-bit = 128 bits = int4
-      // 4 x 16-bit = 64 bits = int2, 2 x 32-bit = 64 bits = int2
-      DataType vec_dtype;
-      int elem_bits = 16; // Default assumption for bf16/f16
-
-      // Try to get element dtype from source buffer
-      auto addr_call_check = new_addr.as<CallNode>();
-      if (addr_call_check &&
-          addr_call_check->op.same_as(builtin::address_of())) {
-        auto buffer_load = addr_call_check->args[0].as<BufferLoadNode>();
-        if (buffer_load) {
-          elem_bits = buffer_load->buffer->dtype.bits();
-        }
-      }
-
-      int total_bits = vector_lanes * elem_bits;
-      if (total_bits == 128) {
-        vec_dtype = DataType::Int(32, 4); // int4 equivalent (128 bits)
-      } else if (total_bits == 64) {
-        vec_dtype = DataType::Int(32, 2); // int2 equivalent (64 bits)
-      } else if (total_bits == 32) {
-        vec_dtype = DataType::Int(32);
-      } else {
-        // Can't vectorize to a standard type, fall back to scalarize
-        need_scalarize_ = true;
-        return tvm::ffi::GetRef<PrimExpr>(op);
-      }
-
-      // Reinterpret the value to vector type (e.g., int4 for 8xbf16)
-      // This generates: reinterpret_cast<int4&>(dst[base])
-      PrimExpr vec_value = Call(vec_dtype, builtin::reinterpret(), {new_value});
-
-      // Build new args with base addresses and reinterpreted value
-      Array<PrimExpr> new_args;
-      new_args.push_back(func_name);
-      new_args.push_back(new_addr);
-      new_args.push_back(vec_value);
-      // Copy remaining args (sem, scope, etc.)
-      for (size_t i = 3; i < op->args.size(); ++i) {
-        new_args.push_back(this->VisitExpr(op->args[i]));
-      }
-
-      // Return the vectorized call with same function but vectorized value type
-      return Call(op->dtype, op->op, new_args);
-    }
-
-    // If we couldn't vectorize but args became vectors, need to scalarize
-    if (new_addr.dtype().is_scalable_or_fixed_length_vector() ||
-        new_value.dtype().is_scalable_or_fixed_length_vector()) {
-      need_scalarize_ = true;
-      return tvm::ffi::GetRef<PrimExpr>(op);
-    }
-
-    // No vectorization needed, return with updated args if changed
-    if (new_addr.same_as(addr_arg) && new_value.same_as(value_arg)) {
-      return tvm::ffi::GetRef<PrimExpr>(op);
-    }
-
-    Array<PrimExpr> new_args;
-    new_args.push_back(func_name);
-    new_args.push_back(new_addr);
-    new_args.push_back(new_value);
-    for (size_t i = 3; i < op->args.size(); ++i) {
-      new_args.push_back(this->VisitExpr(op->args[i]));
-    }
-    return Call(op->dtype, op->op, new_args);
-  }
-
   // Call
   PrimExpr VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(builtin::if_then_else())) {
       return MutateIfThenElseExpr_(op);
-    } else if (op->op.same_as(builtin::call_extern())) {
-      // Check if this is a tl::ld or tl::st call which can be vectorized
-      if (op->args.size() >= 3) {
-        auto func_name_node = op->args[0].as<StringImmNode>();
-        if (func_name_node) {
-          std::string func_name = func_name_node->value;
-          // Check for tl::ld<...> or tl::st<...> patterns
-          if (func_name.rfind("tl::ld<", 0) == 0 ||
-              func_name.rfind("tl::st<", 0) == 0) {
-            return MutateTlLdStExpr_(op, func_name.rfind("tl::ld<", 0) == 0);
-          }
-        }
-      }
     } else if (op->op.same_as(builtin::texture2d_load())) {
       int lane = 0;
       Array<PrimExpr> fcd = MutateArray({op->args.back()}, &lane);
diff --git a/src/transform/common/mbarrier.h b/src/transform/common/mbarrier.h
new file mode 100644
index 0000000000..6b09f8da5e
--- /dev/null
+++ b/src/transform/common/mbarrier.h
@@ -0,0 +1,46 @@
+#ifndef TVM_TL_TRANSFORM_COMMON_MBARRIER_H_
+#define TVM_TL_TRANSFORM_COMMON_MBARRIER_H_
+
+#include "../../op/builtin.h"
+#include <tvm/ir/expr.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Create an mbarrier buffer with shared.barrier storage scope.
+ *
+ * \param name The name of the buffer.
+ * \param num_barriers The number of barriers in the buffer.
+ * \return A Buffer object for mbarrier with shared.barrier scope.
+ */
+inline Buffer CreateMBarrierBuffer(const std::string &name, int num_barriers) {
+  Var data(name, PointerType(PrimType(DataType::UInt(64)), "shared.barrier"));
+  return Buffer(data, DataType::UInt(64),
+                {IntImm(DataType::Int(32), num_barriers)}, {}, PrimExpr(), name,
+                0, 0, kDefault);
+}
+
+/*!
+ * \brief Create a BufferLoad reference to a specific barrier slot.
+ *
+ * \param barrier_buf The shared.barrier scope buffer.
+ * \param barrier_id  The index expression for the barrier slot.
+ * \return A BufferLoad expression referencing the barrier.
+ */
+inline PrimExpr MakeBarrierRef(const Buffer &barrier_buf, PrimExpr barrier_id) {
+  return BufferLoad(barrier_buf, {std::move(barrier_id)});
+}
+
+const std::string injected_mbarrier_name_ =
+    "mbarrier"; // todo: avoid conflict with user-defined mbarriers
+
+} // namespace tl
+} // namespace tvm
+#endif // TVM_TL_TRANSFORM_COMMON_MBARRIER_H_
diff --git a/src/transform/common/pipeline_utils.h b/src/transform/common/pipeline_utils.h
new file mode 100644
index 0000000000..1722c0262e
--- /dev/null
+++ b/src/transform/common/pipeline_utils.h
@@ -0,0 +1,112 @@
+/*!
+ * \file pipeline_utils.h
+ * \brief Shared utilities for software-pipeline and warp-specialization passes.
+ *
+ * Provides:
+ *  - Pipeline annotation attribute keys
+ *  - GetPipelineNumStages()  — extract num_stages from loop annotations
+ *  - ComputeThreadBounds()  — derive thread bounds from an analyzer + IterVar
+ */
+#ifndef TVM_TL_TRANSFORM_COMMON_PIPELINE_UTILS_H_
+#define TVM_TL_TRANSFORM_COMMON_PIPELINE_UTILS_H_
+
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// ---------------------------------------------------------------------------
+// Pipeline annotation attribute keys
+// ---------------------------------------------------------------------------
+
+/*! Marks the enclosing scope with the pipeline stage count. */
+static constexpr const char *kPipelineContextNumStages =
+    "tl.pipeline_context_num_stages";
+/*! Multi-version buffer: stage count for buffer expansion. */
+static constexpr const char *kPipelineMVBContextNumStages =
+    "tl.pipeline_mvb_num_stages";
+/*! Multi-version buffer: per-statement stage index expression. */
+static constexpr const char *kPipelineMVBStageExpr =
+    "tl.pipeline_mvb_stage_expr";
+/*! Multi-version buffer: per-statement parity expression. */
+static constexpr const char *kPipelineMVBParityExpr =
+    "tl.pipeline_mvb_parity_expr";
+/*! Per-statement TMA copy flag (1 = TMA eligible, 0 = not). */
+static constexpr const char *kPipelineTmaCopies =
+    "software_pipeline_tma_copies";
+/*! Per-statement async producer flag (1 = async copy producer, 0 = not). */
+static constexpr const char *kPipelineAsyncProducers =
+    "software_pipeline_async_producers";
+/*! Per-statement async producer group id (-1 = not an async producer). */
+static constexpr const char *kPipelineAsyncProducerGroups =
+    "software_pipeline_async_producer_groups";
+
+// ---------------------------------------------------------------------------
+// GetPipelineNumStages
+// ---------------------------------------------------------------------------
+
+/*!
+ * \brief Extract the pipeline stage count from a For loop's annotations.
+ *
+ * Checks (in order):
+ *   1. "num_stages" — user-provided stage count
+ *   2. "tl_pipelined_num_stages" — set by InjectSoftwarePipeline
+ *   3. tir::attr::software_pipeline_stage — max(stage) + 1
+ *
+ * \return The stage count, or nullopt if the loop is not pipelined.
+ */
+inline Optional<Integer> GetPipelineNumStages(const ForNode *loop) {
+  if (auto num_stages = loop->annotations.Get("num_stages")) {
+    if (const auto *imm = num_stages->as<IntImmNode>()) {
+      return Integer(static_cast<int>(imm->value));
+    }
+  }
+  if (auto num_stages = loop->annotations.Get("tl_pipelined_num_stages")) {
+    if (const auto *imm = num_stages->as<IntImmNode>()) {
+      return Integer(static_cast<int>(imm->value));
+    }
+  }
+  if (auto stages_anno =
+          loop->annotations.Get(tir::attr::software_pipeline_stage)) {
+    auto stages = Downcast<Array<Integer>>(stages_anno.value());
+    int max_stage = -1;
+    for (const auto &stage : stages) {
+      max_stage = std::max(max_stage, static_cast<int>(stage->value));
+    }
+    if (max_stage >= 0) {
+      return Integer(max_stage + 1);
+    }
+  }
+  return Optional<Integer>();
+}
+
+// ---------------------------------------------------------------------------
+// ComputeThreadBounds
+// ---------------------------------------------------------------------------
+
+/*!
+ * \brief Compute the thread index bounds from an IterVar and an analyzer.
+ *
+ * \return Range covering the thread index, or [0, 1) if no bound is known.
+ */
+inline Range ComputeThreadBounds(const IterVar &thread_var,
+                                 const arith::Analyzer &analyzer) {
+  if (thread_var.defined() &&
+      analyzer.const_int_bound.IsBound(thread_var->var)) {
+    auto const_int_bound = analyzer.const_int_bound(thread_var);
+    auto min_value = const_int_bound->min_value;
+    auto max_value = const_int_bound->max_value;
+    auto extent = max_value - min_value + 1;
+    auto dtype = thread_var->var.dtype();
+    return Range::FromMinExtent(IntImm(dtype, min_value),
+                                IntImm(dtype, extent));
+  }
+  return Range::FromMinExtent(0, 1);
+}
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_COMMON_PIPELINE_UTILS_H_
diff --git a/src/transform/eliminate_storage_sync_for_mbarrier.cc b/src/transform/eliminate_storage_sync_for_mbarrier.cc
deleted file mode 100644
index 504de732ca..0000000000
--- a/src/transform/eliminate_storage_sync_for_mbarrier.cc
+++ /dev/null
@@ -1,125 +0,0 @@
-/*!
- * \file eliminate_storage_sync_for_mbarrier.cc
- */
-#include "../op/builtin.h"
-#include "./storage_access.h"
-#include "arith/ir_mutator_with_analyzer.h"
-#include "arith/ir_visitor_with_analyzer.h"
-#include <tvm/ffi/function.h>
-#include <tvm/ffi/reflection/registry.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/stmt_functor.h>
-#include <tvm/tir/transform.h>
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using arith::IRMutatorWithAnalyzer;
-using arith::IRVisitorWithAnalyzer;
-
-class Eliminator : public IRMutatorWithAnalyzer {
-public:
-  static Stmt Substitute(const Stmt &stmt, bool skip_thread_partition = false) {
-    arith::Analyzer analyzer;
-    Eliminator transformer(&analyzer);
-    return transformer.VisitStmt(stmt);
-  }
-
-  Eliminator(arith::Analyzer *analyzer) : IRMutatorWithAnalyzer(analyzer) {
-    im_mbarrier_for_ = false;
-    in_mbarrier_region_ = false;
-  }
-
-  Stmt VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == "thread_extent") {
-      if (const auto *var = op->node.as<VarNode>()) {
-        if (var->name_hint == "threadIdx.x") {
-          thread_extent_ = op;
-        }
-      }
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const EvaluateNode *op) final {
-    const CallNode *call = nullptr;
-    if (op->value->IsInstance<CallNode>()) {
-      call = op->value.as<CallNode>();
-      if (call->op.same_as(builtin::tvm_storage_sync())) {
-        // Skip storage sync if we're in a region with mbarrier operations
-        // and we're not in a for loop with mbarrier operations
-        if (in_mbarrier_region_ || im_mbarrier_for_) {
-          return Stmt();
-        }
-      } else if (call->op.same_as(builtin::ptx_arrive_barrier()) ||
-                 call->op.same_as(builtin::ptx_wait_barrier())) {
-        in_mbarrier_region_ = true;
-      }
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const IfThenElseNode *op) final {
-    bool old_in_mbarrier = in_mbarrier_region_;
-    Stmt then_case = VisitStmt(op->then_case);
-
-    Stmt ret;
-    if (op->else_case.defined()) {
-      in_mbarrier_region_ = old_in_mbarrier;
-      Stmt else_case = VisitStmt(op->else_case.value());
-      in_mbarrier_region_ = old_in_mbarrier || in_mbarrier_region_;
-      ret = IfThenElse(VisitExpr(op->condition), then_case, else_case);
-    } else {
-      in_mbarrier_region_ = old_in_mbarrier || in_mbarrier_region_;
-      ret = IfThenElse(VisitExpr(op->condition), then_case, Stmt());
-    }
-    return ret;
-  }
-
-  Stmt VisitStmt_(const ForNode *op) final {
-    PostOrderVisit(tvm::ffi::GetRef<For>(op), [&](const ObjectRef &node) {
-      if (const auto *call = node.as<CallNode>()) {
-        if (call->op.same_as(create_list_of_mbarrier()) ||
-            call->op.same_as(mbarrier_wait_parity()) ||
-            call->op.same_as(builtin::ptx_arrive_barrier()) ||
-            call->op.same_as(builtin::ptx_cp_async_barrier())) {
-          im_mbarrier_for_ = true;
-        }
-      }
-    });
-    auto stmt = IRMutatorWithAnalyzer::VisitStmt_(op);
-    im_mbarrier_for_ = false;
-    return stmt;
-  }
-
-private:
-  bool im_mbarrier_for_;
-  bool in_mbarrier_region_;
-  const AttrStmtNode *thread_extent_{nullptr};
-};
-using namespace tir::transform;
-
-namespace transform {
-
-tvm::transform::Pass EliminateStorageSyncForMBarrier() {
-  auto pass_func = [](PrimFunc f, const IRModule &m, const PassContext &ctx) {
-    auto *n = f.CopyOnWrite();
-    n->body = Eliminator::Substitute(n->body);
-    return f;
-  };
-  return CreatePrimFuncPass(pass_func, 0, "tl.EliminateStorageSyncForMBarrier",
-                            {});
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.transform.EliminateStorageSyncForMBarrier",
-                        EliminateStorageSyncForMBarrier);
-}
-
-} // namespace transform
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/flatten_buffer.cc b/src/transform/flatten_buffer.cc
index 3b68d33737..2af300f103 100644
--- a/src/transform/flatten_buffer.cc
+++ b/src/transform/flatten_buffer.cc
@@ -241,6 +241,11 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
     for (size_t i = 0; i < flattened->shape.size(); ++i) {
       writer->shape.Set(i, analyzer_->canonical_simplify(flattened->shape[i]));
     }
+    // Flattened indices already include buf->elem_offset (see
+    // VisitBufferAccess). Zero elem_offset so later passes (e.g.
+    // Buffer::access_ptr) use index as the sole offset and do not add
+    // buffer->elem_offset again.
+    writer->elem_offset = make_const(flattened->DefaultIndexType(), 0);
 
     buffer_remap_[buf] = flattened;
     return flattened;
@@ -289,30 +294,41 @@ class BufferFlattener : public arith::IRMutatorWithAnalyzer {
       under_address_of = false;
       return result;
     }
-    return StmtExprMutator::VisitExpr_(op);
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (call->op.same_as(builtin::tvm_access_ptr())) {
+      ICHECK_GE(call->args.size(), 3)
+          << "tvm_access_ptr must have at least 3 arguments";
+      PrimExpr offset = call->args[2];
+      if (NeedsInt64Promotion(offset)) {
+        Int64Promoter promoter;
+        call.CopyOnWrite()->args.Set(2, promoter(offset));
+      }
+    }
+    return std::move(call);
+  }
+
+  bool NeedsInt64Promotion(const PrimExpr &index) {
+    DataType dtype = index->dtype;
+    if (!dtype.is_int() || dtype.bits() >= 64) {
+      return false;
+    }
+
+    auto int_bound = analyzer_->const_int_bound(index);
+    int64_t max_value = int_bound->max_value;
+    int64_t min_value = int_bound->min_value;
+    const int64_t type_max = (1LL << (dtype.bits() - 1));
+    const int64_t type_min = -(1LL << (dtype.bits() - 1));
+    return max_value >= (type_max - 1) || min_value < type_min;
   }
 
   Array<PrimExpr> GetSimplifiedElemOffset(const Buffer &buffer,
                                           const Array<PrimExpr> &indices) {
     auto flattened_indices = buffer->ElemOffset(indices);
     Array<PrimExpr> safe_indices;
-    for (auto index : flattened_indices) {
-      auto int_bound = analyzer_->const_int_bound(index);
-      DataType dtype = index->dtype;
-      if (dtype.is_int() && dtype.bits() < 64) {
-        int64_t max_value = int_bound->max_value;
-        int64_t min_value = int_bound->min_value;
-        const int64_t type_max = (1LL << (dtype.bits() - 1));
-        const int64_t type_min = -(1LL << (dtype.bits() - 1));
-
-        if (max_value >= (type_max - 1) || min_value < type_min) {
-          Int64Promoter promoter;
-          for (auto &index : flattened_indices) {
-            safe_indices.push_back(promoter(index));
-          }
-        } else {
-          safe_indices.push_back(index);
-        }
+    Int64Promoter promoter;
+    for (const auto &index : flattened_indices) {
+      if (NeedsInt64Promotion(index)) {
+        safe_indices.push_back(promoter(index));
       } else {
         safe_indices.push_back(index);
       }
diff --git a/src/transform/fuse_mbarrier_arrive_expect_tx.cc b/src/transform/fuse_mbarrier_arrive_expect_tx.cc
new file mode 100644
index 0000000000..5183bf1aa9
--- /dev/null
+++ b/src/transform/fuse_mbarrier_arrive_expect_tx.cc
@@ -0,0 +1,192 @@
+/*!
+ * \file fuse_mbarrier_arrive_expect_tx.cc
+ * \brief Fuse simple expect_tx -> TMA issue -> arrive sequences back into
+ *        arrive_and_expect_tx before LowerOpaqueBlock.
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+#include "merge_if_stmt.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+namespace {
+
+class MBarrierArriveExpectTxFuser : public StmtExprMutator {
+public:
+  static PrimFunc Rewrite(PrimFunc f) {
+    Stmt merged = ApplyMergeIfStmt(f->body);
+    f.CopyOnWrite()->body = MBarrierArriveExpectTxFuser().VisitStmt(merged);
+    return f;
+  }
+
+private:
+  static bool Is1DTmaLoad(const CallNode *op) {
+    if (!op->op.same_as(tma_load())) {
+      return false;
+    }
+    auto arg0 = op->args[0].as<Call>();
+    return arg0 && !arg0.value()->op.same_as(create_tma_descriptor()) &&
+           !arg0.value()->op.same_as(create_tma_im2col_descriptor());
+  }
+
+  static Optional<Call> GetEvaluateCall(const Stmt &stmt) {
+    if (const auto *eval = stmt.as<EvaluateNode>()) {
+      if (const auto *call = eval->value.as<CallNode>()) {
+        return tvm::ffi::GetRef<Call>(call);
+      }
+    }
+    return std::nullopt;
+  }
+
+  static Optional<PrimExpr> GetTmaBarrier(const Call &call) {
+    if (call->op.same_as(tma_load())) {
+      return call->args[Is1DTmaLoad(call.get()) ? 2 : 1];
+    }
+    if (call->op.same_as(tma_load_im2col())) {
+      return call->args[1];
+    }
+    return std::nullopt;
+  }
+
+  static bool MatchExpectTx(const Stmt &stmt, PrimExpr *barrier,
+                            PrimExpr *bytes) {
+    Optional<Call> call = GetEvaluateCall(stmt);
+    if (!call.defined() || !call.value()->op.same_as(mbarrier_expect_tx())) {
+      return false;
+    }
+    *barrier = call.value()->args[0];
+    *bytes = call.value()->args[1];
+    return true;
+  }
+
+  static bool MatchArrive(const Stmt &stmt, const PrimExpr &barrier) {
+    Optional<Call> call = GetEvaluateCall(stmt);
+    return call.defined() &&
+           (call.value()->op.same_as(builtin::ptx_arrive_barrier()) ||
+            call.value()->op.same_as(tl::ptx_arrive_cluster_barrier())) &&
+           ExprDeepEqual()(call.value()->args[0], barrier);
+  }
+
+  static bool IsTransparentTmaIssueStmt(const Stmt &stmt,
+                                        const PrimExpr &barrier,
+                                        bool *saw_tma_load) {
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      for (const Stmt &sub_stmt : seq->seq) {
+        if (!IsTransparentTmaIssueStmt(sub_stmt, barrier, saw_tma_load)) {
+          return false;
+        }
+      }
+      return true;
+    }
+
+    if (const auto *loop = stmt.as<ForNode>()) {
+      return IsTransparentTmaIssueStmt(loop->body, barrier, saw_tma_load);
+    }
+
+    Optional<Call> call = GetEvaluateCall(stmt);
+    if (!call.defined()) {
+      return false;
+    }
+
+    if (call.value()->op.same_as(fence_proxy_async())) {
+      return true;
+    }
+
+    if (call.value()->op.same_as(builtin::tvm_storage_sync())) {
+      if (call.value()->args.size() == 1) {
+        if (const auto *scope = call.value()->args[0].as<StringImmNode>()) {
+          return scope->value == "shared" || scope->value == "shared.dyn";
+        }
+      }
+      return false;
+    }
+
+    Optional<PrimExpr> load_barrier = GetTmaBarrier(call.value());
+    if (!load_barrier.defined() ||
+        !ExprDeepEqual()(load_barrier.value(), barrier)) {
+      return false;
+    }
+
+    *saw_tma_load = true;
+    return true;
+  }
+
+  void FlattenAppend(const Stmt &stmt, Array<Stmt> *out) {
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      for (const Stmt &sub_stmt : seq->seq) {
+        FlattenAppend(sub_stmt, out);
+      }
+      return;
+    }
+    out->push_back(stmt);
+  }
+
+  Stmt VisitStmt_(const SeqStmtNode *op) final {
+    Array<Stmt> flat_seq;
+    for (const Stmt &stmt : op->seq) {
+      FlattenAppend(this->VisitStmt(stmt), &flat_seq);
+    }
+
+    Array<Stmt> new_seq;
+    for (int i = 0, n = flat_seq.size(); i < n; ++i) {
+      PrimExpr barrier;
+      PrimExpr bytes;
+      if (!MatchExpectTx(flat_seq[i], &barrier, &bytes)) {
+        new_seq.push_back(flat_seq[i]);
+        continue;
+      }
+
+      int j = i + 1;
+      bool saw_tma_load = false;
+      while (j < n &&
+             IsTransparentTmaIssueStmt(flat_seq[j], barrier, &saw_tma_load)) {
+        ++j;
+      }
+
+      if (!saw_tma_load || j >= n || !MatchArrive(flat_seq[j], barrier)) {
+        new_seq.push_back(flat_seq[i]);
+        continue;
+      }
+
+      new_seq.push_back(Evaluate(Call(DataType::Handle(),
+                                      builtin::ptx_arrive_barrier_expect_tx(),
+                                      {barrier, bytes})));
+      for (int k = i + 1; k < j; ++k) {
+        new_seq.push_back(flat_seq[k]);
+      }
+      i = j;
+    }
+
+    return new_seq.size() == 1 ? new_seq[0] : SeqStmt(new_seq);
+  }
+};
+
+} // namespace
+
+tvm::transform::Pass FuseMBarrierArriveExpectTx() {
+  auto pass_func = [](PrimFunc f, const IRModule &,
+                      const tvm::transform::PassContext &) {
+    return MBarrierArriveExpectTxFuser::Rewrite(std::move(f));
+  };
+  return tir::transform::CreatePrimFuncPass(
+      pass_func, 0, "tl.FuseMBarrierArriveExpectTx", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.FuseMBarrierArriveExpectTx",
+                        FuseMBarrierArriveExpectTx);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/hoist_global_buffer_allocations.cc b/src/transform/hoist_global_buffer_allocations.cc
new file mode 100644
index 0000000000..450f89719f
--- /dev/null
+++ b/src/transform/hoist_global_buffer_allocations.cc
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ * \brief Hoist global buffer allocations to the top of the block (host side).
+ * \file hoist_global_buffer_allocations.cc
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+#include <tvm/tir/var.h>
+
+#include "../op/utils.h"
+#include "common/attr.h"
+#include "tir/transforms/ir_utils.h"
+#include "tvm/tir/stmt.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using namespace tir::transform;
+
+class GlobalBufferAllocationsHoister : public StmtMutator {
+public:
+  Stmt VisitStmt_(const BlockNode *op) final {
+    auto node = Downcast<Block>(StmtMutator::VisitStmt_(op));
+
+    if (IsHostMainBlock(op)) {
+      for (const auto &buf : global_buffers_) {
+        node.CopyOnWrite()->alloc_buffers.push_back(buf);
+      }
+    } else {
+      ffi::Array<Buffer> new_alloc_buffers;
+      for (const auto &buf : op->alloc_buffers) {
+        if (IsGlobalBuffer(buf)) {
+          global_buffers_.push_back(buf);
+        } else {
+          new_alloc_buffers.push_back(buf);
+        }
+      }
+      node.CopyOnWrite()->alloc_buffers = std::move(new_alloc_buffers);
+    }
+
+    return node;
+  }
+
+  ffi::Array<Buffer> global_buffers_;
+};
+
+PrimFunc HoistGlobalBufferAllocations(PrimFunc func) {
+  auto fptr = func.CopyOnWrite();
+  GlobalBufferAllocationsHoister hoister;
+  fptr->body = hoister(fptr->body);
+  return func;
+}
+
+namespace transform {
+
+Pass HoistGlobalBufferAllocations() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return ::tvm::tl::HoistGlobalBufferAllocations(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.HoistGlobalBufferAllocations",
+                            {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.HoistGlobalBufferAllocations",
+                        HoistGlobalBufferAllocations);
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/inject_assumes.cc b/src/transform/inject_assumes.cc
index 2a5fc62ca2..7c6ab6d349 100644
--- a/src/transform/inject_assumes.cc
+++ b/src/transform/inject_assumes.cc
@@ -5,6 +5,7 @@
  */
 
 #include "common/assume.h"
+#include "common/attr.h"
 #include "tvm/arith/analyzer.h"
 #include "tvm/ffi/optional.h"
 #include "tvm/ir/expr.h"
@@ -70,6 +71,46 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
       }
     }
 
+    // --- Stride divisibility for sub-byte dtypes ---
+    struct StrideDivisibilityItem {
+      PrimExpr stride;
+      int pack_factor;
+      std::vector<Buffer> buffers;
+    };
+    std::vector<StrideDivisibilityItem> stride_div_items;
+    std::unordered_map<size_t, std::vector<size_t>> stride_div_buckets;
+
+    void addStrideExpr(PrimExpr stride, int pack_factor, Buffer buffer) {
+      size_t h = sh(stride);
+      auto &bucket = stride_div_buckets[h];
+      auto it = std::find_if(bucket.begin(), bucket.end(), [&](size_t y) {
+        return se(stride, stride_div_items[y].stride, true);
+      });
+      if (it == bucket.end()) {
+        auto index = stride_div_items.size();
+        stride_div_items.push_back({stride, pack_factor, {buffer}});
+        bucket.push_back(index);
+      } else {
+        auto &item = stride_div_items[*it];
+        item.buffers.push_back(buffer);
+        // Use the largest pack_factor (strongest constraint)
+        item.pack_factor = std::max(item.pack_factor, pack_factor);
+      }
+    }
+
+    void addBufferStrides(Buffer buf) {
+      int element_bits = buf->dtype.bits() * buf->dtype.lanes();
+      if (element_bits >= 8 || buf->strides.empty())
+        return;
+      int pack_factor = 8 / element_bits;
+      for (size_t k = 0; k + 1 < buf->strides.size(); ++k) {
+        auto stride = buf->strides[k];
+        if (stride->IsInstance<IntImmNode>())
+          continue;
+        addStrideExpr(stride, pack_factor, buf);
+      }
+    }
+
     Stmt build(Stmt body) {
       auto analyzer = arith::Analyzer{};
       for (const auto &e : items) {
@@ -86,6 +127,23 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
         body = AttrStmt(simplified, tir::attr::tilelang_assume,
                         StringImm(ss.str()), body);
       }
+      // Inject stride divisibility assumes for sub-byte dtypes.
+      // E.g. for fp4 (pack_factor=2), non-last-dim strides must be even.
+      for (const auto &e : stride_div_items) {
+        auto cond =
+            EQ(floormod(e.stride, make_const(e.stride.dtype(), e.pack_factor)),
+               make_zero(e.stride.dtype()));
+        std::stringstream ss;
+        ss << "Sub-byte buffer stride must be divisible by " << e.pack_factor
+           << ": stride `" << e.stride << "` from buffer ";
+        for (size_t i = 0; i < e.buffers.size(); i++) {
+          if (i)
+            ss << ", ";
+          ss << "`" << e.buffers[i]->name << "`";
+        }
+        body = AttrStmt(cond, tir::attr::tilelang_assume, StringImm(ss.str()),
+                        body);
+      }
       return body;
     }
   };
@@ -94,6 +152,7 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
     auto body = VisitStmt(op->body);
     AssumeCreator c;
     c.addBuffer(op->buffer);
+    c.addBufferStrides(op->buffer);
     return DeclBuffer(op->buffer, c.build(body), op->span);
   }
 
@@ -149,16 +208,19 @@ class AssumeInjector : public tvm::tir::StmtExprMutator {
 
     // NOTE(chaofan): We only inject assumes from function arguments in the
     // root block.
-    if (op->name_hint == "root") {
+    if (IsHostMainBlock(op)) {
       for (auto item : f->buffer_map) {
         c.addBuffer(item.second);
+        c.addBufferStrides(item.second);
       }
     }
     for (auto item : op->alloc_buffers) {
       c.addBuffer(item);
+      c.addBufferStrides(item);
     }
     for (auto item : op->match_buffers) {
       c.addBuffer(item->buffer);
+      c.addBufferStrides(item->buffer);
     }
 
     return Block(op->iter_vars, op->reads, op->writes, op->name_hint,
diff --git a/src/transform/inject_fence_proxy.cc b/src/transform/inject_fence_proxy.cc
index 6152789a2e..7f383c11e6 100644
--- a/src/transform/inject_fence_proxy.cc
+++ b/src/transform/inject_fence_proxy.cc
@@ -17,6 +17,8 @@
 
 #include "../op/builtin.h"
 
+#include <string>
+
 namespace tvm {
 namespace tl {
 
@@ -99,6 +101,18 @@ bool IsAsyncIntrinsic(const CallNode *call) {
     return true;
   }
 
+  // multimem bulk async ops (shared → mcast_global via bulk_group)
+  if (call->op.same_as(builtin::call_extern()) && call->args.size() >= 1) {
+    if (auto *str_imm = call->args[0].as<StringImmNode>()) {
+      if (std::string(str_imm->value).find("tl::multimem::cp_async_bulk") ==
+              0 ||
+          std::string(str_imm->value)
+                  .find("tl::multimem::cp_reduce_async_bulk") == 0) {
+        return true;
+      }
+    }
+  }
+
   return false;
 }
 
@@ -143,11 +157,22 @@ class TMAStoreSyncInjector : public StmtExprMutator {
 private:
   Stmt operator()(const Stmt &stmt) { return StmtExprMutator::VisitStmt(stmt); }
 
+  static bool IsMultimemBulkCall(const CallNode *call) {
+    if (!call->op.same_as(builtin::call_extern()) || call->args.empty())
+      return false;
+    if (auto *str_imm = call->args[0].as<StringImmNode>()) {
+      std::string name(str_imm->value);
+      return name.find("tl::multimem::cp_async_bulk") == 0 ||
+             name.find("tl::multimem::cp_reduce_async_bulk") == 0;
+    }
+    return false;
+  }
+
   Stmt VisitStmt_(const EvaluateNode *op) final {
     Stmt mutated = StmtExprMutator::VisitStmt_(op);
     const auto *node = mutated.as<EvaluateNode>();
     if (const auto *call = node->value.as<CallNode>()) {
-      if (call->op.same_as(tma_store())) {
+      if (call->op.same_as(tma_store()) || IsMultimemBulkCall(call)) {
         Array<Stmt> seq;
         seq.push_back(mutated);
         seq.push_back(
diff --git a/src/transform/inject_pipeline.cc b/src/transform/inject_pipeline.cc
index e106dec619..47a9344886 100644
--- a/src/transform/inject_pipeline.cc
+++ b/src/transform/inject_pipeline.cc
@@ -3,14 +3,27 @@
  * \brief Transform annotated loops into pipelined one that parallelize
  * producers and consumers
  */
+#include <tvm/arith/analyzer.h>
 #include <tvm/target/target.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/transform.h>
 
 #include <functional>
+#include <map>
+#include <unordered_map>
 #include <unordered_set>
 #include <utility>
-
+#include <vector>
+
+#include "../layout/layout.h"
+#include "../op/builtin.h"
+#include "../op/copy.h"
+#include "../op/gemm.h"
+#include "../op/operator.h"
+#include "../op/region.h"
+#include "../op/utils.h"
+#include "common/mbarrier.h"
+#include "common/pipeline_utils.h"
 #include "support/utils.h"
 #include "tir/schedule/utils.h"
 #include "tir/transforms/ir_utils.h"
@@ -21,11 +34,102 @@ using namespace tir;
 using namespace ffi;
 namespace software_pipeline {
 
+namespace {
+
+bool ShapesEqual(const Array<PrimExpr> &lhs, const Array<PrimExpr> &rhs,
+                 arith::Analyzer *analyzer) {
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < lhs.size(); ++i) {
+    if (!analyzer->CanProveEqual(lhs[i], rhs[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Layout ExpandAnnotatedLayoutForMultiVersionedBuffer(const Layout &layout,
+                                                    const Buffer &old_buffer,
+                                                    const Buffer &new_buffer) {
+  if (!layout.defined() ||
+      new_buffer->shape.size() <= old_buffer->shape.size()) {
+    return Layout();
+  }
+
+  arith::Analyzer analyzer;
+  if (!ShapesEqual(layout->InputShape(), old_buffer->shape, &analyzer)) {
+    return Layout();
+  }
+
+  size_t leading_ndim = new_buffer->shape.size() - old_buffer->shape.size();
+  Array<PrimExpr> trailing_shape;
+  Array<PrimExpr> leading_shape;
+  for (size_t i = 0; i < leading_ndim; ++i) {
+    leading_shape.push_back(new_buffer->shape[i]);
+  }
+  for (size_t i = 0; i < old_buffer->shape.size(); ++i) {
+    trailing_shape.push_back(new_buffer->shape[leading_ndim + i]);
+  }
+  if (!ShapesEqual(trailing_shape, old_buffer->shape, &analyzer)) {
+    return Layout();
+  }
+
+  return layout->Expand(leading_shape);
+}
+
+bool UpdateExpandedLayoutMapForRemappedAllocs(
+    const std::vector<std::pair<Buffer, Buffer>> &remapped_allocs,
+    Map<String, Any> *annotations) {
+  if (remapped_allocs.empty() || !annotations->count(attr::kLayoutMap)) {
+    return false;
+  }
+
+  auto layout_map_ref = annotations->Get(attr::kLayoutMap);
+  if (!layout_map_ref.has_value()) {
+    return false;
+  }
+  auto layout_map = layout_map_ref.value().as<Map<Var, Layout>>();
+  if (!layout_map.has_value()) {
+    return false;
+  }
+
+  Map<Var, Layout> updated_layout_map = layout_map.value();
+  std::unordered_set<const VarNode *> visited;
+  bool changed = false;
+  for (const auto &[old_buffer, new_buffer] : remapped_allocs) {
+    if (!visited.insert(old_buffer->data.get()).second ||
+        !updated_layout_map.count(old_buffer->data)) {
+      continue;
+    }
+    Layout layout = updated_layout_map[old_buffer->data];
+    Layout expanded = ExpandAnnotatedLayoutForMultiVersionedBuffer(
+        layout, old_buffer, new_buffer);
+    if (!expanded.defined()) {
+      continue;
+    }
+    updated_layout_map.Set(old_buffer->data, expanded);
+    changed = true;
+  }
+
+  if (changed) {
+    annotations->Set(attr::kLayoutMap, updated_layout_map);
+  }
+  return changed;
+}
+
+} // namespace
+
 struct LetWrapper {
   Var var;
   PrimExpr value;
 };
 
+struct IfWrapper {
+  PrimExpr condition;
+  Span span;
+};
+
 /*!
  * \brief Collector to find all buffers used in a statement.
  *
@@ -62,6 +166,18 @@ class BufferUsageCollector : public StmtExprVisitor {
   }
 
   void VisitExpr_(const CallNode *op) final {
+    if (auto tile_op = ParseOperator(tvm::ffi::GetRef<Call>(op));
+        tile_op.defined()) {
+      AccessRegions access = tile_op->GetAccessRegions();
+      for (const auto &region : access.reads) {
+        AddBuffer(region->buffer);
+      }
+      for (const auto &region : access.writes) {
+        AddBuffer(region->buffer);
+      }
+      StmtExprVisitor::VisitExpr_(op);
+      return;
+    }
     // Handle tvm_access_ptr which also accesses buffers
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       if (op->args.size() > 1) {
@@ -97,6 +213,29 @@ class BufferUsageCollector : public StmtExprVisitor {
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> used_buffers_;
 };
 
+class TileOpAccessCollector : public StmtExprVisitor {
+public:
+  Array<BufferRegion> GetReads() const { return reads_; }
+
+  Array<BufferRegion> GetWrites() const { return writes_; }
+
+private:
+  void VisitExpr_(const CallNode *op) final {
+    if (auto tile_op = ParseOperator(tvm::ffi::GetRef<Call>(op));
+        tile_op.defined()) {
+      AccessRegions access = tile_op->GetAccessRegions();
+      reads_.insert(reads_.end(), access.reads.begin(), access.reads.end());
+      writes_.insert(writes_.end(), access.writes.begin(), access.writes.end());
+      StmtExprVisitor::VisitExpr_(op);
+      return;
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  Array<BufferRegion> reads_;
+  Array<BufferRegion> writes_;
+};
+
 /*!
  * \brief Create a block and infer the access region with the given body.
  *
@@ -110,19 +249,27 @@ class BufferUsageCollector : public StmtExprVisitor {
  */
 Block MakeBlock(const Stmt &body,
                 const Map<Var, Buffer> &buffer_data_to_buffer) {
+  Block block;
   if (const BlockRealizeNode *block_realize = body.as<BlockRealizeNode>()) {
     if (is_one(block_realize->predicate)) {
-      // no need to create a new block
-      return block_realize->block;
+      block = block_realize->block;
     }
   }
-  Block block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{}, /*name_hint=*/"",
-              /*body*/ body);
+  if (!block.defined()) {
+    block = Block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
+                  /*name_hint=*/"", /*body*/ body);
+  }
   Array<Array<BufferRegion>> access =
       GetBlockReadWriteRegion(block, buffer_data_to_buffer);
+  TileOpAccessCollector collector;
+  collector(block->body);
+  Array<BufferRegion> tile_reads = collector.GetReads();
+  Array<BufferRegion> tile_writes = collector.GetWrites();
   BlockNode *n = block.CopyOnWrite();
   n->reads = access[0];
+  n->reads.insert(n->reads.end(), tile_reads.begin(), tile_reads.end());
   n->writes = access[1];
+  n->writes.insert(n->writes.end(), tile_writes.begin(), tile_writes.end());
   return block;
 }
 
@@ -130,9 +277,8 @@ Block MakeBlock(const Stmt &body,
 struct PipelineAnnotation {
   int stage;
   int order;
-  bool async;
-  // Index of the statement in the original loop body order (SeqStmt order)
-  int original_idx = -1;
+  bool async{false};
+  int async_group_id{-1};
 };
 
 using PipelineInfo = std::unordered_map<Block, PipelineAnnotation,
@@ -143,6 +289,167 @@ struct BufferAccessInfo {
   int use = -1; // the last using stage of the buffer
 };
 
+// Detect whether a stage body already carries explicit async/cp.async
+// semantics. InjectSoftwarePipeline only wants to "upgrade" ordinary producer
+// stages into pipeline-managed async producers; if the body already contains
+// raw cp.async instructions or async queue attrs, re-marking it as async here
+// would stack two async protocols on the same stage.
+bool ContainsExplicitAsyncIntrinsics(const Stmt &stmt) {
+  bool found = false;
+  PostOrderVisit(stmt, [&](const ObjectRef &obj) {
+    if (found) {
+      return;
+    }
+    if (const auto *attr = obj.as<AttrStmtNode>()) {
+      if (attr->attr_key == tir::attr::async_scope ||
+          attr->attr_key == tir::attr::async_commit_queue_scope ||
+          attr->attr_key == tir::attr::async_wait_queue_scope ||
+          attr->attr_key == tir::attr::async_wait_inflight_count) {
+        found = true;
+        return;
+      }
+    }
+    const auto *call = obj.as<CallNode>();
+    if (!call) {
+      return;
+    }
+    if (call->op.same_as(builtin::ptx_cp_async()) ||
+        call->op.same_as(tl::ptx_cp_async()) ||
+        call->op.same_as(builtin::ptx_commit_group()) ||
+        call->op.same_as(builtin::ptx_wait_group())) {
+      found = true;
+    }
+  });
+  return found;
+}
+
+class SimtProducerAnnotator : public StmtExprMutator {
+public:
+  static Stmt Annotate(const Stmt &stmt,
+                       Optional<Target> target = Optional<Target>()) {
+    SimtProducerAnnotator annotator(std::move(target));
+    return annotator.VisitStmt(stmt);
+  }
+
+private:
+  explicit SimtProducerAnnotator(Optional<Target> target)
+      : target_(std::move(target)) {}
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    Stmt body = VisitStmt(op->body);
+    auto annotations = op->annotations;
+    // Keep the raw buffer-store cp.async path under outer pipeline-managed
+    // commit/wait semantics as well.
+    annotations.Set(attr::kParallelAsyncWithoutAsyncCommitWait, Bool(true));
+    return For(op->loop_var, op->min, op->extent, op->kind, body,
+               op->thread_binding, annotations, op->step, op->span);
+  }
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    static const Op &copy_op = Op::Get("tl.tileop.copy");
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (!call->op.same_as(copy_op) || !CanUsePipelineManagedCPAsyncCopy(call)) {
+      return call;
+    }
+    // Tile-op copies lower through copy.cc, so they need an explicit
+    // per-copy marker to suppress their own implicit commit/wait.
+    auto annotations = call->annotations;
+    annotations.Set(attr::kAsyncCopyNoImplicitCommitWait,
+                    IntImm(DataType::Int(32), 1));
+    return Call(call->dtype, call->op, call->args, annotations, call->span);
+  }
+
+  bool CanUsePipelineManagedCPAsyncCopy(const Call &call) const {
+    auto tile_op = ParseOperator(call);
+    const auto *copy = tile_op.as<CopyNode>();
+    if (copy == nullptr) {
+      return false;
+    }
+    if (!target_.defined()) {
+      return copy->CheckPipelineManagedCPAsyncCopy();
+    }
+    return copy->CheckPipelineManagedCPAsyncCopy(target_.value(), &analyzer_);
+  }
+
+  Optional<Target> target_;
+  mutable arith::Analyzer analyzer_;
+};
+
+class TileOpMbarPhaseAnnotator : public StmtExprMutator {
+public:
+  static Stmt Annotate(const Stmt &stmt, PrimExpr phase_expr) {
+    TileOpMbarPhaseAnnotator annotator(std::move(phase_expr));
+    return annotator.VisitStmt(stmt);
+  }
+
+private:
+  explicit TileOpMbarPhaseAnnotator(PrimExpr phase_expr)
+      : phase_expr_(std::move(phase_expr)) {}
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (!IsMbarPhaseConsumer(call)) {
+      return call;
+    }
+    if (call->annotations.count(attr::kPipelineMbarPhaseExpr)) {
+      return call;
+    }
+    auto annotations = call->annotations;
+    annotations.Set(attr::kPipelineMbarPhaseExpr, phase_expr_);
+    return Call(call->dtype, call->op, call->args, annotations, call->span);
+  }
+
+  bool IsMbarPhaseConsumer(const Call &call) const {
+    auto tile_op = ParseOperator(call);
+    return tile_op.defined() && (tile_op.as<CopyNode>() != nullptr ||
+                                 tile_op.as<Conv2DIm2ColOpNode>() != nullptr ||
+                                 tile_op.as<GemmNode>() != nullptr);
+  }
+
+  PrimExpr phase_expr_;
+};
+
+class AsyncCommitWaitAttrLowerer : public StmtExprMutator {
+public:
+  static Stmt Lower(const Stmt &stmt) {
+    AsyncCommitWaitAttrLowerer lowerer;
+    return lowerer.VisitStmt(stmt);
+  }
+
+private:
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tir::attr::async_commit_queue_scope) {
+      Stmt body = VisitStmt(op->body);
+      Stmt commit =
+          Evaluate(Call(DataType::Handle(), builtin::ptx_commit_group(), {}));
+      if (is_no_op(body)) {
+        return commit;
+      }
+      return SeqStmt({body, commit});
+    }
+    if (op->attr_key == tir::attr::async_wait_queue_scope) {
+      auto wait_attrs = GetAsyncWaitAttributes(op);
+      Stmt body = op->body;
+      if (const auto *inner = op->body.as<AttrStmtNode>()) {
+        if (inner->attr_key == tir::attr::async_wait_inflight_count) {
+          body = inner->body;
+        }
+      }
+      body = VisitStmt(body);
+      Stmt wait = Evaluate(Call(DataType::Handle(), builtin::ptx_wait_group(),
+                                {wait_attrs.second}));
+      if (is_no_op(body)) {
+        return wait;
+      }
+      return SeqStmt({wait, body});
+    }
+    if (op->attr_key == tir::attr::async_wait_inflight_count) {
+      return VisitStmt(op->body);
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+};
+
 /*!
  * \brief Rewriter for the body of the software pipeline. This pass inserts
  * `floormod` to indices of the remapped buffer to select the version
@@ -201,8 +508,12 @@ class PipelineBodyRewriter : public StmtExprMutator {
     };
     Array<PrimExpr> new_args = call->args;
     for (int i : arg_indices) {
-      const Buffer &buffer =
-          buffer_data_to_buffer_.at(Downcast<Var>(call->args[i]));
+      auto buffer_var = Downcast<Var>(call->args[i]);
+      auto buf_it = buffer_data_to_buffer_.find(buffer_var);
+      if (buf_it == buffer_data_to_buffer_.end()) {
+        continue;
+      }
+      const Buffer &buffer = (*buf_it).second;
       auto it = buffer_remap_.find(buffer);
       if (it != buffer_remap_.end()) {
         const Buffer &new_buffer = (*it).second;
@@ -215,7 +526,9 @@ class PipelineBodyRewriter : public StmtExprMutator {
         }
         PrimExpr new_index =
             old_index +
-            floormod(pipeline_loop_->loop_var, new_buffer->shape[0]) * offset;
+            floormod((pipeline_loop_->loop_var - pipeline_loop_->min),
+                     new_buffer->shape[0]) *
+                offset;
         new_args.Set(i + 1, new_index);
       }
     }
@@ -275,6 +588,22 @@ class PipelineBodyRewriter : public StmtExprMutator {
     if (call->op.same_as(builtin::tvm_access_ptr())) {
       return RewriteBufferAccess(call, {1});
     }
+    if (call->op.same_as(RegionOp::Get()) && call->args.size() >= 2) {
+      if (auto load = call->args[0].as<BufferLoadNode>()) {
+        size_t num_extents = call->args.size() - 2;
+        if (load->indices.size() == num_extents + 1) {
+          Array<PrimExpr> new_args;
+          new_args.push_back(call->args[0]);
+          new_args.push_back(call->args[1]);
+          new_args.push_back(IntImm(DataType::Int(32), 1));
+          for (size_t i = 2; i < call->args.size(); ++i) {
+            new_args.push_back(call->args[i]);
+          }
+          return Call(call->dtype, call->op, new_args, call->annotations,
+                      call->span);
+        }
+      }
+    }
     return call;
   }
 
@@ -303,16 +632,21 @@ class PipelineRewriter : public StmtExprMutator {
    * \param pipeline_loop The original loop to be software pipelined.
    * \param pipeline_info The pipeline annotation information.
    * \param loop_var_let_wrappers Let wrappers that depend on the loop var.
+   * \param loop_var_if_wrappers If wrappers with conditions that depend on
+   * the loop var.
    */
   PipelineRewriter(Map<Var, Buffer> buffer_data_to_buffer,
                    const Array<Buffer> &pipeline_allocs,
                    const Array<Buffer> &local_allocs, const For &pipeline_loop,
-                   const PipelineInfo &pipeline_info,
-                   const std::vector<LetWrapper> &loop_var_let_wrappers)
+                   const PipelineInfo &pipeline_info, Optional<Target> target,
+                   const std::vector<LetWrapper> &loop_var_let_wrappers,
+                   const std::vector<IfWrapper> &loop_var_if_wrappers)
       : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)),
         pipeline_allocs_(pipeline_allocs), local_allocs_(local_allocs),
         pipeline_loop_(pipeline_loop), pipeline_info_(pipeline_info),
-        loop_var_let_wrappers_(loop_var_let_wrappers) {}
+        target_(std::move(target)),
+        loop_var_let_wrappers_(loop_var_let_wrappers),
+        loop_var_if_wrappers_(loop_var_if_wrappers) {}
 
   Stmt BuildPipeline() {
     // Step 1: Analyze accesses to the buffers in the pipeline and compute the
@@ -335,73 +669,52 @@ class PipelineRewriter : public StmtExprMutator {
       ordered_stmts_.Set(anno.order, block);
     }
 
-    for (const Block &block : ordered_stmts_) {
-      int stage = pipeline_info_[block].stage;
-      if (pipeline_info_[block].async) {
-        auto &state = async_states[stage];
-        state.producer_head = pipeline_loop_->min - 1;
-        for (auto write_region : block->writes) {
-          auto buffer = write_region->buffer;
-          state.dst_buffers.insert(buffer.get());
-          if (buffer_remap_.count(buffer))
-            state.dst_buffers.insert(buffer_remap_[buffer].get());
-        }
-      }
-    }
-    std::unordered_set<int> consumed;
-    for (const Block &block : ordered_stmts_) {
-      int stage = pipeline_info_[block].stage;
-      if (pipeline_info_[block].async) {
-        auto &state = async_states[stage];
-        if (state.commit_groups.empty() || consumed.count(stage)) {
-          state.commit_groups.push_back({});
-        }
-        state.commit_groups.back().push_back(pipeline_info_[block].order);
-        consumed.erase(stage);
-        for (auto write_region : block->writes) {
-          auto buffer = buffer_remap_.count(write_region->buffer)
-                            ? buffer_remap_[write_region->buffer]
-                            : write_region->buffer;
-          state.buffer_to_commit_group_[buffer.get()] =
-              state.commit_groups.size() - 1;
-        }
-      }
-
-      for (auto read_region : block->reads) {
-        for (const auto &[producer_stage_id, producer_state] : async_states) {
-          if (producer_stage_id <= stage &&
-              producer_state.writes(read_region->buffer)) {
-            consumed.insert(producer_stage_id);
-          }
-        }
+    // Step 2: Emit the pipeline prologue, body and epilogue.
+    Optional<Integer> pipeline_num_stages =
+        GetPipelineNumStages(pipeline_loop_.get());
+    Stmt prologue = StripPipelineContextAttrs(EmitImpl(
+        pipeline_loop_->min, pipeline_loop_->min + max_stage_, true, true));
+    Stmt body = StripPipelineContextAttrs(
+        EmitImpl(pipeline_loop_->min + max_stage_,
+                 pipeline_loop_->min + pipeline_loop_->extent, false, false));
+    Stmt epilogue = StripPipelineContextAttrs(EmitImpl(
+        pipeline_loop_->min + pipeline_loop_->extent,
+        pipeline_loop_->min + pipeline_loop_->extent + max_stage_, true, true));
+
+    Array<Stmt> pipeline_parts;
+    for (const Stmt &part : {prologue, body, epilogue}) {
+      for (const Stmt &stmt : FlattenTopLevelSeq(part)) {
+        pipeline_parts.push_back(stmt);
       }
     }
 
-    // Step 2: Emit the pipeline prologue, body and epilogue.
-    Stmt prologue =
-        EmitImpl(pipeline_loop_->min, pipeline_loop_->min + max_stage_, true,
-                 true, false);
-    Stmt body = EmitImpl(pipeline_loop_->min + max_stage_,
-                         pipeline_loop_->min + pipeline_loop_->extent, false,
-                         false, false);
-
-    Stmt epilogue =
-        EmitImpl(pipeline_loop_->min + pipeline_loop_->extent,
-                 pipeline_loop_->min + pipeline_loop_->extent + max_stage_,
-                 true, true, true);
-    SeqStmt stmt = SeqStmt({prologue, body, epilogue});
+    Stmt stmt = pipeline_parts.size() == 1 ? pipeline_parts[0]
+                                           : SeqStmt(pipeline_parts);
+    stmt = AsyncPipelineLoopWaitRelaxer(this)(stmt);
+    Array<Stmt> relaxed_pipeline_parts = FlattenTopLevelSeq(stmt);
+    relaxed_pipeline_parts =
+        RelaxTrailingConsumerWaits(std::move(relaxed_pipeline_parts),
+                                   PipelinedRetainGroups(pipeline_num_stages));
+    stmt = relaxed_pipeline_parts.size() == 1 ? relaxed_pipeline_parts[0]
+                                              : SeqStmt(relaxed_pipeline_parts);
 
     // Step 3: Make a new block that contains new buffer allocations after
     // pipeline rewriting.
     // Only include buffers that are locally allocated in the pipeline block.
     // Buffers from outer blocks will be handled separately.
     Array<Buffer> alloc_buffers;
-    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> local_allocs_set(
-        local_allocs_.begin(), local_allocs_.end());
     for (const auto &alloc : local_allocs_) {
       alloc_buffers.push_back(buffer_remap_.Get(alloc).value_or(alloc));
       buffer_data_to_buffer_.erase(alloc->data);
     }
+    if (pipeline_num_stages) {
+      if (pipeline_num_stages.value()->value > 1) {
+        stmt = AttrStmt(Integer(0), kPipelineMVBContextNumStages,
+                        Downcast<PrimExpr>(pipeline_num_stages.value()), stmt);
+      }
+      stmt = AttrStmt(Integer(0), kPipelineContextNumStages,
+                      Downcast<PrimExpr>(pipeline_num_stages.value()), stmt);
+    }
     Block block = MakeBlock(stmt, buffer_data_to_buffer_);
     block.CopyOnWrite()->alloc_buffers = std::move(alloc_buffers);
     return BlockRealize({}, Bool(true), block);
@@ -563,309 +876,1332 @@ class PipelineRewriter : public StmtExprMutator {
     return Buffer(new_buffer);
   }
 
-  // Per-stage states that need to be tracked across pipeline prologue, body,
-  // and epilogue.
   struct AsyncStateGlobal {
-    // Buffers that this stage asynchronously writes.
     std::unordered_set<const BufferNode *> dst_buffers;
-    // An imaginary index that the latest async operation associated with this
-    // stage has written into. Only valid if all associated predicates are true,
-    // so that we can count the number of async invocations exactly. When it is
-    // valid, it is the "sum of extents of loops that have been executed" - 1,
-    // e.g. for epilogue it is prologue extent + body extent - 1. This is only
-    // needed to compute wait count for epilogue without async producers.
-    PrimExpr producer_head;
-    std::vector<std::vector<int>> commit_groups;
-    std::unordered_map<const BufferNode *, int> buffer_to_commit_group_;
-    bool writes(const Buffer &buf) const {
-      return dst_buffers.count(buf.get()) > 0;
+    Optional<PrimExpr> producer_head{PrimExpr(-1)};
+
+    bool writes(const Buffer &buffer) const {
+      return dst_buffers.count(buffer.get()) > 0;
     }
   };
 
-  // Per-stage states that are local to each of pipeline prologue, body, and
-  // epilogue.
   struct AsyncStateLocal {
     struct PendingWait {
-      // The index into a list of blocks, where async_wait_queue should be
-      // attached at the beginning.
-      int insert_before;
-      // in_flight_count would be a more precise name, but the implementation
-      // uses wait_count for brevity.
+      int insert_before{-1};
       PrimExpr wait_count{nullptr};
 
       bool valid() const { return wait_count.defined(); }
     };
 
-    std::vector<PendingWait> pending_waits;
-
-    // A symbolic expression representing the index the latest async operation
-    // associated with this stage has written into, at the "current" iteration.
+    std::unordered_set<const BufferNode *> seen;
     Optional<PrimExpr> producer_head;
-    // the commit block's predicate
-    PrimExpr commit_predicate{nullptr};
+    Optional<PrimExpr> predicate;
+    std::vector<std::vector<size_t>> commit_groups;
+    std::map<int, PendingWait> pending_waits;
+    std::unordered_map<int, int> annotated_group_to_commit_group;
+    bool consumed{false};
   };
 
-  /*! Structure holding intermediate information for pipeline loop rewriting. */
-  struct RewrittenBlockInfo {
+  struct RewrittenStmtInfo {
     int stage;
-    int order;
-    PrimExpr start;
-    PrimExpr end;
     PrimExpr predicate;
-    Block block;
+    Array<BufferRegion> reads;
+    Array<BufferRegion> writes;
     PrimExpr access_index;
     bool is_async;
+    Stmt stmt;
   };
 
-  void PopulateWaitCounts(const std::vector<RewrittenBlockInfo> &new_blocks,
-                          std::map<int, AsyncStateLocal> *async_states_local,
-                          bool is_epilogue = false) {
-    // Precompute which orders are present in this emit, and their access_index
-    std::unordered_map<int, PrimExpr> order_to_access_index;
-    std::unordered_set<int> present_orders;
-    for (const auto &nb : new_blocks) {
-      order_to_access_index[nb.order] = nb.access_index;
-      present_orders.insert(nb.order);
-    }
-    for (size_t i = 0; i < new_blocks.size(); ++i) {
-      // 1. Find the unique async producer stage
-      int producer_stage_idx = -1;
-      for (const auto &read_region : new_blocks[i].block->reads) {
-        for (const auto &[stage, state] : async_states) {
-          if (stage <= new_blocks[i].stage &&
-              state.writes(read_region->buffer)) {
-            // Currently only a single async stage dependency is supported
-            ICHECK(producer_stage_idx == -1 || producer_stage_idx == stage)
-                << "A dependency on multiple async stages is not supported";
-            producer_stage_idx = stage;
-          }
-        }
-      }
-      if (producer_stage_idx == -1) {
-        // This block does not depend on any async producer
-        continue;
-      }
-      const auto &state = async_states[producer_stage_idx];
+  struct FinalStmtInfo {
+    int stage;
+    PrimExpr access_index;
+    PrimExpr predicate;
+    Stmt stmt;
+  };
 
-      auto &dep_local_state = (*async_states_local)[producer_stage_idx];
+  enum class AsyncSyncStmtKind { kOther, kCommit, kWaitStatic, kWaitDynamic };
 
-      // 2. Use buffer_to_commit_group_ to find all actually dependent commit
-      // groups
-      std::unordered_set<int> dependent_groups;
-      for (const auto &read_region : new_blocks[i].block->reads) {
-        auto it = state.buffer_to_commit_group_.find(read_region->buffer.get());
-        if (it != state.buffer_to_commit_group_.end()) {
-          dependent_groups.insert(it->second);
-        }
-      }
+  struct ClassifiedAsyncSyncStmt {
+    AsyncSyncStmtKind kind{AsyncSyncStmtKind::kOther};
+    int wait_n{0};
+  };
 
-      // If there is no dependent commit group, no wait needs to be inserted
-      if (dependent_groups.empty()) {
-        continue;
-      }
+  struct AsyncSyncSummary {
+    int commit{0};
+    int wait{0};
+  };
 
-      // 3. Compute wait = max_g max(0, t_consumer - committed_before[g])
-      PrimExpr t_consumer = new_blocks[i].access_index;
-      PrimExpr wait_expr = make_zero(t_consumer.dtype());
+  enum class HeadAsyncSyncKind {
+    kNone,
+    kCommit,
+    kWaitStatic,
+    kWaitDynamic,
+    kBlocked,
+  };
 
-      PrimExpr current_head = dep_local_state.producer_head.defined()
-                                  ? dep_local_state.producer_head.value()
-                                  : state.producer_head;
-      int consumer_order = new_blocks[i].order;
+  struct HeadAsyncSyncInfo {
+    HeadAsyncSyncKind kind{HeadAsyncSyncKind::kNone};
+    int wait_n{0};
 
-      for (int g : dependent_groups) {
-        const auto &group = state.commit_groups[g];
-        if (group.empty())
-          continue;
-        int commit_order = group.back();
-        bool commit_present = present_orders.count(commit_order) > 0;
-
-        PrimExpr committed_before;
-        if (commit_present && commit_order <= consumer_order) {
-          // Commit point is in this iteration and earlier than the current
-          // consumer; this iteration's head is visible
-          auto commit_predicate = dep_local_state.commit_predicate;
-          if (analyzer_.CanProve(!commit_predicate,
-                                 arith::ProofStrength::kSymbolicBound)) {
-            // it means the commit block is not executed in this iteration
-            committed_before = new_blocks[i].start - 1;
-          } else if (is_epilogue) {
-            committed_before = new_blocks[i].start - 1;
-          } else {
-            committed_before = order_to_access_index.at(commit_order);
-          }
-        } else {
-          // Commit point is later than the current consumer or not in this
-          // iteration; only the previous iteration's head is visible
-          if (dep_local_state.producer_head.defined()) {
-            auto commit_predicate = dep_local_state.commit_predicate;
-            if (analyzer_.CanProve(!commit_predicate,
-                                   arith::ProofStrength::kSymbolicBound)) {
-              committed_before = new_blocks[i].start - 1;
-            } else if (is_epilogue) {
-              committed_before = new_blocks[i].start - 1;
-            } else {
-              committed_before = current_head - 1;
-            }
-          }
-        }
+    bool IsBoundary() const {
+      return kind == HeadAsyncSyncKind::kCommit ||
+             kind == HeadAsyncSyncKind::kWaitDynamic ||
+             kind == HeadAsyncSyncKind::kBlocked;
+    }
+  };
 
-        wait_expr = analyzer_.Simplify(committed_before - t_consumer);
-      }
+  enum class HeadSeqMode {
+    kSingletonOnly,
+    kTakeFirstElement,
+  };
 
-      wait_expr = analyzer_.Simplify(wait_expr);
-      dep_local_state.pending_waits.push_back({static_cast<int>(i), wait_expr});
+  struct DeterministicNoWaitCommitEffect {
+    bool deterministic{true};
+    bool has_wait{false};
+    int commit_groups{0};
+
+    static DeterministicNoWaitCommitEffect Unknown() {
+      DeterministicNoWaitCommitEffect effect;
+      effect.deterministic = false;
+      return effect;
     }
-  }
 
-  // Given pipelined blocks and async-related information, generate final loop
-  // statements with async scopes (if any).
-  Array<Stmt> CompletePipelineLoopStatements(
-      const std::vector<RewrittenBlockInfo> &blocks,
-      const std::map<int, AsyncStateLocal> &async_states_local) const {
-    std::vector<RewrittenBlockInfo> new_blocks = blocks;
-    for (const auto &[stage_id, state] : async_states_local) {
-      for (const auto &pw : state.pending_waits) {
-        auto &block = new_blocks[pw.insert_before].block;
-        BlockNode *n = block.CopyOnWrite();
-        auto zero = make_zero(DataType::Int(32));
-        n->body = AttrStmt(zero, tir::attr::async_wait_queue_scope, stage_id,
-                           AttrStmt(zero, tir::attr::async_wait_inflight_count,
-                                    pw.wait_count, n->body));
-      }
+    static DeterministicNoWaitCommitEffect Wait() {
+      DeterministicNoWaitCommitEffect effect;
+      effect.has_wait = true;
+      return effect;
     }
+  };
 
-    // mark the last async stmt as commit
-    std::unordered_set<int> commit_group_indices;
-    for (const auto &[stage_id, state] : async_states) {
-      for (size_t i = 0; i < state.commit_groups.size(); ++i) {
-        commit_group_indices.insert(state.commit_groups[i].back());
+  // Analyze a stmt for one specific question used by wait relaxation:
+  // can we prove that it contributes a deterministic number of commit groups
+  // without crossing a wait boundary? The analyzer exposes the effect as
+  // structured state instead of overloading std::optional<int> with both
+  // "unknown" and "has wait" meanings.
+  class DeterministicNoWaitCommitAnalyzer {
+  public:
+    explicit DeterministicNoWaitCommitAnalyzer(const PipelineRewriter *rewriter)
+        : rewriter_(rewriter) {}
+
+    DeterministicNoWaitCommitEffect Analyze(const Stmt &stmt) const {
+      if (const auto *let = stmt.as<LetStmtNode>()) {
+        return Analyze(let->body);
+      }
+      if (const auto *attr = stmt.as<AttrStmtNode>()) {
+        return AnalyzeAttr(attr);
+      }
+      if (const auto *seq = stmt.as<SeqStmtNode>()) {
+        DeterministicNoWaitCommitEffect effect;
+        for (const Stmt &s : seq->seq) {
+          effect = Combine(effect, Analyze(s));
+          if (!effect.deterministic) {
+            return effect;
+          }
+        }
+        return effect;
+      }
+      if (const auto *block = stmt.as<BlockNode>()) {
+        return Analyze(block->body);
+      }
+      if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+        if (!is_one(realize->predicate)) {
+          return DeterministicNoWaitCommitEffect::Unknown();
+        }
+        return Analyze(realize->block->body);
       }
+      if (const auto *for_node = stmt.as<ForNode>()) {
+        return AnalyzeFor(for_node);
+      }
+      if (stmt.as<IfThenElseNode>()) {
+        return DeterministicNoWaitCommitEffect::Unknown();
+      }
+      if (rewriter_->ContainsAsyncSyncScopes(stmt)) {
+        return DeterministicNoWaitCommitEffect::Unknown();
+      }
+      return {};
     }
 
-    Array<Stmt> stmts;
+  private:
+    DeterministicNoWaitCommitEffect
+    AnalyzeAttr(const AttrStmtNode *attr) const {
+      if (PipelineRewriter::IsAsyncWaitQueueScope(attr) ||
+          PipelineRewriter::IsAsyncWaitInflightCount(attr)) {
+        return DeterministicNoWaitCommitEffect::Wait();
+      }
+      if (PipelineRewriter::IsAsyncCommitQueueScope(attr)) {
+        auto effect = Analyze(attr->body);
+        if (!effect.deterministic) {
+          return effect;
+        }
+        ++effect.commit_groups;
+        return effect;
+      }
+      return Analyze(attr->body);
+    }
 
-    for (size_t i = 0; i < new_blocks.size(); i++) {
-      Block block = new_blocks[i].block;
-      if (commit_group_indices.count(new_blocks[i].order)) {
-        auto commit_queue_scope = AttrStmt(make_zero(DataType::Int(32)),
-                                           tir::attr::async_commit_queue_scope,
-                                           new_blocks[i].stage, block->body);
-        block = MakeBlock(commit_queue_scope, buffer_data_to_buffer_);
+    DeterministicNoWaitCommitEffect AnalyzeFor(const ForNode *for_node) const {
+      if (for_node->thread_binding.defined()) {
+        return DeterministicNoWaitCommitEffect::Unknown();
       }
-      stmts.push_back(BlockRealize({}, new_blocks[i].predicate, block));
+      const int64_t *extent_imm = as_const_int(for_node->extent);
+      if (extent_imm == nullptr || *extent_imm < 0) {
+        return DeterministicNoWaitCommitEffect::Unknown();
+      }
+      auto effect = Analyze(for_node->body);
+      if (!effect.deterministic) {
+        return effect;
+      }
+      effect.commit_groups *= static_cast<int>(*extent_imm);
+      return effect;
     }
 
-    return stmts;
-  }
+    static DeterministicNoWaitCommitEffect
+    Combine(const DeterministicNoWaitCommitEffect &lhs,
+            const DeterministicNoWaitCommitEffect &rhs) {
+      if (!lhs.deterministic || !rhs.deterministic) {
+        return DeterministicNoWaitCommitEffect::Unknown();
+      }
+      DeterministicNoWaitCommitEffect effect;
+      effect.has_wait = lhs.has_wait || rhs.has_wait;
+      effect.commit_groups = lhs.commit_groups + rhs.commit_groups;
+      return effect;
+    }
 
-  /*!
-   * \brief Emit the pipeline loop in the given range.
-   * \param start The start of the range
-   * \param end The end of the range
-   * \param unroll_loop Whether the loop should be unrolled.
-   * \return The result loop.
-   */
-  Stmt EmitImpl(const PrimExpr &start, const PrimExpr &end, bool unroll_loop,
-                bool need_bound_check, bool is_epilogue = false) {
-    PrimExpr new_loop_var;
-    PrimExpr extent = end - start;
-    auto make_nop = []() {
-      return BlockRealize({}, Bool(true), MakeBlock(Evaluate(0), {}));
-    };
+    const PipelineRewriter *rewriter_;
+  };
 
-    bool is_unit_loop = analyzer_.CanProveEqual(extent, 1);
-    if (is_unit_loop) {
-      new_loop_var = start; // use constants as the loop var for unit loops
-    } else {
-      new_loop_var = pipeline_loop_->loop_var.copy_with_suffix("");
-      // Bind the iteration domain [start, end) to strengthen analyzer facts.
-      analyzer_.Bind(Downcast<Var>(new_loop_var),
-                     Range::FromMinExtent(start, end - start));
+  Stmt
+  WrapLoopDependentWrappers(Stmt stmt,
+                            const PrimExpr &normalized_access_index) const {
+    for (auto it = loop_var_if_wrappers_.rbegin();
+         it != loop_var_if_wrappers_.rend(); ++it) {
+      const auto &iw = *it;
+      PrimExpr substituted_condition = Substitute(
+          iw.condition, {{pipeline_loop_->loop_var, normalized_access_index}});
+      stmt = IfThenElse(substituted_condition, stmt, Stmt(), iw.span);
     }
-    // Keep the bound constraints active for all analysis below.
-    // Only meaningful when the loop var is symbolic (non-unit loop).
-    std::unique_ptr<With<arith::ConstraintContext>> ctx_lb_guard;
-    std::unique_ptr<With<arith::ConstraintContext>> ctx_ub_guard;
-    if (!is_unit_loop) {
-      Var loop_iter = Downcast<Var>(new_loop_var);
-      ctx_lb_guard.reset(
-          new With<arith::ConstraintContext>(&analyzer_, loop_iter >= start));
-      ctx_ub_guard.reset(
-          new With<arith::ConstraintContext>(&analyzer_, loop_iter < end));
+    for (auto it = loop_var_let_wrappers_.rbegin();
+         it != loop_var_let_wrappers_.rend(); ++it) {
+      const auto &lw = *it;
+      PrimExpr substituted = Substitute(
+          lw.value, {{pipeline_loop_->loop_var, normalized_access_index}});
+      stmt = LetStmt(lw.var, substituted, stmt);
     }
+    return stmt;
+  }
 
-    std::vector<RewrittenBlockInfo> new_blocks;
-
-    // Async related
-    std::map<int, AsyncStateLocal> async_states_local;
+  Stmt WrapPipelineStageContext(Stmt stmt,
+                                const PrimExpr &normalized_access_index,
+                                const Optional<Integer> &pipeline_num_stages) {
+    if (!(pipeline_num_stages && pipeline_num_stages.value()->value > 1)) {
+      return stmt;
+    }
+    PrimExpr ns = IntImm(DataType::Int(32), pipeline_num_stages.value()->value);
+    PrimExpr stage_expr =
+        analyzer_.Simplify(FloorMod(normalized_access_index, ns));
+    PrimExpr parity_expr = analyzer_.Simplify(FloorMod(
+        FloorDiv(normalized_access_index, ns), IntImm(DataType::Int(32), 2)));
+    stmt = AttrStmt(Integer(0), kPipelineMVBParityExpr, parity_expr, stmt);
+    stmt = AttrStmt(Integer(0), kPipelineMVBStageExpr, stage_expr, stmt);
+    return stmt;
+  }
 
-    for (const Block &block : ordered_stmts_) {
-      int stage = pipeline_info_.at(block).stage;
-      int order = pipeline_info_.at(block).order;
+  Optional<PrimExpr>
+  ComputePipelineMbarPhaseExpr(const PrimExpr &normalized_access_index,
+                               const Optional<Integer> &pipeline_num_stages) {
+    if (!pipeline_num_stages) {
+      return Optional<PrimExpr>();
+    }
+    PrimExpr parity_expr;
+    if (pipeline_num_stages.value()->value <= 1) {
+      parity_expr =
+          FloorMod(normalized_access_index, IntImm(DataType::Int(32), 2));
+    } else {
+      PrimExpr ns =
+          IntImm(DataType::Int(32), pipeline_num_stages.value()->value);
+      parity_expr = FloorMod(FloorDiv(normalized_access_index, ns),
+                             IntImm(DataType::Int(32), 2));
+    }
+    return analyzer_.Simplify(parity_expr);
+  }
 
-      PrimExpr inbound = Bool(true);
-      PrimExpr skewed_loop_var = new_loop_var - stage;
-      if (need_bound_check)
-        inbound = And(
-            pipeline_loop_->min <= skewed_loop_var,
-            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent));
+  static bool IsAsyncCommitQueueScope(const AttrStmtNode *attr) {
+    return attr && attr->attr_key == tir::attr::async_commit_queue_scope;
+  }
 
-      Block new_block = Downcast<Block>(
-          PipelineBodyRewriter(buffer_data_to_buffer_, buffer_remap_,
-                               pipeline_loop_, max_stage_ != 1)(block));
+  static bool IsAsyncWaitQueueScope(const AttrStmtNode *attr) {
+    return attr && attr->attr_key == tir::attr::async_wait_queue_scope;
+  }
 
-      PrimExpr delta = start - pipeline_loop_->min;
-      // This variable corresponds to
-      // - "producer_head" if this stage is an async producer
-      // - "consumer_head" if this stage reads from asynchronously written
-      // buffers.
-      PrimExpr normalized_access_index =
-          is_unit_loop ? skewed_loop_var : skewed_loop_var + delta;
+  static bool IsAsyncWaitInflightCount(const AttrStmtNode *attr) {
+    return attr && attr->attr_key == tir::attr::async_wait_inflight_count;
+  }
 
-      normalized_access_index = analyzer_.Simplify(normalized_access_index);
+  static int
+  PipelinedRetainGroups(const Optional<Integer> &pipeline_num_stages) {
+    int retain = 1;
+    if (pipeline_num_stages) {
+      retain =
+          std::max(0, static_cast<int>(pipeline_num_stages.value()->value) - 1);
+    }
+    return retain;
+  }
 
-      // Adjust the block predicate and the body according to the final loop
-      // bound
-      //  [pipeline_loop_->min, extent).
-      if (!is_unit_loop) {
-        Var loop_iter = Downcast<Var>(new_loop_var);
-        inbound = Substitute(inbound, {{loop_iter, loop_iter + delta}});
+  Stmt StripPipelineContextAttrs(Stmt stmt) const {
+    while (const auto *attr = stmt.as<AttrStmtNode>()) {
+      if (attr->attr_key != kPipelineContextNumStages &&
+          attr->attr_key != kPipelineMVBContextNumStages) {
+        break;
       }
-      new_block = Downcast<Block>(Substitute(
-          new_block, {{pipeline_loop_->loop_var, normalized_access_index}}));
+      stmt = attr->body;
+    }
+    return stmt;
+  }
 
-      // If there were Let-wrappers outside the original pipeline body that
-      // depended on the pipeline loop var, push them into each rewritten
-      // block with the correct per-block substitution.
-      if (!loop_var_let_wrappers_.empty()) {
-        BlockNode *n = new_block.CopyOnWrite();
-        Stmt inner = n->body;
-        for (const auto &lw : loop_var_let_wrappers_) {
-          PrimExpr substituted = Substitute(
-              lw.value, {{pipeline_loop_->loop_var, normalized_access_index}});
-          inner = LetStmt(lw.var, substituted, inner);
-        }
-        n->body = inner;
-      }
+  Array<Stmt> FlattenTopLevelSeq(const Stmt &stmt) const {
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      return seq->seq;
+    }
+    return {stmt};
+  }
 
-      if (pipeline_info_[block].async) {
+  std::optional<int>
+  TryGetStaticAsyncWaitCount(const AttrStmtNode *attr) const {
+    if (!IsAsyncWaitQueueScope(attr)) {
+      return std::nullopt;
+    }
+    const auto *inner = attr->body.as<AttrStmtNode>();
+    if (!IsAsyncWaitInflightCount(inner)) {
+      return std::nullopt;
+    }
+    const int64_t *imm = as_const_int(inner->value);
+    if (!imm) {
+      return std::nullopt;
+    }
+    return static_cast<int>(*imm);
+  }
+
+  Stmt MakeStaticAsyncWaitStmtLike(const AttrStmtNode *attr,
+                                   int new_wait_n) const {
+    const auto *inner = attr->body.as<AttrStmtNode>();
+    if (!IsAsyncWaitInflightCount(inner)) {
+      return AttrStmt(attr->node, attr->attr_key, attr->value, attr->body,
+                      attr->span);
+    }
+    PrimExpr new_wait = make_const(inner->value.dtype(), new_wait_n);
+    Stmt new_inner = AttrStmt(inner->node, inner->attr_key, new_wait,
+                              inner->body, inner->span);
+    return AttrStmt(attr->node, attr->attr_key, attr->value, new_inner,
+                    attr->span);
+  }
+
+  HeadAsyncSyncInfo AnalyzeHeadAsyncSync(const Stmt &stmt,
+                                         HeadSeqMode seq_mode) const {
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      return AnalyzeHeadAsyncSync(let->body, seq_mode);
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      if (IsAsyncWaitQueueScope(attr)) {
+        if (auto wait_n = TryGetStaticAsyncWaitCount(attr)) {
+          return {HeadAsyncSyncKind::kWaitStatic, *wait_n};
+        }
+        return {HeadAsyncSyncKind::kWaitDynamic, 0};
+      }
+      if (IsAsyncCommitQueueScope(attr)) {
+        return {HeadAsyncSyncKind::kCommit, 0};
+      }
+      if (IsAsyncWaitInflightCount(attr)) {
+        return {HeadAsyncSyncKind::kBlocked, 0};
+      }
+      return AnalyzeHeadAsyncSync(attr->body, seq_mode);
+    }
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      if (seq->seq.empty()) {
+        return {};
+      }
+      if (seq_mode == HeadSeqMode::kSingletonOnly && seq->seq.size() != 1) {
+        return {HeadAsyncSyncKind::kBlocked, 0};
+      }
+      return AnalyzeHeadAsyncSync(seq->seq[0], seq_mode);
+    }
+    if (const auto *block = stmt.as<BlockNode>()) {
+      return AnalyzeHeadAsyncSync(block->body, seq_mode);
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      if (is_one(realize->predicate)) {
+        return AnalyzeHeadAsyncSync(realize->block->body, seq_mode);
+      }
+      return {HeadAsyncSyncKind::kBlocked, 0};
+    }
+    return {};
+  }
+
+  ClassifiedAsyncSyncStmt ClassifySimpleAsyncSyncStmt(const Stmt &stmt) const {
+    HeadAsyncSyncInfo info =
+        AnalyzeHeadAsyncSync(stmt, HeadSeqMode::kSingletonOnly);
+    switch (info.kind) {
+    case HeadAsyncSyncKind::kCommit:
+      return {AsyncSyncStmtKind::kCommit, 0};
+    case HeadAsyncSyncKind::kWaitStatic:
+      return {AsyncSyncStmtKind::kWaitStatic, info.wait_n};
+    case HeadAsyncSyncKind::kWaitDynamic:
+      return {AsyncSyncStmtKind::kWaitDynamic, 0};
+    default:
+      return {};
+    }
+  }
+
+  bool ContainsAsyncSyncScopes(const Stmt &stmt) const {
+    bool found = false;
+    PostOrderVisit(stmt, [&](const ObjectRef &obj) {
+      if (found) {
+        return;
+      }
+      if (const auto *attr = obj.as<AttrStmtNode>()) {
+        if (IsAsyncCommitQueueScope(attr) || IsAsyncWaitQueueScope(attr)) {
+          found = true;
+        }
+      }
+    });
+    return found;
+  }
+
+  bool ContainsAsyncCommitScopes(const Stmt &stmt) const {
+    bool found = false;
+    PostOrderVisit(stmt, [&](const ObjectRef &obj) {
+      if (found) {
+        return;
+      }
+      if (const auto *attr = obj.as<AttrStmtNode>()) {
+        if (IsAsyncCommitQueueScope(attr)) {
+          found = true;
+        }
+      }
+    });
+    return found;
+  }
+
+  AsyncSyncSummary SummarizeAsyncSyncScopes(const Stmt &stmt) const {
+    AsyncSyncSummary summary;
+    PostOrderVisit(stmt, [&](const ObjectRef &obj) {
+      if (const auto *attr = obj.as<AttrStmtNode>()) {
+        if (IsAsyncCommitQueueScope(attr)) {
+          ++summary.commit;
+        } else if (IsAsyncWaitQueueScope(attr)) {
+          ++summary.wait;
+        }
+      }
+    });
+    return summary;
+  }
+
+  std::optional<int>
+  TryGetDeterministicNoWaitCommitGroups(const Stmt &stmt) const {
+    auto effect = DeterministicNoWaitCommitAnalyzer(this).Analyze(stmt);
+    if (!effect.deterministic || effect.has_wait) {
+      return std::nullopt;
+    }
+    return effect.commit_groups;
+  }
+
+  int GuaranteedNewGroupsBeforeNextWait(const Array<Stmt> &body,
+                                        int start_idx) const {
+    int guaranteed_groups = 0;
+    for (int i = start_idx, n = static_cast<int>(body.size()); i < n; ++i) {
+      AsyncSyncSummary summary = SummarizeAsyncSyncScopes(body[i]);
+      if (summary.wait > 0) {
+        break;
+      }
+      if (summary.commit == 0) {
+        continue;
+      }
+      if (auto commits = TryGetDeterministicNoWaitCommitGroups(body[i])) {
+        guaranteed_groups += *commits;
+        continue;
+      }
+      break;
+    }
+    return guaranteed_groups;
+  }
+
+  Stmt RewriteWaitStaticInSimpleWrapper(const Stmt &stmt, int new_wait_n,
+                                        bool *changed) const {
+    ClassifiedAsyncSyncStmt cls = ClassifySimpleAsyncSyncStmt(stmt);
+    if (cls.kind != AsyncSyncStmtKind::kWaitStatic) {
+      return stmt;
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      if (IsAsyncWaitQueueScope(attr)) {
+        *changed = true;
+        return MakeStaticAsyncWaitStmtLike(attr, new_wait_n);
+      }
+    }
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      Stmt new_body =
+          RewriteWaitStaticInSimpleWrapper(let->body, new_wait_n, changed);
+      if (*changed) {
+        return LetStmt(let->var, let->value, new_body, let->span);
+      }
+      return stmt;
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      Stmt new_body =
+          RewriteWaitStaticInSimpleWrapper(attr->body, new_wait_n, changed);
+      if (*changed) {
+        return AttrStmt(attr->node, attr->attr_key, attr->value, new_body,
+                        attr->span);
+      }
+      return stmt;
+    }
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      if (seq->seq.size() == 1) {
+        Stmt inner =
+            RewriteWaitStaticInSimpleWrapper(seq->seq[0], new_wait_n, changed);
+        if (*changed) {
+          return SeqStmt({inner});
+        }
+      }
+      return stmt;
+    }
+    if (const auto *block = stmt.as<BlockNode>()) {
+      Stmt inner =
+          RewriteWaitStaticInSimpleWrapper(block->body, new_wait_n, changed);
+      if (*changed) {
+        Block new_block = Downcast<Block>(stmt);
+        new_block.CopyOnWrite()->body = inner;
+        return new_block;
+      }
+      return stmt;
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      if (is_one(realize->predicate)) {
+        Stmt inner = RewriteWaitStaticInSimpleWrapper(realize->block->body,
+                                                      new_wait_n, changed);
+        if (*changed) {
+          Block new_block = realize->block;
+          new_block.CopyOnWrite()->body = inner;
+          return BlockRealize(realize->iter_values, realize->predicate,
+                              new_block, realize->span);
+        }
+      }
+      return stmt;
+    }
+    return stmt;
+  }
+
+  std::optional<int> TryGetHeadStaticWaitCount(const Stmt &stmt) const {
+    HeadAsyncSyncInfo info =
+        AnalyzeHeadAsyncSync(stmt, HeadSeqMode::kTakeFirstElement);
+    if (info.kind == HeadAsyncSyncKind::kWaitStatic) {
+      return info.wait_n;
+    }
+    return std::nullopt;
+  }
+
+  std::optional<int> TryGetFirstStaticWaitCount(const Stmt &stmt) const {
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      return TryGetFirstStaticWaitCount(let->body);
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      HeadAsyncSyncInfo info =
+          AnalyzeHeadAsyncSync(stmt, HeadSeqMode::kTakeFirstElement);
+      if (info.kind == HeadAsyncSyncKind::kWaitStatic) {
+        return info.wait_n;
+      }
+      if (info.IsBoundary()) {
+        return std::nullopt;
+      }
+      return TryGetFirstStaticWaitCount(attr->body);
+    }
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      for (const Stmt &elem : seq->seq) {
+        HeadAsyncSyncInfo info =
+            AnalyzeHeadAsyncSync(elem, HeadSeqMode::kTakeFirstElement);
+        if (info.kind == HeadAsyncSyncKind::kWaitStatic) {
+          return info.wait_n;
+        }
+        if (info.IsBoundary() || ContainsAsyncSyncScopes(elem)) {
+          return std::nullopt;
+        }
+      }
+      return std::nullopt;
+    }
+    if (const auto *block = stmt.as<BlockNode>()) {
+      return TryGetFirstStaticWaitCount(block->body);
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      if (is_one(realize->predicate)) {
+        return TryGetFirstStaticWaitCount(realize->block->body);
+      }
+    }
+    return std::nullopt;
+  }
+
+  Stmt RewriteHeadStaticWaitInWrapper(const Stmt &stmt, int new_wait_n,
+                                      bool *changed) const {
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      Stmt new_body =
+          RewriteHeadStaticWaitInWrapper(let->body, new_wait_n, changed);
+      if (*changed) {
+        return LetStmt(let->var, let->value, new_body, let->span);
+      }
+      return stmt;
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      if (IsAsyncWaitQueueScope(attr)) {
+        *changed = true;
+        return MakeStaticAsyncWaitStmtLike(attr, new_wait_n);
+      }
+      Stmt new_body =
+          RewriteHeadStaticWaitInWrapper(attr->body, new_wait_n, changed);
+      if (*changed) {
+        return AttrStmt(attr->node, attr->attr_key, attr->value, new_body,
+                        attr->span);
+      }
+      return stmt;
+    }
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      if (seq->seq.empty()) {
+        return stmt;
+      }
+      Array<Stmt> new_seq = seq->seq;
+      new_seq.Set(
+          0, RewriteHeadStaticWaitInWrapper(seq->seq[0], new_wait_n, changed));
+      if (*changed) {
+        return SeqStmt(new_seq);
+      }
+      return stmt;
+    }
+    if (const auto *block = stmt.as<BlockNode>()) {
+      Stmt new_body =
+          RewriteHeadStaticWaitInWrapper(block->body, new_wait_n, changed);
+      if (*changed) {
+        Block new_block = Downcast<Block>(stmt);
+        new_block.CopyOnWrite()->body = new_body;
+        return new_block;
+      }
+      return stmt;
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      if (is_one(realize->predicate)) {
+        Stmt new_body = RewriteHeadStaticWaitInWrapper(realize->block->body,
+                                                       new_wait_n, changed);
+        if (*changed) {
+          Block new_block = realize->block;
+          new_block.CopyOnWrite()->body = new_body;
+          return BlockRealize(realize->iter_values, realize->predicate,
+                              new_block, realize->span);
+        }
+      }
+      return stmt;
+    }
+    return stmt;
+  }
+
+  Stmt RewriteFirstStaticWaitInWrapper(const Stmt &stmt, int new_wait_n,
+                                       bool *changed) const {
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      Stmt new_body =
+          RewriteFirstStaticWaitInWrapper(let->body, new_wait_n, changed);
+      if (*changed) {
+        return LetStmt(let->var, let->value, new_body, let->span);
+      }
+      return stmt;
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      if (IsAsyncWaitQueueScope(attr)) {
+        *changed = true;
+        return MakeStaticAsyncWaitStmtLike(attr, new_wait_n);
+      }
+      if (IsAsyncCommitQueueScope(attr) || IsAsyncWaitInflightCount(attr)) {
+        return stmt;
+      }
+      Stmt new_body =
+          RewriteFirstStaticWaitInWrapper(attr->body, new_wait_n, changed);
+      if (*changed) {
+        return AttrStmt(attr->node, attr->attr_key, attr->value, new_body,
+                        attr->span);
+      }
+      return stmt;
+    }
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      Array<Stmt> new_seq = seq->seq;
+      for (int i = 0, n = static_cast<int>(new_seq.size()); i < n; ++i) {
+        Stmt updated =
+            RewriteFirstStaticWaitInWrapper(new_seq[i], new_wait_n, changed);
+        if (*changed) {
+          new_seq.Set(i, updated);
+          return SeqStmt(new_seq);
+        }
+        if (ContainsAsyncSyncScopes(new_seq[i])) {
+          return stmt;
+        }
+      }
+      return stmt;
+    }
+    if (const auto *block = stmt.as<BlockNode>()) {
+      Stmt new_body =
+          RewriteFirstStaticWaitInWrapper(block->body, new_wait_n, changed);
+      if (*changed) {
+        Block new_block = Downcast<Block>(stmt);
+        new_block.CopyOnWrite()->body = new_body;
+        return new_block;
+      }
+      return stmt;
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      if (is_one(realize->predicate)) {
+        Stmt new_body = RewriteFirstStaticWaitInWrapper(realize->block->body,
+                                                        new_wait_n, changed);
+        if (*changed) {
+          Block new_block = realize->block;
+          new_block.CopyOnWrite()->body = new_body;
+          return BlockRealize(realize->iter_values, realize->predicate,
+                              new_block, realize->span);
+        }
+      }
+      return stmt;
+    }
+    return stmt;
+  }
+
+  Stmt MaybeRelaxLoopWaits(const For &loop, int pre_outstanding_lb) const {
+    int retain = PipelinedRetainGroups(GetPipelineNumStages(loop.get()));
+    if (retain <= 0 || !loop.defined()) {
+      return loop;
+    }
+    const auto *seq = loop->body.as<SeqStmtNode>();
+    if (!seq || seq->seq.empty()) {
+      return loop;
+    }
+
+    Array<Stmt> body = seq->seq;
+    bool changed = false;
+    int outstanding_lb = std::max(0, pre_outstanding_lb);
+    int groups_since_wait_lb = 0;
+    bool seen_wait_boundary = false;
+
+    for (int i = 0, n = static_cast<int>(body.size()); i < n; ++i) {
+      ClassifiedAsyncSyncStmt cls = ClassifySimpleAsyncSyncStmt(body[i]);
+      if (cls.kind == AsyncSyncStmtKind::kCommit) {
+        ++outstanding_lb;
+        ++groups_since_wait_lb;
+        continue;
+      }
+      if (cls.kind == AsyncSyncStmtKind::kWaitDynamic) {
+        seen_wait_boundary = true;
+        outstanding_lb = 0;
+        groups_since_wait_lb = 0;
+        continue;
+      }
+      if (cls.kind == AsyncSyncStmtKind::kWaitStatic) {
+        int effective_wait_n = cls.wait_n;
+        if (cls.wait_n == 0) {
+          int groups_after_wait_lb =
+              GuaranteedNewGroupsBeforeNextWait(body, i + 1);
+          int per_sync_groups = groups_since_wait_lb;
+          bool uses_head_fallback =
+              (per_sync_groups == 0 && !seen_wait_boundary);
+          if (uses_head_fallback) {
+            per_sync_groups = 1;
+          }
+          int candidate_wait_n =
+              std::max(0, std::min(retain * per_sync_groups, 7));
+          bool enough_pre_outstanding =
+              !uses_head_fallback || outstanding_lb >= (candidate_wait_n + 1);
+          if (candidate_wait_n > 0 && enough_pre_outstanding &&
+              (!uses_head_fallback || groups_after_wait_lb > 0)) {
+            bool changed_wait = false;
+            body.Set(i, RewriteWaitStaticInSimpleWrapper(
+                            body[i], candidate_wait_n, &changed_wait));
+            if (changed_wait) {
+              changed = true;
+              effective_wait_n = candidate_wait_n;
+            }
+          }
+        }
+        seen_wait_boundary = true;
+        outstanding_lb = std::min(outstanding_lb, effective_wait_n);
+        groups_since_wait_lb = 0;
+        continue;
+      }
+
+      AsyncSyncSummary summary = SummarizeAsyncSyncScopes(body[i]);
+      if (summary.wait == 0) {
+        if (auto commits = TryGetDeterministicNoWaitCommitGroups(body[i])) {
+          outstanding_lb += *commits;
+          groups_since_wait_lb += *commits;
+          continue;
+        }
+      }
+      if (summary.wait > 0) {
+        seen_wait_boundary = true;
+      }
+      outstanding_lb = 0;
+      groups_since_wait_lb = 0;
+    }
+
+    if (!changed) {
+      return loop;
+    }
+    For new_loop = loop;
+    new_loop.CopyOnWrite()->body = body.size() == 1 ? body[0] : SeqStmt(body);
+    return new_loop;
+  }
+
+  Stmt RelaxLoopWaitsInSimpleWrapper(const Stmt &stmt, int pre_outstanding_lb,
+                                     bool *changed) const {
+    if (const auto *loop = stmt.as<ForNode>()) {
+      Stmt relaxed =
+          MaybeRelaxLoopWaits(Downcast<For>(stmt), pre_outstanding_lb);
+      *changed = !relaxed.same_as(stmt);
+      return relaxed;
+    }
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      Stmt new_body =
+          RelaxLoopWaitsInSimpleWrapper(let->body, pre_outstanding_lb, changed);
+      if (*changed) {
+        return LetStmt(let->var, let->value, new_body, let->span);
+      }
+      return stmt;
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      Stmt new_body = RelaxLoopWaitsInSimpleWrapper(
+          attr->body, pre_outstanding_lb, changed);
+      if (*changed) {
+        return AttrStmt(attr->node, attr->attr_key, attr->value, new_body,
+                        attr->span);
+      }
+      return stmt;
+    }
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      if (seq->seq.size() == 1) {
+        Stmt inner = RelaxLoopWaitsInSimpleWrapper(seq->seq[0],
+                                                   pre_outstanding_lb, changed);
+        if (*changed) {
+          return SeqStmt({inner});
+        }
+      }
+      return stmt;
+    }
+    if (const auto *block = stmt.as<BlockNode>()) {
+      Stmt new_body = RelaxLoopWaitsInSimpleWrapper(
+          block->body, pre_outstanding_lb, changed);
+      if (*changed) {
+        Block new_block = Downcast<Block>(stmt);
+        new_block.CopyOnWrite()->body = new_body;
+        return new_block;
+      }
+      return stmt;
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      if (is_one(realize->predicate)) {
+        Stmt new_body = RelaxLoopWaitsInSimpleWrapper(
+            realize->block->body, pre_outstanding_lb, changed);
+        if (*changed) {
+          Block new_block = realize->block;
+          new_block.CopyOnWrite()->body = new_body;
+          return BlockRealize(realize->iter_values, realize->predicate,
+                              new_block, realize->span);
+        }
+      }
+      return stmt;
+    }
+    return stmt;
+  }
+
+  class AsyncPipelineLoopWaitRelaxer : public StmtExprMutator {
+  public:
+    explicit AsyncPipelineLoopWaitRelaxer(const PipelineRewriter *rewriter)
+        : rewriter_(rewriter) {}
+
+    Stmt VisitStmt_(const SeqStmtNode *op) final {
+      Array<Stmt> visited;
+      visited.reserve(op->seq.size());
+      for (const Stmt &stmt : op->seq) {
+        visited.push_back(this->VisitStmt(stmt));
+      }
+
+      int outstanding_lb = 0;
+      for (int i = 0, n = static_cast<int>(visited.size()); i < n; ++i) {
+        Stmt current = visited[i];
+        bool changed_loop = false;
+        current = rewriter_->RelaxLoopWaitsInSimpleWrapper(
+            current, outstanding_lb, &changed_loop);
+        if (changed_loop) {
+          visited.Set(i, current);
+        }
+        ClassifiedAsyncSyncStmt cls =
+            rewriter_->ClassifySimpleAsyncSyncStmt(current);
+        if (cls.kind == AsyncSyncStmtKind::kCommit) {
+          ++outstanding_lb;
+          continue;
+        }
+        if (cls.kind == AsyncSyncStmtKind::kWaitStatic) {
+          outstanding_lb = std::min(outstanding_lb, cls.wait_n);
+          continue;
+        }
+        if (cls.kind == AsyncSyncStmtKind::kWaitDynamic) {
+          outstanding_lb = 0;
+          continue;
+        }
+        AsyncSyncSummary summary = rewriter_->SummarizeAsyncSyncScopes(current);
+        if (summary.wait == 0) {
+          if (auto commits =
+                  rewriter_->TryGetDeterministicNoWaitCommitGroups(current)) {
+            outstanding_lb += *commits;
+            continue;
+          }
+        }
+        if (summary.wait > 0) {
+          outstanding_lb = 0;
+        }
+      }
+
+      if (visited.empty()) {
+        return Evaluate(0);
+      }
+      if (visited.size() == 1) {
+        return visited[0];
+      }
+      return SeqStmt(visited);
+    }
+
+  private:
+    const PipelineRewriter *rewriter_;
+  };
+
+  Array<Stmt> RelaxTrailingConsumerWaits(Array<Stmt> seq, int retain) const {
+    if (retain <= 0 || seq.size() <= 1) {
+      return seq;
+    }
+    std::vector<int> suffix_wait_indices;
+    for (int i = static_cast<int>(seq.size()) - 1; i >= 0; --i) {
+      if (ContainsAsyncCommitScopes(seq[i])) {
+        break;
+      }
+      auto first_wait = TryGetFirstStaticWaitCount(seq[i]);
+      if (!first_wait.has_value() || *first_wait != 0) {
+        break;
+      }
+      suffix_wait_indices.push_back(i);
+    }
+    if (suffix_wait_indices.size() <= 1) {
+      return seq;
+    }
+    for (size_t pos = 1; pos < suffix_wait_indices.size(); ++pos) {
+      bool changed = false;
+      int idx = suffix_wait_indices[pos];
+      // Tail consumers drain the final committed groups with no new commits in
+      // between. Relax them progressively from the end so the suffix becomes
+      // ..., wait<2>, wait<1>, wait<0> instead of rewriting every drain wait to
+      // the same retain count.
+      int new_wait_n = std::min(retain, static_cast<int>(pos));
+      seq.Set(idx,
+              RewriteFirstStaticWaitInWrapper(seq[idx], new_wait_n, &changed));
+    }
+    return seq;
+  }
+
+  void PopulateWaitCounts(
+      const std::vector<RewrittenStmtInfo> &new_stmts,
+      arith::Analyzer *ana_normalized,
+      const std::unordered_map<const BufferNode *, int> &buffer_to_commit_group,
+      std::map<int, AsyncStateLocal> *async_states_local) {
+    for (size_t i = 0; i < new_stmts.size(); ++i) {
+      if (new_stmts[i].is_async) {
+        for (const BufferRegion &write_region : new_stmts[i].writes) {
+          (*async_states_local)[new_stmts[i].stage].seen.insert(
+              write_region->buffer.get());
+        }
+        continue;
+      }
+
+      int producer_stage_idx = -1;
+      for (const BufferRegion &read_region : new_stmts[i].reads) {
+        for (const auto &kv : async_states_) {
+          if (kv.first <= new_stmts[i].stage &&
+              kv.second.writes(read_region->buffer)) {
+            ICHECK(producer_stage_idx == -1 || producer_stage_idx == kv.first)
+                << "A dependency on multiple async stages is not supported";
+            producer_stage_idx = kv.first;
+          }
+        }
+      }
+
+      if (producer_stage_idx == -1) {
+        continue;
+      }
+
+      auto &dep_local_state = (*async_states_local)[producer_stage_idx];
+      int num_commit_group = dep_local_state.commit_groups.size();
+      std::vector<Optional<PrimExpr>> producer_head_per_commit;
+      std::vector<int> dependent_commit_groups;
+
+      if (num_commit_group == 0) {
+        ICHECK(!dep_local_state.producer_head);
+        dependent_commit_groups.push_back(-1);
+        producer_head_per_commit.push_back(
+            async_states_[producer_stage_idx].producer_head);
+      } else {
+        ICHECK(dep_local_state.producer_head);
+        std::vector<bool> need_wait_count(num_commit_group, true);
+        for (const BufferRegion &read_region : new_stmts[i].reads) {
+          if (!async_states_[producer_stage_idx].writes(read_region->buffer)) {
+            continue;
+          }
+          auto commit_group_id =
+              buffer_to_commit_group.at(read_region->buffer.get());
+          if (!need_wait_count[commit_group_id]) {
+            continue;
+          }
+          dependent_commit_groups.push_back(commit_group_id);
+          if (!dep_local_state.seen.count(read_region->buffer.get())) {
+            producer_head_per_commit.push_back(
+                dep_local_state.producer_head.value() - 1);
+          } else {
+            producer_head_per_commit.push_back(
+                dep_local_state.producer_head.value());
+          }
+          need_wait_count[commit_group_id] = false;
+        }
+      }
+
+      PrimExpr wait_count = [&]() {
+        PrimExpr sum = PrimExpr(0);
+        for (const Optional<PrimExpr> &producer_head :
+             producer_head_per_commit) {
+          if (producer_head &&
+              ana_normalized->CanProve(producer_head.value() >= 0)) {
+            sum += analyzer_.Simplify(producer_head.value() -
+                                      new_stmts[i].access_index);
+          } else {
+            return PrimExpr(0);
+          }
+        }
+        return sum;
+      }();
+
+      for (int commit_group_id : dependent_commit_groups) {
+        auto &pending_wait = dep_local_state.pending_waits[commit_group_id];
+        if (!pending_wait.valid()) {
+          pending_wait = {static_cast<int>(i), wait_count};
+        } else if (analyzer_.CanProve(wait_count < pending_wait.wait_count)) {
+          pending_wait = {pending_wait.insert_before, wait_count};
+        }
+      }
+    }
+  }
+
+  std::vector<FinalStmtInfo> CompletePipelineLoopStatements(
+      const std::vector<RewrittenStmtInfo> &stmts,
+      const std::map<int, AsyncStateLocal> &async_states_local,
+      arith::Analyzer *ana_normalized) const {
+    std::vector<FinalStmtInfo> new_stmts;
+    new_stmts.reserve(stmts.size());
+    for (const auto &stmt : stmts) {
+      new_stmts.push_back(
+          {stmt.stage, stmt.access_index, stmt.predicate, stmt.stmt});
+    }
+
+    std::vector<int> commit_group_tags(new_stmts.size(), -1);
+    std::unordered_map<int, int> commit_group_tag_to_stage;
+    int next_commit_group_tag = 0;
+    std::map<int, std::map<int, PrimExpr>> waits_before_stmt;
+    auto make_wait_stmt = [](int stage_id, PrimExpr wait_count, Stmt body) {
+      auto zero = make_zero(DataType::Int(32));
+      return AttrStmt(zero, tir::attr::async_wait_queue_scope, stage_id,
+                      AttrStmt(zero, tir::attr::async_wait_inflight_count,
+                               wait_count, body));
+    };
+    auto merge_wait_before_stmt = [&](int insert_before, int stage_id,
+                                      PrimExpr wait_count) {
+      auto &waits_at_stmt = waits_before_stmt[insert_before];
+      auto it = waits_at_stmt.find(stage_id);
+      if (it == waits_at_stmt.end()) {
+        waits_at_stmt.emplace(stage_id, ana_normalized->Simplify(wait_count));
+      } else if (ana_normalized->CanProve(wait_count < it->second)) {
+        it->second = ana_normalized->Simplify(wait_count);
+      }
+    };
+
+    for (const auto &[stage_id, state] : async_states_local) {
+      if (!state.commit_groups.empty()) {
+        for (const auto &group_stmt_indices : state.commit_groups) {
+          int commit_group_tag = next_commit_group_tag++;
+          commit_group_tag_to_stage.emplace(commit_group_tag, stage_id);
+          for (size_t stmt_idx : group_stmt_indices) {
+            ICHECK(stmt_idx < new_stmts.size());
+            commit_group_tags[stmt_idx] = commit_group_tag;
+          }
+        }
+      }
+
+      for (const auto &[commit_group_id, pending_wait] : state.pending_waits) {
+        if (!pending_wait.valid()) {
+          continue;
+        }
+        PrimExpr wait_count = ana_normalized->Simplify(pending_wait.wait_count);
+        if (state.predicate &&
+            !ana_normalized->CanProve(state.predicate.value())) {
+          PrimExpr predicate =
+              ana_normalized->Simplify(state.predicate.value());
+          if (is_zero(predicate)) {
+            continue;
+          }
+          merge_wait_before_stmt(pending_wait.insert_before, stage_id,
+                                 wait_count);
+          continue;
+        }
+
+        merge_wait_before_stmt(pending_wait.insert_before, stage_id,
+                               wait_count);
+      }
+    }
+
+    std::vector<FinalStmtInfo> result;
+    for (size_t i = 0; i < new_stmts.size();) {
+      if (auto it = waits_before_stmt.find(i); it != waits_before_stmt.end()) {
+        for (const auto &[stage_id, wait_count] : it->second) {
+          Stmt wait_stmt = make_wait_stmt(stage_id, wait_count, Evaluate(0));
+          if (auto state_it = async_states_local.find(stage_id);
+              state_it != async_states_local.end() &&
+              state_it->second.predicate &&
+              !ana_normalized->CanProve(state_it->second.predicate.value())) {
+            PrimExpr predicate =
+                ana_normalized->Simplify(state_it->second.predicate.value());
+            if (is_zero(predicate)) {
+              continue;
+            }
+            wait_stmt = IfThenElse(predicate, wait_stmt, Evaluate(0));
+          }
+          result.push_back({new_stmts[i].stage, new_stmts[i].access_index,
+                            new_stmts[i].predicate, wait_stmt});
+        }
+      }
+
+      if (commit_group_tags[i] == -1) {
+        result.push_back(new_stmts[i]);
+        ++i;
+        continue;
+      }
+
+      int commit_group_tag = commit_group_tags[i];
+      int stage_id = commit_group_tag_to_stage.at(commit_group_tag);
+      Array<Stmt> group_stmts;
+      PrimExpr access_index = new_stmts[i].access_index;
+      PrimExpr predicate = new_stmts[i].predicate;
+      for (; i < new_stmts.size() && commit_group_tags[i] == commit_group_tag;
+           ++i) {
+        group_stmts.push_back(new_stmts[i].stmt);
+      }
+      Stmt group_body =
+          group_stmts.size() == 1 ? group_stmts[0] : SeqStmt(group_stmts);
+      Stmt commit_queue_scope =
+          AttrStmt(make_zero(DataType::Int(32)),
+                   tir::attr::async_commit_queue_scope, stage_id, group_body);
+      if (!is_one(predicate) && !ana_normalized->CanProve(predicate)) {
+        PrimExpr simplified_predicate = ana_normalized->Simplify(predicate);
+        if (!is_zero(simplified_predicate)) {
+          commit_queue_scope =
+              IfThenElse(simplified_predicate, commit_queue_scope, Evaluate(0));
+        }
+      }
+      result.push_back({stage_id, access_index, predicate, commit_queue_scope});
+    }
+    return result;
+  }
+
+  /*!
+   * \brief Emit the pipeline loop in the given range.
+   * \param start The start of the range
+   * \param end The end of the range
+   * \param unroll_loop Whether the loop should be unrolled.
+   * \return The result loop.
+   */
+  Stmt EmitImpl(const PrimExpr &start, const PrimExpr &end, bool unroll_loop,
+                bool need_bound_check) {
+    PrimExpr new_loop_var;
+    PrimExpr extent = end - start;
+    Optional<Integer> pipeline_num_stages =
+        GetPipelineNumStages(pipeline_loop_.get());
+    auto make_nop = []() {
+      return BlockRealize({}, Bool(true), MakeBlock(Evaluate(0), {}));
+    };
+
+    if (unroll_loop) {
+      if (const int64_t *extent_imm = as_const_int(extent)) {
+        if (*extent_imm > 1) {
+          Array<Stmt> expanded;
+          expanded.reserve(static_cast<size_t>(*extent_imm));
+          for (int64_t iter = 0; iter < *extent_imm; ++iter) {
+            PrimExpr unit_start =
+                analyzer_.Simplify(start + IntImm(extent.dtype(), iter));
+            PrimExpr unit_end =
+                analyzer_.Simplify(start + IntImm(extent.dtype(), iter + 1));
+            Stmt unit_stmt =
+                EmitImpl(unit_start, unit_end, false, need_bound_check);
+            expanded.push_back(StripPipelineContextAttrs(unit_stmt));
+          }
+          Stmt result = expanded.size() == 1 ? expanded[0] : SeqStmt(expanded);
+          if (pipeline_num_stages) {
+            if (pipeline_num_stages.value()->value > 1) {
+              result = AttrStmt(Integer(0), kPipelineMVBContextNumStages,
+                                Downcast<PrimExpr>(pipeline_num_stages.value()),
+                                result);
+            }
+            result = AttrStmt(Integer(0), kPipelineContextNumStages,
+                              Downcast<PrimExpr>(pipeline_num_stages.value()),
+                              result);
+          }
+          return result;
+        }
+      }
+    }
+
+    bool is_unit_loop = analyzer_.CanProveEqual(extent, 1);
+    if (is_unit_loop) {
+      new_loop_var = start; // use constants as the loop var for unit loops
+    } else {
+      new_loop_var = pipeline_loop_->loop_var.copy_with_suffix("");
+      // Bind the iteration domain [start, end) to strengthen analyzer facts.
+      analyzer_.Bind(Downcast<Var>(new_loop_var),
+                     Range::FromMinExtent(start, end - start));
+    }
+    // Keep the bound constraints active for all analysis below.
+    // Only meaningful when the loop var is symbolic (non-unit loop).
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_lb_guard;
+    std::unique_ptr<With<arith::ConstraintContext>> ctx_ub_guard;
+    if (!is_unit_loop) {
+      Var loop_iter = Downcast<Var>(new_loop_var);
+      ctx_lb_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter >= start));
+      ctx_ub_guard.reset(
+          new With<arith::ConstraintContext>(&analyzer_, loop_iter < end));
+    }
+
+    arith::Analyzer ana_normalized;
+    if (!is_unit_loop) {
+      ana_normalized.Bind(Downcast<Var>(new_loop_var),
+                          Range(pipeline_loop_->min, extent));
+    }
+
+    std::vector<RewrittenStmtInfo> new_stmts;
+    std::map<int, AsyncStateLocal> async_states_local;
+    std::unordered_map<const BufferNode *, int> buffer_to_commit_group;
+
+    for (const Block &block : ordered_stmts_) {
+      const auto &pipeline_anno = pipeline_info_.at(block);
+      int stage = pipeline_anno.stage;
+      PrimExpr inbound = Bool(true);
+      PrimExpr skewed_loop_var = new_loop_var - stage;
+      if (need_bound_check)
+        inbound = And(
+            pipeline_loop_->min <= skewed_loop_var,
+            (skewed_loop_var < pipeline_loop_->min + pipeline_loop_->extent));
+
+      Block new_block = Downcast<Block>(
+          PipelineBodyRewriter(buffer_data_to_buffer_, buffer_remap_,
+                               pipeline_loop_, max_stage_ != 1)(block));
+
+      PrimExpr delta = start - pipeline_loop_->min;
+      PrimExpr normalized_access_index =
+          is_unit_loop ? skewed_loop_var : skewed_loop_var + delta;
+
+      normalized_access_index = analyzer_.Simplify(normalized_access_index);
+
+      // Adjust the block predicate and the body according to the final loop
+      // bound
+      //  [pipeline_loop_->min, extent).
+      if (!is_unit_loop) {
+        Var loop_iter = Downcast<Var>(new_loop_var);
+        inbound = Substitute(inbound, {{loop_iter, loop_iter + delta}});
+      }
+      inbound = ana_normalized.Simplify(inbound);
+      if (is_zero(inbound)) {
+        continue;
+      }
+      new_block = Downcast<Block>(Substitute(
+          new_block, {{pipeline_loop_->loop_var, normalized_access_index}}));
+
+      Stmt rewritten_stmt = BlockRealize({}, inbound, new_block);
+      rewritten_stmt = WrapLoopDependentWrappers(std::move(rewritten_stmt),
+                                                 normalized_access_index);
+      rewritten_stmt = WrapPipelineStageContext(std::move(rewritten_stmt),
+                                                normalized_access_index,
+                                                pipeline_num_stages);
+      Optional<PrimExpr> pipeline_mbar_phase = ComputePipelineMbarPhaseExpr(
+          normalized_access_index, pipeline_num_stages);
+
+      bool is_async = pipeline_anno.async;
+      if (is_async) {
         auto &local_state = async_states_local[stage];
+        int commit_group_id = -1;
+        if (pipeline_anno.async_group_id >= 0) {
+          auto it = local_state.annotated_group_to_commit_group.find(
+              pipeline_anno.async_group_id);
+          if (it == local_state.annotated_group_to_commit_group.end()) {
+            commit_group_id = local_state.commit_groups.size();
+            local_state.commit_groups.push_back({new_stmts.size()});
+            local_state.annotated_group_to_commit_group.emplace(
+                pipeline_anno.async_group_id, commit_group_id);
+          } else {
+            commit_group_id = it->second;
+            local_state.commit_groups[commit_group_id].push_back(
+                new_stmts.size());
+          }
+        } else if (local_state.commit_groups.empty() || local_state.consumed) {
+          commit_group_id = local_state.commit_groups.size();
+          local_state.commit_groups.push_back({new_stmts.size()});
+        } else {
+          commit_group_id = local_state.commit_groups.size() - 1;
+          local_state.commit_groups.back().push_back(new_stmts.size());
+        }
+
+        for (const BufferRegion &write_region : new_block->writes) {
+          async_states_[stage].dst_buffers.insert(write_region->buffer.get());
+          buffer_to_commit_group[write_region->buffer.get()] = commit_group_id;
+        }
+        async_states_[stage].producer_head = normalized_access_index;
         local_state.producer_head = normalized_access_index;
-        local_state.commit_predicate = inbound;
-        BlockNode *n = new_block.CopyOnWrite();
-        n->body = AttrStmt(make_zero(DataType::Int(32)), tir::attr::async_scope,
-                           1, n->body);
+        if (!local_state.predicate ||
+            ana_normalized.CanProve(local_state.predicate.value())) {
+          local_state.predicate = inbound;
+        } else {
+          local_state.predicate =
+              ana_normalized.Simplify(local_state.predicate.value() & inbound);
+        }
+        rewritten_stmt =
+            SimtProducerAnnotator::Annotate(rewritten_stmt, target_);
+        rewritten_stmt = AttrStmt(make_zero(DataType::Int(32)),
+                                  tir::attr::async_scope, 1, rewritten_stmt);
       }
+      if (pipeline_mbar_phase) {
+        rewritten_stmt = TileOpMbarPhaseAnnotator::Annotate(
+            rewritten_stmt, pipeline_mbar_phase.value());
+      }
+
+      new_stmts.push_back({stage, inbound, new_block->reads, new_block->writes,
+                           normalized_access_index, is_async, rewritten_stmt});
 
-      new_blocks.push_back({stage, order, start, end, inbound, new_block,
-                            normalized_access_index,
-                            pipeline_info_[block].async});
+      for (const BufferRegion &read_region : new_block->reads) {
+        for (const auto &kv : async_states_) {
+          if (kv.first <= stage && kv.second.writes(read_region->buffer)) {
+            async_states_local[kv.first].consumed = true;
+          }
+        }
+      }
     }
 
-    PopulateWaitCounts(new_blocks, &async_states_local, is_epilogue);
+    PopulateWaitCounts(new_stmts, &ana_normalized, buffer_to_commit_group,
+                       &async_states_local);
+    std::vector<FinalStmtInfo> final_stmts = CompletePipelineLoopStatements(
+        new_stmts, async_states_local, &ana_normalized);
 
-    auto stmts = CompletePipelineLoopStatements(new_blocks, async_states_local);
+    Array<Stmt> stmts;
+    for (const auto &stmt_info : final_stmts) {
+      stmts.push_back(stmt_info.stmt);
+    }
 
     Stmt new_loop{nullptr};
 
@@ -885,21 +2221,36 @@ class PipelineRewriter : public StmtExprMutator {
         const String &key = kv.first;
         if (kv.first != tir::attr::software_pipeline_stage &&
             kv.first != tir::attr::software_pipeline_order &&
-            kv.first != tir::attr::software_pipeline_async_stages) {
+            kv.first != tir::attr::software_pipeline_async_stages &&
+            kv.first != kPipelineAsyncProducers &&
+            kv.first != kPipelineAsyncProducerGroups &&
+            kv.first != kPipelineTmaCopies && kv.first != "num_stages") {
           preserved_annotations.Set(key, kv.second);
         }
       }
+      if (pipeline_num_stages &&
+          preserved_annotations.find("tl_pipelined_num_stages") ==
+              preserved_annotations.end()) {
+        preserved_annotations.Set("tl_pipelined_num_stages",
+                                  pipeline_num_stages.value());
+      }
       new_loop = For(Downcast<Var>(new_loop_var), pipeline_loop_->min, extent,
                      unroll_loop ? ForKind::kUnrolled : pipeline_loop_->kind,
                      std::move(new_loop), std::nullopt, preserved_annotations);
     }
-    // Update producer heads in the global async states.
-    for (const auto &[stage_id, state] : async_states_local) {
-      async_states[stage_id].producer_head += extent;
+    Stmt result = BlockRealize({}, Bool(true),
+                               MakeBlock(new_loop, buffer_data_to_buffer_));
+    if (pipeline_num_stages) {
+      if (pipeline_num_stages.value()->value > 1) {
+        result =
+            AttrStmt(Integer(0), kPipelineMVBContextNumStages,
+                     Downcast<PrimExpr>(pipeline_num_stages.value()), result);
+      }
+      result =
+          AttrStmt(Integer(0), kPipelineContextNumStages,
+                   Downcast<PrimExpr>(pipeline_num_stages.value()), result);
     }
-
-    return BlockRealize({}, Bool(true),
-                        MakeBlock(new_loop, buffer_data_to_buffer_));
+    return result;
   }
 
   arith::Analyzer analyzer_;
@@ -910,9 +2261,11 @@ class PipelineRewriter : public StmtExprMutator {
   PipelineInfo pipeline_info_;
   int max_stage_ = -1;
   Map<Buffer, Buffer> buffer_remap_;
+  Optional<Target> target_;
   Array<Block> ordered_stmts_;
-  std::map<int, AsyncStateGlobal> async_states;
   std::vector<LetWrapper> loop_var_let_wrappers_;
+  std::vector<IfWrapper> loop_var_if_wrappers_;
+  std::map<int, AsyncStateGlobal> async_states_;
 };
 
 /*!
@@ -950,11 +2303,478 @@ void BuildDependencyGraph(const Array<Block> &blocks,
   }
 }
 
+// ---------------------------------------------------------------------------
+// Helpers for pipeline-level TMA barrier management
+// ---------------------------------------------------------------------------
+
+/*!
+ * \brief Rewrite a block's body, converting tl.tileop.copy calls to
+ *        tl.tileop.tma_copy with barrier and emit_arrive annotations.
+ */
+class CopyToTmaCopyRewriter : public StmtExprMutator {
+public:
+  CopyToTmaCopyRewriter(const Buffer &barrier_buf, PrimExpr barrier_id,
+                        bool emit_arrive = true)
+      : barrier_buf_(barrier_buf), barrier_id_(std::move(barrier_id)),
+        emit_arrive_(emit_arrive) {}
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    static const Op &copy_op = Op::Get("tl.tileop.copy");
+    static const Op &tma_copy_op = Op::Get("tl.tileop.tma_copy");
+    static const Op &im2col_op = Op::Get("tl.tileop.c2d_im2col");
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (call->op.same_as(copy_op)) {
+      auto new_annotations = call->annotations;
+      new_annotations.Set("barrier", MakeBarrierRef(barrier_buf_, barrier_id_));
+      new_annotations.Set("is_tma_copy", IntImm(DataType::Int(32), 1));
+      new_annotations.Set("emit_arrive",
+                          IntImm(DataType::Int(32), emit_arrive_ ? 1 : 0));
+      return Call(call->dtype, tma_copy_op, call->args, new_annotations,
+                  call->span);
+    }
+    // Annotate c2d_im2col with pipeline barrier so its Lower() uses it
+    // instead of allocating a separate internal barrier.
+    if (call->op.same_as(im2col_op)) {
+      auto new_annotations = call->annotations;
+      new_annotations.Set("barrier", MakeBarrierRef(barrier_buf_, barrier_id_));
+      new_annotations.Set("emit_arrive",
+                          IntImm(DataType::Int(32), emit_arrive_ ? 1 : 0));
+      return Call(call->dtype, call->op, call->args, new_annotations,
+                  call->span);
+    }
+    return call;
+  }
+
+private:
+  Buffer barrier_buf_;
+  PrimExpr barrier_id_;
+  bool emit_arrive_;
+};
+
+// ---------------------------------------------------------------------------
+// ExpandPipelineBarriers — multi-version all barrier buffers for pipelining
+// ---------------------------------------------------------------------------
+
+/// Collect all shared.barrier Buffer objects referenced in a statement.
+class BarrierBufferCollector : public StmtExprVisitor {
+public:
+  static std::vector<Buffer>
+  Collect(const Array<Block> &blocks,
+          const Map<Var, Buffer> &buffer_data_to_buffer) {
+    BarrierBufferCollector c(buffer_data_to_buffer);
+    for (const auto &block : blocks) {
+      c(block->body);
+    }
+    return {c.barriers_.begin(), c.barriers_.end()};
+  }
+
+private:
+  explicit BarrierBufferCollector(const Map<Var, Buffer> &buf_map)
+      : buf_map_(buf_map) {}
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (op->buffer.scope() == "shared.barrier" ||
+        op->buffer.scope() == "shared.cluster_barrier") {
+      if (!seen_.count(op->buffer.get())) {
+        seen_.insert(op->buffer.get());
+        barriers_.push_back(op->buffer);
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    if (op->buffer.scope() == "shared.barrier" ||
+        op->buffer.scope() == "shared.cluster_barrier") {
+      if (!seen_.count(op->buffer.get())) {
+        seen_.insert(op->buffer.get());
+        barriers_.push_back(op->buffer);
+      }
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  // Also check barrier refs inside Call annotations (e.g., tma_copy barrier).
+  void VisitExpr_(const CallNode *op) final {
+    for (const auto &[key, val] : op->annotations) {
+      if (auto load = val.as<BufferLoadNode>()) {
+        if (load->buffer.scope() == "shared.barrier" ||
+            load->buffer.scope() == "shared.cluster_barrier") {
+          if (!seen_.count(load->buffer.get())) {
+            seen_.insert(load->buffer.get());
+            barriers_.push_back(load->buffer);
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  const Map<Var, Buffer> &buf_map_;
+  std::unordered_set<const BufferNode *> seen_;
+  std::vector<Buffer> barriers_;
+};
+
+/// Rewrite barrier references: expand indices and rewrite parity.
+class BarrierIndexRewriter : public StmtExprMutator {
+public:
+  BarrierIndexRewriter(
+      const std::unordered_map<const BufferNode *, Buffer> &old_to_new,
+      const std::unordered_map<const BufferNode *, PrimExpr> &old_shapes,
+      PrimExpr stage_expr, PrimExpr parity_cycle, Var loop_var,
+      PrimExpr loop_min)
+      : old_to_new_(old_to_new), old_shapes_(old_shapes),
+        stage_expr_(std::move(stage_expr)),
+        parity_cycle_(std::move(parity_cycle)), loop_var_(std::move(loop_var)),
+        loop_min_(std::move(loop_min)) {}
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    BufferLoad load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    auto it = old_to_new_.find(load->buffer.get());
+    if (it != old_to_new_.end()) {
+      auto *n = load.CopyOnWrite();
+      PrimExpr old_size = old_shapes_.at(load->buffer.get());
+      n->buffer = it->second;
+      n->indices.Set(0, stage_expr_ * old_size + n->indices[0]);
+    }
+    return load;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    BufferStore store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    auto it = old_to_new_.find(store->buffer.get());
+    if (it != old_to_new_.end()) {
+      auto *n = store.CopyOnWrite();
+      PrimExpr old_size = old_shapes_.at(store->buffer.get());
+      n->buffer = it->second;
+      n->indices.Set(0, stage_expr_ * old_size + n->indices[0]);
+    }
+    return store;
+  }
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+
+    // Rewrite barrier refs inside annotations (e.g., tma_copy "barrier").
+    bool anno_changed = false;
+    Map<String, ObjectRef> new_annos = call->annotations;
+    for (const auto &[key, val] : call->annotations) {
+      if (auto load = val.as<BufferLoadNode>()) {
+        auto it = old_to_new_.find(load->buffer.get());
+        if (it != old_to_new_.end()) {
+          PrimExpr old_size = old_shapes_.at(load->buffer.get());
+          auto new_load = BufferLoad(
+              it->second, {stage_expr_ * old_size + load->indices[0]});
+          new_annos.Set(key, new_load);
+          anno_changed = true;
+        }
+      }
+    }
+    if (anno_changed) {
+      call = Call(call->dtype, call->op, call->args, new_annos, call->span);
+    }
+
+    // Rewrite mbarrier_wait_parity parity argument.
+    if (call->op.same_as(mbarrier_wait_parity()) && call->args.size() >= 2) {
+      if (auto load = call->args[0].as<BufferLoadNode>()) {
+        // Check if the barrier ref (possibly already rewritten above)
+        // targets one of our expanded barriers.
+        const BufferNode *target = load->buffer.get();
+        bool is_expanded = false;
+        for (const auto &[old_buf, new_buf] : old_to_new_) {
+          if (new_buf.get() == target) {
+            is_expanded = true;
+            break;
+          }
+        }
+        if (is_expanded) {
+          // Compute initial-phase offset from the user's original parity.
+          arith::Analyzer analyzer;
+          PrimExpr user_parity = call->args[1];
+          PrimExpr user_parity_at_min = analyzer.Simplify(
+              tir::Substitute(user_parity, {{loop_var_, loop_min_}}));
+          // New parity = (iteration_block + offset) % 2
+          PrimExpr offset = IntImm(DataType::Int(32), 0);
+          if (const int64_t *imm = as_const_int(user_parity_at_min)) {
+            offset = IntImm(DataType::Int(32), *imm % 2);
+          }
+          PrimExpr new_parity = FloorMod(parity_cycle_ + offset, 2);
+          Array<PrimExpr> new_args = call->args;
+          new_args.Set(1, new_parity);
+          return Call(call->dtype, call->op, new_args, call->annotations,
+                      call->span);
+        }
+      }
+    }
+    return call;
+  }
+
+private:
+  const std::unordered_map<const BufferNode *, Buffer> &old_to_new_;
+  const std::unordered_map<const BufferNode *, PrimExpr> &old_shapes_;
+  PrimExpr stage_expr_;
+  PrimExpr parity_cycle_;
+  Var loop_var_;
+  PrimExpr loop_min_;
+};
+
+/// Expand all shared.barrier buffers in the pipeline body from [N] to
+/// [N * num_stages], rewrite barrier indices to include stage offset, and
+/// rewrite mbarrier_wait_parity parity expressions.
+///
+/// This is the unified barrier multi-versioning path that replaces the old
+/// late barrier-only fixup in OptimizeForTarget.
+/// Returns a map of old→new barrier buffers for outer block alloc_buffers
+/// update.
+Map<Buffer, Buffer> ExpandPipelineBarriers(
+    Array<Block> &original_order, PipelineInfo &pipeline_info,
+    Map<Var, Buffer> &buffer_data_to_buffer,
+    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
+        &allocated_buffers,
+    Array<Buffer> &block_local_allocs, Array<Buffer> &pipeline_allocs,
+    Var loop_var, PrimExpr loop_min, int num_stages) {
+  if (num_stages <= 1)
+    return {};
+
+  // Only expand barriers that have explicit ptx_arrive_barrier calls in the
+  // loop body.  This distinguishes pipeline synchronization barriers (where
+  // arrive/wait are user-managed and need per-stage slots) from barriers
+  // whose arrival is managed internally by tile-ops (e.g., tcgen05 MMA
+  // arrive barriers) — those should NOT be pipeline-expanded.
+  // ISP-created pipeline_mbar is handled specially: it's always in
+  // block_local_allocs and was just created, so include it too.
+  std::unordered_set<const BufferNode *> local_barrier_set;
+  for (const Buffer &buf : block_local_allocs) {
+    if (buf.scope() == "shared.barrier" ||
+        buf.scope() == "shared.cluster_barrier")
+      local_barrier_set.insert(buf.get());
+  }
+
+  // Find barriers that have explicit ptx_arrive_barrier calls.
+  class ArriveBarrierDetector : public StmtExprVisitor {
+  public:
+    std::unordered_set<const BufferNode *> arrived_;
+    void VisitExpr_(const CallNode *op) final {
+      if (op->op.same_as(builtin::ptx_arrive_barrier()) && !op->args.empty()) {
+        if (auto load = op->args[0].as<BufferLoadNode>()) {
+          arrived_.insert(load->buffer.get());
+        }
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    }
+  };
+  ArriveBarrierDetector arrive_det;
+  for (const auto &block : original_order) {
+    arrive_det(block->body);
+  }
+
+  std::vector<Buffer> all_referenced =
+      BarrierBufferCollector::Collect(original_order, buffer_data_to_buffer);
+  std::vector<Buffer> barriers;
+  for (const Buffer &buf : all_referenced) {
+    // Include if: (a) it's an ISP-created local barrier, OR
+    //             (b) it has explicit ptx_arrive_barrier calls.
+    if (local_barrier_set.count(buf.get()) ||
+        arrive_det.arrived_.count(buf.get())) {
+      barriers.push_back(buf);
+    }
+  }
+  if (barriers.empty())
+    return {};
+
+  PrimExpr ns = IntImm(DataType::Int(32), num_stages);
+  PrimExpr stage_expr = FloorMod(loop_var - loop_min, ns);
+  PrimExpr parity_cycle = FloorMod(FloorDiv(loop_var - loop_min, ns), 2);
+
+  auto replace_in_array = [](Array<Buffer> &arr, const Buffer &old_buf,
+                             const Buffer &new_buf) {
+    for (size_t i = 0; i < arr.size(); ++i) {
+      if (arr[i].same_as(old_buf)) {
+        arr.Set(i, new_buf);
+      }
+    }
+  };
+
+  // Create expanded buffer for each barrier.
+  std::unordered_map<const BufferNode *, Buffer> old_to_new;
+  std::unordered_map<const BufferNode *, PrimExpr> old_shapes;
+  for (const Buffer &buf : barriers) {
+    old_shapes[buf.get()] = buf->shape[0];
+    ObjectPtr<BufferNode> new_node =
+        tvm::ffi::make_object<BufferNode>(*(buf.get()));
+    new_node->shape = {PrimExpr(num_stages) * buf->shape[0]};
+    Buffer new_buf(new_node);
+    old_to_new[buf.get()] = new_buf;
+
+    // Update all maps and alloc arrays.
+    buffer_data_to_buffer.Set(buf->data, new_buf);
+    allocated_buffers.erase(buf);
+    allocated_buffers.insert(new_buf);
+    replace_in_array(block_local_allocs, buf, new_buf);
+    replace_in_array(pipeline_allocs, buf, new_buf);
+  }
+
+  // Rewrite all blocks.
+  BarrierIndexRewriter rewriter(old_to_new, old_shapes, stage_expr,
+                                parity_cycle, loop_var, loop_min);
+  for (size_t i = 0; i < original_order.size(); ++i) {
+    Block old_block = original_order[i];
+    Stmt new_body = rewriter(old_block->body);
+    if (!new_body.same_as(old_block->body)) {
+      // Also rewrite alloc_buffers in the block (barriers may be allocated
+      // here).
+      Array<Buffer> new_allocs;
+      for (const Buffer &ab : old_block->alloc_buffers) {
+        auto it = old_to_new.find(ab.get());
+        new_allocs.push_back(it != old_to_new.end() ? it->second : ab);
+      }
+      Block new_block(old_block->iter_vars, old_block->reads, old_block->writes,
+                      old_block->name_hint, new_body, old_block->init,
+                      new_allocs, old_block->match_buffers,
+                      old_block->annotations);
+      PipelineAnnotation anno = pipeline_info.at(old_block);
+      pipeline_info.erase(old_block);
+      pipeline_info.emplace(new_block, anno);
+      original_order.Set(i, new_block);
+    }
+  }
+
+  // Return the old→new mapping for outer block alloc_buffers update.
+  Map<Buffer, Buffer> result;
+  for (const auto &[old_ptr, new_buf] : old_to_new) {
+    for (const Buffer &old_buf : barriers) {
+      if (old_buf.get() == old_ptr) {
+        result.Set(old_buf, new_buf);
+        break;
+      }
+    }
+  }
+  return result;
+}
+
+/*!
+ * \brief Rewrite TMA-eligible copy blocks in the pipeline body for
+ *        pipeline-level barrier management.
+ *
+ * For each TMA copy: convert tl.tileop.copy → tl.tileop.tma_copy with a
+ * per-stage barrier slot and emit_arrive=1 so LowerTileOp emits arrive inside
+ * the thread-0 guard.
+ *
+ * For the first consumer stage block: prepend mbarrier_wait_parity with
+ * stage-indexed barrier reference and parity expression.
+ *
+ * \param original_order  In/out: blocks in original pipeline order.
+ * \param pipeline_info   In/out: block → PipelineAnnotation mapping.
+ * \param tma_copies      Per-statement TMA flag array from PipelinePlanning.
+ * \param buffer_data_to_buffer  In/out: buffer var → Buffer mapping.
+ * \param allocated_buffers      In/out: set of allocated buffers.
+ * \param block_local_allocs     In/out: buffers allocated in the pipeline
+ * block.
+ * \return The newly created barrier buffer (undefined if no TMA copies).
+ */
+Buffer RewritePipelineTmaBarriers(
+    Array<Block> &original_order, PipelineInfo &pipeline_info,
+    const Array<Integer> &tma_copies, Map<Var, Buffer> &buffer_data_to_buffer,
+    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
+        &allocated_buffers,
+    Array<Buffer> &block_local_allocs, Var loop_var, PrimExpr loop_min,
+    int num_stages) {
+  // Count TMA copies
+  int num_tma = 0;
+  for (const auto &tc : tma_copies) {
+    if (tc->value != 0)
+      num_tma++;
+  }
+  if (num_tma == 0)
+    return Buffer();
+
+  // Create pipeline barrier buffer with a single slot.  The generic
+  // ExpandPipelineBarriers pass (called later) will expand it to
+  // num_stages slots along with all other barrier buffers.
+  Buffer barrier_buf = CreateMBarrierBuffer("pipeline_mbar", 1);
+  buffer_data_to_buffer.Set(barrier_buf->data, barrier_buf);
+  allocated_buffers.insert(barrier_buf);
+  block_local_allocs.push_back(barrier_buf);
+
+  // Find the index of the last TMA copy for arrive emission.
+  int last_tma_idx = -1;
+  for (size_t i = 0; i < original_order.size(); i++) {
+    if (static_cast<int>(tma_copies[i]->value) != 0)
+      last_tma_idx = static_cast<int>(i);
+  }
+
+  // Phase 1: Rewrite TMA copy blocks — all share barrier slot 0.
+  // ExpandPipelineBarriers (called later) will rewrite indices to be
+  // stage-dependent.  Only the last TMA copy emits arrive.
+  for (size_t i = 0; i < original_order.size(); i++) {
+    if (static_cast<int>(tma_copies[i]->value) == 0)
+      continue;
+
+    bool is_last = (static_cast<int>(i) == last_tma_idx);
+    Block old_block = original_order[i];
+    CopyToTmaCopyRewriter rewriter(barrier_buf,
+                                   /*barrier_id=*/IntImm(DataType::Int(32), 0),
+                                   /*emit_arrive=*/is_last);
+    Stmt new_body = rewriter(old_block->body);
+
+    Block new_block(old_block->iter_vars, old_block->reads, old_block->writes,
+                    old_block->name_hint, new_body, old_block->init,
+                    old_block->alloc_buffers, old_block->match_buffers,
+                    old_block->annotations);
+
+    PipelineAnnotation anno = pipeline_info.at(old_block);
+    pipeline_info.erase(old_block);
+    pipeline_info.emplace(new_block, anno);
+    original_order.Set(i, new_block);
+  }
+
+  // Phase 2: Insert waits in consumer blocks (blocks that depend on TMA data).
+  // For simplicity, we insert waits before the first block whose stage > 0.
+  // This covers the common case where stage 0 = producers, stage 1 = consumer.
+  bool waits_inserted = false;
+  for (size_t i = 0; i < original_order.size(); i++) {
+    if (waits_inserted)
+      break;
+    Block old_block = original_order[i];
+    int stage = pipeline_info.at(old_block).stage;
+    if (stage == 0)
+      continue; // still in producer stage
+
+    // Wait on barrier slot 0 with single-slot parity.
+    // ExpandPipelineBarriers will rewrite index and parity for versioning.
+    Array<Stmt> wait_stmts;
+    {
+      PrimExpr barrier_ref =
+          MakeBarrierRef(barrier_buf, IntImm(DataType::Int(32), 0));
+      PrimExpr ns = IntImm(DataType::Int(32), num_stages);
+      PrimExpr parity = FloorMod(FloorDiv(loop_var - loop_min, ns), 2);
+      wait_stmts.push_back(Evaluate(Call(
+          DataType::Handle(), mbarrier_wait_parity(), {barrier_ref, parity})));
+    }
+    wait_stmts.push_back(old_block->body);
+    Stmt new_body = SeqStmt(wait_stmts);
+
+    Block new_block(old_block->iter_vars, old_block->reads, old_block->writes,
+                    old_block->name_hint, new_body, old_block->init,
+                    old_block->alloc_buffers, old_block->match_buffers,
+                    old_block->annotations);
+
+    PipelineAnnotation anno = pipeline_info.at(old_block);
+    pipeline_info.erase(old_block);
+    pipeline_info.emplace(new_block, anno);
+    original_order.Set(i, new_block);
+    waits_inserted = true;
+  }
+
+  return barrier_buf;
+}
+
 class PipelineInjector : private StmtExprMutator {
 public:
   static Stmt Inject(const PrimFunc &func) {
     auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
-    PipelineInjector injector(global_symbol);
+    auto target = func->GetAttr<Target>(tvm::attr::kTarget);
+    PipelineInjector injector(global_symbol, target);
     for (const auto &kv : func->buffer_map) {
       const Buffer &buffer = kv.second;
       injector.buffer_data_to_buffer_.Set(buffer->data, buffer);
@@ -963,8 +2783,9 @@ class PipelineInjector : private StmtExprMutator {
   }
 
 private:
-  explicit PipelineInjector(Optional<String> global_symbol)
-      : global_symbol_(std::move(global_symbol)) {}
+  explicit PipelineInjector(Optional<String> global_symbol,
+                            Optional<Target> target)
+      : global_symbol_(std::move(global_symbol)), target_(std::move(target)) {}
 
   /*!
    * \brief Check the pipeline satisfies the following conditions:
@@ -1013,6 +2834,36 @@ class PipelineInjector : private StmtExprMutator {
     }
   }
 
+  bool HasOverlappableStages(const PipelineInfo &pipeline_info) const {
+    std::optional<int> first_stage;
+    for (const auto &pair : pipeline_info) {
+      int stage = pair.second.stage;
+      if (!first_stage.has_value()) {
+        first_stage = stage;
+      } else if (stage != first_stage.value()) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  Map<String, Any>
+  StripPipelineAnnotations(const Map<String, Any> &annotations) const {
+    Map<String, Any> preserved_annotations;
+    for (const auto &kv : annotations) {
+      const String &key = kv.first;
+      if (key != tir::attr::software_pipeline_stage &&
+          key != tir::attr::software_pipeline_order &&
+          key != tir::attr::software_pipeline_async_stages &&
+          key != kPipelineAsyncProducers &&
+          key != kPipelineAsyncProducerGroups && key != kPipelineTmaCopies &&
+          key != "num_stages" && key != "tl_pipelined_num_stages") {
+        preserved_annotations.Set(key, kv.second);
+      }
+    }
+    return preserved_annotations;
+  }
+
   Stmt VisitStmt_(const ForNode *op) final {
     // Step 1: Recursively rewrite the children first.
     For for_node = Downcast<For>(StmtExprMutator::VisitStmt_(op));
@@ -1044,6 +2895,7 @@ class PipelineInjector : private StmtExprMutator {
     const SeqStmtNode *pipeline_body_seq = nullptr;
     std::vector<std::function<Stmt(Stmt)>> rewrap_fns;
     std::vector<LetWrapper> loop_var_let_wrappers;
+    std::vector<IfWrapper> loop_var_if_wrappers;
     auto append_attr_wrapper = [&rewrap_fns](const AttrStmtNode *attr) {
       Any node = attr->node;
       String attr_key = attr->attr_key;
@@ -1066,24 +2918,55 @@ class PipelineInjector : private StmtExprMutator {
           ICHECK(!if_then_else->else_case.defined())
               << "InjectSoftwarePipeline: Can't handle the body of the loop "
                  "because the IfThenElse node has an else branch";
-          PrimExpr condition = if_then_else->condition;
-          Span span = if_then_else->span;
-          rewrap_fns.emplace_back(
-              [condition = std::move(condition), span](Stmt body) -> Stmt {
-                return IfThenElse(condition, body, Stmt(), span);
+
+          // Check if the condition depends on the loop variable or any
+          // transitively dependent variables (similar to LetStmt handling)
+          std::unordered_set<const VarNode *> dependent_vars;
+          dependent_vars.insert(op->loop_var.get());
+          for (const auto &lw : loop_var_let_wrappers) {
+            dependent_vars.insert(lw.var.get());
+          }
+          bool condition_depends_on_loop = UsesVar(
+              if_then_else->condition, [&dependent_vars](const VarNode *vn) {
+                return dependent_vars.count(vn) > 0;
               });
+
+          if (condition_depends_on_loop) {
+            // If condition depends on loop variable, we need to push it inside
+            // each pipeline stage with proper substitution
+            loop_var_if_wrappers.push_back(
+                {if_then_else->condition, if_then_else->span});
+          } else {
+            // Otherwise, safe to wrap outside the pipeline
+            PrimExpr condition = if_then_else->condition;
+            Span span = if_then_else->span;
+            rewrap_fns.emplace_back(
+                [condition = std::move(condition), span](Stmt body) -> Stmt {
+                  return IfThenElse(condition, body, Stmt(), span);
+                });
+          }
           current = if_then_else->then_case;
           continue;
         }
         if (const auto *let_stmt = current.as<LetStmtNode>()) {
-          // If this Let value uses the pipeline loop var, record it and push
-          // inside each rewritten block later so the loop var can be
-          // substituted with the correct per-iteration index. Otherwise, keep
-          // it as a normal wrapper.
-          bool uses_loop_var = UsesVar(
-              let_stmt->value,
-              [v = op->loop_var.get()](const VarNode *vn) { return vn == v; });
-          if (uses_loop_var) {
+          // If this Let value uses the pipeline loop var OR any variable
+          // defined by a previously recorded loop-var-dependent LetStmt,
+          // record it and push inside each rewritten block later so the
+          // loop var can be substituted with the correct per-iteration index.
+          // Otherwise, keep it as a normal wrapper.
+          // This handles transitive dependencies like:
+          //   id = ids[i]      # depends on loop var
+          //   id2 = ids2[id]   # depends on id, so transitively on loop var
+          std::unordered_set<const VarNode *> dependent_vars;
+          dependent_vars.insert(op->loop_var.get());
+          for (const auto &lw : loop_var_let_wrappers) {
+            dependent_vars.insert(lw.var.get());
+          }
+          bool depends_on_loop =
+              UsesVar(let_stmt->value, [&dependent_vars](const VarNode *vn) {
+                return dependent_vars.count(vn) > 0;
+              });
+          if (depends_on_loop) {
             loop_var_let_wrappers.push_back({let_stmt->var, let_stmt->value});
           } else {
             Var var = let_stmt->var;
@@ -1139,36 +3022,6 @@ class PipelineInjector : private StmtExprMutator {
     BufferUsageCollector collector(buffer_data_to_buffer_, allocated_buffers_);
     pipeline_allocs = collector.Collect(SeqStmt(pipeline_body_seq->seq));
 
-    // Build a set of local allocs (buffers allocated in the pipeline block
-    // itself) for efficient lookup
-    std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> local_allocs_set;
-    for (const auto &buffer : block_local_allocs) {
-      local_allocs_set.insert(buffer);
-    }
-    for (size_t i = 0; i < pipeline_body_seq->seq.size(); i++) {
-      const Stmt &child = pipeline_body_seq->seq[i];
-      const auto *nested_block_realize = child.as<BlockRealizeNode>();
-      if (nested_block_realize && is_one(nested_block_realize->predicate) &&
-          nested_block_realize->block->body->IsInstance<SeqStmtNode>()) {
-        for (const auto &buffer : nested_block_realize->block->alloc_buffers) {
-          local_allocs_set.insert(buffer);
-        }
-      }
-    }
-
-    // Check if any external buffer (from outer blocks) is already used in
-    // another pipeline. This would cause conflicts in multi-versioning.
-    for (const auto &buffer : pipeline_allocs) {
-      // Only check external buffers (not locally allocated in this pipeline)
-      if (local_allocs_set.count(buffer) == 0) {
-        CHECK(buffers_used_in_pipeline_.count(buffer) == 0)
-            << "Buffer '" << buffer->name
-            << "' is used in multiple software pipeline loops. "
-            << "This is not supported because multi-versioning would conflict.";
-        buffers_used_in_pipeline_.insert(buffer);
-      }
-    }
-
     auto pipeline_stages = Downcast<Array<Integer>>(
         op->annotations.at(tir::attr::software_pipeline_stage));
     auto pipeline_orders = Downcast<Array<Integer>>(
@@ -1187,30 +3040,150 @@ class PipelineInjector : private StmtExprMutator {
         << " with different size";
 
     std::unordered_set<int> pipeline_async_stages;
-    if (auto annot =
+    if (auto async_annot =
             op->annotations.Get(tir::attr::software_pipeline_async_stages)) {
-      for (auto s : Downcast<Array<Integer>>(annot.value())) {
-        pipeline_async_stages.insert(s->value);
+      for (const Integer &stage :
+           Downcast<Array<Integer>>(async_annot.value())) {
+        pipeline_async_stages.insert(static_cast<int>(stage->value));
       }
     }
+    Optional<Array<Integer>> pipeline_async_producers;
+    if (auto async_producers_anno =
+            op->annotations.Get(kPipelineAsyncProducers)) {
+      auto async_flags = Downcast<Array<Integer>>(async_producers_anno.value());
+      CHECK_EQ(async_flags.size(), original_order.size())
+          << "PrimFunc " << global_symbol_ << " has original order "
+          << original_order.Map(
+                 [](const auto &block) { return block->name_hint; })
+          << ", but async producer annotation is " << async_flags
+          << " with different size";
+      pipeline_async_producers = async_flags;
+    }
+    Optional<Array<Integer>> pipeline_async_producer_groups;
+    if (auto async_groups_anno =
+            op->annotations.Get(kPipelineAsyncProducerGroups)) {
+      auto async_group_ids =
+          Downcast<Array<Integer>>(async_groups_anno.value());
+      CHECK_EQ(async_group_ids.size(), original_order.size())
+          << "PrimFunc " << global_symbol_ << " has original order "
+          << original_order.Map(
+                 [](const auto &block) { return block->name_hint; })
+          << ", but async producer group annotation is " << async_group_ids
+          << " with different size";
+      pipeline_async_producer_groups = async_group_ids;
+    }
 
     for (size_t i = 0; i < pipeline_stages.size(); i++) {
       int stage = static_cast<int>(pipeline_stages[i]->value);
-      bool is_async =
-          pipeline_async_stages.find(stage) != pipeline_async_stages.end();
+      bool is_async_candidate =
+          pipeline_async_producers
+              ? (pipeline_async_producers.value()[i]->value != 0)
+              : (pipeline_async_stages.count(stage) > 0);
+      // Stages that already spell out async behavior themselves keep that
+      // ownership. The pipeline pass only injects async producer semantics for
+      // "plain" producer stages that do not already contain cp.async / async
+      // queue operations.
+      bool is_async = is_async_candidate &&
+                      !ContainsExplicitAsyncIntrinsics(original_order[i]->body);
       PipelineAnnotation stage_order{
           stage,
-          /*order=*/static_cast<int>(pipeline_orders[i]->value), is_async,
-          /*original_idx=*/static_cast<int>(i)};
+          /*order=*/static_cast<int>(pipeline_orders[i]->value),
+          /*async=*/is_async,
+          /*async_group_id=*/
+          pipeline_async_producer_groups
+              ? static_cast<int>(
+                    pipeline_async_producer_groups.value()[i]->value)
+              : -1};
       pipeline_info.emplace(original_order[i], stage_order);
     }
 
     ValidatePipelineBody(pipeline_info, original_order);
 
+    if (!HasOverlappableStages(pipeline_info)) {
+      if (const auto *realize = op->body.as<BlockRealizeNode>()) {
+        const auto &block = realize->block;
+        for (const auto &buffer : block->alloc_buffers) {
+          buffer_data_to_buffer_.erase(buffer->data);
+          allocated_buffers_.erase(buffer);
+        }
+      }
+      return For(for_node->loop_var, for_node->min, for_node->extent,
+                 for_node->kind, for_node->body, for_node->thread_binding,
+                 StripPipelineAnnotations(for_node->annotations),
+                 for_node->step, for_node->span);
+    }
+
+    // Step 3.5: Pipeline-level TMA barrier management.
+    // When TMA copies are present (without warp specialization), rewrite
+    // them to use tl.tileop.tma_copy with shared pipeline barriers and insert
+    // mbarrier_wait_parity before the first consumer stage.
+    // Creates pipeline_mbar[pipeline_depth] at final size so LowerTileOp
+    // uses the provided barrier instead of allocating separate per-copy ones.
+    Buffer pipeline_barrier_buf;
+    int num_pipeline_tma_copies = 0;
+    {
+      int max_stage = 0;
+      for (const auto &pair : pipeline_info) {
+        max_stage = std::max(max_stage, pair.second.stage);
+      }
+      // Use the actual pipeline depth (number of buffer copies) for barrier
+      // sizing, not the SW pipeline stage count (max_stage + 1).
+      // Even for pipeline_depth=1 we create a shared barrier so that
+      // LowerTileOp uses it instead of allocating separate per-copy barriers.
+      Optional<Integer> pipelined_num_stages = GetPipelineNumStages(op);
+      int pipeline_depth =
+          pipelined_num_stages.defined()
+              ? static_cast<int>(pipelined_num_stages.value()->value)
+              : max_stage + 1;
+      // Clamp to at least 1 so we always allocate at least one barrier slot.
+      pipeline_depth = std::max(pipeline_depth, 1);
+      if (max_stage > 0) {
+        if (auto tma_copies_anno = op->annotations.Get(kPipelineTmaCopies)) {
+          auto tma_copies = Downcast<Array<Integer>>(tma_copies_anno.value());
+          if (tma_copies.size() == original_order.size()) {
+            for (const auto &tc : tma_copies) {
+              if (tc->value != 0)
+                num_pipeline_tma_copies++;
+            }
+            if (num_pipeline_tma_copies > 0) {
+              pipeline_barrier_buf = RewritePipelineTmaBarriers(
+                  original_order, pipeline_info, tma_copies,
+                  buffer_data_to_buffer_, allocated_buffers_,
+                  block_local_allocs, op->loop_var, op->min, pipeline_depth);
+            }
+          }
+        }
+      }
+    }
+
     // Step 4: Rewrite the pipeline body.
     // local_allocs contains buffers allocated in the pipeline block itself.
     // pipeline_allocs contains all buffers that need multi-versioning,
     // including buffers from outer blocks.
+    // Step 4.5: Expand all barrier buffers for pipelining.
+    // This handles both ISP-created pipeline_mbar AND user-written
+    // T.alloc_barrier, so that no late standalone barrier-only fixup is needed.
+    // Must run BEFORE local_allocs is copied from block_local_allocs.
+    {
+      Optional<Integer> pipelined_ns = GetPipelineNumStages(op);
+      int barrier_depth = 1;
+      if (pipelined_ns.defined()) {
+        barrier_depth = static_cast<int>(pipelined_ns.value()->value);
+      } else if (op->annotations.count("num_stages")) {
+        barrier_depth = static_cast<int>(
+            Downcast<Integer>(op->annotations.Get("num_stages").value())
+                ->value);
+      }
+      Map<Buffer, Buffer> barrier_remap = ExpandPipelineBarriers(
+          original_order, pipeline_info, buffer_data_to_buffer_,
+          allocated_buffers_, block_local_allocs, pipeline_allocs, op->loop_var,
+          op->min, barrier_depth);
+      // Register expanded barriers for outer block alloc_buffers update.
+      for (const auto &[old_buf, new_buf] : barrier_remap) {
+        pending_buffer_remap_.Set(old_buf, new_buf);
+      }
+    }
+
     Array<Buffer> local_allocs = block_local_allocs;
     // Add nested block allocs to local_allocs
     for (size_t i = 0; i < pipeline_body_seq->seq.size(); i++) {
@@ -1227,8 +3200,90 @@ class PipelineInjector : private StmtExprMutator {
 
     PipelineRewriter rewriter(buffer_data_to_buffer_, pipeline_allocs,
                               local_allocs, tvm::ffi::GetRef<For>(op),
-                              pipeline_info, loop_var_let_wrappers);
+                              pipeline_info, target_, loop_var_let_wrappers,
+                              loop_var_if_wrappers);
     Stmt pipeline = rewriter.BuildPipeline();
+    subtree_modified_ = true;
+
+    auto unwrap_outer_attrs = [](Stmt stmt) {
+      std::vector<AttrStmt> attrs;
+      while (const auto *attr = stmt.as<AttrStmtNode>()) {
+        attrs.push_back(Downcast<AttrStmt>(stmt));
+        stmt = attr->body;
+      }
+      return std::make_pair(attrs, stmt);
+    };
+    auto rewrap_outer_attrs = [](Stmt stmt,
+                                 const std::vector<AttrStmt> &attrs) {
+      for (auto it = attrs.rbegin(); it != attrs.rend(); ++it) {
+        stmt = AttrStmt((*it)->node, (*it)->attr_key, (*it)->value, stmt,
+                        (*it)->span);
+      }
+      return stmt;
+    };
+
+    // Update barrier_init annotations for expanded barrier buffers.
+    // For pipeline_mbar (ISP-created): add new entry with arrive_count=1 per
+    // slot. For user barriers (T.alloc_barrier): replicate existing arrive
+    // counts across the expanded slots.
+    {
+      auto [outer_attrs, inner_stmt] = unwrap_outer_attrs(pipeline);
+      BlockRealize br = Downcast<BlockRealize>(inner_stmt);
+      Block block = br->block;
+      BlockNode *bn = block.CopyOnWrite();
+
+      Map<Var, Array<PrimExpr>> barrier_init_map;
+      if (bn->annotations.count("barrier_init")) {
+        barrier_init_map = Downcast<Map<Var, Array<PrimExpr>>>(
+            bn->annotations.Get("barrier_init").value());
+      }
+      bool changed = false;
+
+      // Handle ISP-created pipeline barrier (needs new entry).
+      if (pipeline_barrier_buf.defined()) {
+        int num_slots = Downcast<IntImm>(pipeline_barrier_buf->shape[0])->value;
+        // After ExpandPipelineBarriers, pipeline_mbar has been expanded.
+        // Look up the expanded buffer via buffer_data_to_buffer_.
+        Buffer expanded_buf =
+            buffer_data_to_buffer_[pipeline_barrier_buf->data];
+        int expanded_slots = Downcast<IntImm>(expanded_buf->shape[0])->value;
+        Array<PrimExpr> counts;
+        for (int s = 0; s < expanded_slots; ++s) {
+          counts.push_back(IntImm(DataType::Int(32), 1));
+        }
+        barrier_init_map.Set(expanded_buf->data, counts);
+        changed = true;
+      }
+
+      // Replicate existing barrier_init entries for expanded barriers.
+      Map<Var, Array<PrimExpr>> updated_init;
+      for (const auto &[var, counts] : barrier_init_map) {
+        Buffer buf = buffer_data_to_buffer_[var];
+        int buf_size = Downcast<IntImm>(buf->shape[0])->value;
+        int orig_size = static_cast<int>(counts.size());
+        if (buf_size > orig_size && orig_size > 0 &&
+            buf_size % orig_size == 0) {
+          // Replicate pattern to match expanded size.
+          Array<PrimExpr> new_counts;
+          for (int v = 0; v < buf_size; v += orig_size) {
+            for (const auto &c : counts) {
+              new_counts.push_back(c);
+            }
+          }
+          updated_init.Set(var, new_counts);
+          changed = true;
+        } else {
+          updated_init.Set(var, counts);
+        }
+      }
+
+      if (changed) {
+        bn->annotations.Set("barrier_init", updated_init);
+        pipeline = rewrap_outer_attrs(
+            BlockRealize(br->iter_values, br->predicate, block, br->span),
+            outer_attrs);
+      }
+    }
 
     // Store the buffer remapping for updating outer block alloc_buffers
     for (const auto &kv : rewriter.GetBufferRemap()) {
@@ -1242,20 +3297,25 @@ class PipelineInjector : private StmtExprMutator {
     };
     if (!rewrap_fns.empty()) {
       if (pipeline_body_from_block) {
-        BlockRealize pipeline_realize = Downcast<BlockRealize>(pipeline);
+        auto [outer_attrs, inner_stmt] = unwrap_outer_attrs(pipeline);
+        BlockRealize pipeline_realize = Downcast<BlockRealize>(inner_stmt);
         Block pipeline_block = pipeline_realize->block;
         {
           BlockNode *block_node = pipeline_block.CopyOnWrite();
           block_node->body = apply_wrappers(block_node->body);
         }
-        pipeline = BlockRealize(pipeline_realize->iter_values,
-                                pipeline_realize->predicate, pipeline_block,
-                                pipeline_realize->span);
+        pipeline = rewrap_outer_attrs(
+            BlockRealize(pipeline_realize->iter_values,
+                         pipeline_realize->predicate, pipeline_block,
+                         pipeline_realize->span),
+            outer_attrs);
       } else {
         pipeline = apply_wrappers(pipeline);
       }
     }
 
+    pipeline = AsyncCommitWaitAttrLowerer::Lower(pipeline);
+
     if (const auto *realize = op->body.as<BlockRealizeNode>()) {
       const auto &block = realize->block;
       for (const auto &buffer : block->alloc_buffers) {
@@ -1272,28 +3332,168 @@ class PipelineInjector : private StmtExprMutator {
       allocated_buffers_.insert(buffer);
     }
 
+    bool outer_flag = subtree_modified_;
+    subtree_modified_ = false;
     Block block = Downcast<Block>(StmtExprMutator::VisitStmt_(op));
+    bool children_modified = subtree_modified_;
+    // Propagate to parent: if this subtree was modified, parent should know.
+    subtree_modified_ = outer_flag || children_modified;
 
     // Update alloc_buffers with any pending buffer remaps from pipeline
     // rewriting. This handles buffers allocated in this block but
     // multi-versioned during pipeline rewriting of inner loops.
+    bool allocs_changed = false;
+    bool layout_changed = false;
     Array<Buffer> new_alloc_buffers;
+    std::vector<std::pair<Buffer, Buffer>> remapped_allocs;
     for (const auto &buffer : block->alloc_buffers) {
       if (auto remapped = pending_buffer_remap_.Get(buffer)) {
         new_alloc_buffers.push_back(remapped.value());
-        // Remove from pending after applying
+        remapped_allocs.emplace_back(buffer, remapped.value());
         pending_buffer_remap_.erase(buffer);
+        allocs_changed = true;
       } else {
         new_alloc_buffers.push_back(buffer);
       }
     }
 
-    Array<Array<BufferRegion>> access =
-        GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
-    BlockNode *n = block.CopyOnWrite();
-    n->reads = access[0];
-    n->writes = access[1];
-    n->alloc_buffers = std::move(new_alloc_buffers);
+    if (!remapped_allocs.empty()) {
+      auto ann = block->annotations;
+      if (UpdateExpandedLayoutMapForRemappedAllocs(remapped_allocs, &ann)) {
+        block.CopyOnWrite()->annotations = std::move(ann);
+        layout_changed = true;
+      }
+    }
+
+    // Replicate barrier_init counts for any expanded barrier buffers.
+    if (allocs_changed && block->annotations.count("barrier_init")) {
+      Map<Var, Array<PrimExpr>> init_map = Downcast<Map<Var, Array<PrimExpr>>>(
+          block->annotations.Get("barrier_init").value());
+      Map<Var, Array<PrimExpr>> new_init;
+      bool init_changed = false;
+      for (const auto &[var, counts] : init_map) {
+        // Find the buffer for this var — it may have been remapped.
+        Buffer buf;
+        for (const auto &ab : new_alloc_buffers) {
+          if (ab->data.same_as(var)) {
+            buf = ab;
+            break;
+          }
+        }
+        if (buf.defined()) {
+          int buf_size = Downcast<IntImm>(buf->shape[0])->value;
+          int orig_size = static_cast<int>(counts.size());
+          if (buf_size > orig_size && orig_size > 0 &&
+              buf_size % orig_size == 0) {
+            Array<PrimExpr> new_counts;
+            for (int v = 0; v < buf_size; v += orig_size) {
+              for (const auto &c : counts)
+                new_counts.push_back(c);
+            }
+            new_init.Set(var, new_counts);
+            init_changed = true;
+            continue;
+          }
+        }
+        new_init.Set(var, counts);
+      }
+      if (init_changed) {
+        BlockNode *bn = block.CopyOnWrite();
+        bn->annotations.Set("barrier_init", new_init);
+        bn->alloc_buffers = new_alloc_buffers;
+        allocs_changed = false; // already applied
+      }
+    }
+
+    bool modified = children_modified || allocs_changed || layout_changed;
+    if (modified) {
+      // Recalculate reads/writes only when the block was actually
+      // modified by pipeline rewriting.  Unconditional recalculation
+      // can embed references to block-local buffers (e.g. local.var)
+      // into the block's own read/write annotations, which misleads
+      // downstream LCA analysis and causes those buffers to be
+      // promoted to kernel parameters.
+      //
+      // After recalculation:
+      // 1. Drop BufferRegions whose buffer is allocated in this block.
+      // 2. Widen to full-region any BufferRegion whose index
+      //    expressions reference a data var of any buffer allocated
+      //    in this block or any nested block. This prevents
+      //    downstream LCA analysis from seeing those vars at the
+      //    outer scope and promoting them to kernel parameters.
+      std::unordered_set<const BufferNode *> local_bufs;
+      std::unordered_set<const VarNode *> local_data_vars;
+      for (const auto &buf : block->alloc_buffers) {
+        local_bufs.insert(buf.get());
+        local_data_vars.insert(buf->data.get());
+      }
+      // Also collect data vars from all nested blocks.
+      PostOrderVisit(block->body, [&](const ObjectRef &obj) {
+        if (auto *inner = obj.as<BlockNode>()) {
+          for (const auto &buf : inner->alloc_buffers) {
+            local_data_vars.insert(buf->data.get());
+          }
+        }
+      });
+      auto region_uses_local_var = [&](const BufferRegion &br) -> bool {
+        for (const auto &range : br->region) {
+          bool found = false;
+          PostOrderVisit(range->min, [&](const ObjectRef &obj) {
+            if (found)
+              return;
+            if (auto *load = obj.as<BufferLoadNode>()) {
+              if (local_data_vars.count(load->buffer->data.get())) {
+                found = true;
+              }
+            }
+            if (auto *var = obj.as<VarNode>()) {
+              if (local_data_vars.count(var)) {
+                found = true;
+              }
+            }
+          });
+          if (found)
+            return true;
+          PostOrderVisit(range->extent, [&](const ObjectRef &obj) {
+            if (found)
+              return;
+            if (auto *load = obj.as<BufferLoadNode>()) {
+              if (local_data_vars.count(load->buffer->data.get())) {
+                found = true;
+              }
+            }
+            if (auto *var = obj.as<VarNode>()) {
+              if (local_data_vars.count(var)) {
+                found = true;
+              }
+            }
+          });
+          if (found)
+            return true;
+        }
+        return false;
+      };
+      Array<Array<BufferRegion>> access =
+          GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
+      auto sanitize = [&](const Array<BufferRegion> &regions) {
+        Array<BufferRegion> out;
+        for (const auto &br : regions) {
+          if (local_bufs.count(br->buffer.get())) {
+            continue; // drop block-local buffer
+          }
+          if (region_uses_local_var(br)) {
+            out.push_back(BufferRegion::FullRegion(br->buffer));
+          } else {
+            out.push_back(br);
+          }
+        }
+        return out;
+      };
+      BlockNode *n = block.CopyOnWrite();
+      n->reads = sanitize(access[0]);
+      n->writes = sanitize(access[1]);
+      n->alloc_buffers = std::move(new_alloc_buffers);
+    }
 
     for (const auto &buffer : op->alloc_buffers) {
       buffer_data_to_buffer_.erase(buffer->data);
@@ -1324,11 +3524,16 @@ class PipelineInjector : private StmtExprMutator {
   Map<Var, Buffer> buffer_data_to_buffer_;
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual> allocated_buffers_;
   Map<Buffer, Buffer> pending_buffer_remap_;
+  Optional<Target> target_;
   // Buffers from outer blocks that have been used in a pipeline loop.
   // Used to detect if the same buffer is used in multiple pipeline loops.
   std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>
       buffers_used_in_pipeline_;
   Optional<String> global_symbol_;
+  // Track whether any pipeline was actually injected in the current
+  // subtree.  Used to avoid unnecessary reads/writes recalculation
+  // on blocks whose descendants were not modified.
+  bool subtree_modified_ = false;
 };
 } // namespace software_pipeline
 
diff --git a/src/transform/inject_tcgen05_fence.cc b/src/transform/inject_tcgen05_fence.cc
new file mode 100644
index 0000000000..0573378a8a
--- /dev/null
+++ b/src/transform/inject_tcgen05_fence.cc
@@ -0,0 +1,317 @@
+/*!
+ * \file inject_tcgen05_fence.cc
+ * \brief Inject tcgen05.fence::before_thread_sync / after_thread_sync at
+ *        conservative TCGEN05/TMEM synchronization boundaries on Blackwell
+ *        (SM100+) targets.
+ *
+ * On Blackwell, the tcgen05 accumulator (TMEM) lives in its own address
+ * space. Regular thread synchronization barriers (__syncthreads, mbarrier)
+ * do NOT automatically make TMEM writes visible across threads. Two PTX
+ * fence instructions bridge this gap:
+ *
+ *   tcgen05.fence::before_thread_sync  — flush TMEM state before barrier
+ *   tcgen05.fence::after_thread_sync   — pull TMEM state after barrier
+ *
+ * This pass currently handles three patterns when the function targets SM100+
+ * and contains tcgen05/TMEM operations:
+ *
+ *   1. Wrap every tvm_storage_sync("shared") / ("shared.dyn") with
+ *      before+after fences.
+ *   2. Insert after_thread_sync after mbarrier_wait_parity when a linear
+ *      scan of following statements reaches tcgen05/TMEM use before another
+ *      synchronization boundary.
+ *   3. Insert before_thread_sync before ptx_arrive_barrier /
+ *      ptx_arrive_cluster_barrier when a linear reverse scan reaches
+ *      tcgen05/TMEM use before another synchronization boundary.
+ *
+ * It intentionally does not add an extra before_thread_sync around
+ * tcgen05_mma_arrive(), because the underlying tcgen05.commit.*.mbarrier
+ * already carries the producer-side ordering.
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/transform.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <utility>
+
+#include "../op/builtin.h"
+#include "../target/utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+using tvm::transform::PassContext;
+
+namespace {
+
+/*!
+ * \brief Check if a call is tvm_storage_sync("shared") or
+ *        tvm_storage_sync("shared.dyn").
+ */
+bool IsSharedStorageSync(const CallNode *call) {
+  if (!call || !call->op.same_as(builtin::tvm_storage_sync())) {
+    return false;
+  }
+  if (call->args.empty())
+    return false;
+  const auto *scope = call->args[0].as<StringImmNode>();
+  if (!scope)
+    return false;
+  return scope->value == "shared" || scope->value == "shared.dyn";
+}
+
+bool IsMbarrierWaitParity(const CallNode *call) {
+  return call && call->op.same_as(mbarrier_wait_parity());
+}
+
+bool IsPlainBarrierArrive(const CallNode *call) {
+  return call && (call->op.same_as(builtin::ptx_arrive_barrier()) ||
+                  call->op.same_as(ptx_arrive_cluster_barrier()));
+}
+
+bool IsBeforeFenceCall(const CallNode *call) {
+  return call && call->op.same_as(tcgen05_before_thread_sync());
+}
+
+bool IsAfterFenceCall(const CallNode *call) {
+  return call && call->op.same_as(tcgen05_after_thread_sync());
+}
+
+const CallNode *GetEvaluateCall(const Stmt &stmt) {
+  if (const auto *eval = stmt.as<EvaluateNode>()) {
+    return eval->value.as<CallNode>();
+  }
+  return nullptr;
+}
+
+bool IsTcgen05OrTmemCall(const CallNode *call) {
+  if (!call || IsBeforeFenceCall(call) || IsAfterFenceCall(call)) {
+    return false;
+  }
+
+  return call->op.same_as(ptx_tcgen05_mma_ss()) ||
+         call->op.same_as(ptx_tcgen05_mma_ts()) ||
+         call->op.same_as(tcgen05_ld()) || call->op.same_as(tcgen05_st()) ||
+         call->op.same_as(tcgen05_mma_arrive()) ||
+         call->op.same_as(ptx_init_tensor_memory()) ||
+         call->op.same_as(ptx_deallocate_tensor_memory());
+}
+
+bool StmtUsesTcgen05OrTmem(const Stmt &stmt) {
+  bool found = false;
+  PostOrderVisit(stmt, [&](const ObjectRef &node) {
+    if (found) {
+      return;
+    }
+    if (const auto *call = node.as<CallNode>()) {
+      found = IsTcgen05OrTmemCall(call);
+    }
+  });
+  return found;
+}
+
+bool IsBeforeFenceStmt(const Stmt &stmt) {
+  return IsBeforeFenceCall(GetEvaluateCall(stmt));
+}
+
+bool IsAfterFenceStmt(const Stmt &stmt) {
+  return IsAfterFenceCall(GetEvaluateCall(stmt));
+}
+
+bool IsFenceSyncBoundary(const CallNode *call) {
+  return IsSharedStorageSync(call) || IsMbarrierWaitParity(call) ||
+         IsPlainBarrierArrive(call) ||
+         (call && call->op.same_as(tcgen05_mma_arrive()));
+}
+
+bool HasUpcomingTcgen05Use(const Array<Stmt> &seq, int start_index) {
+  for (int i = start_index + 1; i < static_cast<int>(seq.size()); ++i) {
+    const Stmt &stmt = seq[i];
+    if (IsAfterFenceStmt(stmt)) {
+      return false;
+    }
+    if (StmtUsesTcgen05OrTmem(stmt)) {
+      return true;
+    }
+    if (IsBeforeFenceStmt(stmt) || IsFenceSyncBoundary(GetEvaluateCall(stmt))) {
+      return false;
+    }
+  }
+  return false;
+}
+
+bool HasPriorTcgen05Use(const Array<Stmt> &seq, int start_index) {
+  for (int i = start_index - 1; i >= 0; --i) {
+    const Stmt &stmt = seq[i];
+    if (IsBeforeFenceStmt(stmt)) {
+      return false;
+    }
+    if (StmtUsesTcgen05OrTmem(stmt)) {
+      return true;
+    }
+    if (IsAfterFenceStmt(stmt) || IsFenceSyncBoundary(GetEvaluateCall(stmt))) {
+      return false;
+    }
+  }
+  return false;
+}
+
+/*!
+ * \brief Check whether the function body contains any tcgen05 / TMEM
+ *        operations that warrant fence insertion.
+ */
+bool HasTcgen05Operations(const Stmt &body) {
+  return StmtUsesTcgen05OrTmem(body);
+}
+
+inline Stmt MakeBeforeFenceStmt() {
+  return Evaluate(Call(DataType::Void(), tcgen05_before_thread_sync(), {}));
+}
+
+inline Stmt MakeAfterFenceStmt() {
+  return Evaluate(Call(DataType::Void(), tcgen05_after_thread_sync(), {}));
+}
+
+inline void AppendFlattened(Array<Stmt> *out, const Stmt &stmt) {
+  if (!stmt.defined()) {
+    return;
+  }
+  if (const auto *seq = stmt.as<SeqStmtNode>()) {
+    for (const Stmt &inner : seq->seq) {
+      out->push_back(inner);
+    }
+    return;
+  }
+  out->push_back(stmt);
+}
+
+/*!
+ * \brief Rewriter for conservative TCGEN05/TMEM handoff points.
+ *
+ * Supported rewrites:
+ *
+ *   tcgen05_before_thread_sync();
+ *   __syncthreads();               // tvm_storage_sync("shared")
+ *   tcgen05_after_thread_sync();
+ *
+ *   mbarrier_wait_parity(...);
+ *   tcgen05_after_thread_sync();   // when the subsequent linear region uses
+ *                                  // tcgen05/TMEM before another sync point
+ *
+ *   tcgen05_before_thread_sync();  // when the prior linear region used
+ *   ptx_arrive_barrier(...);       // tcgen05/TMEM after the previous sync
+ */
+class Tcgen05FenceRewriter : public StmtExprMutator {
+public:
+  Stmt VisitStmt_(const SeqStmtNode *op) final {
+    bool saved_in_seq = in_seq_rewrite_;
+    in_seq_rewrite_ = true;
+
+    Array<Stmt> mutated_children;
+    for (const Stmt &stmt : op->seq) {
+      mutated_children.push_back(VisitStmt(stmt));
+    }
+
+    in_seq_rewrite_ = saved_in_seq;
+
+    Array<Stmt> flat_seq;
+    for (const Stmt &stmt : mutated_children) {
+      AppendFlattened(&flat_seq, stmt);
+    }
+
+    Array<Stmt> rewritten;
+    for (int i = 0; i < static_cast<int>(flat_seq.size()); ++i) {
+      const Stmt &stmt = flat_seq[i];
+      const CallNode *call = GetEvaluateCall(stmt);
+
+      if (IsSharedStorageSync(call)) {
+        if (i == 0 || !IsBeforeFenceStmt(flat_seq[i - 1])) {
+          rewritten.push_back(MakeBeforeFenceStmt());
+        }
+        rewritten.push_back(stmt);
+        if (i + 1 >= static_cast<int>(flat_seq.size()) ||
+            !IsAfterFenceStmt(flat_seq[i + 1])) {
+          rewritten.push_back(MakeAfterFenceStmt());
+        }
+        continue;
+      }
+
+      if (IsMbarrierWaitParity(call)) {
+        rewritten.push_back(stmt);
+        bool has_manual_after = i + 1 < static_cast<int>(flat_seq.size()) &&
+                                IsAfterFenceStmt(flat_seq[i + 1]);
+        if (!has_manual_after && HasUpcomingTcgen05Use(flat_seq, i)) {
+          rewritten.push_back(MakeAfterFenceStmt());
+        }
+        continue;
+      }
+
+      if (IsPlainBarrierArrive(call)) {
+        bool has_manual_before = i > 0 && IsBeforeFenceStmt(flat_seq[i - 1]);
+        if (!has_manual_before && HasPriorTcgen05Use(flat_seq, i)) {
+          rewritten.push_back(MakeBeforeFenceStmt());
+        }
+        rewritten.push_back(stmt);
+        continue;
+      }
+
+      rewritten.push_back(stmt);
+    }
+
+    if (rewritten.size() == 1) {
+      return rewritten[0];
+    }
+    return SeqStmt(std::move(rewritten));
+  }
+
+  Stmt VisitStmt_(const EvaluateNode *op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    if (in_seq_rewrite_) {
+      return stmt;
+    }
+    const auto *call = GetEvaluateCall(stmt);
+    if (IsSharedStorageSync(call)) {
+      return SeqStmt(
+          {MakeBeforeFenceStmt(), std::move(stmt), MakeAfterFenceStmt()});
+    }
+    return stmt;
+  }
+
+private:
+  bool in_seq_rewrite_{false};
+};
+
+} // namespace
+
+tvm::transform::Pass InjectTcgen05Fence() {
+  auto pass_func = [](PrimFunc f, const IRModule &, const PassContext &) {
+    // Only apply on SM100+ (Blackwell) targets.
+    Optional<Target> opt_target = f->GetAttr<Target>(tvm::attr::kTarget);
+    if (!opt_target.defined() ||
+        !TargetHasSMVersionGE(opt_target.value(), 100)) {
+      return f;
+    }
+    // Only apply if the function actually uses tcgen05 / TMEM operations.
+    if (!HasTcgen05Operations(f->body)) {
+      return f;
+    }
+    Tcgen05FenceRewriter rewriter;
+    f.CopyOnWrite()->body = rewriter(f->body);
+    return f;
+  };
+  return tir::transform::CreatePrimFuncPass(pass_func, 0,
+                                            "tl.InjectTcgen05Fence", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.InjectTcgen05Fence", InjectTcgen05Fence);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/inject_tma_barrier.cc b/src/transform/inject_tma_barrier.cc
deleted file mode 100644
index 93beb15d4f..0000000000
--- a/src/transform/inject_tma_barrier.cc
+++ /dev/null
@@ -1,604 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements.  See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License.  You may obtain a copy of the License at
- *
- *   http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing,
- * software distributed under the License is distributed on an
- * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
- * KIND, either express or implied.  See the License for the
- * specific language governing permissions and limitations
- * under the License.
- */
-
-/*!
- * \file tma_barrier_rewriter.cc
- * \brief Rewrite TMA barriers for cuda GPU (sm90+)
- */
-
-#include <tvm/arith/analyzer.h>
-#include <tvm/ffi/reflection/registry.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/expr.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt.h>
-#include <tvm/tir/stmt_functor.h>
-#include <tvm/tir/transform.h>
-
-#include <utility>
-
-#include "../op/builtin.h"
-#include "./common/attr.h"
-#include "./common/collector.h"
-#include "arith/ir_mutator_with_analyzer.h"
-#include "arith/ir_visitor_with_analyzer.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using namespace tir::transform;
-using arith::IRMutatorWithAnalyzer;
-using arith::IRVisitorWithAnalyzer;
-
-class TmaTraitsCollector : public StmtExprVisitor {
-public:
-  TmaTraitsCollector() { Initialize(); }
-
-  void Initialize() {
-    bulk_copy_bytes = 0;
-    loop_extents = 1;
-  }
-
-  void Collect(const Stmt &stmt) { VisitStmt(stmt); }
-
-  PrimExpr BulkCopyBytes() { return bulk_copy_bytes; }
-
-private:
-  void VisitExpr_(const CallNode *call) final {
-    if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col())) {
-      auto arg0 = call->args[0].as<Call>();
-      if (call->op.same_as(tma_load()) && arg0 &&
-          !arg0.value()->op.same_as(create_tma_descriptor())) {
-        // 1D TMA load has tvm_access_ptr of shared tensor in its args[0]
-        bulk_copy_bytes = call->args[3] * loop_extents;
-      } else {
-        Call access_ptr = Downcast<Call>(call->args[2]);
-        ICHECK(access_ptr->op.same_as(builtin::tvm_access_ptr()));
-        int type_bytes = access_ptr->args[0]->dtype.bytes();
-        bulk_copy_bytes += access_ptr->args[3] * loop_extents * type_bytes;
-      }
-    }
-    StmtExprVisitor::VisitExpr_(call);
-  }
-
-  void VisitStmt_(const ForNode *op) final {
-    PrimExpr old_loop_evtents = loop_extents;
-    loop_extents *= op->extent;
-    StmtExprVisitor::VisitStmt_(op);
-    loop_extents = old_loop_evtents;
-  }
-
-  PrimExpr bulk_copy_bytes = 0;
-  PrimExpr loop_extents = 1;
-};
-
-class TmaExpectTxRewriter : public IRMutatorWithAnalyzer {
-public:
-  static PrimFunc Rewrite(PrimFunc f, arith::Analyzer *analyzer) {
-    TmaExpectTxRewriter rewriter(analyzer);
-    f.CopyOnWrite()->body = rewriter(f->body);
-    return f;
-  }
-
-private:
-  bool inside_tma_block_{false};
-  bool visited_tma_load_{false};
-  IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
-                                IterVarType::kDataPar);
-
-  PrimExpr makeGetBarrier(PrimExpr barrier_id) {
-    return Call(DataType::Handle(), get_mbarrier(), {std::move(barrier_id)});
-  }
-
-  Stmt makeExpectTX(PrimExpr barrier_id, PrimExpr bytes) {
-    auto call = Call(DataType::Handle(), mbarrier_expect_tx(),
-                     {makeGetBarrier(std::move(barrier_id)), std::move(bytes)});
-    return Evaluate(call);
-  }
-
-  TmaExpectTxRewriter(arith::Analyzer *analyzer)
-      : IRMutatorWithAnalyzer(analyzer) {}
-
-  Stmt VisitStmt_(const AttrStmtNode *op) final {
-
-    if (op->attr_key == tir::attr::thread_extent) {
-      IterVar iv = Downcast<IterVar>(op->node);
-      if (iv->thread_tag == "threadIdx.x") {
-        ICHECK(iv->dom->extent.as<IntImmNode>());
-        thread_var_ = iv;
-      }
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const IfThenElseNode *op) {
-    // Check if this is the TMA block
-    bool flag = false;
-    if (op->condition.as<CallNode>()) {
-      flag = op->condition.as<CallNode>()->op.same_as(tl_shuffle_elect());
-    }
-    if (op->condition.as<EQNode>() || flag) {
-      Stmt ret = IRMutatorWithAnalyzer::VisitStmt_(op);
-
-      if (visited_tma_load_) {
-        auto then_case = op->then_case;
-        TmaTraitsCollector collector;
-        collector.Collect(then_case);
-
-        Array<Stmt> stmts;
-        if (!is_zero(collector.BulkCopyBytes())) {
-          auto expect_tx = makeExpectTX(0, collector.BulkCopyBytes());
-          stmts.push_back(expect_tx);
-        }
-        stmts.push_back(then_case);
-        if (stmts.size() == 1) {
-          return IfThenElse(op->condition, stmts[0], op->else_case);
-        } else {
-          auto seq_stmt = SeqStmt(stmts);
-          return IfThenElse(op->condition, seq_stmt, op->else_case);
-        }
-      }
-      visited_tma_load_ = false;
-      return ret;
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  PrimExpr VisitExpr_(const CallNode *op) {
-    if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
-      auto arg0 = op->args[0].as<Call>();
-      bool is_1d_tma_load =
-          arg0 && !arg0.value()->op.same_as(create_tma_descriptor()) &&
-          op->op.same_as(tma_load());
-      visited_tma_load_ = true;
-      Array<PrimExpr> new_args = op->args;
-      new_args.Set(is_1d_tma_load ? 2 : 1,
-                   Call(DataType::Handle(), get_mbarrier(),
-                        {IntImm(DataType::Int(32), 0)}));
-      return Call(op->dtype, op->op, new_args);
-    }
-    return IRMutatorWithAnalyzer::VisitExpr_(op);
-  }
-};
-
-class TmaBarrierCollector : public IRVisitorWithAnalyzer {
-public:
-  TmaBarrierCollector(Map<Var, Buffer> buffer_data_to_buffer)
-      : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)) {}
-
-  Map<ObjectRef, PrimExpr> tma_op_to_barrier_id() {
-    return tma_op_to_barrier_id_;
-  }
-  Map<PrimExpr, IntImm> barrier_id_to_range() { return barrier_id_to_range_; }
-
-private:
-  void UpdateBarrierRange(const PrimExpr &barrier_id, const IntImm &extent) {
-    if (barrier_id_to_range_.count(barrier_id)) {
-      auto old_extent = barrier_id_to_range_[barrier_id];
-      ICHECK_EQ(old_extent->value, extent->value)
-          << "barrier_id: " << barrier_id << " has different extent";
-      barrier_id_to_range_.Set(barrier_id, extent);
-    } else {
-      barrier_id_to_range_.Set(barrier_id, extent);
-    }
-  }
-
-  void VisitStmt_(const EvaluateNode *op) final {
-    if (const auto *call = op->value.as<CallNode>()) {
-      if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col())) {
-        pending_tma_ops_.push_back(tvm::ffi::GetRef<Call>(call));
-      } else if (call->op.same_as(mbarrier_expect_tx())) {
-        pending_tma_ops_.push_back(tvm::ffi::GetRef<Call>(call));
-      } else if (call->op.same_as(builtin::ptx_arrive_barrier())) {
-        PrimExpr barrier_id = call->args[0];
-        for (const auto &tma_call : pending_tma_ops_) {
-          tma_op_to_barrier_id_.Set(tma_call, barrier_id);
-        }
-        auto const_int_bound = analyzer_.const_int_bound(thread_var_);
-        auto extent =
-            const_int_bound->max_value - const_int_bound->min_value + 1;
-        UpdateBarrierRange(barrier_id, IntImm(DataType::Int(32), extent));
-        pending_tma_ops_.clear();
-      } else if (call->op.same_as(builtin::ptx_wait_barrier())) {
-        PrimExpr barrier_id = call->args[0];
-        auto const_int_bound = analyzer_.const_int_bound(thread_var_);
-        auto extent =
-            const_int_bound->max_value - const_int_bound->min_value + 1;
-        UpdateBarrierRange(barrier_id, IntImm(DataType::Int(32), extent));
-      }
-    }
-    StmtExprVisitor::VisitStmt_(op);
-  }
-
-  void VisitStmt_(const AttrStmtNode *op) {
-    if (op->attr_key == tir::attr::thread_extent) {
-      IterVar iv = Downcast<IterVar>(op->node);
-      if (iv->thread_tag == "threadIdx.x") {
-        thread_var_ = iv;
-      }
-    }
-    IRVisitorWithAnalyzer::VisitStmt_(op);
-  }
-
-  IterVar thread_var_;
-  std::vector<Call> pending_tma_ops_;
-  Map<ObjectRef, PrimExpr> tma_op_to_barrier_id_;
-  Map<PrimExpr, IntImm> barrier_id_to_range_;
-  Map<Var, Buffer> buffer_data_to_buffer_;
-};
-
-class TmaSequenceCollector : public IRVisitorWithAnalyzer {
-public:
-  TmaSequenceCollector(Map<ObjectRef, PrimExpr> tma_op_to_barrier_id)
-      : tma_op_to_barrier_id_(std::move(tma_op_to_barrier_id)) {}
-
-  std::vector<bool> GetSequence() {
-    std::vector<bool> clear_zero_list(expect_tx_count_, false);
-    int zero_idx = -1;
-    int zero_count = 0;
-
-    for (auto v : sequence) {
-      if (v == 0) {
-        zero_count += 1;
-        zero_idx += 1;
-      } else {
-        if (zero_count == 1) {
-          clear_zero_list[zero_idx] = expect_[zero_idx] && !has_simt_copy_;
-          if (clear_zero_list[zero_idx] == false) {
-            int begin = int_sets_[zero_idx].min().as<IntImmNode>()->value;
-            int end = int_sets_[zero_idx].max().as<IntImmNode>()->value;
-            for (int i = begin; i <= end; ++i) {
-              restore_barrier_ids_.push_back(i);
-            }
-          }
-        } else {
-          for (int i{zero_idx}; i > zero_idx - zero_count; --i) {
-            int begin = int_sets_[i].min().as<IntImmNode>()->value;
-            int end = int_sets_[i].max().as<IntImmNode>()->value;
-            for (int i = begin; i <= end; ++i) {
-              restore_barrier_ids_.push_back(i);
-            }
-          }
-        }
-        zero_count = 0;
-      }
-    }
-
-    return clear_zero_list;
-  }
-
-  std::vector<int> GetRestoreBarrierIds() { return restore_barrier_ids_; }
-
-  void VisitStmt_(const ForNode *op) final {
-    var_int_set_.Set(op->loop_var,
-                     arith::IntSet::FromMinExtent(op->min, op->extent));
-    IRVisitorWithAnalyzer::VisitStmt_(op);
-  }
-
-  void VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(mbarrier_expect_tx())) {
-      auto call_ref = tvm::ffi::GetRef<Call>(op);
-      if (tma_op_to_barrier_id_.count(call_ref)) {
-        PrimExpr e = tma_op_to_barrier_id_[call_ref].as<CallNode>()->args[0];
-        auto int_set = arith::EvalSet(e, var_int_set_);
-        expect_.push_back(if_depth_ == 1);
-        sequence.push_back(0);
-        int_sets_.push_back(int_set);
-        expect_tx_count_ += 1;
-      }
-    } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
-      sequence.push_back(1);
-    } else if (op->op.same_as(builtin::ptx_cp_async_barrier())) {
-      has_simt_copy_ = true;
-    }
-    IRVisitorWithAnalyzer::VisitExpr_(op);
-  }
-
-  void VisitStmt_(const IfThenElseNode *op) final {
-    if_depth_ += 1;
-
-    IRVisitorWithAnalyzer::VisitStmt(op->then_case);
-
-    if (op->else_case) {
-      IRVisitorWithAnalyzer::VisitStmt(op->else_case.value());
-    }
-    if_depth_ -= 1;
-  }
-
-  std::vector<int> sequence;
-  int expect_tx_count_{0};
-  std::vector<bool> expect_;
-  bool has_simt_copy_{false};
-  std::vector<int> restore_barrier_ids_;
-  int if_depth_{0};
-  Map<ObjectRef, PrimExpr> tma_op_to_barrier_id_;
-  arith::Analyzer *analyzer_{};
-  Map<Var, arith::IntSet> var_int_set_;
-  std::vector<arith::IntSet> int_sets_;
-};
-
-class BarrierCreationRewriter : public StmtExprMutator {
-public:
-  BarrierCreationRewriter(std::vector<int> restore_barrier_ids,
-                          PrimExpr producer_thread_extent,
-                          int ensure_min_count = 0,
-                          PrimExpr default_barrier_thread_count = 1)
-      : restore_barrier_ids_(std::move(restore_barrier_ids)),
-        producer_thread_extent_(std::move(producer_thread_extent)),
-        ensure_min_count_(ensure_min_count),
-        default_barrier_thread_count_(std::move(default_barrier_thread_count)) {
-  }
-
-  PrimExpr VisitExpr_(const CallNode *op) {
-    if (op->op.same_as(create_list_of_mbarrier())) {
-      size_t cur_n = op->args.size();
-      size_t need_n =
-          std::max<size_t>(cur_n, static_cast<size_t>(ensure_min_count_));
-
-      // Mark barriers to restore across the full needed length, not just the
-      // original length, so newly appended entries can be restored as well.
-      std::vector<bool> replace(need_n, false);
-      for (auto &id : restore_barrier_ids_) {
-        if (id >= 0 && static_cast<size_t>(id) < replace.size()) {
-          replace[id] = true;
-        }
-      }
-
-      Array<PrimExpr> new_args;
-      new_args.reserve(need_n);
-
-      // Preserve/override existing entries
-      for (size_t i{0}; i < cur_n; ++i) {
-        if (replace[i]) {
-          new_args.push_back(producer_thread_extent_);
-        } else {
-          new_args.push_back(op->args[i]);
-        }
-      }
-      // Append additional barriers if required
-      for (size_t i = cur_n; i < need_n; ++i) {
-        if (replace[i]) {
-          new_args.push_back(producer_thread_extent_);
-        } else {
-          new_args.push_back(default_barrier_thread_count_);
-        }
-      }
-
-      return Call(op->dtype, op->op, new_args);
-    } else {
-      return StmtExprMutator::VisitExpr_(op);
-    }
-  }
-
-private:
-  std::vector<int> restore_barrier_ids_;
-  PrimExpr producer_thread_extent_;
-  int ensure_min_count_{0};
-  PrimExpr default_barrier_thread_count_{1};
-};
-
-// we trust mbarrier_wait_parity to be correct
-class TmaBarrierRewriter : public IRMutatorWithAnalyzer {
-public:
-  TmaBarrierRewriter(arith::Analyzer *analyzer,
-                     Map<ObjectRef, PrimExpr> tma_op_to_barrier_id,
-                     Map<PrimExpr, IntImm> barrier_id_to_range,
-                     bool has_create_list_of_mbarrier)
-      : IRMutatorWithAnalyzer(analyzer),
-        tma_op_to_barrier_id_(std::move(tma_op_to_barrier_id)),
-        barrier_id_to_range_(std::move(barrier_id_to_range)),
-        has_create_list_of_mbarrier_(has_create_list_of_mbarrier) {}
-
-  static PrimFunc Rewrite(PrimFunc f, arith::Analyzer *analyzer) {
-    auto buffer_lca = DetectBufferAccessLCA(f);
-    Map<Var, Buffer> buffer_data_to_buffer_;
-    for (auto [buffer, _] : buffer_lca)
-      buffer_data_to_buffer_.Set(buffer->data, buffer);
-    f = TmaExpectTxRewriter::Rewrite(f, analyzer);
-    TmaBarrierCollector collector(buffer_data_to_buffer_);
-    collector(f->body);
-    bool has_create_list_of_mbarrier = false;
-    PostOrderVisit(f->body, [&](const ObjectRef &node) {
-      if (const auto *call = node.as<CallNode>()) {
-        if (call->op.same_as(create_list_of_mbarrier())) {
-          has_create_list_of_mbarrier = true;
-        } else if (call->op.same_as(builtin::ptx_init_barrier_thread_count())) {
-          has_create_list_of_mbarrier = true;
-        }
-      }
-    });
-    TmaBarrierRewriter rewriter(analyzer, collector.tma_op_to_barrier_id(),
-                                collector.barrier_id_to_range(),
-                                has_create_list_of_mbarrier);
-    f.CopyOnWrite()->body = rewriter(f->body);
-    // Compute the minimum number of barriers actually referenced in the body
-    // after TMA barrier rewrites (e.g., get_mbarrier(0) inserted for TMA).
-    struct GetMbarrierMaxIdxCollector : public StmtExprVisitor {
-      int max_idx{-1};
-      void VisitExpr_(const CallNode *op) final {
-        if (op->op.same_as(get_mbarrier())) {
-          if (op->args.size() == 1) {
-            if (const auto *imm = op->args[0].as<IntImmNode>()) {
-              max_idx = std::max(max_idx, static_cast<int>(imm->value));
-            }
-          }
-        }
-        StmtExprVisitor::VisitExpr_(op);
-      }
-    };
-
-    GetMbarrierMaxIdxCollector max_idx_collector;
-    max_idx_collector(f->body);
-    int ensure_min_count = max_idx_collector.max_idx + 1; // 0-based -> count
-
-    // For simple TMA-only producers, default barrier arrive count should be 1
-    // (only the elected leader performs the TMA arrive/expect).
-    auto barrier_creation_rewriter = BarrierCreationRewriter(
-        rewriter.restore_barrier_ids_, rewriter.producer_thread_extent_,
-        ensure_min_count, Integer(1));
-    f.CopyOnWrite()->body = barrier_creation_rewriter(f->body);
-    return f;
-  }
-
-private:
-  Stmt VisitStmt_(const BlockNode *op) {
-    auto block = tvm::ffi::GetRef<Block>(op);
-    if (!has_create_list_of_mbarrier_ && !barrier_id_to_range_.empty() &&
-        op->name_hint == MainBlockName) {
-      ICHECK(false) << "Please declare create_list_of_mbarrier.";
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const IfThenElseNode *op) {
-    if (first_if) {
-      if (op->condition.as<GENode>()) {
-        producer_thread_extent_ =
-            thread_var_->dom->extent - op->condition.as<GENode>()->b;
-      }
-      TmaSequenceCollector collector(tma_op_to_barrier_id_);
-      collector(op->then_case);
-      clear_expect_list_ = collector.GetSequence();
-      restore_barrier_ids_ = collector.GetRestoreBarrierIds();
-      first_if = false;
-
-      is_producer_ = true;
-
-      auto then_case = StmtExprMutator::VisitStmt(op->then_case);
-
-      is_producer_ = false;
-      Stmt else_case;
-      if (op->else_case.defined())
-        else_case = StmtExprMutator::VisitStmt(op->else_case.value());
-      return IfThenElse(op->condition, then_case, else_case);
-    }
-    return StmtExprMutator::VisitStmt_(op);
-  }
-
-  Stmt VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == "kWarpSpecializationScope") {
-      has_warp_specialization_ = true;
-      first_if = true;
-    } else if (op->attr_key == tir::attr::thread_extent &&
-               Downcast<IterVar>(op->node)->thread_tag == "threadIdx.x") {
-      thread_var_ = Downcast<IterVar>(op->node);
-    }
-    return IRMutatorWithAnalyzer::VisitStmt_(op);
-  }
-
-  PrimExpr VisitExpr_(const CallNode *op) {
-    if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
-      auto call_ref = tvm::ffi::GetRef<Call>(op);
-      if (!tma_op_to_barrier_id_.count(call_ref)) {
-        // For 1D TMA loads, promote raw integer barrier id to get_mbarrier(id)
-        // so codegen can emit mbarrier[index]. This handles degenerate
-        // producer-only kernels where no arrive() is seen and mapping is empty.
-        auto arg0 = op->args[0].as<Call>();
-        bool is_1d_tma_load =
-            arg0 && !arg0.value()->op.same_as(create_tma_descriptor()) &&
-            !arg0.value()->op.same_as(create_tma_im2col_descriptor());
-        if (is_1d_tma_load && op->args.size() >= 3) {
-          if (const auto *imm = op->args[2].as<IntImmNode>()) {
-            Array<PrimExpr> new_args = op->args;
-            new_args.Set(2, Call(DataType::Handle(), get_mbarrier(),
-                                 {IntImm(DataType::Int(32),
-                                         static_cast<int>(imm->value))}));
-            return Call(op->dtype, op->op, new_args);
-          }
-        }
-        return IRMutatorWithAnalyzer::VisitExpr_(op);
-      }
-      auto barrier_id = tma_op_to_barrier_id_[call_ref];
-      auto new_args = op->args;
-      auto arg0 = op->args[0].as<Call>();
-      auto is_1d_tma_load =
-          arg0 && !arg0.value()->op.same_as(create_tma_descriptor()) &&
-          !arg0.value()->op.same_as(create_tma_im2col_descriptor());
-      if (is_1d_tma_load) {
-        new_args.Set(2, barrier_id);
-      } else {
-        new_args.Set(1, barrier_id);
-      }
-      return Call(op->dtype, op->op, new_args);
-    } else if (op->op.same_as(mbarrier_expect_tx())) {
-      auto call_ref = tvm::ffi::GetRef<Call>(op);
-      if (!tma_op_to_barrier_id_.count(call_ref)) {
-        return IRMutatorWithAnalyzer::VisitExpr_(op);
-      }
-      auto barrier_id = tma_op_to_barrier_id_[call_ref];
-      auto new_args = op->args;
-      new_args.Set(0, barrier_id);
-      if (!has_warp_specialization_)
-        clear_arrive_ = false;
-      else
-        clear_arrive_ = clear_expect_list_[cur_expect_idx_++];
-      if (clear_arrive_) {
-        return Call(op->dtype, builtin::ptx_arrive_barrier_expect_tx(),
-                    new_args);
-      }
-      return Call(op->dtype, op->op, new_args);
-    } else if (op->op.same_as(builtin::ptx_arrive_barrier())) {
-      if (clear_arrive_) {
-        clear_arrive_ = false;
-        return 0;
-      }
-      // by default, all threads must wait.
-      auto new_args = op->args;
-      return Call(op->dtype, op->op, new_args);
-    }
-    return IRMutatorWithAnalyzer::VisitExpr_(op);
-  }
-  Map<ObjectRef, PrimExpr> tma_op_to_barrier_id_;
-  Map<PrimExpr, IntImm> barrier_id_to_range_;
-  bool has_create_list_of_mbarrier_;
-  bool clear_arrive_{false};
-  bool first_if{false}, has_warp_specialization_{false}, is_producer_{false};
-  IterVar thread_var_;
-  int tma_expect_tx_{0}, cur_expect_idx_{0};
-  std::vector<bool> clear_expect_list_;
-  std::vector<int> restore_barrier_ids_;
-  PrimExpr producer_thread_extent_;
-};
-
-tvm::transform::Pass InjectTmaBarrier() {
-  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
-    // Check if function only uses threadIdx.x before proceeding
-    if (!ThreadTagChecker::HasOnlyThreadIdxX(f)) {
-      LOG(WARNING) << "InjectTmaBarrier will be disabled because the program "
-                      "uses thread tags other than threadIdx.x\n"
-                   << "If you want to use TMA barrier, please refactor "
-                      "your program to use threadIdx.x only";
-      // Return original function unchanged if other thread tags are found
-      return f;
-    }
-    arith::Analyzer analyzer;
-    return TmaBarrierRewriter::Rewrite(f, &analyzer);
-  };
-  return CreatePrimFuncPass(pass_func, 0, "tl.InjectTmaBarrier", {});
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.transform.InjectTmaBarrier", InjectTmaBarrier);
-}
-
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/instruction_annotation.cc b/src/transform/instruction_annotation.cc
new file mode 100644
index 0000000000..b2ab121fed
--- /dev/null
+++ b/src/transform/instruction_annotation.cc
@@ -0,0 +1,239 @@
+/*!
+ * \file instruction_annotation.cc
+ * \brief Annotate tile operations with coarse-grained instruction kind.
+ *
+ * This pass runs **before** LayoutInference and LowerTileOp.  It inspects
+ * every `tl.tileop.*` Call node and determines the instruction category that
+ * will eventually be selected during lowering.  The result is stored as a
+ * string annotation (`tl_instruction_kind`) on the Call node so that later
+ * passes (e.g. warp specialization) can make structural decisions without
+ * needing the full lowered IR.
+ *
+ * For copy operations the classification is:
+ *   - "tma"      : will use TMA bulk load/store (descriptor or 1-D)
+ *   - "cp_async" : will use cp.async
+ *   - "sync"     : synchronous copy (SIMT / LDSM / STSM / TMem / normal)
+ *
+ * For gemm operations the classification is:
+ *   - "wgmma"      : Hopper warp-group MMA
+ *   - "tcgen5mma"  : Blackwell TCGEN5 MMA
+ *   - "mma"        : Volta/Ampere tensor-core MMA
+ *   - "mfma"       : AMD CDNA matrix fused multiply-add
+ *   - "scalar"     : scalar fallback
+ *
+ * Because this pass runs before layout inference it intentionally uses only
+ * coarse checks (target arch, buffer scopes, shape alignment) that do not
+ * depend on the inferred memory layout.
+ */
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+#include "../op/copy.h"
+#include "../op/gemm.h"
+#include "../op/operator.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+namespace {
+
+/// Annotation key written by this pass.
+static constexpr const char *kInstructionKind = "tl_instruction_kind";
+
+static bool IsAutoAsyncCopyEnabled(Target target, bool default_enabled = true) {
+  using namespace tvm::transform;
+  PassContext pass_ctx = PassContext::Current();
+  return TargetHasAsyncCopy(target) &&
+         pass_ctx->GetConfig<Bool>(kEnableAsyncCopy, Bool(default_enabled))
+             .value();
+}
+
+static bool CanUseAutoCPAsyncCopy(const CopyNode *copy, Target target,
+                                  arith::Analyzer *analyzer,
+                                  bool default_enabled = true) {
+  return copy != nullptr && !copy->GetIsTmaCopy() && !copy->GetIsAsyncCopy() &&
+         IsAutoAsyncCopyEnabled(target, default_enabled) &&
+         copy->CheckCPAsyncCopy(target, LayoutMap(), analyzer);
+}
+
+// ---------------------------------------------------------------------------
+// Classify copy ops
+// ---------------------------------------------------------------------------
+
+/*!
+ * \brief Determine the coarse instruction kind for a CopyNode.
+ *
+ * The classification does **not** depend on layout_map (which is unavailable
+ * at this point).  It mirrors the priority order in CopyNode::GetCopyInst but
+ * collapses BulkLoad/BulkLoad1D/BulkStore/BulkStore1D into "tma" and skips
+ * checks that require layout information.
+ */
+std::string ClassifyCopy(const CopyNode *copy, Target target, bool in_pipeline,
+                         arith::Analyzer *analyzer) {
+  // Explicit T.tma_copy() — always TMA.
+  if (copy->GetIsTmaCopy()) {
+    // Verify target can do TMA at all.
+    if (copy->CheckBulkLoad(target, analyzer, /*check_last_dim=*/false) ||
+        copy->CheckBulkStore(target, analyzer, /*check_last_dim=*/false)) {
+      return "tma";
+    }
+    // User asked for TMA but target doesn't support it — leave unannotated
+    // so that LowerTileOp can produce a proper error later.
+    return "sync";
+  }
+
+  // Explicit T.async_copy() — always cp.async.
+  if (copy->GetIsAsyncCopy()) {
+    return "cp_async";
+  }
+
+  // Generic T.copy() stays synchronous here. Auto-TMA is only introduced by
+  // warp-specialized rewriting, which rewrites the op to explicit T.tma_copy.
+
+  // LDSM / STSM / TMem — these are synchronous from the WS perspective.
+  if (copy->CheckLDSMCopy(target) || copy->CheckSTSMCopy(target) ||
+      copy->CheckTMemLoad(target) || copy->CheckTMemStore(target)) {
+    return "sync";
+  }
+
+  // Inside a pipelined loop, eligible copies may be lowered to cp.async.
+  if (in_pipeline && CanUseAutoCPAsyncCopy(copy, target, analyzer,
+                                           /*default_enabled=*/false)) {
+    return "cp_async";
+  }
+
+  return "sync";
+}
+
+// ---------------------------------------------------------------------------
+// Classify gemm ops
+// ---------------------------------------------------------------------------
+
+std::string ClassifyGemm(const GemmNode *gemm, int block_size, Target target) {
+  GemmInst inst = gemm->getGemmInst(block_size, target);
+  switch (inst) {
+  case GemmInst::kWGMMA:
+    return "wgmma";
+  case GemmInst::kTCGEN5MMA:
+    return "tcgen5mma";
+  case GemmInst::kMMA:
+    return "mma";
+  case GemmInst::kMFMA:
+    return "mfma";
+  case GemmInst::kScalar:
+    return "scalar";
+  default:
+    return "unknown";
+  }
+}
+
+// ---------------------------------------------------------------------------
+// IR rewriter
+// ---------------------------------------------------------------------------
+
+class InstructionAnnotator : public StmtExprMutator {
+public:
+  static PrimFunc Annotate(PrimFunc f) {
+    auto target = f->GetAttr<Target>(tvm::attr::kTarget);
+    ICHECK(target.defined())
+        << "InstructionAnnotation: target attribute is required";
+
+    InstructionAnnotator annotator;
+    annotator.target_ = target.value();
+    PrimFuncNode *fptr = f.CopyOnWrite();
+    fptr->body = annotator.VisitStmt(f->body);
+    return f;
+  }
+
+private:
+  // Track threadIdx.x extent for gemm instruction selection.
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tir::attr::thread_extent) {
+      IterVar iv = Downcast<IterVar>(op->node);
+      if (iv->thread_tag == "threadIdx.x") {
+        if (auto *int_imm = op->value.as<IntImmNode>()) {
+          block_size_ = static_cast<int>(int_imm->value);
+        }
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  // Track whether we are inside a pipelined loop.
+  Stmt VisitStmt_(const ForNode *op) final {
+    bool old_in_pipeline = in_pipeline_;
+    if (op->annotations.Get("num_stages")) {
+      in_pipeline_ = true;
+    }
+    Stmt result = StmtExprMutator::VisitStmt_(op);
+    in_pipeline_ = old_in_pipeline;
+    return result;
+  }
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+
+    // Only process tile operators.
+    auto tile_op = ParseOperator(call);
+    if (!tile_op.defined())
+      return call;
+
+    // Skip if already annotated.
+    if (call->annotations.count(kInstructionKind))
+      return call;
+
+    std::string kind;
+
+    if (auto *copy_node = tile_op.as<CopyNode>()) {
+      kind = ClassifyCopy(copy_node, target_, in_pipeline_, &analyzer_);
+    } else if (auto *gemm_node = tile_op.as<GemmNode>()) {
+      kind = ClassifyGemm(gemm_node, block_size_, target_);
+    } else {
+      // Other tile ops (reduce, fill, etc.) are synchronous.
+      kind = "sync";
+    }
+
+    // Create a new Call with the annotation added.
+    auto new_annotations = call->annotations;
+    new_annotations.Set(kInstructionKind, StringImm(kind));
+    return Call(call->dtype, call->op, call->args, new_annotations, call->span);
+  }
+
+  Target target_;
+  bool in_pipeline_{false};
+  int block_size_{0};
+  arith::Analyzer analyzer_;
+};
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Pass registration
+// ---------------------------------------------------------------------------
+
+tvm::transform::Pass InstructionAnnotation() {
+  using namespace tir::transform;
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return InstructionAnnotator::Annotate(std::move(f));
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.InstructionAnnotation", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.InstructionAnnotation",
+                        InstructionAnnotation);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/layout_inference.cc b/src/transform/layout_inference.cc
index daaa7b4ccb..4cfdb6bf82 100644
--- a/src/transform/layout_inference.cc
+++ b/src/transform/layout_inference.cc
@@ -16,17 +16,18 @@
 #include <memory>
 #include <queue>
 
+#include "../layout/layout.h"
 #include "../layout/utils.h"
+#include "../op/builtin.h"
 #include "../op/copy.h"
 #include "../op/parallel.h"
 #include "../op/region.h"
 #include "../op/utils.h"
 #include "../target/utils.h"
-
 #include "arith/ir_mutator_with_analyzer.h"
 #include "arith/ir_visitor_with_analyzer.h"
 #include "common/loop_fusion_utils.h"
-#include "common/loop_parallel_transform_utils.h"
+#include "common/pipeline_utils.h"
 #include "common/union_find.h"
 #include "layout_reducer.h"
 #include "parallel_loop_layout_validator.h"
@@ -37,6 +38,42 @@ namespace tl {
 
 using namespace tir;
 
+namespace {
+
+int64_t GetElementStorageBits(DataType dtype) {
+  // Layout aliasing must be reasoned about in logical storage bits per element,
+  // not in bytes.  For sub-byte dtypes such as fp4, `dtype.bytes()` rounds up
+  // to 1 and loses the "two fp4 values share one byte" relationship that
+  // subtype-changing views rely on.
+  return static_cast<int64_t>(dtype.bits()) * dtype.lanes();
+}
+
+bool ShapesEqual(const Array<PrimExpr> &lhs, const Array<PrimExpr> &rhs,
+                 arith::Analyzer *analyzer) {
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < lhs.size(); ++i) {
+    if (!analyzer->CanProveEqual(lhs[i], rhs[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Optional<Buffer> FindLayoutAnchorBuffer(const Array<Buffer> &buffers,
+                                        const Layout &layout,
+                                        arith::Analyzer *analyzer) {
+  for (const auto &buffer : buffers) {
+    if (ShapesEqual(layout->InputShape(), buffer->shape, analyzer)) {
+      return buffer;
+    }
+  }
+  return Optional<Buffer>();
+}
+
+} // namespace
+
 /*!
  * \brief collect the mapping from the buffer var to it allocated buffer
  */
@@ -115,7 +152,8 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
                                                      cur_analyzer,
                                                      buffer_oob,
                                                      {},
-                                                     let_var_to_expr_},
+                                                     let_var_to_expr_,
+                                                     false},
                                      level);
 
     // Process the returned updates
@@ -147,9 +185,15 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
           Layout target_layout =
               shapes_equal
                   ? src_layout
-                  : src_layout->Reshape(sib->shape, &analyzer_,
-                                        Integer(src_buffer->dtype.bytes()),
-                                        Integer(sib->dtype.bytes()));
+                  // Alias buffers may reinterpret the same storage with a
+                  // different element width.  Reshape the inferred layout using
+                  // the old/new storage bit ratio so that layout inference
+                  // keeps the physical storage footprint unchanged while
+                  // allowing the logical element count to change.
+                  : src_layout->Reshape(
+                        sib->shape, &analyzer_,
+                        Integer(GetElementStorageBits(src_buffer->dtype)),
+                        Integer(GetElementStorageBits(sib->dtype)));
           if (layout_map.count(sib)) {
             ICHECK(target_layout->IsEqual(layout_map[sib].get()))
                 << "Get different layout for alias buffer " << sib
@@ -206,11 +250,26 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
             continue;
           }
         }
-        // If already in map, ensure they are structurally equal
-        ICHECK(layout->IsEqual(layout_map[buffer].get()))
-            << "Get different layout for " << buffer
-            << "\n current layout: " << layout->DebugOutput()
-            << "\n previous layout: " << layout_map[buffer]->DebugOutput();
+
+        // If already in map, check if they are structurally equal
+        if (!layout->IsEqual(layout_map[buffer].get())) {
+          // Try to merge swizzle layouts if both are swizzle layouts
+          const Layout &existing = layout_map[buffer];
+          if (!layout.as<Fragment>() && !existing.as<Fragment>()) {
+            if (auto merged = MergeSwizzleLayouts(existing, layout, buffer)) {
+              DLOG(WARNING) << "Swizzle layout conflict for buffer " << buffer
+                            << ", merging to smaller granularity";
+              layout_map.Set(buffer, merged.value());
+              propagate_alias(buffer, merged.value());
+              continue;
+            }
+          }
+          // If not swizzle layouts or merge failed, raise error
+          LOG(FATAL) << "Get different layout for " << buffer
+                     << "\n current layout: " << layout->DebugOutput()
+                     << "\n previous layout: "
+                     << layout_map[buffer]->DebugOutput();
+        }
         // Ensure aliases are consistent too
         propagate_alias(buffer, layout);
       } else {
@@ -222,7 +281,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
           continue;
 
         // Check if buffer exists in use_list_
-        if (!use_list_.count(buffer)) {
+        if (!use_list_.count(buffer) && IsFragmentBuffer(buffer)) {
           LOG(WARNING) << "Layout inference failed for buffer " << buffer
                        << ". "
                        << "The buffer cannot be inferred with current layout "
@@ -277,7 +336,6 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     ICHECK_EQ(buffer_oob_vec_.size(), infer_list_.size())
         << "Size mismatch: buffer_oob_vec_ and infer_list_ must match in "
            "length.";
-
     DLOG(INFO) << "[InferLayout] all participating operators:" << '\n';
     for (int i = 0; i < infer_list_stmt_.size(); ++i) {
       DLOG(INFO) << "    op " << i << ":" << infer_list_stmt_[i] << '\n';
@@ -363,13 +421,13 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
               }
             }
           }
-
-          Layout reshaped = shapes_equal
-                                ? rep_layout.value()
-                                : rep_layout.value()->Reshape(
-                                      buf->shape, &analyzer_,
-                                      Integer(rep.value()->dtype.bytes()),
-                                      Integer(buf->dtype.bytes()));
+          Layout reshaped =
+              shapes_equal
+                  ? rep_layout.value()
+                  : rep_layout.value()->Reshape(
+                        buf->shape, &analyzer_,
+                        Integer(GetElementStorageBits(rep.value()->dtype)),
+                        Integer(GetElementStorageBits(buf->dtype)));
           layout_map.Set(buf, reshaped);
         }
       }
@@ -500,17 +558,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       }
       // Compute thread_var_ and thread_bounds_
       thread_var_vec_.push_back(thread_var_);
-      if (analyzer_.const_int_bound.IsBound(thread_var_->var)) {
-        auto const_int_bound = analyzer_.const_int_bound(thread_var_);
-        auto min_value = const_int_bound->min_value;
-        auto max_value = const_int_bound->max_value;
-        auto extent = max_value - min_value + 1;
-        auto dtype = thread_var_->var.dtype();
-        thread_bounds_vec_.push_back(Range::FromMinExtent(
-            IntImm(dtype, min_value), IntImm(dtype, extent)));
-      } else {
-        thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
-      }
+      thread_bounds_vec_.push_back(CurrentThreadBounds());
       analyzer_vec_.push_back(analyzer_.Clone());
 
       // Compute buffer oob for each buffer in the op
@@ -681,17 +729,7 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
       infer_list_stmt_.push_back(tvm::ffi::GetRef<ObjectRef>(op));
       infer_list_.push_back(std::move(infer));
       thread_var_vec_.push_back(thread_var_);
-      if (thread_var_.defined() &&
-          analyzer_.const_int_bound.IsBound(thread_var_->var)) {
-        auto const_int_bound = analyzer_.const_int_bound(thread_var_);
-        auto dtype = thread_var_->var.dtype();
-        auto extent =
-            const_int_bound->max_value - const_int_bound->min_value + 1;
-        thread_bounds_vec_.push_back(Range::FromMinExtent(
-            IntImm(dtype, const_int_bound->min_value), IntImm(dtype, extent)));
-      } else {
-        thread_bounds_vec_.push_back(Range::FromMinExtent(0, 1));
-      }
+      thread_bounds_vec_.push_back(CurrentThreadBounds());
       analyzer_vec_.push_back(analyzer_.Clone());
       buffer_oob_vec_.push_back(false);
     } else {
@@ -724,31 +762,26 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
             << "buffer " << var << " is not found in the block";
         const auto &buffers = buffer_data_to_buffers_[var];
         ICHECK(!buffers.empty()) << "buffer list for " << var << " is empty";
+        Optional<Buffer> anchor_buffer =
+            FindLayoutAnchorBuffer(buffers, layout, &analyzer_);
+        int64_t anchor_bits =
+            anchor_buffer.defined()
+                ? GetElementStorageBits(anchor_buffer.value()->dtype)
+                : GetElementStorageBits(buffers[0]->dtype);
         // Apply layout to all buffers associated with this var
         for (const auto &buffer : buffers) {
 
           // Reshape the layout to match the buffer's shape
           // Check if shapes are structurally equal
           bool shapes_equal =
-              layout->InputShape().size() == buffer->shape.size();
-          if (shapes_equal) {
-            for (size_t i = 0; i < layout->InputShape().size(); ++i) {
-              if (!analyzer_.CanProveEqual(layout->InputShape()[i],
-                                           buffer->shape[i])) {
-                shapes_equal = false;
-                break;
-              }
-            }
-          }
+              ShapesEqual(layout->InputShape(), buffer->shape, &analyzer_);
 
           if (shapes_equal) {
             annotated_layout_map_.Set(buffer, layout);
           } else {
-            // Use the first buffer sharing this var as the base for dtype ratio
-            int base_bytes = buffers[0]->dtype.bytes();
             auto reshaped_layout =
-                layout->Reshape(buffer->shape, &analyzer_, Integer(base_bytes),
-                                Integer(buffer->dtype.bytes()));
+                layout->Reshape(buffer->shape, &analyzer_, Integer(anchor_bits),
+                                Integer(GetElementStorageBits(buffer->dtype)));
             annotated_layout_map_.Set(buffer, reshaped_layout);
           }
         }
@@ -791,6 +824,10 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
     });
   }
 
+  Range CurrentThreadBounds() const {
+    return ComputeThreadBounds(thread_var_, analyzer_);
+  }
+
   void VisitExpr_(const BufferLoadNode *op) final {
     // Collect buffer from BufferLoad
     if (op->buffer.defined() && op->buffer->data.defined()) {
@@ -921,17 +958,11 @@ class BufferUseDefCollector : public IRVisitorWithAnalyzer {
         // This is a floating access - record buffer with current thread_bounds
         if (floating_buffers_.find(buffer) != floating_buffers_.end())
           return; // Already recorded
-        Range thread_bounds = Range::FromMinExtent(0, 1);
-        if (thread_var_.defined() &&
-            analyzer_.const_int_bound.IsBound(thread_var_->var)) {
-          auto const_int_bound = analyzer_.const_int_bound(thread_var_);
-          auto dtype = thread_var_->var.dtype();
-          auto extent =
-              const_int_bound->max_value - const_int_bound->min_value + 1;
-          thread_bounds = Range::FromMinExtent(
-              IntImm(dtype, const_int_bound->min_value), IntImm(dtype, extent));
-        }
-        floating_buffers_[buffer] = thread_bounds;
+        floating_buffers_[buffer] = CurrentThreadBounds();
+      }
+
+      Range CurrentThreadBounds() const {
+        return ComputeThreadBounds(thread_var_, analyzer_);
       }
 
       const std::unordered_set<const Object *> &nodes_in_tileops_;
@@ -1222,7 +1253,7 @@ class LayoutInferencer : public IRMutatorWithAnalyzer {
 
     auto loop_layout = result_.for_map[root];
 
-    // Store the loop layout as an annotation on the For node
+    // Store the loop layout as an annotation on the For node (outermost)
     auto for_ptr = for_node.CopyOnWrite();
     for_ptr->annotations.Set(attr::kParallelLoopLayout, loop_layout);
 
@@ -1253,7 +1284,6 @@ class LayoutInferencer : public IRMutatorWithAnalyzer {
 tvm::transform::Pass LayoutInference() {
   using namespace tir::transform;
   auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
-    f.CopyOnWrite()->body = ParallelLoopTransformer::Substitute(f->body);
     ThreadBindingCollector collector;
     collector(f->body);
     bool has_thread_binding = !collector.thread_binding_.empty();
diff --git a/src/transform/legalize_safe_memory_access.cc b/src/transform/legalize_safe_memory_access.cc
index a6f31da7d0..de85e0ac78 100644
--- a/src/transform/legalize_safe_memory_access.cc
+++ b/src/transform/legalize_safe_memory_access.cc
@@ -24,24 +24,32 @@ namespace tl {
 using namespace tir;
 using arith::IRMutatorWithAnalyzer;
 
-// GlobalMemChecker for a BufferLoad/BufferStore node:
+// SafeMemChecker for a BufferLoad/BufferStore node:
 // 1. Identify BufferLoad and BufferStore nodes.
-// 2. Check if the buffer is in global scope.
-// 3. For each index, compare against the buffer's shape.
+// 2. For each index, compare against the buffer's shape.
 //    If the index might exceed the shape (upper bound too large),
-//    log a warning or handle accordingly.
-struct GlobalMemChecker : public StmtExprVisitor {
+//    log a warning (local/shared) or handle accordingly (global).
+struct SafeMemChecker : public StmtExprVisitor {
 
-  GlobalMemChecker(arith::Analyzer *analyzer, bool recursively_collect_conds)
+  bool disableOOBWarning = false;
+
+  SafeMemChecker(arith::Analyzer *analyzer, bool recursively_collect_conds)
       : analyzer_(analyzer),
-        recursively_collect_conds_(recursively_collect_conds) {}
+        recursively_collect_conds_(recursively_collect_conds) {
+    disableOOBWarning =
+        tvm::transform::PassContext::Current()
+            ->GetConfig(kDisableOutOfBoundWarning, Optional<Bool>())
+            .value_or(true);
+  }
   void VisitExpr_(const BufferLoadNode *op) final {
-    // Check if the buffer is in global scope
-    // This is because we are writing TilePrograms, where out of bounds
-    // accesses only happen in the global buffer.
-    if (IsGlobalBuffer(op->buffer)) {
-      CheckBufferIndices(op->buffer, op->indices, /*is_load=*/true);
-    }
+    // If the buffer is in global scope, we will check its indices and add
+    // corresponding bound checks.
+    // If the buffer is in shared/local, although out of bound accesses are
+    // still possible, we assume the developers can handle them. This is because
+    // we are writing TilePrograms. Therefore we only log warnings if there
+    // are possible out-of-bounds.
+    CheckBufferIndices(op->buffer, op->indices, /*is_load=*/true,
+                       !disableOOBWarning && !IsGlobalBuffer(op->buffer));
     if (recursively_collect_conds_) {
       StmtExprVisitor::VisitExpr_(op);
     }
@@ -49,9 +57,8 @@ struct GlobalMemChecker : public StmtExprVisitor {
 
   void VisitStmt_(const BufferStoreNode *op) final {
     // Check if the buffer is in global scope
-    if (IsGlobalBuffer(op->buffer)) {
-      CheckBufferIndices(op->buffer, op->indices, /*is_load=*/false);
-    }
+    CheckBufferIndices(op->buffer, op->indices, /*is_load=*/false,
+                       !disableOOBWarning && !IsGlobalBuffer(op->buffer));
     if (recursively_collect_conds_) {
       StmtExprVisitor::VisitStmt_(op);
     }
@@ -70,7 +77,7 @@ struct GlobalMemChecker : public StmtExprVisitor {
 
   // Check each index against the buffer shape dimensions
   void CheckBufferIndices(const Buffer &buffer, const Array<PrimExpr> &indices,
-                          bool is_load) {
+                          bool is_load, bool throw_warning) {
     // Ensure indices count matches buffer dimension
     if (indices.size() != buffer->shape.size()) {
       LOG(WARNING) << "Buffer access dimension mismatch: indices size ("
@@ -97,23 +104,106 @@ struct GlobalMemChecker : public StmtExprVisitor {
         continue;
       }
 
+      // Shared/local warning checks are best-effort only. Some swizzled
+      // indices generated by tile-op lowering contain floor-div/mod and
+      // bitwise arithmetic that the interval analyzer cannot robustly handle.
+      // Do not let warning-only analysis turn into a hard compilation error.
+      if (throw_warning && HasAnalyzerFragilePattern(index)) {
+        continue;
+      }
+
       // We want to check if index < shape_dim can be proven.
       // If analyzer->CanProve(index < shape_dim) returns false,
       // it means we cannot prove the access is within bounds.
       PrimExpr upper_bound_cond = index < shape_dim;
-      if (!analyzer_->CanProve(upper_bound_cond,
-                               arith::ProofStrength::kSymbolicBound)) {
-        _conditions.push_back(upper_bound_cond);
+      bool can_prove_upper = false;
+      try {
+        can_prove_upper = analyzer_->CanProve(
+            upper_bound_cond, arith::ProofStrength::kSymbolicBound);
+      } catch (const Error &e) {
+        // Some layout-lowered sparse/global indices contain arithmetic that
+        // defeats interval reasoning.  Safe-memory legalization should remain
+        // conservative in that case and emit an explicit runtime guard instead
+        // of hard-failing compilation.
+        can_prove_upper = false;
+      }
+
+      if (!can_prove_upper) {
+        // Fallback: use const_int_bound directly.
+        // CanProve's kSymbolicBound strategy uses int_set, which only tracks
+        // Var-keyed constraints. const_int_bound tracks PrimExpr-keyed
+        // constraints (including BufferLoad), so it can leverage bounds
+        // from an enclosing if-condition like `if bin_id < N`.
+        arith::ConstIntBound index_bound = analyzer_->const_int_bound(index);
+        arith::ConstIntBound shape_bound =
+            analyzer_->const_int_bound(shape_dim);
+        if (index_bound->max_value < shape_bound->min_value) {
+          can_prove_upper = true;
+        }
+      }
+
+      if (!can_prove_upper) {
+        if (throw_warning) {
+          LOG(WARNING) << "Index access may exceed buffer bounds: " << index
+                       << " >= " << shape_dim
+                       << "; Buffer name: " << buffer->name;
+        }
+        if (IsGlobalBuffer(buffer)) {
+          _conditions.push_back(upper_bound_cond);
+        }
       }
       // Check if index >= 0 can be proven.
       PrimExpr lower_bound_cond = index >= 0;
-      if (!analyzer_->CanProve(lower_bound_cond,
-                               arith::ProofStrength::kSymbolicBound)) {
-        _conditions.push_back(lower_bound_cond);
+      bool can_prove_lower = false;
+      try {
+        can_prove_lower = analyzer_->CanProve(
+            lower_bound_cond, arith::ProofStrength::kSymbolicBound);
+      } catch (const Error &e) {
+        can_prove_lower = false;
+      }
+
+      if (!can_prove_lower) {
+        // Same fallback as above for the lower bound.
+        arith::ConstIntBound index_bound = analyzer_->const_int_bound(index);
+        if (index_bound->min_value >= 0) {
+          can_prove_lower = true;
+        }
+      }
+
+      if (!can_prove_lower) {
+        if (throw_warning) {
+          LOG(WARNING) << "Index access may be negative: " << index << " < 0"
+                       << "; Buffer name: " << buffer->name;
+        }
+        if (IsGlobalBuffer(buffer)) {
+          _conditions.push_back(lower_bound_cond);
+        }
       }
     }
   }
 
+  static bool HasAnalyzerFragilePattern(const PrimExpr &expr) {
+    bool fragile = false;
+    PostOrderVisit(expr, [&](const ObjectRef &obj) {
+      if (obj->IsInstance<FloorDivNode>() || obj->IsInstance<FloorModNode>() ||
+          obj->IsInstance<DivNode>() || obj->IsInstance<ModNode>()) {
+        fragile = true;
+        return;
+      }
+      if (const auto *call = obj.as<CallNode>()) {
+        if (const auto *op_node = call->op.as<OpNode>()) {
+          String name = op_node->name;
+          if (name == "tir.bitwise_and" || name == "tir.bitwise_or" ||
+              name == "tir.bitwise_xor" || name == "tir.shift_left" ||
+              name == "tir.shift_right") {
+            fragile = true;
+          }
+        }
+      }
+    });
+    return fragile;
+  }
+
   Array<PrimExpr> GetConditions() { return _conditions; }
 
 private:
@@ -150,7 +240,7 @@ class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
 
     // For Load/Store, we only check the current node, not its children.
     // Since rewriter will recursively visit children.
-    GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
+    SafeMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
     checker(load);
     Array<PrimExpr> conditions = checker.GetConditions();
 
@@ -173,7 +263,7 @@ class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
     // Check if the buffer is in global scope
     auto store = Downcast<BufferStore>(IRMutatorWithAnalyzer::VisitStmt_(op));
 
-    GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
+    SafeMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
     checker(store);
     Array<PrimExpr> conditions = checker.GetConditions();
 
@@ -215,28 +305,173 @@ class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
   // current statement. The current solution adopts a simplified approach:
   // directly applying the boundary constraints of all parameters to the
   // statement. While not entirely precise, it addresses most common scenarios.
+  // Check if the call is an atomic operation
+  bool IsAtomicOp(const Op &op) {
+    return op == atomic_add_elem_op() || op == atomic_add_ret_elem_op() ||
+           op == atomic_addx2_elem_op() || op == atomic_addx4_elem_op() ||
+           op == atomic_load_elem_op() || op == atomic_store_elem_op() ||
+           op == atomic_max_elem_op() || op == atomic_max_ret_elem_op() ||
+           op == atomic_min_elem_op() || op == atomic_min_ret_elem_op();
+  }
+
+  bool IsCPAsyncOp(const Op &op) {
+    return op == builtin::ptx_cp_async() || op == tl::ptx_cp_async();
+  }
+
+  static constexpr int kCPAsyncDstPtrArg = 0;
+  static constexpr int kCPAsyncSrcPtrArg = 1;
+
+  BufferLoad GetBaseLoadFromAccessPtrExpr(const PrimExpr &expr) {
+    const auto *ptr_call = expr.as<CallNode>();
+    ICHECK(ptr_call) << "cp.async expects access_ptr arguments, but got "
+                     << expr;
+
+    if (ptr_call->op.same_as(tl::access_ptr())) {
+      ICHECK_EQ(ptr_call->args.size(), 3U)
+          << "tl.access_ptr expects 3 arguments, but got " << ptr_call->args;
+      const auto *base_load = ptr_call->args[0].as<BufferLoadNode>();
+      ICHECK(base_load) << "tl.access_ptr base must be BufferLoad, but got "
+                        << ptr_call->args[0];
+      return Downcast<BufferLoad>(ptr_call->args[0]);
+    }
+
+    ICHECK(ptr_call->op.same_as(builtin::tvm_access_ptr()))
+        << "cp.async expects tl.access_ptr or tvm_access_ptr, but got "
+        << ptr_call->op;
+    ICHECK_EQ(ptr_call->args.size(), 5U)
+        << "tvm_access_ptr expects 5 arguments, but got " << ptr_call->args;
+    const auto *var = ptr_call->args[1].as<VarNode>();
+    ICHECK(var) << "tvm_access_ptr buffer data must be Var, but got "
+                << ptr_call->args[1];
+    Var buffer_data = Downcast<Var>(ptr_call->args[1]);
+    ICHECK(buffer_data_to_buffer_.count(buffer_data))
+        << "Buffer data var " << buffer_data
+        << " is not registered in buffer_data_to_buffer_.";
+    Buffer flat = buffer_data_to_buffer_[buffer_data].GetFlattenedBuffer();
+    return BufferLoad(flat, Array<PrimExpr>{ptr_call->args[2]});
+  }
+
+  bool NeedsEvaluateBoundaryCheck(const Call &call) {
+    return call->op == builtin::call_extern() ||
+           (call->op.as<OpNode>() && IsAtomicOp(Downcast<Op>(call->op)));
+  }
+
+  Array<PrimExpr> CollectCPAsyncConditions(const Call &call) {
+    ICHECK_GE(call->args.size(), 3U)
+        << "cp.async expects at least 3 arguments, but got " << call->args;
+    Array<PrimExpr> conditions;
+    BufferLoad src_base_load =
+        GetBaseLoadFromAccessPtrExpr(call->args[kCPAsyncSrcPtrArg]);
+
+    SafeMemChecker checker(analyzer_, /*recursively_collect_conds=*/false);
+    checker.CheckBufferIndices(src_base_load->buffer, src_base_load->indices,
+                               /*is_load=*/true, /*throw_warning=*/false);
+    return checker.GetConditions();
+  }
+
+  Buffer GetCPAsyncSourceBuffer(const Call &call) {
+    ICHECK_GE(call->args.size(), 3U)
+        << "cp.async expects at least 3 arguments, but got " << call->args;
+    BufferLoad src_base_load =
+        GetBaseLoadFromAccessPtrExpr(call->args[kCPAsyncSrcPtrArg]);
+    return src_base_load->buffer;
+  }
+
+  PrimExpr CombineConditions(const Array<PrimExpr> &conditions) {
+    ICHECK(!conditions.empty());
+    PrimExpr combined = conditions[0];
+    for (size_t i = 1; i < conditions.size(); ++i) {
+      combined = tir::And(combined, conditions[i]);
+    }
+    return analyzer_->Simplify(combined);
+  }
+
+  Optional<PrimExpr> GetCPAsyncPredicate(const Call &call) {
+    if (call->args.size() >= 4U) {
+      return call->args[3];
+    }
+    return Optional<PrimExpr>();
+  }
+
+  Stmt RewriteCPAsync(const Evaluate &evaluate, const Call &call,
+                      const Array<PrimExpr> &conditions) {
+    if (conditions.empty()) {
+      return evaluate;
+    }
+
+    ICHECK_GE(call->args.size(), 3U)
+        << "cp.async expects at least 3 arguments, but got " << call->args;
+    BufferLoad dst_base_load =
+        GetBaseLoadFromAccessPtrExpr(call->args[kCPAsyncDstPtrArg]);
+    Buffer src_buffer = GetCPAsyncSourceBuffer(call);
+
+    PrimExpr combined = CombineConditions(conditions);
+    Optional<PrimExpr> existing_predicate = GetCPAsyncPredicate(call);
+
+    PrimExpr safe_value = GetSafeValue(src_buffer);
+    DataType dst_dtype = dst_base_load->buffer->dtype;
+    if (safe_value.dtype() != dst_dtype) {
+      safe_value = Cast(dst_dtype, safe_value);
+    }
+    safe_value = analyzer_->Simplify(safe_value);
+
+    // Predicated cp.async zero-fills on the false path. Use that form when the
+    // buffer's safe value is zero so downstream codegen can emit the native
+    // conditional intrinsic instead of materializing an explicit fallback
+    // store.
+    if (analyzer_->CanProveEqual(safe_value, make_zero(dst_dtype))) {
+      PrimExpr predicate = existing_predicate.defined()
+                               ? analyzer_->Simplify(tir::And(
+                                     existing_predicate.value(), combined))
+                               : combined;
+      Array<PrimExpr> new_args{call->args[0], call->args[1], call->args[2]};
+      new_args.push_back(predicate);
+      return Evaluate(
+          Call(call->dtype, call->op, new_args, call->annotations, call->span));
+    }
+
+    Stmt else_case =
+        BufferStore(dst_base_load->buffer, safe_value, dst_base_load->indices);
+    return IfThenElse(combined, evaluate, else_case);
+  }
+
+  Stmt WrapEvaluateWithConditions(const Evaluate &evaluate,
+                                  const Array<PrimExpr> &conditions) {
+    if (conditions.empty()) {
+      return evaluate;
+    }
+    Stmt evaluate_with_conditions = evaluate;
+    for (auto cond : conditions) {
+      evaluate_with_conditions = IfThenElse(cond, evaluate_with_conditions);
+    }
+    return evaluate_with_conditions;
+  }
+
   Stmt VisitStmt_(const EvaluateNode *op) final {
     auto evaluate = Downcast<Evaluate>(op);
 
-    if (const CallNode *call_op = op->value.as<CallNode>()) {
-      auto call = Downcast<Call>(op->value);
-      if (call->op == builtin::call_extern()) {
-        // For CallExtern, we recursively collect conditions from all children.
-        // Since we cannot rewrite any BufferLoad in its children (Rewrite will
-        // cause potential Nullptr exception).
-        GlobalMemChecker checker(analyzer_, /*recursively_collect_conds=*/true);
-        checker(call);
-        Array<PrimExpr> conditions = checker.GetConditions();
-
+    if (const CallNode *call_node = op->value.as<CallNode>()) {
+      Call call = Downcast<Call>(op->value);
+      if (call->op.as<OpNode>() && IsCPAsyncOp(Downcast<Op>(call->op))) {
+        Array<PrimExpr> conditions = CollectCPAsyncConditions(call);
         if (conditions.empty()) {
-          return evaluate;
+          // Fallback when we cannot recover the underlying buffer access
+          // directly from the access_ptr arguments.
+          SafeMemChecker checker(analyzer_, /*recursively_collect_conds=*/true);
+          checker(call);
+          conditions = checker.GetConditions();
         }
+        return RewriteCPAsync(evaluate, call, conditions);
+      }
 
-        Stmt evaluate_with_conditions = evaluate;
-        for (auto cond : conditions) {
-          evaluate_with_conditions = IfThenElse(cond, evaluate_with_conditions);
-        }
-        return evaluate_with_conditions;
+      if (NeedsEvaluateBoundaryCheck(call)) {
+        // For side-effect calls (extern/atomic), recursively collect
+        // conditions from all children. We avoid rewriting BufferLoad in call
+        // arguments directly to prevent nullptr issues in downstream lowering.
+        SafeMemChecker checker(analyzer_, /*recursively_collect_conds=*/true);
+        checker(call);
+        Array<PrimExpr> conditions = checker.GetConditions();
+        return WrapEvaluateWithConditions(evaluate, conditions);
       }
     }
 
@@ -257,6 +492,7 @@ class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
             << buffer_data_to_buffer_;
         auto buffer = buffer_data_to_buffer_[var];
         annotated_safe_value_map_.Set(buffer, safe_value);
+        annotated_safe_value_by_data_.Set(var, safe_value);
       }
     }
     return IRMutatorWithAnalyzer::VisitStmt_(op);
@@ -267,11 +503,15 @@ class SafeMemorysRewriter : public IRMutatorWithAnalyzer {
     if (annotated_safe_value_map_.count(buffer)) {
       return annotated_safe_value_map_[buffer];
     }
+    if (annotated_safe_value_by_data_.count(buffer->data)) {
+      return annotated_safe_value_by_data_[buffer->data];
+    }
     return make_zero(buffer->dtype);
   }
 
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Buffer, PrimExpr> annotated_safe_value_map_;
+  Map<Var, PrimExpr> annotated_safe_value_by_data_;
 };
 
 // Create a pass that legalizes vectorized loops in the IRModule
diff --git a/src/transform/loop_partition.cc b/src/transform/loop_partition.cc
index c1117533a6..53daa2be5d 100644
--- a/src/transform/loop_partition.cc
+++ b/src/transform/loop_partition.cc
@@ -83,8 +83,6 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   Array<PrimExpr> loop_extents;
   auto inverse_info = loop_layout->InverseWithLevel();
   auto inv_loop = inverse_info.first;
-  // Must check the guard if the layout can not be proved as bijective
-  bool need_guard = inverse_info.second != arith::IterMapLevel::Bijective;
   auto indices = inv_loop->Forward(Array<PrimExpr>(vars.begin(), vars.end()));
   // Normalize thread var once so we can reuse the same substitution later.
   Map<Var, PrimExpr> thread_offset_map;
@@ -118,36 +116,38 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   // inverse i, j land outside the original extents. This protects
   // non-surjective loop_layout mappings that otherwise over-cover the parallel
   // space.
+  // Always build guard and let analyzer decide if it can be proved true.
+  // This handles both non-bijective layouts and cases where loop extent
+  // differs from layout input shape (e.g., loop extent=4 with
+  // Fragment([8]->[1]) produces inverse index `tx % 8` ranging 0-7, requiring
+  // guard `tx % 8 < 4`).
   PrimExpr guard = const_true();
-
-  if (need_guard) {
-    for (int i = 0; i < old_loop_depth; i++) {
-      PrimExpr index = indices[i];
-      if (has_thread_offset) {
-        index = Substitute(index, thread_offset_map);
-      }
-      PrimExpr lower_bound = analyzer->Simplify(index >= loop_mins[i]);
-      PrimExpr upper_bound =
-          analyzer->Simplify(index < loop_mins[i] + loop_extents[i]);
-      guard = And(guard, And(lower_bound, upper_bound));
-    }
-    auto inv_output_shape = inv_loop->OutputShape();
-    if (inv_output_shape.size() > static_cast<size_t>(old_loop_depth)) {
-      PrimExpr replicate_index = indices[old_loop_depth];
-      if (has_thread_offset) {
-        replicate_index = Substitute(replicate_index, thread_offset_map);
-      }
-      PrimExpr replicate_extent = inv_output_shape[old_loop_depth];
-      PrimExpr lower_bound = analyzer->Simplify(
-          replicate_index >= make_zero(replicate_index.dtype()));
-      PrimExpr upper_bound =
-          analyzer->Simplify(replicate_index < replicate_extent);
-      guard = And(guard, And(lower_bound, upper_bound));
+  for (int i = 0; i < old_loop_depth; i++) {
+    PrimExpr index = indices[i];
+    if (has_thread_offset) {
+      index = Substitute(index, thread_offset_map);
     }
-    PrimExpr simplified_guard = analyzer->Simplify(guard);
-    if (!analyzer->CanProve(simplified_guard)) {
-      body = IfThenElse(simplified_guard, body, Stmt());
+    PrimExpr lower_bound = analyzer->Simplify(index >= loop_mins[i]);
+    PrimExpr upper_bound =
+        analyzer->Simplify(index < loop_mins[i] + loop_extents[i]);
+    guard = And(guard, And(lower_bound, upper_bound));
+  }
+  auto inv_output_shape = inv_loop->OutputShape();
+  if (inv_output_shape.size() > static_cast<size_t>(old_loop_depth)) {
+    PrimExpr replicate_index = indices[old_loop_depth];
+    if (has_thread_offset) {
+      replicate_index = Substitute(replicate_index, thread_offset_map);
     }
+    PrimExpr replicate_extent = inv_output_shape[old_loop_depth];
+    PrimExpr lower_bound = analyzer->Simplify(
+        replicate_index >= make_zero(replicate_index.dtype()));
+    PrimExpr upper_bound =
+        analyzer->Simplify(replicate_index < replicate_extent);
+    guard = And(guard, And(lower_bound, upper_bound));
+  }
+  PrimExpr simplified_guard = analyzer->Simplify(guard);
+  if (!analyzer->CanProve(simplified_guard)) {
+    body = IfThenElse(simplified_guard, body, Stmt());
   }
 
   for (int i = new_loop_depth - 1; i >= 0; i--) {
@@ -161,9 +161,7 @@ For PartitionLoop(For op, Var thread_var, arith::Analyzer *analyzer,
   if (has_thread_offset) {
     body = Substitute(body, thread_offset_map);
   }
-
-  auto for_node = LoopPragmaUnroll(Downcast<For>(body));
-  return for_node;
+  return Downcast<For>(body);
 }
 
 class LoopPramaUnroller : public StmtExprMutator {
@@ -264,19 +262,28 @@ Fragment PlanLoopPartition(const For &op, int vectorize_size,
   return fragment->BindThreadRange(thread_range);
 }
 
-For LoopPragmaUnroll(For stmt) {
+For PragmaUnrollLoop(For stmt) {
   LoopPramaUnroller unroller;
   For unrolled = Downcast<For>(unroller(std::move(stmt)));
   return unrolled;
 }
 
 Stmt LowerParallelLoop(For loop, const Fragment &loop_layout, Var thread_var,
-                       arith::Analyzer *analyzer, Optional<PrimExpr> predicate,
-                       bool parallel_loop, bool should_vectorize) {
+                       arith::Analyzer *analyzer, const LayoutMap &layout_map,
+                       Optional<PrimExpr> predicate, bool parallel_loop,
+                       bool should_vectorize) {
   // Save analyzer state to prevent conflicted bindings during vectorization
   auto saved_analyzer = analyzer->Clone();
 
   For result_loop = loop;
+  // Strip parallel-loop layout/predicate annotations on the original loop.
+  // After partitioning/vectorization, keeping them can confuse later passes.
+  // Also, annotations may contain complex expressions; mutators do not visit
+  // inside annotation payloads, so explicit removal here prevents stale state
+  // from leaking into subsequent transforms.
+  // Note: Map::erase(key) is a no-op if key doesn't exist.
+  result_loop.CopyOnWrite()->annotations.erase(attr::kParallelLoopLayout);
+  result_loop.CopyOnWrite()->annotations.erase(attr::kParallelLoopPredicate);
 
   // Step 1: Partition the loop based on the layout (if this is a parallel loop)
   if (parallel_loop) {
@@ -285,9 +292,11 @@ Stmt LowerParallelLoop(For loop, const Fragment &loop_layout, Var thread_var,
 
   // Step 2: Vectorize the loop (if requested)
   if (should_vectorize) {
-    result_loop = VectorizeLoop(result_loop, saved_analyzer.get());
+    result_loop = VectorizeLoop(result_loop, saved_analyzer.get(), layout_map);
   }
 
+  result_loop = PragmaUnrollLoop(result_loop);
+
   // Step 3: Wrap with predicate if provided and this is a parallel loop
   if (predicate.defined() && parallel_loop) {
     return IfThenElse(predicate.value(), result_loop);
diff --git a/src/transform/loop_partition.h b/src/transform/loop_partition.h
index 844065ab35..033a256155 100644
--- a/src/transform/loop_partition.h
+++ b/src/transform/loop_partition.h
@@ -29,6 +29,7 @@
 #include <tvm/tir/stmt.h>
 
 #include "../layout/layout.h"
+#include "../op/operator.h"
 
 namespace tvm {
 namespace tl {
@@ -44,7 +45,7 @@ Fragment PlanLoopPartition(const For &op, size_t num_thread,
 Fragment PlanLoopPartition(const For &op, int vectorize_size,
                            const Range &thread_range);
 
-For LoopPragmaUnroll(For stmt);
+For PragmaUnrollLoop(For stmt);
 
 /*!
  * \brief Lower a parallel loop by partitioning and vectorizing it.
@@ -68,6 +69,7 @@ For LoopPragmaUnroll(For stmt);
  */
 Stmt LowerParallelLoop(For loop, const Fragment &loop_layout, Var thread_var,
                        arith::Analyzer *analyzer,
+                       const LayoutMap &layout_map = {},
                        Optional<PrimExpr> predicate = Optional<PrimExpr>(),
                        bool parallel_loop = true, bool should_vectorize = true);
 
diff --git a/src/transform/loop_unswitching.cc b/src/transform/loop_unswitching.cc
new file mode 100644
index 0000000000..166f4ac696
--- /dev/null
+++ b/src/transform/loop_unswitching.cc
@@ -0,0 +1,666 @@
+/*!
+ * \file loop_unswitching.cc
+ * \brief Loop Unswitching: Hoist loop-invariant if statements out of loops
+ *
+ * Transformation:
+ *   for i in range(n):        if cond:
+ *       if cond:         =>       for i in range(n): A(i)
+ *           A(i)               else:
+ *       else:                     for i in range(n): B(i)
+ *           B(i)
+ *
+ * A condition is loop-invariant iff:
+ *   1. It does not use the loop variable
+ *   2. It does not read buffers written inside the loop
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+
+#include <unordered_map>
+#include <unordered_set>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+/*!
+ * \brief Collect buffer data vars that are written in a statement
+ *
+ * Handles:
+ *   - BufferStore
+ *   - tvm_access_ptr with write flag (rw_mask & 2)
+ *   - address_of(BufferLoad) as call argument (conservative)
+ */
+class WrittenVarCollector : public StmtExprVisitor {
+public:
+  std::unordered_set<const VarNode *> written;
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    written.insert(op->buffer->data.get());
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      // tvm_access_ptr(dtype, data, offset, extent, rw_mask)
+      ICHECK_EQ(op->args.size(), 5U);
+      const VarNode *buf = op->args[1].as<VarNode>();
+      ICHECK(buf) << "tvm_access_ptr data argument must be a Var";
+      const IntImmNode *flag = op->args[4].as<IntImmNode>();
+      // Conservative: assume write if flag is non-constant
+      bool maybe_write = !flag || (flag->value & 2);
+      if (maybe_write) {
+        written.insert(buf);
+      }
+    } else if (op->op.same_as(builtin::address_of())) {
+      // address_of(BufferLoad) - conservatively treat as write
+      ICHECK_EQ(op->args.size(), 1U);
+      const auto *load = op->args[0].as<BufferLoadNode>();
+      ICHECK(load) << "address_of argument must be a BufferLoad";
+      written.insert(load->buffer->data.get());
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+};
+
+/*!
+ * \brief Check if an expression reads any written buffer
+ *
+ * Also handles Let-bound variables that are bound to BufferLoad expressions.
+ */
+class WrittenBufferReadChecker : public ExprVisitor {
+public:
+  bool reads_written = false;
+  const std::unordered_set<const VarNode *> &written_vars;
+  const std::unordered_map<const VarNode *, PrimExpr> *let_bindings;
+
+  explicit WrittenBufferReadChecker(
+      const std::unordered_set<const VarNode *> &written,
+      const std::unordered_map<const VarNode *, PrimExpr> *bindings = nullptr)
+      : written_vars(written), let_bindings(bindings) {}
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (written_vars.count(op->buffer->data.get())) {
+      reads_written = true;
+    }
+    ExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const VarNode *op) final {
+    // Check if this var is Let-bound to a BufferLoad that reads a written
+    // buffer
+    if (let_bindings) {
+      auto it = let_bindings->find(op);
+      if (it != let_bindings->end()) {
+        // Recursively check the bound expression
+        VisitExpr(it->second);
+      }
+    }
+    ExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      // tvm_access_ptr read
+      ICHECK_EQ(op->args.size(), 5U);
+      const VarNode *buf = op->args[1].as<VarNode>();
+      ICHECK(buf) << "tvm_access_ptr data argument must be a Var";
+      const IntImmNode *flag = op->args[4].as<IntImmNode>();
+      bool maybe_read = !flag || (flag->value & 1);
+      if (maybe_read && written_vars.count(buf)) {
+        reads_written = true;
+      }
+    } else if (op->op.same_as(builtin::address_of())) {
+      // address_of(BufferLoad) counts as reading the buffer
+      ICHECK_EQ(op->args.size(), 1U);
+      const auto *load = op->args[0].as<BufferLoadNode>();
+      ICHECK(load) << "address_of argument must be a BufferLoad";
+      if (written_vars.count(load->buffer->data.get())) {
+        reads_written = true;
+      }
+    }
+    ExprVisitor::VisitExpr_(op);
+  }
+};
+
+/*!
+ * \brief Check if an expression contains any CallNode
+ */
+class CallNodeChecker : public ExprVisitor {
+public:
+  bool has_call = false;
+
+  void VisitExpr_(const CallNode *op) final {
+    has_call = true;
+    // No need to continue visiting once we find a call
+  }
+};
+
+/*!
+ * \brief Check if a statement contains any CallNode, excluding matching If
+ * nodes
+ *
+ * Loop unswitching is unsafe when there are function calls OUTSIDE the
+ * hoisted if statement, because those calls (originally executed by all
+ * threads together) would be split into different code paths after
+ * unswitching, potentially breaking synchronization semantics.
+ *
+ * Calls INSIDE the if are safe because they were already conditionally
+ * executed before unswitching.
+ *
+ * Since we replace ALL if statements with matching conditions, we need to
+ * exclude all such if statements when checking for calls.
+ */
+class CallCheckerExcludingIf : public StmtExprVisitor {
+public:
+  bool has_call = false;
+  PrimExpr excluded_condition;
+
+  void VisitStmt_(const IfThenElseNode *op) final {
+    // Skip the interior of any if statement with matching condition
+    if (excluded_condition.defined() &&
+        StructuralEqual()(op->condition, excluded_condition)) {
+      return;
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    has_call = true;
+    // No need to continue once we find a call
+  }
+};
+
+/*!
+ * \brief Check if condition or any Let-bound variable it uses depends on loop
+ * var
+ */
+bool UsesLoopVarThroughLetBindings(
+    const PrimExpr &cond, const Var &loop_var,
+    const std::unordered_map<const VarNode *, PrimExpr> *let_bindings) {
+  // Check if condition directly uses loop variable
+  if (UsesVar(cond, [&](const VarNode *v) { return v == loop_var.get(); })) {
+    return true;
+  }
+
+  // Check if any Let-bound variable used in condition has a binding that uses
+  // the loop variable
+  if (let_bindings) {
+    bool uses_loop_var = false;
+    PostOrderVisit(cond, [&](const ObjectRef &obj) {
+      if (uses_loop_var)
+        return;
+      if (const auto *var_node = obj.as<VarNode>()) {
+        auto it = let_bindings->find(var_node);
+        if (it != let_bindings->end()) {
+          // Check if the bound expression uses the loop variable
+          if (UsesLoopVarThroughLetBindings(it->second, loop_var,
+                                            let_bindings)) {
+            uses_loop_var = true;
+          }
+        }
+      }
+    });
+    if (uses_loop_var) {
+      return true;
+    }
+  }
+  return false;
+}
+
+/*!
+ * \brief Check if an expression uses any variable in \p vars (directly or
+ * through Let bindings).
+ *
+ * This is similar to UsesLoopVarThroughLetBindings, but generalized to a set of
+ * variables. It is used to conservatively block unswitching on per-thread
+ * predicates (e.g. threadIdx.x) because later passes may insert synchronization
+ * calls that would become control-flow dependent after unswitching.
+ */
+bool UsesVarsThroughLetBindingsImpl(
+    const PrimExpr &expr, const std::unordered_set<const VarNode *> &vars,
+    const std::unordered_map<const VarNode *, PrimExpr> *let_bindings,
+    std::unordered_set<const VarNode *> *visited_let_vars) {
+  if (vars.empty()) {
+    return false;
+  }
+
+  // Direct use in expr
+  if (UsesVar(expr, [&](const VarNode *v) { return vars.count(v); })) {
+    return true;
+  }
+
+  if (!let_bindings) {
+    return false;
+  }
+
+  bool uses = false;
+  PostOrderVisit(expr, [&](const ObjectRef &obj) {
+    if (uses) {
+      return;
+    }
+    const auto *var_node = obj.as<VarNode>();
+    if (!var_node) {
+      return;
+    }
+    auto it = let_bindings->find(var_node);
+    if (it == let_bindings->end()) {
+      return;
+    }
+    if (visited_let_vars && visited_let_vars->count(var_node)) {
+      return;
+    }
+    if (visited_let_vars) {
+      visited_let_vars->insert(var_node);
+    }
+    if (UsesVarsThroughLetBindingsImpl(it->second, vars, let_bindings,
+                                       visited_let_vars)) {
+      uses = true;
+    }
+  });
+
+  return uses;
+}
+
+bool UsesVarsThroughLetBindings(
+    const PrimExpr &expr, const std::unordered_set<const VarNode *> &vars,
+    const std::unordered_map<const VarNode *, PrimExpr> *let_bindings) {
+  std::unordered_set<const VarNode *> visited_let_vars;
+  return UsesVarsThroughLetBindingsImpl(expr, vars, let_bindings,
+                                        &visited_let_vars);
+}
+
+/*!
+ * \brief Check if a statement is side-effect free (i.e. a no-op), allowing only
+ * pure/read-only expression evaluation.
+ *
+ * This is intentionally conservative, and is used as a profitability/safety
+ * guard: only unswitch when the "else version" of the loop body does not
+ * perform any meaningful work. This keeps the common pattern
+ *
+ *   for i: if cond: S(i)
+ *
+ * while avoiding code-size blowup and control-flow complexity for
+ *
+ *   for i: if cond: S1(i) else: S2(i)
+ *
+ * or when there are other side-effecting statements outside the hoisted if.
+ */
+bool IsSideEffectFreeStmt(const Stmt &stmt) {
+  if (!stmt.defined()) {
+    return true;
+  }
+
+  if (const auto *op = stmt.as<EvaluateNode>()) {
+    // Treat pure or read-only evaluation as no-op.
+    return SideEffect(op->value) <= CallEffectKind::kReadState;
+  }
+
+  if (const auto *op = stmt.as<SeqStmtNode>()) {
+    for (const Stmt &s : op->seq) {
+      if (!IsSideEffectFreeStmt(s)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  if (const auto *op = stmt.as<LetStmtNode>()) {
+    if (SideEffect(op->value) > CallEffectKind::kReadState) {
+      return false;
+    }
+    return IsSideEffectFreeStmt(op->body);
+  }
+
+  if (const auto *op = stmt.as<IfThenElseNode>()) {
+    if (SideEffect(op->condition) > CallEffectKind::kReadState) {
+      return false;
+    }
+    if (!IsSideEffectFreeStmt(op->then_case)) {
+      return false;
+    }
+    if (op->else_case.defined() &&
+        !IsSideEffectFreeStmt(op->else_case.value())) {
+      return false;
+    }
+    return true;
+  }
+
+  if (const auto *op = stmt.as<ForNode>()) {
+    if (SideEffect(op->min) > CallEffectKind::kReadState ||
+        SideEffect(op->extent) > CallEffectKind::kReadState) {
+      return false;
+    }
+    return IsSideEffectFreeStmt(op->body);
+  }
+
+  // Conservatively treat all other statements as side-effecting.
+  return false;
+}
+
+/*!
+ * \brief Check if a condition is loop-invariant
+ */
+bool IsLoopInvariant(
+    const PrimExpr &cond, const Var &loop_var,
+    const std::unordered_set<const VarNode *> &written_vars,
+    const std::unordered_map<const VarNode *, PrimExpr> *let_bindings = nullptr,
+    const std::unordered_set<const VarNode *> *disallowed_vars = nullptr) {
+  // Check 0: disallow conditions that depend on per-thread binding vars (e.g.
+  // threadIdx.x). These predicates are loop-invariant, but unswitching them can
+  // split the execution into different code paths across threads. Later passes
+  // (e.g. thread sync insertion, fence proxy injection) may add synchronization
+  // calls outside the hoisted if, which would become control-flow dependent and
+  // lead to incorrect codegen.
+  if (disallowed_vars && !disallowed_vars->empty()) {
+    if (UsesVarsThroughLetBindings(cond, *disallowed_vars, let_bindings)) {
+      return false;
+    }
+  }
+
+  // Check 1: must not use loop variable (directly or through Let bindings)
+  if (UsesLoopVarThroughLetBindings(cond, loop_var, let_bindings)) {
+    return false;
+  }
+
+  // Check 2: must not read written buffers (including through Let bindings)
+  WrittenBufferReadChecker checker(written_vars, let_bindings);
+  checker(cond);
+  if (checker.reads_written) {
+    return false;
+  }
+
+  // Check 3: conservatively reject if condition contains any call node
+  // (calls may have side effects or depend on loop-variant state)
+  CallNodeChecker call_checker;
+  call_checker(cond);
+  return !call_checker.has_call;
+}
+
+/*!
+ * \brief Replace if nodes with matching condition with their then/else branch
+ *
+ * When hoisting a condition out of a loop, we need to replace ALL if statements
+ * with the same condition, not just the first one found. This ensures that
+ * in the then-branch all matching conditions are replaced with their then-case,
+ * and in the else-branch all matching conditions are replaced with their
+ * else-case.
+ *
+ * Also removes LetStmts for variables that have been hoisted, since they are
+ * now redundant (the variable is already bound outside the loop).
+ */
+class IfBranchReplacer : public StmtExprMutator {
+public:
+  PrimExpr hoisted_condition;
+  bool take_then;
+  std::unordered_set<const VarNode *> hoisted_vars;
+
+  IfBranchReplacer(
+      const PrimExpr &condition, bool take_then,
+      const std::vector<std::pair<Var, PrimExpr>> &hoisted_let_bindings)
+      : hoisted_condition(condition), take_then(take_then) {
+    for (const auto &binding : hoisted_let_bindings) {
+      hoisted_vars.insert(binding.first.get());
+    }
+  }
+
+  Stmt VisitStmt_(const IfThenElseNode *op) final {
+    // Replace if the condition is structurally equal to the hoisted condition
+    if (StructuralEqual()(op->condition, hoisted_condition)) {
+      if (take_then) {
+        return VisitStmt(op->then_case);
+      } else {
+        return op->else_case.defined() ? VisitStmt(op->else_case.value())
+                                       : Evaluate(0);
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const LetStmtNode *op) final {
+    // Remove LetStmts for hoisted variables (they are now bound outside the
+    // loop)
+    if (hoisted_vars.count(op->var.get())) {
+      return VisitStmt(op->body);
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+};
+
+/*!
+ * \brief Collect Let-bound variables used in an expression
+ */
+class LetVarCollector : public ExprVisitor {
+public:
+  std::vector<std::pair<Var, PrimExpr>> used_let_bindings;
+  const std::unordered_map<const VarNode *, PrimExpr> &let_bindings;
+  std::unordered_set<const VarNode *> visited;
+
+  explicit LetVarCollector(
+      const std::unordered_map<const VarNode *, PrimExpr> &bindings)
+      : let_bindings(bindings) {}
+
+  void VisitExpr_(const VarNode *op) final {
+    if (visited.count(op))
+      return;
+    auto it = let_bindings.find(op);
+    if (it != let_bindings.end()) {
+      visited.insert(op);
+      // First recursively collect Let-bound vars used in this binding's value
+      VisitExpr(it->second);
+      // Then add this binding (so dependencies come first)
+      used_let_bindings.push_back(
+          std::make_pair(ffi::GetRef<Var>(op), it->second));
+    }
+  }
+};
+
+/*!
+ * \brief Find first hoistable if (not descending into nested loops)
+ *
+ * Also tracks Let bindings where variables are bound to BufferLoad expressions.
+ */
+class HoistableIfFinder : public StmtVisitor {
+public:
+  const IfThenElseNode *found = nullptr;
+  const Var &loop_var;
+  const std::unordered_set<const VarNode *> &written_vars;
+  const std::unordered_set<const VarNode *> *disallowed_vars;
+  std::unordered_map<const VarNode *, PrimExpr> let_bindings_;
+  // Let bindings that need to be hoisted with the condition
+  std::vector<std::pair<Var, PrimExpr>> hoisted_let_bindings;
+
+  HoistableIfFinder(const Var &loop_var,
+                    const std::unordered_set<const VarNode *> &written_vars,
+                    const std::unordered_set<const VarNode *> *disallowed_vars)
+      : loop_var(loop_var), written_vars(written_vars),
+        disallowed_vars(disallowed_vars) {}
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    // Track ALL Let bindings to detect when a condition uses a variable
+    // that is defined inside the loop with a loop-variant value.
+    // This is necessary because variables like i_s may be bound to expressions
+    // containing the loop variable (e.g., if_then_else(...k...)), and
+    // conditions using such variables should not be hoisted.
+    let_bindings_[op->var.get()] = op->value;
+    StmtVisitor::VisitStmt_(op);
+    // Remove the binding when leaving scope
+    let_bindings_.erase(op->var.get());
+  }
+
+  void VisitStmt_(const IfThenElseNode *op) final {
+    if (found)
+      return;
+    if (IsLoopInvariant(op->condition, loop_var, written_vars, &let_bindings_,
+                        disallowed_vars)) {
+      found = op;
+      // Collect Let-bound variables used in the condition
+      LetVarCollector collector(let_bindings_);
+      collector(op->condition);
+      hoisted_let_bindings = std::move(collector.used_let_bindings);
+      return;
+    }
+    StmtVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const ForNode *) final {
+    // Don't descend into nested loops
+  }
+};
+
+/*!
+ * \brief Main pass: Loop Unswitching
+ */
+class LoopUnswitcher : public StmtExprMutator {
+public:
+  explicit LoopUnswitcher(bool allow_non_trivial_else)
+      : allow_non_trivial_else_(allow_non_trivial_else) {}
+
+  std::unordered_set<const VarNode *> thread_idx_vars_in_scope_;
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    bool pushed_thread_idx = false;
+    if (op->thread_binding.defined()) {
+      String thread_tag = op->thread_binding.value()->thread_tag;
+      if (thread_tag == "threadIdx.x" || thread_tag == "threadIdx.y" ||
+          thread_tag == "threadIdx.z") {
+        thread_idx_vars_in_scope_.insert(op->loop_var.get());
+        pushed_thread_idx = true;
+      }
+    }
+
+    // Bottom-up: process nested structures first
+    Stmt body = VisitStmt(op->body);
+
+    // Collect written buffer vars
+    WrittenVarCollector collector;
+    collector(body);
+
+    // Find hoistable if
+    HoistableIfFinder finder(op->loop_var, collector.written,
+                             &thread_idx_vars_in_scope_);
+    finder(body);
+
+    Stmt result;
+    if (!finder.found) {
+      if (body.same_as(op->body)) {
+        result = ffi::GetRef<Stmt>(op);
+      } else {
+        result = For(op->loop_var, op->min, op->extent, op->kind, body,
+                     op->thread_binding, op->annotations);
+      }
+      if (pushed_thread_idx) {
+        thread_idx_vars_in_scope_.erase(op->loop_var.get());
+      }
+      return result;
+    }
+
+    // Check if there are any function calls OUTSIDE the hoisted if statement.
+    // Calls outside the if are executed by all threads together; unswitching
+    // would split them into different code paths, breaking synchronization.
+    // Calls inside the if are already conditionally executed, so they're safe.
+    CallCheckerExcludingIf call_checker;
+    call_checker.excluded_condition = finder.found->condition;
+    call_checker(body);
+    if (call_checker.has_call) {
+      if (body.same_as(op->body)) {
+        result = ffi::GetRef<Stmt>(op);
+      } else {
+        result = For(op->loop_var, op->min, op->extent, op->kind, body,
+                     op->thread_binding, op->annotations);
+      }
+      if (pushed_thread_idx) {
+        thread_idx_vars_in_scope_.erase(op->loop_var.get());
+      }
+      return result;
+    }
+
+    // Unswitch: create two loop versions
+    const IfThenElseNode *if_node = finder.found;
+    PrimExpr hoisted_condition = if_node->condition;
+
+    Stmt then_body = IfBranchReplacer(hoisted_condition, true,
+                                      finder.hoisted_let_bindings)(body);
+    Stmt else_body = IfBranchReplacer(hoisted_condition, false,
+                                      finder.hoisted_let_bindings)(body);
+
+    // Only unswitch when the else-version does not do any meaningful work.
+    // This keeps the canonical optimization `for: if(cond) {S}` ->
+    // `if(cond){for:S}` while avoiding duplicating non-trivial loop bodies into
+    // two versions.
+    if (!allow_non_trivial_else_ && !IsSideEffectFreeStmt(else_body)) {
+      result = For(op->loop_var, op->min, op->extent, op->kind, body,
+                   op->thread_binding, op->annotations);
+      if (pushed_thread_idx) {
+        thread_idx_vars_in_scope_.erase(op->loop_var.get());
+      }
+      return result;
+    }
+
+    // Create new loop_var for else_loop to maintain SSA form
+    Var else_loop_var(op->loop_var->name_hint, op->loop_var->dtype);
+    else_body = Substitute(else_body, {{op->loop_var, else_loop_var}});
+
+    For then_loop(op->loop_var, op->min, op->extent, op->kind, then_body,
+                  op->thread_binding, op->annotations);
+    For else_loop(else_loop_var, op->min, op->extent, op->kind, else_body,
+                  op->thread_binding, op->annotations);
+
+    result = IfThenElse(if_node->condition, then_loop, else_loop);
+
+    // Wrap with hoisted Let bindings (in reverse order so first binding is
+    // outermost)
+    for (auto it = finder.hoisted_let_bindings.rbegin();
+         it != finder.hoisted_let_bindings.rend(); ++it) {
+      result = LetStmt(it->first, it->second, result);
+    }
+
+    if (pushed_thread_idx) {
+      thread_idx_vars_in_scope_.erase(op->loop_var.get());
+    }
+    return result;
+  }
+
+private:
+  bool allow_non_trivial_else_{false};
+};
+
+// --- Public API ---
+
+Stmt ApplyLoopUnswitching(Stmt stmt, bool allow_non_trivial_else) {
+  return LoopUnswitcher(allow_non_trivial_else)(std::move(stmt));
+}
+
+using namespace tir::transform;
+
+tvm::transform::Pass LoopUnswitching() {
+  auto pass_func = [](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    bool disable_loop_unswitching =
+        ctx->GetConfig<Bool>(kDisableLoopUnswitching, Bool(false)).value();
+    if (disable_loop_unswitching) {
+      return f;
+    }
+    bool allow_non_trivial_else =
+        ctx->GetConfig<Bool>(kLoopUnswitchingAllowNonTrivialElse, Bool(false))
+            .value();
+    f.CopyOnWrite()->body =
+        ApplyLoopUnswitching(f->body, allow_non_trivial_else);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LoopUnswitching", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LoopUnswitching", LoopUnswitching);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/loop_vectorize.cc b/src/transform/loop_vectorize.cc
index 64b4fa6fc2..d1c8e2feaf 100644
--- a/src/transform/loop_vectorize.cc
+++ b/src/transform/loop_vectorize.cc
@@ -23,73 +23,336 @@
  */
 
 #include "loop_vectorize.h"
+#include "../config.h"
 #include "../op/builtin.h"
+#include "../op/utils.h"
 #include "../target/utils.h"
 #include "arith/int_operator.h"
 #include "arith/ir_visitor_with_analyzer.h"
 #include "common/loop_vectorization_utils.h"
 #include "tvm/tir/analysis.h"
 #include "tvm/tir/var.h"
-#include <string>
+#include <iostream>
+#include <optional>
 #include <tvm/arith/iter_affine_map.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/stmt_functor.h>
+#include <vector>
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
+/*!
+ * \brief Check if buffer strides represent a contiguous (row-major) layout.
+ * \param buffer The buffer to check.
+ * \param analyzer The analyzer for symbolic comparison.
+ * \return True if strides are empty (implicitly contiguous) or match row-major
+ * layout.
+ */
+bool IsBufferContiguous(const Buffer &buffer, arith::Analyzer *analyzer) {
+  if (buffer->strides.empty()) {
+    return true;
+  }
+  if (buffer->strides.size() != buffer->shape.size()) {
+    return false;
+  }
+  // For row-major layout:
+  // strides[n-1] = 1
+  // strides[i] = strides[i+1] * shape[i+1]
+  int n = buffer->shape.size();
+  PrimExpr expected_stride = make_const(buffer->shape[0].dtype(), 1);
+  for (int i = n - 1; i >= 0; --i) {
+    if (!analyzer->CanProveEqual(buffer->strides[i], expected_stride)) {
+      return false;
+    }
+    if (i > 0) {
+      expected_stride = expected_stride * buffer->shape[i];
+    }
+  }
+  return true;
+}
+
 struct VectorizePlanResult {
   int vector_size;
   bool dynamic;
   PrimExpr condition;
 };
 
-class VectorizeFindGlobalAccess : public StmtExprVisitor {
+struct BufferVectorInfo {
+  Buffer buffer;
+  int vector_size;
+  bool is_store;
+  Array<PrimExpr> indices;
+  bool is_cast = false; // true for CastNode constraints (vs CallNode)
+};
+
+Array<PrimExpr> GetBufferStrides(const Buffer &buffer) {
+  if (!buffer->strides.empty()) {
+    return buffer->strides;
+  }
+  Array<PrimExpr> strides;
+  PrimExpr stride = 1;
+  for (int i = buffer->shape.size() - 1; i >= 0; --i) {
+    strides.push_back(stride);
+    stride = stride * buffer->shape[i];
+  }
+  return Array<PrimExpr>{strides.rbegin(), strides.rend()};
+}
+
+class VectorizeFindMemoryAccess : public StmtExprVisitor {
 public:
-  VectorizeFindGlobalAccess() = default;
+  VectorizeFindMemoryAccess() = default;
 
   bool HasGlobalAccess(const Stmt &stmt) {
     this->operator()(stmt);
     return has_global_access_;
   }
 
+  bool HasSharedAccess(const Stmt &stmt) {
+    this->operator()(stmt);
+    return has_shared_access_;
+  }
+
+  static bool MaySupportVectorize256(const Stmt &stmt) {
+    VectorizeFindMemoryAccess visitor;
+    visitor(stmt);
+    return visitor.has_global_access_ && !visitor.has_shared_access_;
+  }
+
 private:
   bool has_global_access_ = false;
+  bool has_shared_access_ = false;
 
   void VisitStmt_(const BufferStoreNode *node) final {
-    if (node->buffer.scope() == "global")
+    if (IsGlobalBuffer(node->buffer))
       has_global_access_ = true;
+    if (IsSharedBuffer(node->buffer))
+      has_shared_access_ = true;
     return StmtExprVisitor::VisitStmt_(node);
   }
 
   void VisitExpr_(const BufferLoadNode *node) final {
-    if (node->buffer.scope() == "global")
+    if (IsGlobalBuffer(node->buffer))
       has_global_access_ = true;
+    if (IsSharedBuffer(node->buffer))
+      has_shared_access_ = true;
     return StmtExprVisitor::VisitExpr_(node);
   }
 };
 
+/*!
+ * \brief Check if a For loop body contains SeqStmt (multiple statements).
+ *
+ * When the For body has SeqStmt, the vectorization analysis is more complex
+ * and we should be conservative - treating local buffers the same as memory
+ * buffers instead of ignoring their constraints.
+ *
+ * Currently we only handle simple single BufferStore cases specially for
+ * local buffer optimization.
+ */
+bool ForBodyContainsSeqStmt(const For &loop) {
+  bool has_seq_stmt = false;
+  PostOrderVisit(loop->body, [&](const ObjectRef &obj) {
+    if (obj.as<SeqStmtNode>()) {
+      has_seq_stmt = true;
+    }
+  });
+  return has_seq_stmt;
+}
+
 class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
 public:
-  explicit VectorizePlanner(arith::Analyzer *analyzer)
-      : arith::IRMutatorWithAnalyzer(analyzer) {}
+  explicit VectorizePlanner(arith::Analyzer *analyzer,
+                            const LayoutMap &layout_map = {})
+      : arith::IRMutatorWithAnalyzer(analyzer), layout_map_(layout_map) {}
 
   int Plan(const For &node) {
-    tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-    Optional<Bool> opt_disable_vectorize_256 =
-        ctxt->GetConfig(kDisableVectorize256, Optional<Bool>());
-    bool disable_vectorize_256 =
-        opt_disable_vectorize_256.value_or(Bool(false));
-    if (tvm::tl::TargetIsSm100(Target::Current(false)) &&
+    bool disable_vectorize_256 = tl_config::Vectorize256Disabled();
+    bool verbose = tl_config::VectorizePlannerVerboseEnabled();
+
+    if (TargetSupportVectorize256(Target::Current(false)) &&
         !disable_vectorize_256 &&
-        VectorizeFindGlobalAccess().HasGlobalAccess(node)) {
-      vector_load_bits_max_ = vector_size_ = 256;
+        VectorizeFindMemoryAccess::MaySupportVectorize256(node)) {
+      vector_load_bits_max_ = initial_vector_size_ = loop_extent_vector_size_ =
+          256;
     } else {
-      vector_load_bits_max_ = vector_size_ = 128;
+      vector_load_bits_max_ = initial_vector_size_ = loop_extent_vector_size_ =
+          128;
     }
+
+    // Check if For body contains SeqStmt (multiple statements).
+    // When there's SeqStmt, we use conservative strategy - treating local
+    // buffers the same as memory buffers. The special local buffer optimization
+    // (ignoring local buffer constraints) only applies to simple single
+    // BufferStore cases.
+    bool has_seq_stmt = ForBodyContainsSeqStmt(node);
+
+    // Clear previous buffer info and collect new ones
+    buffer_vector_infos_.clear();
     this->operator()(node);
+
+    // Compute final vector size from collected buffer infos
+    // Strategy:
+    // - If For body contains SeqStmt: take min of all buffers (conservative)
+    // - Else if all buffers are local/fragment: take min of all
+    // - Else if there are global/shared buffers: ignore local/fragment
+    //   constraints and only take min of global/shared buffers
+    // Rationale: local/fragment are register-level, no memory alignment
+    // constraints. But for complex cases (SeqStmt), we stay conservative.
+    vector_size_ = initial_vector_size_;
+
+    if (verbose) {
+      std::cerr << "=== VectorizePlanner: Collected buffer vector sizes ==="
+                << "\n";
+      std::cerr << "  initial_vector_size=" << initial_vector_size_
+                << ", loop_extent_vector_size=" << loop_extent_vector_size_
+                << ", has_seq_stmt=" << (has_seq_stmt ? "true" : "false")
+                << "\n";
+    }
+
+    // Separate buffers into local/fragment vs memory (global/shared) vs
+    // call/cast
+    int local_fragment_min = initial_vector_size_;
+    int memory_min = initial_vector_size_;
+    int call_node_min = initial_vector_size_;
+    int non_cast_call_node_min = initial_vector_size_;
+    bool has_global_or_shared_buffer = false;
+
+    auto is_local_or_fragment = [](const Buffer &buf) {
+      return IsLocalBuffer(buf, /*allow_var=*/true) || IsFragmentBuffer(buf);
+    };
+
+    std::vector<BufferVectorInfo> local_fragment_buffers;
+
+    for (const auto &info : buffer_vector_infos_) {
+      auto buffer = info.buffer;
+      if (verbose) {
+        if (buffer.defined()) {
+          std::cerr << "  Buffer: " << buffer->name
+                    << " (scope=" << buffer.scope() << ")"
+                    << " -> vector_size=" << info.vector_size
+                    << (info.is_store ? " [store]" : " [load]") << "\n";
+        } else {
+          std::cerr << "  [" << (info.is_cast ? "cast" : "call")
+                    << "] -> vector_size=" << info.vector_size << "\n";
+        }
+      }
+      if (!buffer.defined()) {
+        call_node_min = arith::ZeroAwareGCD(call_node_min, info.vector_size);
+        if (!info.is_cast) {
+          non_cast_call_node_min =
+              arith::ZeroAwareGCD(non_cast_call_node_min, info.vector_size);
+        }
+      } else if (is_local_or_fragment(buffer)) {
+        local_fragment_min =
+            arith::ZeroAwareGCD(local_fragment_min, info.vector_size);
+        local_fragment_buffers.push_back(info);
+      } else {
+        // global, shared, shared.dyn
+        // If a *load*'s indices don't depend on loop var (e.g. b[0]), treat
+        // as local — it will become a scalar broadcast, not a vector memory
+        // access, and DecoupleTypeCast won't create a cast buffer for it.
+        // Stores must stay in the memory bucket: a loop-invariant store is a
+        // reduction-like pattern where ComputeBufferVectorSize has already
+        // returned 1 to disable vectorization, and that constraint must not
+        // be dropped (memory strategy ignores local_fragment_min).
+        bool depends_on_loop_var = true;
+        if (!info.indices.empty() && inner_for_) {
+          Array<PrimExpr> strides = GetBufferStrides(info.buffer);
+          PrimExpr elem_offset = 0;
+          for (size_t i = 0; i < info.indices.size(); ++i) {
+            elem_offset += info.indices[i] * strides[i];
+          }
+          depends_on_loop_var = !IsExprInvariantInVectorBoundary(
+              elem_offset, inner_for_->loop_var, vector_size_, analyzer_);
+        }
+        if (depends_on_loop_var) {
+          memory_min = arith::ZeroAwareGCD(memory_min, info.vector_size);
+          has_global_or_shared_buffer = true;
+        } else {
+          local_fragment_min =
+              arith::ZeroAwareGCD(local_fragment_min, info.vector_size);
+          local_fragment_buffers.push_back(info);
+        }
+      }
+    }
+
+    if (verbose) {
+      std::cerr << "  Computed mins: local_fragment_min=" << local_fragment_min
+                << ", memory_min=" << memory_min
+                << ", call_node_min=" << call_node_min << "\n";
+    }
+
+    if (has_seq_stmt) {
+      // For body contains SeqStmt (multiple statements).
+      // Use conservative strategy: take GCD of all buffers including local.
+      // The special local buffer optimization only applies to simple single
+      // BufferStore cases where we can be confident about the access pattern.
+      vector_size_ = arith::ZeroAwareGCD(
+          arith::ZeroAwareGCD(local_fragment_min, memory_min), call_node_min);
+      if (verbose) {
+        std::cerr << "  [Strategy] Has SeqStmt, using conservative GCD of all"
+                  << " -> vector_size=" << vector_size_ << "\n";
+      }
+    } else if (has_global_or_shared_buffer) {
+      // Has memory buffers and simple case (no SeqStmt):
+      // ignore local/fragment constraints AND cast constraints.
+      // Cast constraints are ignored because DecoupleTypeCast will later
+      // split mixed-type operations into separate loops, allowing memory
+      // copies to use wider vectors independently of cast width limits.
+      vector_size_ = arith::ZeroAwareGCD(memory_min, non_cast_call_node_min);
+      if (verbose) {
+        std::cerr << "  [Strategy] Has memory buffers (simple case), using "
+                  << "memory_min=" << memory_min
+                  << ", non_cast_call_node_min=" << non_cast_call_node_min
+                  << " (ignoring local/fragment_min=" << local_fragment_min
+                  << ")" << "\n";
+      }
+      // vector_size may be greater than local/fragment buffers' vector_size.
+      // In such case, we need to re-validate if the indices are vectorizable
+      // at the new vector_size boundary. If not, take GCD.
+      for (const auto &info : local_fragment_buffers) {
+        if (vector_size_ > info.vector_size && !info.indices.empty()) {
+          // Compute elem_offset from indices and strides
+          Array<PrimExpr> strides = GetBufferStrides(info.buffer);
+          PrimExpr elem_offset = 0;
+          for (size_t i = 0; i < info.indices.size(); ++i) {
+            elem_offset += info.indices[i] * strides[i];
+          }
+          if (!IndicesCanVectorize(elem_offset, inner_for_->loop_var,
+                                   inner_for_->extent, vector_size_,
+                                   analyzer_)) {
+            // Not invariant at this vector_size, need to take GCD
+            int old_vector_size = vector_size_;
+            vector_size_ = arith::ZeroAwareGCD(vector_size_, info.vector_size);
+            if (verbose) {
+              std::cerr << "  [Re-validate] Local buffer '" << info.buffer->name
+                        << "' not invariant at vector_size=" << old_vector_size
+                        << ", GCD with " << info.vector_size
+                        << " -> vector_size=" << vector_size_ << "\n";
+            }
+          }
+        }
+      }
+    } else {
+      // Only local/fragment buffers: use GCD of local_fragment_min and
+      // call_node_min
+      vector_size_ = arith::ZeroAwareGCD(local_fragment_min, call_node_min);
+      if (verbose) {
+        std::cerr << "  [Strategy] Only local/fragment buffers, using "
+                     "GCD(local_fragment_min, call_node_min)="
+                  << vector_size_ << "\n";
+      }
+    }
+
+    // GCD with loop extent to ensure vector_size divides the loop extent
+    vector_size_ = arith::ZeroAwareGCD(loop_extent_vector_size_, vector_size_);
+
+    if (verbose) {
+      std::cerr << "=== Final vector_size: " << vector_size_ << " ===" << "\n";
+    }
     return vector_size_;
   }
 
@@ -109,18 +372,20 @@ class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
       // Here I disable dynamic shape completely,
       //   In order to do it, the Planner should accept an analyzer with
       //   arithmetic info outside to prove the dividiblity of vector size
+      // Note(lei): This is somehow make sense because we should assume the
+      // tiling size is always static.
       if (!extent_ptr) {
-        vector_size_ = 1;
+        loop_extent_vector_size_ = 1;
         return ffi::GetRef<Stmt>(node);
       }
-      vector_size_ = arith::ZeroAwareGCD(vector_size_, *extent_ptr);
+      loop_extent_vector_size_ =
+          arith::ZeroAwareGCD(initial_vector_size_, *extent_ptr);
     }
     return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
   PrimExpr VisitExpr_(const BufferLoadNode *node) final {
-    if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
-        node->buffer.scope() == "shared.dyn")
+    if (IsSharedBuffer(node->buffer) || IsGlobalBuffer(node->buffer))
       has_nonlocal_memory_access_ = true;
     if (node->buffer->shape.size() == 1) {
       // TODO(lei): This should be improved as
@@ -130,15 +395,14 @@ class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
         return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
       }
     }
-    UpdateVectorSize(node->indices, node->buffer);
+    UpdateVectorSize(node->indices, node->buffer, false);
     return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
   Stmt VisitStmt_(const BufferStoreNode *node) final {
-    if (node->buffer.scope() == "shared" || node->buffer.scope() == "global" ||
-        node->buffer.scope() == "shared.dyn")
+    if (IsSharedBuffer(node->buffer) || IsGlobalBuffer(node->buffer))
       has_nonlocal_memory_access_ = true;
-    UpdateVectorSize(node->indices, node->buffer);
+    UpdateVectorSize(node->indices, node->buffer, true);
     return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
@@ -147,53 +411,201 @@ class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
     return arith::IRMutatorWithAnalyzer::VisitStmt_(node);
   }
 
+  static std::optional<int> GetAccessPtrElementBits(const PrimExpr &expr) {
+    const auto *ptr_call = expr.as<CallNode>();
+    if (ptr_call == nullptr) {
+      return std::nullopt;
+    }
+    if (ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+      ICHECK(!ptr_call->args.empty());
+      DataType dtype = ptr_call->args[0].dtype();
+      return dtype.bits() * dtype.lanes();
+    }
+    if (ptr_call->op.same_as(tl::access_ptr())) {
+      ICHECK_EQ(ptr_call->args.size(), 3U)
+          << "tl.access_ptr expects 3 args: (BufferLoad, extent, rw_mask)";
+      const auto *buffer_load = ptr_call->args[0].as<BufferLoadNode>();
+      ICHECK(buffer_load) << "tl.access_ptr arg0 must be BufferLoad";
+      DataType dtype = buffer_load->buffer->dtype;
+      return dtype.bits() * dtype.lanes();
+    }
+    return std::nullopt;
+  }
+
+  static std::optional<int> GetCPAsyncBitsPerCall(const CallNode *node) {
+    ICHECK_GE(node->args.size(), 3U)
+        << "cp.async expects at least 3 arguments, but got " << node->args;
+    const auto *count_imm = node->args[2].as<IntImmNode>();
+    ICHECK(count_imm) << "cp.async transfer count must be IntImm, but got "
+                      << node->args[2];
+    int count = static_cast<int>(count_imm->value);
+    if (count <= 0) {
+      return std::nullopt;
+    }
+    if (node->op.same_as(builtin::ptx_cp_async())) {
+      return count * 8;
+    }
+    ICHECK(node->op.same_as(tl::ptx_cp_async()));
+    auto dst_elem_bits = GetAccessPtrElementBits(node->args[0]);
+    auto src_elem_bits = GetAccessPtrElementBits(node->args[1]);
+    if (!dst_elem_bits.has_value() || !src_elem_bits.has_value()) {
+      return std::nullopt;
+    }
+    int dst_total_bits = count * dst_elem_bits.value();
+    int src_total_bits = count * src_elem_bits.value();
+    ICHECK_EQ(dst_total_bits, src_total_bits)
+        << "tl.ptx_cp_async requires src/dst transfer widths to match, but got "
+        << dst_total_bits << " vs " << src_total_bits << " bits";
+    return dst_total_bits;
+  }
+
+  static int GetMaxCPAsyncVectorizeLength(int per_call_bits) {
+    if (per_call_bits <= 0) {
+      return 1;
+    }
+    int vectorize_length = 1;
+    for (int target_bytes : {16, 8, 4}) {
+      int target_bits = target_bytes * 8;
+      if (target_bits % per_call_bits == 0) {
+        vectorize_length =
+            std::max(vectorize_length, target_bits / per_call_bits);
+      }
+    }
+    return vectorize_length;
+  }
+
   PrimExpr VisitExpr_(const CallNode *node) final {
     if (node->op == builtin::if_then_else()) {
       CheckConditionVectorized(node->args[0]);
-    } else if (node->op == builtin::call_extern()) {
-      // Check if this is a tl::ld or tl::st call which can be vectorized
-      if (node->args.size() >= 3) {
-        auto func_name_node = node->args[0].as<StringImmNode>();
-        if (func_name_node) {
-          std::string func_name = func_name_node->value;
-          // Check for tl::ld<...> or tl::st<...> patterns
-          if (func_name.rfind("tl::ld<", 0) == 0 ||
-              func_name.rfind("tl::st<", 0) == 0) {
-            bool can_vectorize = true;
-
-            // Check source address (args[1]) for vectorizable pattern
-            auto addr_call = node->args[1].as<CallNode>();
-            if (addr_call && addr_call->op.same_as(builtin::address_of())) {
-              auto buffer_load = addr_call->args[0].as<BufferLoadNode>();
-              if (buffer_load) {
-                has_nonlocal_memory_access_ = true;
-                UpdateVectorSize(buffer_load->indices, buffer_load->buffer);
-              } else {
-                can_vectorize = false;
-              }
-            } else {
-              can_vectorize = false;
-            }
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    } else if (node->op.same_as(builtin::tvm_access_ptr())) {
+      HandleTvmAccessPtr(node);
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    } else if (node->op == tl::atomic_add_elem_op()) {
+      // Assert at least 2 args (dst_ptr and src)
+      ICHECK(node->args.size() >= 2)
+          << "atomic_add_elem_op requires at least 2 args (dst and src)";
 
-            // Check destination value (args[2]) for vectorizable pattern
-            auto value_load = node->args[2].as<BufferLoadNode>();
-            if (value_load) {
-              UpdateVectorSize(value_load->indices, value_load->buffer);
-            }
+      // Get dst dtype from args[0] (tvm_access_ptr or address_of(BufferLoad))
+      const CallNode *dst_ptr_call = node->args[0].as<CallNode>();
+      ICHECK(dst_ptr_call) << "atomic_add_elem_op first arg must be a call";
+
+      DataType dtype;
+      if (dst_ptr_call->op.same_as(builtin::address_of())) {
+        auto buffer_load = dst_ptr_call->args[0].as<BufferLoadNode>();
+        ICHECK(buffer_load) << "address_of arg must be BufferLoad";
+        dtype = buffer_load->buffer->dtype;
+      } else if (dst_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+        ICHECK(!dst_ptr_call->args.empty());
+        dtype = dst_ptr_call->args[0].dtype();
+      } else if (dst_ptr_call->op.same_as(tl::access_ptr())) {
+        ICHECK_EQ(dst_ptr_call->args.size(), 3U)
+            << "tl.access_ptr expects 3 args: (BufferLoad, extent, rw_mask)";
+        auto buffer_load = dst_ptr_call->args[0].as<BufferLoadNode>();
+        ICHECK(buffer_load) << "tl.access_ptr arg0 must be BufferLoad";
+        dtype = buffer_load->buffer->dtype;
+      } else {
+        LOG(FATAL) << "atomic_add_elem_op first arg must be tvm_access_ptr, "
+                      "tl.access_ptr, or address_of call, but got "
+                   << node->args[0];
+      }
+      int vectorize_length = 1;
+      if (dtype.is_float16() || dtype.is_bfloat16()) {
+        vectorize_length = 2;
+      } else if (dtype.is_float() && dtype.bits() == 32 &&
+                 TargetHasSMVersionGE(Target::Current(false), 90)) {
+        vectorize_length = 4;
+      }
+
+      buffer_vector_infos_.push_back({Buffer(), vectorize_length, false, {}});
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    } else if (node->op.same_as(builtin::ptx_cp_async()) ||
+               node->op.same_as(tl::ptx_cp_async())) {
+      // builtin::ptx_cp_async stores bytes, while tl::ptx_cp_async stores
+      // logical element counts. In both cases we pick the largest vector width
+      // whose eventual PTX payload is one of {4, 8, 16} bytes.
+      int vectorize_length =
+          GetMaxCPAsyncVectorizeLength(GetCPAsyncBitsPerCall(node).value_or(0));
+      buffer_vector_infos_.push_back({Buffer(), vectorize_length, false, {}});
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    } else if (node->op == builtin::address_of() ||
+               node->op == tl::access_ptr()) {
+      // address_of and tl.access_ptr have buffer load value so we should
+      // analysis the buffer load node to update vector_size_.
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    }
+
+    // vectorizable property
+    OpAttrMap<TVectorizable> op_vectorizable_ =
+        Op::GetAttrMap<TVectorizable>("TVectorizable");
+
+    auto optional_op = node->op.as<Op>();
+    bool vectorizable = op_vectorizable_.get(optional_op.value(), false) &&
+                        !node->dtype.is_scalable_vector();
+    if (vectorizable) {
+      return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    }
 
-            if (can_vectorize) {
-              return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
+    // For other call nodes, use PostOrderVisit to check buffer accesses
+    // and determine if the given vector size is invariant
+    auto check_buffer_access_invariant = [&](int target_vec_size) -> bool {
+      if (!inner_for_)
+        return true;
+      bool all_invariant = true;
+      PostOrderVisit(ffi::GetRef<PrimExpr>(node), [&](const ObjectRef &obj) {
+        if (!all_invariant)
+          return;
+        if (auto *load = obj.as<BufferLoadNode>()) {
+          auto transformed_indices =
+              TransformIndices(load->indices, load->buffer);
+          Array<PrimExpr> strides = GetBufferStrides(load->buffer);
+          PrimExpr elem_offset = 0;
+          for (size_t i = 0; i < transformed_indices.size(); ++i) {
+            elem_offset += transformed_indices[i] * strides[i];
+          }
+          if (!IsExprInvariantInVectorBoundary(elem_offset,
+                                               inner_for_->loop_var,
+                                               target_vec_size, analyzer_)) {
+            all_invariant = false;
+          }
+        } else if (auto *store = obj.as<BufferStoreNode>()) {
+          auto transformed_indices =
+              TransformIndices(store->indices, store->buffer);
+          Array<PrimExpr> strides = GetBufferStrides(store->buffer);
+          PrimExpr elem_offset = 0;
+          for (size_t i = 0; i < transformed_indices.size(); ++i) {
+            elem_offset += transformed_indices[i] * strides[i];
+          }
+          if (!IsExprInvariantInVectorBoundary(elem_offset,
+                                               inner_for_->loop_var,
+                                               target_vec_size, analyzer_)) {
+            all_invariant = false;
+          }
+        } else if (auto *call = obj.as<CallNode>()) {
+          // tvm_access_ptr(dtype_annotation, data, offset, extent, rw_mask)
+          // The offset (args[2]) is the element offset into the buffer.
+          if (call->op.same_as(builtin::tvm_access_ptr()) &&
+              call->args.size() >= 3) {
+            PrimExpr offset = call->args[2];
+            if (!IsExprInvariantInVectorBoundary(offset, inner_for_->loop_var,
+                                                 target_vec_size, analyzer_)) {
+              all_invariant = false;
             }
           }
         }
+      });
+      return all_invariant;
+    };
+    // Find the largest vector size where all buffer accesses are invariant
+    int call_node_vector_size = loop_extent_vector_size_;
+    while (call_node_vector_size > 1) {
+      if (check_buffer_access_invariant(call_node_vector_size)) {
+        break;
       }
-      // do not vectorize other extern calls
-      vector_size_ = 1;
-    } else if (node->op.same_as(tl::rng_rand()) ||
-               node->op.same_as(tl::rng_init())) {
-      // do not vectorize random operation
-      vector_size_ = 1;
+      call_node_vector_size /= 2;
     }
+    buffer_vector_infos_.push_back(
+        {Buffer(), call_node_vector_size, false, {}});
     return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
@@ -201,53 +613,242 @@ class VectorizePlanner : public arith::IRMutatorWithAnalyzer {
     // TODO: perform some checks here
   }
 
+  void HandleTvmAccessPtr(const CallNode *node) {
+    // tvm_access_ptr format: (ptype, data, offset, extent, rw_mask)
+    if (!inner_for_) {
+      return;
+    }
+    ICHECK(node->args.size() >= 3U)
+        << "tvm_access_ptr requires at least 3 args";
+
+    // args[0] is TypeAnnotation(dtype[/lanes]); dtype() encodes the element
+    // type. See tvm::tir::Buffer::access_ptr implementation.
+    DataType dtype = node->args[0].dtype();
+    Var data_var;
+    if (auto data_var_node = node->args[1].as<VarNode>()) {
+      data_var = Downcast<Var>(node->args[1]);
+    }
+    ICHECK(data_var.defined()) << "tvm_access_ptr second arg must be a var";
+    PrimExpr offset = node->args[2];
+
+    Optional<Buffer> buffer_opt;
+    Optional<Layout> layout_opt;
+
+    // Find the Buffer whose data pointer matches data_var by searching
+    // layout_map_. The layout_map_ maps Buffer -> Layout, so we iterate
+    // to find the buffer whose ->data field is the same Var.
+    if (layout_map_.defined()) {
+      for (auto [buf, layout] : layout_map_) {
+        if (buf->data.same_as(data_var)) {
+          buffer_opt = buf;
+          layout_opt = layout;
+          break;
+        }
+      }
+    }
+
+    // Base vector size from loop extent.
+    int access_vec_size = loop_extent_vector_size_;
+    // Constrain by dtype lane capacity (128/256-bit vector load/store width).
+    // This mirrors ComputeBufferVectorSize's dtype-based lower bound.
+    int dtype_bits = dtype.bits() * dtype.lanes();
+    if (dtype_bits > 0) {
+      int dtype_lane_bound = vector_load_bits_max_ / dtype_bits;
+      if (dtype_lane_bound <= 0) {
+        dtype_lane_bound = 1;
+      }
+      access_vec_size = arith::ZeroAwareGCD(access_vec_size, dtype_lane_bound);
+    }
+
+    // If the buffer has a layout, use the last output dimension as a proxy for
+    // the maximum contiguous vector length implied by the layout.
+    if (layout_opt.defined()) {
+      Array<PrimExpr> out_shape = layout_opt.value()->OutputShape();
+      if (!out_shape.empty()) {
+        PrimExpr contig = analyzer_->Simplify(out_shape.back());
+        if (auto contig_int = as_const_int(contig);
+            contig_int && *contig_int > 1) {
+          access_vec_size = arith::ZeroAwareGCD(access_vec_size, *contig_int);
+        }
+      }
+    }
+    // tvm_access_ptr itself is not vectorizable in TLVectorizer. If its offset
+    // depends on the vectorized loop var, TLVectorizer will force scalarization
+    // of the whole loop body. To avoid planning a vector size that will be
+    // immediately scalarized (and to keep semantics sane for side-effectful
+    // calls), require the offset to be invariant within the vector boundary.
+    PrimExpr offset_s = analyzer_->Simplify(offset);
+    while (access_vec_size > 1 &&
+           !IndicesCanVectorize(offset_s, inner_for_->loop_var,
+                                inner_for_->extent, access_vec_size,
+                                analyzer_)) {
+      access_vec_size /= 2;
+    }
+    // Record as a memory-like constraint if we can resolve the buffer.
+    buffer_vector_infos_.push_back(
+        {buffer_opt.value_or(Buffer()), access_vec_size, false, {}});
+  }
+
+  Array<PrimExpr> TransformIndices(const Array<PrimExpr> &indices,
+                                   const Buffer &buffer) {
+    auto transformed_indices = indices;
+    if (layout_map_.defined() && layout_map_.count(buffer)) {
+      ICHECK(IsBufferContiguous(buffer, analyzer_))
+          << buffer
+          << " has non-contiguous strides, but layout map is provided.";
+      // forward indices
+      auto layout = layout_map_[buffer];
+      transformed_indices = layout->Forward(indices);
+      // Reshape transformed_indices to match buffer->shape dimensions if needed
+      if (transformed_indices.size() != buffer->shape.size()) {
+        // Step 1: Compute linear offset using layout->OutputShape()
+        auto output_shape = layout->OutputShape();
+        ICHECK_EQ(transformed_indices.size(), output_shape.size())
+            << "Forward indices size " << transformed_indices.size()
+            << " != OutputShape size " << output_shape.size();
+        PrimExpr linear_offset = 0;
+        PrimExpr stride = 1;
+        for (int i = output_shape.size() - 1; i >= 0; --i) {
+          linear_offset = linear_offset + transformed_indices[i] * stride;
+          stride = stride * output_shape[i];
+        }
+        // Step 2: Decompose linear_offset into buffer->shape dimensions
+        Array<PrimExpr> new_indices;
+        for (int i = buffer->shape.size() - 1; i >= 0; --i) {
+          new_indices.push_back(FloorMod(linear_offset, buffer->shape[i]));
+          linear_offset = FloorDiv(linear_offset, buffer->shape[i]);
+        }
+        transformed_indices =
+            Array<PrimExpr>{new_indices.rbegin(), new_indices.rend()};
+      }
+    }
+    return transformed_indices;
+  }
+
   PrimExpr VisitExpr_(const CastNode *node) final {
-    vector_size_ = arith::ZeroAwareGCD(
-        vector_load_bits_max_ / node->dtype.bits(), vector_size_);
+    // Consider both source and target types to ensure all intermediate
+    // vector types can be represented. For example, casting int32 to
+    // float8_e4m3fn: target allows 128/8=16 lanes but int32 only supports
+    // up to 128/32=4 lanes in CUDA vector types.
+    int target_lanes = vector_load_bits_max_ / node->dtype.bits();
+    int source_bits = node->value.dtype().bits();
+    int max_lanes = target_lanes;
+    if (source_bits > 0) {
+      int source_lanes = vector_load_bits_max_ / source_bits;
+      max_lanes = std::min(target_lanes, source_lanes);
+    }
+    int cast_vector_size = arith::ZeroAwareGCD(max_lanes, initial_vector_size_);
+    // Record cast constraint (use empty buffer to indicate cast)
+    // Mark is_cast=true so Plan() can distinguish cast from other call nodes
+    buffer_vector_infos_.push_back(
+        {Buffer(), cast_vector_size, false, {}, /*is_cast=*/true});
     return arith::IRMutatorWithAnalyzer::VisitExpr_(node);
   }
 
-  void UpdateVectorSize(const Array<PrimExpr> indices, const Buffer &buffer) {
+  int ComputeBufferVectorSize(const Array<PrimExpr> &indices,
+                              const Buffer &buffer, bool is_store) {
     if (!inner_for_)
-      return;
+      return initial_vector_size_;
+
+    int buffer_vec_size = loop_extent_vector_size_;
+
+    // Transform indices using layout_map if present
+    auto transformed_indices = TransformIndices(indices, buffer);
+
     // 1. Compute raw element offset
-    auto strides = buffer->strides;
-    if (buffer->strides.empty()) {
-      PrimExpr stride = 1;
-      for (int i = indices.size() - 1; i >= 0; --i) {
-        strides.push_back(stride);
-        stride = stride * buffer->shape[i];
+    Array<PrimExpr> strides = GetBufferStrides(buffer);
+
+    PrimExpr elem_offset = 0;
+    for (size_t i = 0; i < transformed_indices.size(); ++i) {
+      elem_offset += transformed_indices[i] * strides[i];
+    }
+
+    // 2. Check if current buffer_vec_size works with invariant boundary check
+    // In some cases, buffer_vec_size is max (e.g. 128), but
+    // IsExprInvariantInVectorBoundary may only be true at a smaller size (e.g.
+    // 64). Recursively halve buffer_vec_size until we find a size where
+    // is_invariant is true. Fallback: minimum vector size based on buffer dtype
+    int min_vec_size = arith::ZeroAwareGCD(
+        buffer_vec_size,
+        vector_load_bits_max_ / (buffer->dtype.bits() * buffer->dtype.lanes()));
+    bool is_invariant = false;
+    int try_vec_size = buffer_vec_size;
+    while (try_vec_size >= min_vec_size) {
+      is_invariant = IsExprInvariantInVectorBoundary(
+          elem_offset, inner_for_->loop_var, try_vec_size, analyzer_);
+      if (is_invariant) {
+        buffer_vec_size = try_vec_size;
+        break;
       }
-      strides = Array<PrimExpr>{strides.rbegin(), strides.rend()};
+      try_vec_size /= 2;
     }
-    PrimExpr elem_offset = 0;
-    for (int i = 0; i < indices.size(); ++i) {
-      elem_offset += indices[i] * strides[i];
+    // If is_invariant is still false, use the fallback min_vec_size
+    if (!is_invariant) {
+      buffer_vec_size = min_vec_size;
     }
-    // 2. If element offset is independent with loop_var, ignore it
-    if (CanProveIndependent(elem_offset, inner_for_->loop_var, analyzer_)) {
-      return;
+
+    // 3. If element offset is independent with loop_var, ignore it.
+    bool is_independent =
+        CanProveIndependent(elem_offset, inner_for_->loop_var, analyzer_);
+    // For BufferStore, if indices is invariant or independent with loop_var,
+    // we should not vectorize it (broadcasting store is not supported).
+    if (is_store && (is_invariant || is_independent)) {
+      return 1;
     }
-    // 3. Check if current vector_size_ works with invariant boundary check
-    if (!IsExprInvariantInVectorBoundary(elem_offset, inner_for_->loop_var,
-                                         vector_size_, analyzer_)) {
-      // If not, tight vectorize bound with buffer dtype constraint
-      vector_size_ = arith::ZeroAwareGCD(
-          vector_size_, vector_load_bits_max_ /
-                            (buffer->dtype.bits() * buffer->dtype.lanes()));
+    if (is_independent) {
+      return buffer_vec_size; // only limited constraint from this buffer
     }
-    // 4. Try to vectorize buffer load
-    while (!IndiceCanVectorize(elem_offset, inner_for_->loop_var,
-                               inner_for_->extent, vector_size_, analyzer_)) {
-      vector_size_ /= 2;
+    // 4. Try to find max vectorize size for this buffer
+    while (buffer_vec_size > 1 &&
+           !IndicesCanVectorize(elem_offset, inner_for_->loop_var,
+                                inner_for_->extent, buffer_vec_size,
+                                analyzer_)) {
+      buffer_vec_size /= 2;
+    }
+    return buffer_vec_size;
+  }
+
+  void UpdateVectorSize(const Array<PrimExpr> &indices, const Buffer &buffer,
+                        bool is_store) {
+    int buffer_vec_size = ComputeBufferVectorSize(indices, buffer, is_store);
+    buffer_vector_infos_.push_back(
+        {buffer, buffer_vec_size, is_store, indices});
+  }
+
+  // NOTE(wt): The base class IRMutatorWithAnalyzer::VisitStmt_(LetStmtNode*)
+  // binds let variables, but this causes issues when the same variable name
+  // appears multiple times with different values (e.g., in pipelined loops
+  // where the body is duplicated). For this case, we allow the analyzer to
+  // override the binding. Check the impl of
+  // IRMutatorWithAnalyzer::VisitStmt_(LetStmtNode*) in:
+  // tvm/src/arith/ir_mutator_with_analyzer.cc
+  Stmt VisitStmt_(const LetStmtNode *op) final {
+    PrimExpr value = this->VisitExpr(op->value);
+    if (SideEffect(value) <= CallEffectKind::kPure) {
+      // Allow override to handle duplicated loop bodies in pipelined loops
+      analyzer_->Bind(op->var, value, /*allow_override=*/true);
+    }
+    // Continue visiting the body to collect vectorization info
+    Stmt body = this->VisitStmt(op->body);
+    if (value.same_as(op->value) && body.same_as(op->body)) {
+      return ffi::GetRef<Stmt>(op);
+    } else {
+      auto n = this->CopyOnWrite(op);
+      n->value = std::move(value);
+      n->body = std::move(body);
+      return Stmt(n);
     }
   }
 
   int vector_load_bits_max_;
+  int initial_vector_size_ = 128;
+  int loop_extent_vector_size_ = 128;
 
   const ForNode *inner_for_{};
   bool has_nonlocal_memory_access_ = false;
   int vector_size_ = 128;
+  std::vector<BufferVectorInfo> buffer_vector_infos_;
+  LayoutMap layout_map_;
 };
 
 class VectorizeRewriter : public StmtExprMutator {
@@ -265,7 +866,8 @@ class VectorizeRewriter : public StmtExprMutator {
       ICHECK(extent_ptr) << fnode->extent;
       int extent = *extent_ptr;
       ICHECK(extent % vector_size_ == 0)
-          << "extent: " << extent << " vector_size_: " << vector_size_;
+          << "extent: " << extent << " vector_size_: " << vector_size_
+          << " for loop: " << fnode;
       ICHECK(is_zero(fnode->min));
       if (extent == vector_size_) {
         fnode.CopyOnWrite()->kind = ForKind::kVectorized;
@@ -277,13 +879,26 @@ class VectorizeRewriter : public StmtExprMutator {
         vmap.Set(fnode->loop_var, outer_var * vector_size_ + inner_var);
         Stmt body = Substitute(fnode->body, vmap);
         body = For(inner_var, 0, vector_size_, ForKind::kVectorized, body);
-        body = For(outer_var, 0, extent / vector_size_, fnode->kind, body,
+        // TileLang uses ForKind::kParallel in frontend SIMT loops. After
+        // vectorization, keep semantics equivalent but downgrade to serial so
+        // subsequent passes (e.g. pragma-unroll) can run.
+        ForKind outer_kind = fnode->kind;
+        if (outer_kind == ForKind::kParallel) {
+          outer_kind = ForKind::kSerial;
+        }
+        body = For(outer_var, 0, extent / vector_size_, outer_kind, body,
                    fnode->thread_binding, fnode->annotations, fnode->step,
                    fnode->span);
         return body;
       }
     } else {
-      return ret;
+      // Keep other loops intact, except for TileLang frontend "parallel" loops
+      // which should behave as serial loops after lowering.
+      For loop = ret.as<For>().value();
+      if (loop->kind == ForKind::kParallel) {
+        loop.CopyOnWrite()->kind = ForKind::kSerial;
+      }
+      return loop;
     }
   }
 
@@ -291,13 +906,14 @@ class VectorizeRewriter : public StmtExprMutator {
   const int vector_size_;
 };
 
-int GetVectorizeSize(const For &loop) {
+int GetVectorizeSize(const For &loop, const LayoutMap &layout_map) {
   arith::Analyzer analyzer;
-  return VectorizePlanner(&analyzer).Plan(loop);
+  return VectorizePlanner(&analyzer, layout_map).Plan(loop);
 }
 
-int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer) {
-  return VectorizePlanner(analyzer).Plan(loop);
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer,
+                     const LayoutMap &layout_map) {
+  return VectorizePlanner(analyzer, layout_map).Plan(loop);
 }
 
 bool CanProveIndependent(const PrimExpr &expr, Var var,
@@ -340,9 +956,10 @@ bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
   return false;
 }
 
-bool IndiceCanVectorize(const PrimExpr &expr, Var var,
-                        const PrimExpr &iter_var_size,
-                        int target_vectorized_size, arith::Analyzer *analyzer) {
+bool IndicesCanVectorize(const PrimExpr &expr, Var var,
+                         const PrimExpr &iter_var_size,
+                         int target_vectorized_size,
+                         arith::Analyzer *analyzer) {
   ICHECK(target_vectorized_size >= 1);
   if (target_vectorized_size == 1)
     return true;
@@ -397,26 +1014,59 @@ bool IndiceCanVectorize(const PrimExpr &expr, Var var,
   }
 }
 
-For VectorizeLoop(const For &loop, int vectorize_hint) {
+namespace {
+
+/*!
+ * \brief Convert TIR parallel loops into serial loops.
+ *
+ * TileLang uses ForKind::kParallel in a few places as a frontend "SIMT loop"
+ * marker. When vectorize size resolves to 1 (i.e. no vectorization is applied),
+ * keeping these loops as kParallel can block later loop transforms that only
+ * apply to serial loops (e.g. pragma-unroll rewriting).
+ *
+ * This rewriter is intentionally conservative: it only downgrades kParallel to
+ * kSerial and leaves all other loop kinds untouched.
+ */
+class ParallelToSerialRewriter : public StmtExprMutator {
+private:
+  Stmt VisitStmt_(const ForNode *node) final {
+    Stmt visited = StmtExprMutator::VisitStmt_(node);
+    For loop = Downcast<For>(visited);
+    if (loop->kind == ForKind::kParallel) {
+      loop.CopyOnWrite()->kind = ForKind::kSerial;
+    }
+    return loop;
+  }
+};
+
+For ParallelToSerial(const For &loop) {
+  ParallelToSerialRewriter rewriter;
+  return Downcast<For>(rewriter(loop));
+}
+
+} // namespace
+
+For VectorizeLoop(const For &loop, const LayoutMap &layout_map,
+                  int vectorize_hint) {
   if (vectorize_hint <= 0) {
     arith::Analyzer analyzer;
-    VectorizePlanner planner(&analyzer);
+    VectorizePlanner planner(&analyzer, layout_map);
     vectorize_hint = planner.Plan(loop);
   }
   if (vectorize_hint == 1)
-    return loop;
+    return ParallelToSerial(loop);
   auto rewriter = VectorizeRewriter(vectorize_hint);
   return Downcast<For>(rewriter(loop));
 }
 
 For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
-                  int vectorize_hint) {
+                  const LayoutMap &layout_map, int vectorize_hint) {
   if (vectorize_hint <= 0) {
-    VectorizePlanner planner(analyzer);
+    VectorizePlanner planner(analyzer, layout_map);
     vectorize_hint = planner.Plan(loop);
   }
   if (vectorize_hint == 1)
-    return loop;
+    return ParallelToSerial(loop);
   auto rewriter = VectorizeRewriter(vectorize_hint);
   return Downcast<For>(rewriter(loop));
 }
diff --git a/src/transform/loop_vectorize.h b/src/transform/loop_vectorize.h
index 92a756228d..214d703f0a 100644
--- a/src/transform/loop_vectorize.h
+++ b/src/transform/loop_vectorize.h
@@ -25,6 +25,7 @@
 #ifndef TVM_TL_LOOP_VECTORIZE_H_
 #define TVM_TL_LOOP_VECTORIZE_H_
 
+#include "../op/operator.h"
 #include <tvm/arith/analyzer.h>
 #include <tvm/tir/op.h>
 
@@ -33,14 +34,16 @@ namespace tl {
 
 using namespace tir;
 
-int GetVectorizeSize(const For &loop);
+int GetVectorizeSize(const For &loop, const LayoutMap &layout_map = {});
 
-int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer);
+int GetVectorizeSize(const For &loop, arith::Analyzer *analyzer,
+                     const LayoutMap &layout_map = {});
 
-For VectorizeLoop(const For &loop, int vectorize_hint = -1);
+For VectorizeLoop(const For &loop, const LayoutMap &layout_map = {},
+                  int vectorize_hint = -1);
 
 For VectorizeLoop(const For &loop, arith::Analyzer *analyzer,
-                  int vectorize_hint = -1);
+                  const LayoutMap &layout_map = {}, int vectorize_hint = -1);
 
 // Can prove expr is independent with var, i.e. the value of expr doesn't change
 // when var changes
@@ -52,9 +55,9 @@ bool IsExprInvariantInVectorBoundary(const PrimExpr &expr, Var var,
                                      int target_vectorized_size,
                                      arith::Analyzer *analyzer);
 
-bool IndiceCanVectorize(const PrimExpr &expr, Var var,
-                        const PrimExpr &iter_var_size,
-                        int target_vectorized_size, arith::Analyzer *analyzer);
+bool IndicesCanVectorize(const PrimExpr &expr, Var var,
+                         const PrimExpr &iter_var_size,
+                         int target_vectorized_size, arith::Analyzer *analyzer);
 
 } // namespace tl
 } // namespace tvm
diff --git a/src/transform/lower_access_ptr.cc b/src/transform/lower_access_ptr.cc
new file mode 100644
index 0000000000..96f566c33a
--- /dev/null
+++ b/src/transform/lower_access_ptr.cc
@@ -0,0 +1,135 @@
+/*!
+ * \file lower_access_ptr.cc
+ * \brief Lower TileLang frontend `tl.access_ptr` to
+ * `tir.builtin.tvm_access_ptr`.
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+namespace {
+
+DataType IndexDTypeFromBuffer(const Buffer &buffer) {
+  if (!buffer.defined() || buffer->shape.empty()) {
+    return DataType::Int(32);
+  }
+  return buffer->shape[0].dtype();
+}
+
+Array<PrimExpr> RowMajorStrides(const Buffer &buffer) {
+  int ndim = static_cast<int>(buffer->shape.size());
+  Array<PrimExpr> strides;
+  DataType idx_dtype = IndexDTypeFromBuffer(buffer);
+  for (int i = 0; i < ndim; ++i) {
+    PrimExpr stride = make_const(idx_dtype, 1);
+    for (int j = i + 1; j < ndim; ++j) {
+      stride = stride * buffer->shape[j];
+    }
+    strides.push_back(stride);
+  }
+  return strides;
+}
+
+PrimExpr BaseIndexForOffset(const PrimExpr &index) {
+  if (const auto *ramp = index.as<RampNode>()) {
+    return ramp->base;
+  }
+  if (const auto *broadcast = index.as<BroadcastNode>()) {
+    return broadcast->value;
+  }
+  return index;
+}
+
+PrimExpr LinearOffsetFromLoad(const BufferLoad &load) {
+  Buffer buffer = load->buffer;
+  ICHECK(buffer.defined());
+  int ndim = static_cast<int>(buffer->shape.size());
+  ICHECK_EQ(static_cast<int>(load->indices.size()), ndim)
+      << "tl.access_ptr expects a BufferLoad with indices matching buffer ndim";
+
+  Array<PrimExpr> strides;
+  if (!buffer->strides.empty() &&
+      buffer->strides.size() == buffer->shape.size()) {
+    strides = buffer->strides;
+  } else {
+    strides = RowMajorStrides(buffer);
+  }
+
+  DataType idx_dtype = IndexDTypeFromBuffer(buffer);
+  PrimExpr offset = make_const(idx_dtype, 0);
+  for (int i = 0; i < ndim; ++i) {
+    PrimExpr idx = BaseIndexForOffset(load->indices[i]);
+    offset = offset + idx * strides[i];
+  }
+  return offset;
+}
+
+class AccessPtrLowerer : public StmtExprMutator {
+public:
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (!call->op.same_as(tl::access_ptr())) {
+      return std::move(call);
+    }
+
+    ICHECK_EQ(call->args.size(), 3U)
+        << "tl.access_ptr expects 3 args: (BufferLoad, extent, rw_mask)";
+
+    BufferLoad base_load = Downcast<BufferLoad>(call->args[0]);
+    Buffer buffer = base_load->buffer;
+    ICHECK(buffer.defined());
+
+    PrimExpr extent = call->args[1];
+    PrimExpr rw_mask = call->args[2];
+
+    PrimExpr ptype = tir::TypeAnnotation(buffer->dtype);
+    PrimExpr data = buffer->data;
+    PrimExpr offset = LinearOffsetFromLoad(base_load);
+
+    Array<PrimExpr> args{ptype, data, offset, extent, rw_mask};
+    return Call(DataType::Handle(), builtin::tvm_access_ptr(), args);
+  }
+};
+
+PrimFunc LowerAccessPtrPrimFunc(PrimFunc f) {
+  if (!f.defined() || !f->body.defined()) {
+    return f;
+  }
+  AccessPtrLowerer lowerer;
+  PrimFuncNode *n = f.CopyOnWrite();
+  n->body = lowerer(std::move(n->body));
+  return f;
+}
+
+} // namespace
+
+namespace transform {
+
+tvm::transform::Pass LowerAccessPtr() {
+  auto pass_func = [](PrimFunc f, const IRModule &m,
+                      const tvm::transform::PassContext &ctx) {
+    return LowerAccessPtrPrimFunc(std::move(f));
+  };
+  return tvm::tir::transform::CreatePrimFuncPass(pass_func, 0,
+                                                 "tl.LowerAccessPtr", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerAccessPtr", LowerAccessPtr);
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_blackwell_2sm.cc b/src/transform/lower_blackwell_2sm.cc
new file mode 100644
index 0000000000..21ef2277be
--- /dev/null
+++ b/src/transform/lower_blackwell_2sm.cc
@@ -0,0 +1,152 @@
+/*!
+ * \file lower_blackwell_2sm.cc
+ * \brief Lower 2SM TCGEN5MMA and related on Blackwell target
+ *
+ * This pass runs before LowerTileOp. At that point the IR still has T.gemm
+ * (tl.tileop.gemm Call), not the lowered tl::tcgen5mma_gemm_ss/ts. We detect
+ * Gemm ops that will be lowered to TCGEN5MMA with use_2cta and set block attr.
+ */
+
+// todo: consider mixture of 1cta/2cta tcgen5mma in the same kernel
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/expr.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/gemm.h"
+#include "../op/operator.h"
+#include "../target/utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+namespace attr {
+constexpr const char *kUse2Cta = "use_2cta";
+} // namespace attr
+
+/**
+ * \brief Check if any block in the body has cluster_dims (2,1,1) or (1,2,1).
+ * enable_2cta is only allowed when cluster_dims matches one of these.
+ */
+static bool HasValidClusterDimsFor2Cta(const Stmt &body) {
+  bool found = false;
+  PostOrderVisit(body, [&](const ObjectRef &node) {
+    if (found)
+      return;
+    if (const auto *block = node.as<BlockNode>()) {
+      if (block->annotations.count("cluster_dims")) {
+        if (auto arr = block->annotations.Get("cluster_dims")
+                           ->try_cast<Array<Integer>>()) {
+          if (arr.value().size() >= 3) {
+            int64_t x = arr.value()[0]->value;
+            int64_t y = arr.value()[1]->value;
+            int64_t z = arr.value()[2]->value;
+            found =
+                (x == 2 && y == 1 && z == 1) || (x == 1 && y == 2 && z == 1);
+          }
+        }
+      }
+    }
+  });
+  return found;
+}
+
+/**
+ * \brief Detect 2SM TCGEN5MMA in the kernel (before LowerTileOp).
+ * Looks for T.gemm (tl.tileop.gemm Call); if it will be lowered to TCGEN5MMA
+ * with use_2cta, sets the flag for the mutator to add block attr.
+ */
+class Tcgen5_2SmLower : public StmtExprMutator {
+public:
+  Tcgen5_2SmLower(bool cluster_dims_valid)
+      : cluster_dims_valid_(cluster_dims_valid) {}
+  bool has_2sm_tcgen5mma() const { return has_2sm_tcgen5mma_; }
+
+private:
+  Stmt VisitStmt_(const EvaluateNode *op) final {
+    if (const CallNode *call = op->value.as<CallNode>()) {
+      TileOperator tile_op = ParseOperator(ffi::GetRef<Stmt>(op));
+      if (tile_op.defined() && tile_op.as<Gemm>()) {
+        // Check if the user explicitly requested 2CTA via the use_2cta
+        // annotation on the Call node (set by T.tcgen05_gemm(use_2cta=True)).
+        if (call->annotations.count(attr::kUse2Cta)) {
+          auto val = call->annotations.Get(attr::kUse2Cta).value();
+          if (const auto *imm = val.as<IntImmNode>()) {
+            if (imm->value) {
+              if (!cluster_dims_valid_) {
+                LOG(WARNING) << "Invalid cluster_dims disables 2CTA "
+                                "TCGEN5MMA, use 1CTA variant instead.";
+                return StmtExprMutator::VisitStmt_(op);
+              }
+              has_2sm_tcgen5mma_ = true;
+            }
+          }
+        }
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  bool cluster_dims_valid_;
+  bool has_2sm_tcgen5mma_ = false;
+};
+
+class Tcgen5_2SmAnnotator : public StmtExprMutator {
+public:
+  explicit Tcgen5_2SmAnnotator() {}
+
+private:
+  Stmt VisitStmt_(const BlockRealizeNode *op) final {
+    Stmt new_realize = StmtExprMutator::VisitStmt_(op);
+    if (root_block_annotated_)
+      return new_realize;
+    const auto *realize = new_realize.as<BlockRealizeNode>();
+    ICHECK(realize);
+    Block block = realize->block;
+    BlockNode *n = block.CopyOnWrite();
+    // Set block attr: {use_2cta: 1}
+    // lower_shared_tmem.cc will depend on this to allocate/deallocate tmem with
+    // 2cta.
+    n->annotations.Set(attr::kUse2Cta, IntImm(DataType::Int(32), 1));
+    root_block_annotated_ = true;
+    return BlockRealize(realize->iter_values, realize->predicate, block);
+  }
+
+  bool root_block_annotated_ = false;
+};
+
+using namespace tir::transform;
+
+tvm::transform::Pass LowerBlackwell2SM() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, PassContext ctx) {
+    Optional<Target> opt_target = f->GetAttr<Target>(tvm::attr::kTarget);
+    if (!opt_target.defined() || !TargetIsSm100(opt_target.value())) {
+      return f;
+    }
+    Stmt body = f->body;
+    bool cluster_dims_valid = HasValidClusterDimsFor2Cta(body);
+    Tcgen5_2SmLower lower(cluster_dims_valid);
+    body = lower(std::move(body));
+    if (lower.has_2sm_tcgen5mma()) {
+      // Annotate block attr for using 2cta tcgen5
+      Tcgen5_2SmAnnotator annotator;
+      body = annotator(std::move(body));
+    }
+    return PrimFunc(f->params, body, f->ret_type, f->buffer_map, f->attrs);
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LowerBlackwell2SM", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerBlackwell2SM", LowerBlackwell2SM);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_device_kernel_launch.cc b/src/transform/lower_device_kernel_launch.cc
index f2d8ae239a..6bf82493ba 100644
--- a/src/transform/lower_device_kernel_launch.cc
+++ b/src/transform/lower_device_kernel_launch.cc
@@ -64,6 +64,8 @@ struct KernelInfo {
   Map<String, PrimExpr> thread_extent;
   // The amount of dynamic shared memory used
   Optional<PrimExpr> dyn_shmem_size{std::nullopt};
+  // Cluster dimensions for SM90+ cluster launch
+  Optional<Array<Integer>> cluster_dims{std::nullopt};
 };
 
 /*!
@@ -79,6 +81,11 @@ class DeviceInfoCollector : public StmtVisitor {
 
     collector(func->body);
 
+    // Cluster dims are promoted to a PrimFunc attr by LowerOpaqueBlock.
+    if (auto opt = func->GetAttr<Array<Integer>>("cluster_dims")) {
+      collector.info_.cluster_dims = opt.value();
+    }
+
     // The dynamic shared memory is required to be the last of the
     // kernel launch parameters
     if (collector.dyn_shmem_size) {
@@ -94,6 +101,26 @@ class DeviceInfoCollector : public StmtVisitor {
         [&](const auto &param) { return collector.GetArgument(param); });
     collector.info_.dyn_shmem_size = collector.dyn_shmem_size;
     collector.info_.thread_extent = collector.thread_extent;
+
+    // Prepend cluster dim tags and static values so the TVM runtime
+    // can read them from packed-function args at launch time.
+    if (collector.info_.cluster_dims.defined()) {
+      auto dims = collector.info_.cluster_dims.value();
+      Array<PrimExpr> new_launch_args = {PrimExpr(dims[0]), PrimExpr(dims[1]),
+                                         PrimExpr(dims[2])};
+      for (auto arg : collector.info_.launch_args)
+        new_launch_args.push_back(arg);
+      collector.info_.launch_args = new_launch_args;
+
+      Array<String> new_launch_params = {
+          String(tvm::runtime::launch_param::kClusterDimX),
+          String(tvm::runtime::launch_param::kClusterDimY),
+          String(tvm::runtime::launch_param::kClusterDimZ)};
+      for (auto param : collector.info_.launch_params)
+        new_launch_params.push_back(param);
+      collector.info_.launch_params = new_launch_params;
+    }
+
     return collector.info_;
   }
 
@@ -251,9 +278,12 @@ class DeviceKernelMutator : public StmtExprMutator {
                      {tvm::tir::attr::kKernelLaunchParams, info.launch_params},
                      {tvm::attr::kGlobalSymbol, info.global_symbol}});
     }
-    // @lei: workaround as we may require c host codegen, so we need to set the
-    // global symbol for cpu backend.
-    func = WithAttr(func, tvm::attr::kGlobalSymbol, gvar->name_hint);
+    // Preserve any global_symbol chosen earlier during device splitting.
+    // Source kernels rely on this to launch the external CUDA entry directly.
+    if (!func->GetAttr<String>(tvm::attr::kGlobalSymbol)) {
+      func =
+          WithAttr(std::move(func), tvm::attr::kGlobalSymbol, gvar->name_hint);
+    }
 
     const auto &info = device_info_map_.at(gvar.get());
     const auto &thread_extent = info.thread_extent;
@@ -262,6 +292,10 @@ class DeviceKernelMutator : public StmtExprMutator {
       func = WithAttr(std::move(func), "dyn_shared_memory_buf",
                       info.dyn_shmem_size.value());
     }
+    if (info.cluster_dims.defined()) {
+      func =
+          WithAttr(std::move(func), "cluster_dims", info.cluster_dims.value());
+    }
     return func;
   }
 
diff --git a/src/transform/lower_device_storage_access_info.cc b/src/transform/lower_device_storage_access_info.cc
index 6dc46e9857..1f3077843d 100644
--- a/src/transform/lower_device_storage_access_info.cc
+++ b/src/transform/lower_device_storage_access_info.cc
@@ -45,7 +45,8 @@ class StorageAccessInfoLower : public StmtExprMutator {
   Stmt VisitStmt_(const AllocateNode *op) final {
     auto scope = StorageScope::Create(GetPtrStorageScope(op->buffer_var));
     if (!scope.tag.empty() && scope.tag != ".dyn" && scope.tag != ".var" &&
-        scope.tag != ".barrier" && scope.tag.find(".descriptor") != 0) {
+        scope.tag != ".barrier" && scope.tag != ".cluster_barrier" &&
+        scope.tag.find(".descriptor") != 0) {
       auto info = GetMemoryInfo(GetPtrStorageScope(op->buffer_var));
       ICHECK(info.defined())
           << "Cannot find memory info of " << scope.to_string();
diff --git a/src/transform/lower_hopper_intrin.cc b/src/transform/lower_hopper_intrin.cc
index e9c848ac91..18b405f2bb 100644
--- a/src/transform/lower_hopper_intrin.cc
+++ b/src/transform/lower_hopper_intrin.cc
@@ -10,6 +10,9 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <unordered_map>
+#include <vector>
+
 #include "../op/builtin.h"
 #include "../runtime/runtime.h"
 
@@ -25,33 +28,16 @@ class LowerHopperIntrin : public StmtExprMutator {
     PrimFuncNode *fptr = f.CopyOnWrite();
     LowerHopperIntrin substituter(disable_shuffle_elect);
     fptr->body = substituter.VisitStmt(f->body);
-    Map<Var, Array<PrimExpr>> init_desc_arg_map;
     // Collect prologue/epilogue statements for host-side setup/teardown
     Array<Stmt> prologue_stmts;
     Array<Stmt> epilogue_stmts;
-    for (const auto &[call, var] : substituter.desc_map_) {
-      // Should allocate 128 bytes for TensorMap on stack
-      Call alloc_desc = Call(DataType::Handle(), builtin::tvm_stack_alloca(),
-                             {StringImm("tvm_ffi_any"), 16});
-      Array<PrimExpr> init_desc_args;
-      if (call->op.same_as(create_tma_descriptor())) {
-        init_desc_args.push_back(StringImm(tvm_tensormap_create_tiled));
-      } else if (call->op.same_as(create_tma_im2col_descriptor())) {
-        init_desc_args.push_back(StringImm(tvm_tensormap_create_im2col));
-      } else {
-        CHECK(0) << call->op;
+    for (const auto &desc_init : substituter.desc_inits_) {
+      if (!desc_init.emitted) {
+        prologue_stmts.push_back(desc_init.stmt);
       }
-      init_desc_args.push_back(var);
-      init_desc_args.insert(init_desc_args.end(), call->args.begin(),
-                            call->args.end());
-      // add to function attribute
-      Call init_desc =
-          Call(DataType::Handle(), builtin::tvm_call_packed(), init_desc_args);
-      // Accumulate TMA descriptor init into prologue
-      prologue_stmts.push_back(LetStmt(var, alloc_desc, Evaluate(init_desc)));
-      init_desc_arg_map.Set(var, init_desc_args);
     }
-    f = WithAttr(std::move(f), "tma_descriptor_args", init_desc_arg_map);
+    f = WithAttr(std::move(f), "tma_descriptor_args",
+                 substituter.init_desc_arg_map_);
 
     // Additionally, if L2 persistent cache annotations were lowered earlier,
     // materialize TVM FFI calls to set the stream access policy window.
@@ -71,8 +57,24 @@ class LowerHopperIntrin : public StmtExprMutator {
             continue;
           }
           const Buffer &buf = name2buf.at(buf_name);
-          // Build base pointer expression (read access)
-          PrimExpr base_ptr = buf.access_ptr(1);
+          // Build base pointer expression.
+          //
+          // We only need the base address for CUDA stream access policy window
+          // configuration. Using `Buffer::access_ptr` would materialize a
+          // typed pointer cast based on `buf->dtype` (e.g. float16 -> `half*`)
+          // in the generated C host stubs, which then requires a `half`
+          // definition during host compilation. Since the runtime API treats
+          // the pointer as opaque, keep it as `void*`/handle and adjust by
+          // `elem_offset` in bytes when needed.
+          PrimExpr base_ptr = buf->data;
+          if (buf->elem_offset.defined() && !is_zero(buf->elem_offset)) {
+            PrimExpr byte_offset =
+                buf->elem_offset *
+                IntImm(buf->elem_offset.dtype(), buf->dtype.bytes());
+            base_ptr =
+                Call(DataType::Handle(), builtin::handle_add_byte_offset(),
+                     {base_ptr, byte_offset});
+          }
           // Args packed: func_name, base_ptr, num_bytes, hit_ratio
           Array<PrimExpr> packed_args;
           packed_args.push_back(
@@ -110,57 +112,54 @@ class LowerHopperIntrin : public StmtExprMutator {
     return f;
   }
 
+  Stmt VisitStmt_(const AllocateNode *op) final {
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    Array<Stmt> init_stmts;
+    for (auto &desc_init : desc_inits_) {
+      if (!desc_init.emitted && desc_init.base_var == op->buffer_var.get()) {
+        init_stmts.push_back(desc_init.stmt);
+        desc_init.emitted = true;
+      }
+    }
+    if (init_stmts.empty()) {
+      return stmt;
+    }
+
+    auto *alloc = stmt.as<AllocateNode>();
+    ICHECK(alloc != nullptr);
+    Array<Stmt> seq;
+    for (const auto &init_stmt : init_stmts) {
+      seq.push_back(init_stmt);
+    }
+    seq.push_back(alloc->body);
+    auto n = CopyOnWrite(alloc);
+    n->body = SeqStmt(seq);
+    return Stmt(n);
+  }
+
   Stmt VisitStmt_(const AttrStmtNode *op) final {
-    // Insert the prefetch TMA descriptor statement TO the beginning of the
-    // kernel
     if (op->attr_key == tir::attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
       if (iv->thread_tag == "threadIdx.x") {
         auto body = StmtExprMutator::VisitStmt(op->body);
-        if (prefetch_calls_.empty() && init_mbarrier_calls_.empty()) {
+        if (prefetch_calls_.empty()) {
           return AttrStmt(op->node, op->attr_key, op->value, body);
         } else {
           Array<Stmt> stmt_seq;
-          if (!init_mbarrier_calls_.empty()) {
-            auto alloc_mbarrier =
-                Evaluate(Call(DataType::Handle(), builtin::create_barriers(),
-                              {static_cast<int>(init_mbarrier_calls_.size())}));
-            stmt_seq.push_back(alloc_mbarrier);
-          }
-
-          auto stmts = prefetch_calls_;
-          stmts.insert(stmts.end(), init_mbarrier_calls_.begin(),
-                       init_mbarrier_calls_.end());
           PrimExpr condition;
           if (!disable_shuffle_elect_) {
             condition = Call(DataType::Bool(), tl_shuffle_elect(), {0});
           } else {
             condition = EQ(iv->var, 0);
           }
+          auto stmts = prefetch_calls_;
           auto stmt_ = IfThenElse(condition,
                                   stmts.size() > 1 ? SeqStmt(stmts) : stmts[0]);
           stmt_seq.push_back(stmt_);
-          if (!init_mbarrier_calls_.empty()) {
-            // Note from FlashAttention:
-            // Helps with visibility of barrier init operations across warps /
-            // cta / cluster Available as a separate function so as to batch
-            // inits across barriers and fence once Note : It must be composed
-            // with an appropriate sync instruction with the right scope to
-            // ensure visibility eg. __syncthreads() or a cluster_arrive() +
-            // cluster_wait()
-            Stmt mem_fence = Evaluate(Call(
-                DataType::Handle(), tvm::tl::ptx_fence_barrier_init(), {}));
-            stmt_seq.push_back(mem_fence);
-            Stmt mem_sync =
-                Evaluate(Call(DataType::Handle(), builtin::tvm_storage_sync(),
-                              {StringImm("shared")}));
-            stmt_seq.push_back(mem_sync);
-          }
           stmt_seq.push_back(body);
-
+          Stmt result = SeqStmt(stmt_seq);
           prefetch_calls_.clear();
-          init_mbarrier_calls_.clear();
-          return AttrStmt(op->node, op->attr_key, op->value, SeqStmt(stmt_seq));
+          return AttrStmt(op->node, op->attr_key, op->value, result);
         }
       }
     }
@@ -178,31 +177,58 @@ class LowerHopperIntrin : public StmtExprMutator {
         String name = call->args[2].as<Var>().value()->name_hint;
         var = Var(name + "_desc",
                   PointerType(PrimType(cuTensorMapType()), "grid_constant"));
-        desc_map_[tvm::ffi::GetRef<Call>(call)] = var;
+        Call call_ref = tvm::ffi::GetRef<Call>(call);
+        desc_map_[call_ref] = var;
+        Array<PrimExpr> init_desc_args = MakeInitDescArgs(call_ref, var);
+        init_desc_arg_map_.Set(var, init_desc_args);
+        desc_inits_.push_back({call->args[2].as<Var>().value().get(),
+                               MakeInitDescStmt(var, init_desc_args), false});
         prefetch_calls_.push_back(
             Evaluate(Call(DataType::Handle(), builtin::call_extern(),
                           {StringImm("tl::prefetch_tma_descriptor"), var})));
       }
       return var;
-    } else if (call->op.same_as(create_list_of_mbarrier())) {
-      ICHECK(init_mbarrier_calls_.empty());
-      int num_barriers = static_cast<int>(call->args.size());
-      for (int i = 0; i < num_barriers; i++) {
-        PrimExpr mbarrier = Call(DataType::Handle(), get_mbarrier(), {i});
-        init_mbarrier_calls_.push_back(Evaluate(
-            Call(DataType::Handle(), builtin::ptx_init_barrier_thread_count(),
-                 {mbarrier, call->args[i]})));
-      }
-      return 0;
     } else {
       return StmtExprMutator::VisitExpr_(call);
     }
   }
 
 private:
+  struct DescInit {
+    const VarNode *base_var;
+    Stmt stmt;
+    bool emitted;
+  };
+
+  static Array<PrimExpr> MakeInitDescArgs(const Call &call, const Var &var) {
+    Array<PrimExpr> init_desc_args;
+    if (call->op.same_as(create_tma_descriptor())) {
+      init_desc_args.push_back(StringImm(tvm_tensormap_create_tiled));
+    } else if (call->op.same_as(create_tma_im2col_descriptor())) {
+      init_desc_args.push_back(StringImm(tvm_tensormap_create_im2col));
+    } else {
+      CHECK(0) << call->op;
+    }
+    init_desc_args.push_back(var);
+    init_desc_args.insert(init_desc_args.end(), call->args.begin(),
+                          call->args.end());
+    return init_desc_args;
+  }
+
+  static Stmt MakeInitDescStmt(const Var &var,
+                               const Array<PrimExpr> &init_desc_args) {
+    // Should allocate 128 bytes for TensorMap on stack.
+    Call alloc_desc = Call(DataType::Handle(), builtin::tvm_stack_alloca(),
+                           {StringImm("tvm_ffi_any"), 16});
+    Call init_desc =
+        Call(DataType::Handle(), builtin::tvm_call_packed(), init_desc_args);
+    return LetStmt(var, alloc_desc, Evaluate(init_desc));
+  }
+
   Array<Stmt> prefetch_calls_;
-  Array<Stmt> init_mbarrier_calls_;
   std::unordered_map<Call, Var, StructuralHash, ExprDeepEqual> desc_map_;
+  std::vector<DescInit> desc_inits_;
+  Map<Var, Array<PrimExpr>> init_desc_arg_map_;
   LowerHopperIntrin(bool disable_shuffle_elect)
       : disable_shuffle_elect_(disable_shuffle_elect) {}
   bool disable_shuffle_elect_;
diff --git a/src/transform/lower_ldg_stg.cc b/src/transform/lower_ldg_stg.cc
new file mode 100644
index 0000000000..cb72b139b3
--- /dev/null
+++ b/src/transform/lower_ldg_stg.cc
@@ -0,0 +1,513 @@
+/*!
+ * \file lower_ldg_stg.cc
+ * \brief Lower Ramp-based global memory load/store to ldg/stg intrinsics
+ *
+ * This pass transforms vectorized global memory loads and stores (using Ramp
+ * indices) into explicit ldg32/64/128/256 and stg32/64/128/256 intrinsics for
+ * better codegen.
+ *
+ * Key behaviors:
+ * 1. Converts Ramp-based global BufferLoad to ldg intrinsics
+ * 2. Converts Ramp-based global BufferStore to stg intrinsics
+ * 3. Supports predicated loads (if_then_else with else=0)
+ * 4. Supports predicated stores (if_then_else with empty then case)
+ * 5. Only enabled for CUDA targets
+ *
+ * Pass configurations:
+ * - tl.enable_lower_ldgstg: Enable non-predicated ldg/stg lowering (default:
+ * OFF)
+ * - tl.enable_lower_ldgstg_predicated: Enable predicated ldg/stg lowering
+ * (default: OFF)
+ */
+
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
+#include "tir/ir/buffer_common.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+class LowerLDGSTGRewriter : public StmtExprMutator {
+public:
+  explicit LowerLDGSTGRewriter(bool enable_non_predicated,
+                               bool enable_predicated)
+      : enable_non_predicated_(enable_non_predicated),
+        enable_predicated_(enable_predicated) {}
+
+  Stmt VisitStmt_(const BufferStoreNode *store) final {
+    // Skip if non-predicated lowering is disabled
+    if (!enable_non_predicated_) {
+      return StmtExprMutator::VisitStmt_(store);
+    }
+
+    // Only handle global memory stores
+    if (store->buffer.scope() != "global") {
+      return StmtExprMutator::VisitStmt_(store);
+    }
+
+    // Assume buffer has been flattened by FlattenBuffer pass
+    ICHECK(store->indices.size() == 1)
+        << "Expected flattened buffer with single index, but got "
+        << store->indices.size() << " indices for buffer "
+        << store->buffer->name;
+
+    // Check if this is a Ramp-based store (vectorized)
+    if (store->indices[0]->IsInstance<RampNode>()) {
+      auto ramp = store->indices[0].as<RampNode>();
+      // Check if stride is 1 (contiguous access)
+      if (auto stride_imm = ramp->stride.as<IntImmNode>()) {
+        if (stride_imm->value == 1) {
+          // Get lanes from the index dtype
+          int lanes = store->indices[0]->dtype.lanes();
+          // Use bits() to correctly handle sub-byte types like float4_e2m1fn
+          int total_bits = lanes * store->buffer->dtype.bits();
+
+          // Check for supported vector widths (32/64/128/256 bits)
+          if (total_bits == 32 || total_bits == 64 || total_bits == 128 ||
+              total_bits == 256) {
+            return LowerToSTG(store, ramp->base, total_bits);
+          }
+        }
+      }
+    } else {
+      // Single element store (non-Ramp)
+      int bits = store->buffer->dtype.bits();
+      if (bits == 32 || bits == 64 || bits == 128 || bits == 256) {
+        return LowerToSTG(store, store->indices[0], bits);
+      }
+    }
+
+    // Check if store value is an if_then_else with empty then case (predicated
+    // store) This pattern appears as: if (pred) { store } which gets lowered to
+    // BufferStore with if_then_else in the IfThenElse statement handling
+    return StmtExprMutator::VisitStmt_(store);
+  }
+
+  Stmt VisitStmt_(const IfThenElseNode *if_stmt) final {
+    // Skip if predicated lowering is disabled
+    if (!enable_predicated_) {
+      return StmtExprMutator::VisitStmt_(if_stmt);
+    }
+
+    // Check for predicated store pattern:
+    // if (pred) { } else { BufferStore(...) }
+    // This represents a store that only happens when pred is false
+    // We convert this to stg with predicate = !pred
+
+    // Actually, the more common pattern for predicated store is:
+    // if (pred) { BufferStore(...) }
+    // which means store only when pred is true
+    if (!if_stmt->else_case.defined()) {
+      // Check if then_case is a single BufferStore to global memory with Ramp
+      if (auto seq = if_stmt->then_case.as<SeqStmtNode>()) {
+        // Multiple statements, not a simple predicated store
+        return StmtExprMutator::VisitStmt_(if_stmt);
+      }
+
+      if (auto store = if_stmt->then_case.as<BufferStoreNode>()) {
+        // Only handle global memory stores
+        if (IsGlobalBuffer(store->buffer)) {
+          // Assume buffer has been flattened by FlattenBuffer pass
+          ICHECK(store->indices.size() == 1)
+              << "Expected flattened buffer with single index, but got "
+              << store->indices.size() << " indices for buffer "
+              << store->buffer->name;
+
+          // Check if this is a Ramp-based store (vectorized)
+          if (store->indices[0]->IsInstance<RampNode>()) {
+            auto ramp = store->indices[0].as<RampNode>();
+            // Check if stride is 1 (contiguous access)
+            if (auto stride_imm = ramp->stride.as<IntImmNode>()) {
+              if (stride_imm->value == 1) {
+                // Get lanes from the index dtype
+                int lanes = store->indices[0]->dtype.lanes();
+                // Use bits() to correctly handle sub-byte types like
+                // float4_e2m1fn
+                int total_bits = lanes * store->buffer->dtype.bits();
+
+                // Check for supported vector widths (32/64/128/256 bits)
+                if (total_bits == 32 || total_bits == 64 || total_bits == 128 ||
+                    total_bits == 256) {
+                  return LowerToSTGPredicated(store, ramp->base, total_bits,
+                                              if_stmt->condition);
+                }
+              }
+            }
+          } else {
+            // Single element predicated store (non-Ramp)
+            int bits = store->buffer->dtype.bits();
+            if (bits == 32 || bits == 64 || bits == 128 || bits == 256) {
+              return LowerToSTGPredicated(store, store->indices[0], bits,
+                                          if_stmt->condition);
+            }
+          }
+        }
+      }
+    }
+
+    return StmtExprMutator::VisitStmt_(if_stmt);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *load) final {
+    // Only handle global memory loads
+    if (load->buffer.scope() != "global") {
+      return StmtExprMutator::VisitExpr_(load);
+    }
+
+    // Check if we're in a predicated context (from IfThenElse store pattern)
+    // In this case, we need to use predicated load regardless of
+    // enable_non_predicated_
+    bool use_predicated = current_predicate_.defined();
+
+    // Skip if non-predicated lowering is disabled and we're not in predicated
+    // context
+    if (!enable_non_predicated_ && !use_predicated) {
+      return StmtExprMutator::VisitExpr_(load);
+    }
+
+    // Assume buffer has been flattened by FlattenBuffer pass
+    ICHECK(load->indices.size() == 1)
+        << "Expected flattened buffer with single index, but got "
+        << load->indices.size() << " indices for buffer " << load->buffer->name;
+
+    // Check if this is a Ramp-based load (vectorized)
+    if (load->indices[0]->IsInstance<RampNode>()) {
+      auto ramp = load->indices[0].as<RampNode>();
+      // Check if stride is 1 (contiguous access)
+      if (auto stride_imm = ramp->stride.as<IntImmNode>()) {
+        if (stride_imm->value == 1) {
+          // Get lanes from the index dtype
+          int lanes = load->indices[0]->dtype.lanes();
+          // Use bits() to correctly handle sub-byte types like float4_e2m1fn
+          int total_bits = lanes * load->buffer->dtype.bits();
+
+          // Check for supported vector widths (32/64/128/256 bits)
+          if (total_bits == 32 || total_bits == 64 || total_bits == 128 ||
+              total_bits == 256) {
+            if (use_predicated) {
+              return LowerToLDGPredicated(load, ramp->base, total_bits,
+                                          current_predicate_.value());
+            }
+            return LowerToLDG(load, ramp->base, total_bits);
+          }
+        }
+      }
+    } else {
+      // Single element load (non-Ramp)
+      int bits = load->buffer->dtype.bits();
+      if (bits == 32 || bits == 64 || bits == 128 || bits == 256) {
+        if (use_predicated) {
+          return LowerToLDGPredicated(load, load->indices[0], bits,
+                                      current_predicate_.value());
+        }
+        return LowerToLDG(load, load->indices[0], bits);
+      }
+    }
+
+    return StmtExprMutator::VisitExpr_(load);
+  }
+
+  PrimExpr VisitExpr_(const CallNode *call) final {
+    // Skip if predicated lowering is disabled
+    if (!enable_predicated_) {
+      return StmtExprMutator::VisitExpr_(call);
+    }
+
+    // Check for if_then_else pattern for predicated loads
+    if (call->op.same_as(builtin::if_then_else()) && call->args.size() == 3) {
+      PrimExpr condition = call->args[0];
+      PrimExpr then_value = call->args[1];
+      PrimExpr else_value = call->args[2];
+
+      // Check if else value is zero (required for predicated ldg)
+      bool else_is_zero = false;
+      if (auto bcast = else_value.as<BroadcastNode>()) {
+        if (auto f = bcast->value.as<FloatImmNode>()) {
+          else_is_zero = (f->value == 0.0f);
+        } else if (auto i = bcast->value.as<IntImmNode>()) {
+          else_is_zero = (i->value == 0);
+        }
+      } else if (auto f = else_value.as<FloatImmNode>()) {
+        else_is_zero = (f->value == 0.0f);
+      } else if (auto i = else_value.as<IntImmNode>()) {
+        else_is_zero = (i->value == 0);
+      }
+
+      if (else_is_zero) {
+        // Check if then_value is a BufferLoad from global memory with Ramp
+        if (auto load = then_value.as<BufferLoadNode>()) {
+          if (IsGlobalBuffer(load->buffer)) {
+            // Assume buffer has been flattened by FlattenBuffer pass
+            ICHECK(load->indices.size() == 1)
+                << "Expected flattened buffer with single index, but got "
+                << load->indices.size() << " indices for buffer "
+                << load->buffer->name;
+
+            if (load->indices[0]->IsInstance<RampNode>()) {
+              auto ramp = load->indices[0].as<RampNode>();
+              if (auto stride_imm = ramp->stride.as<IntImmNode>()) {
+                if (stride_imm->value == 1) {
+                  // Get lanes from the index dtype
+                  int lanes = load->indices[0]->dtype.lanes();
+                  // Use bits() to correctly handle sub-byte types like
+                  // float4_e2m1fn
+                  int total_bits = lanes * load->buffer->dtype.bits();
+
+                  // Check for supported vector widths (32/64/128/256 bits)
+                  if (total_bits == 32 || total_bits == 64 ||
+                      total_bits == 128 || total_bits == 256) {
+                    return LowerToLDGPredicated(load, ramp->base, total_bits,
+                                                condition);
+                  }
+                }
+              }
+            } else {
+              // Single element predicated load (non-Ramp)
+              int bits = load->buffer->dtype.bits();
+              if (bits == 32 || bits == 64 || bits == 128 || bits == 256) {
+                return LowerToLDGPredicated(load, load->indices[0], bits,
+                                            condition);
+              }
+            }
+          }
+        }
+      }
+    }
+
+    return StmtExprMutator::VisitExpr_(call);
+  }
+
+private:
+  bool enable_non_predicated_{false};
+  bool enable_predicated_{true};
+  Optional<PrimExpr>
+      current_predicate_; // Track predicate context for nested loads
+
+  // Create access pointer for the buffer at given base offset
+  PrimExpr CreateAccessPtr(const Buffer &buffer, const PrimExpr &base,
+                           int access_mask) {
+    // access_mask: 1 = read, 2 = write
+    return buffer.access_ptr(access_mask, DataType::Handle(), 1, base);
+  }
+
+  // Lower a BufferLoad to ldg intrinsic
+  PrimExpr LowerToLDG(const BufferLoadNode *load, const PrimExpr &base,
+                      int bits) {
+    PrimExpr ptr = CreateAccessPtr(load->buffer, base, 1);
+
+    DataType ret_dtype;
+    Op ldg_op;
+    if (bits == 32) {
+      ret_dtype = DataType::UInt(32);
+      ldg_op = ldg32();
+    } else if (bits == 64) {
+      ret_dtype = DataType::UInt(32, 2);
+      ldg_op = ldg64();
+    } else if (bits == 128) {
+      ret_dtype = DataType::UInt(32, 4);
+      ldg_op = ldg128();
+    } else if (bits == 256) {
+      ret_dtype = DataType::UInt(32, 8);
+      ldg_op = ldg256();
+    } else {
+      LOG(FATAL) << "Unsupported bit width for ldg: " << bits;
+      return PrimExpr();
+    }
+
+    // Create ldg call
+    PrimExpr ldg_result = Call(ret_dtype, ldg_op, {ptr});
+
+    // Reinterpret to the original dtype if needed
+    if (load->dtype != ret_dtype) {
+      return Call(load->dtype, builtin::reinterpret(), {ldg_result});
+    }
+    return ldg_result;
+  }
+
+  // Lower a predicated BufferLoad to ldg intrinsic
+  PrimExpr LowerToLDGPredicated(const BufferLoadNode *load,
+                                const PrimExpr &base, int bits,
+                                const PrimExpr &predicate) {
+    PrimExpr ptr = CreateAccessPtr(load->buffer, base, 1);
+
+    DataType ret_dtype;
+    Op ldg_op;
+    if (bits == 32) {
+      ret_dtype = DataType::UInt(32);
+      ldg_op = ldg32();
+    } else if (bits == 64) {
+      ret_dtype = DataType::UInt(32, 2);
+      ldg_op = ldg64();
+    } else if (bits == 128) {
+      ret_dtype = DataType::UInt(32, 4);
+      ldg_op = ldg128();
+    } else if (bits == 256) {
+      ret_dtype = DataType::UInt(32, 8);
+      ldg_op = ldg256();
+    } else {
+      LOG(FATAL) << "Unsupported bit width for ldg: " << bits;
+      return PrimExpr();
+    }
+
+    // Create predicated ldg call
+    PrimExpr ldg_result = Call(ret_dtype, ldg_op, {ptr, predicate});
+
+    // Reinterpret to the original dtype if needed
+    if (load->dtype != ret_dtype) {
+      return Call(load->dtype, builtin::reinterpret(), {ldg_result});
+    }
+    return ldg_result;
+  }
+
+  // Lower a BufferStore to stg intrinsic
+  Stmt LowerToSTG(const BufferStoreNode *store, const PrimExpr &base,
+                  int bits) {
+    PrimExpr ptr = CreateAccessPtr(store->buffer, base, 2);
+
+    // Get the value to store
+    PrimExpr value = this->VisitExpr(store->value);
+
+    // Reinterpret value to uint32xN if needed
+    DataType store_dtype;
+    const Op *stg_op;
+    switch (bits) {
+    case 32:
+      store_dtype = DataType::UInt(32);
+      stg_op = &stg32();
+      break;
+    case 64:
+      store_dtype = DataType::UInt(32, 2);
+      stg_op = &stg64();
+      break;
+    case 128:
+      store_dtype = DataType::UInt(32, 4);
+      stg_op = &stg128();
+      break;
+    case 256:
+      store_dtype = DataType::UInt(32, 8);
+      stg_op = &stg256();
+      break;
+    default:
+      LOG(FATAL) << "Unsupported bit width for stg: " << bits;
+      return Stmt();
+    }
+
+    // Reinterpret value if dtype doesn't match
+    if (value.dtype() != store_dtype) {
+      value = Call(store_dtype, builtin::reinterpret(), {value});
+    }
+
+    // Create stg call
+    return Evaluate(Call(DataType::Handle(), *stg_op, {ptr, value}));
+  }
+
+  // Lower a predicated BufferStore to stg intrinsic
+  Stmt LowerToSTGPredicated(const BufferStoreNode *store, const PrimExpr &base,
+                            int bits, const PrimExpr &predicate) {
+    PrimExpr ptr = CreateAccessPtr(store->buffer, base, 2);
+
+    // Set predicate context so that nested loads also use predicated version
+    Optional<PrimExpr> old_predicate = current_predicate_;
+    current_predicate_ = predicate;
+
+    // Get the value to store (loads inside will use predicated version)
+    PrimExpr value = this->VisitExpr(store->value);
+
+    // Restore old predicate context
+    current_predicate_ = old_predicate;
+
+    // Reinterpret value to uint32xN if needed
+    DataType store_dtype;
+    const Op *stg_op;
+    switch (bits) {
+    case 32:
+      store_dtype = DataType::UInt(32);
+      stg_op = &stg32();
+      break;
+    case 64:
+      store_dtype = DataType::UInt(32, 2);
+      stg_op = &stg64();
+      break;
+    case 128:
+      store_dtype = DataType::UInt(32, 4);
+      stg_op = &stg128();
+      break;
+    case 256:
+      store_dtype = DataType::UInt(32, 8);
+      stg_op = &stg256();
+      break;
+    default:
+      LOG(FATAL) << "Unsupported bit width for stg: " << bits;
+      return Stmt();
+    }
+
+    // Reinterpret value if dtype doesn't match
+    if (value.dtype() != store_dtype) {
+      value = Call(store_dtype, builtin::reinterpret(), {value});
+    }
+
+    // Create predicated stg call
+    return Evaluate(Call(DataType::Handle(), *stg_op, {ptr, value, predicate}));
+  }
+};
+
+using namespace tir::transform;
+
+tvm::transform::Pass LowerLDGSTG() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    // Check if target is CUDA
+    auto target_opt = f->GetAttr<Target>(tvm::attr::kTarget);
+    if (!target_opt.defined()) {
+      // No target bound, skip this pass
+      return f;
+    }
+    Target target = target_opt.value();
+    if (target->kind->name != "cuda") {
+      // Not a CUDA target, skip
+      return f;
+    }
+
+    // Skip for CuTeDSL backend
+    if (tl::TargetIsCuTeDSL(target)) {
+      return f;
+    }
+
+    // Read pass configurations
+    // Non-predicated ldg/stg: default OFF
+    bool enable_non_predicated =
+        ctx->GetConfig<Bool>(kEnableLowerLDGSTG, Bool(false)).value();
+    // Predicated ldg/stg: default OFF
+    bool enable_predicated =
+        ctx->GetConfig<Bool>(kEnableLowerLDGSTGPredicated, Bool(false)).value();
+
+    // If both are disabled, skip the pass entirely
+    if (!enable_non_predicated && !enable_predicated) {
+      return f;
+    }
+
+    auto *n = f.CopyOnWrite();
+    n->body =
+        LowerLDGSTGRewriter(enable_non_predicated, enable_predicated)(n->body);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LowerLDGSTG", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerLDGSTG", LowerLDGSTG);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_opaque_block.cc b/src/transform/lower_opaque_block.cc
index 76dc36a6a3..a56e370e4a 100644
--- a/src/transform/lower_opaque_block.cc
+++ b/src/transform/lower_opaque_block.cc
@@ -30,6 +30,7 @@
 #include <utility>
 
 #include "../op/builtin.h"
+#include "common/attr.h"
 #include "tir/transforms/ir_utils.h"
 
 namespace tvm {
@@ -55,6 +56,9 @@ class OpaqueBlockLower : public StmtExprMutator {
       f = WithAttr(std::move(f), tl::attr::kLocalVarInit,
                    lower.local_var_init_map_);
     }
+    if (lower.cluster_dims_.has_value()) {
+      f = WithAttr(std::move(f), "cluster_dims", lower.cluster_dims_.value());
+    }
     return f;
   }
 
@@ -101,7 +105,16 @@ class OpaqueBlockLower : public StmtExprMutator {
       body = Allocate(buffer->data, buffer->dtype, allocation_shape,
                       const_true(), std::move(body), allocate_annotations);
     }
-    // Step 5. Insert attribute statements converted from pragmas
+    // Step 5. Materialize a lexical scope boundary only for blocks that were
+    // explicitly marked by an earlier semantic lowering pass (for example
+    // gemm/gemm_sp_py). We intentionally avoid re-inferring this from the
+    // lowered alloc_buffers here because provenance has already been blurred by
+    // earlier allocation planning/hoisting passes.
+    if (HasLexicalAllocScopeAnnotation(new_block->annotations)) {
+      body = AttrStmt(Integer(0), tl::attr::kLexicalAllocScope, Integer(1),
+                      std::move(body));
+    }
+    // Step 6. Insert attribute statements converted from pragmas
     for (auto it = pragma_attrs.rbegin(); it != pragma_attrs.rend(); ++it) {
       body = AttrStmt(Integer(0), it->first, it->second, std::move(body));
     }
@@ -232,7 +245,7 @@ class OpaqueBlockLower : public StmtExprMutator {
     pragma_attrs->clear();
     for (const auto &kv : annotations) {
       const String &key = kv.first;
-      if (tir::attr::IsPragmaKey(key)) {
+      if (tir::attr::IsPragmaKey(key) || tl::attr::IsCodeBlockKey(key)) {
         pragma_attrs->emplace_back(key, ConvertAttrValue(key, kv.second));
       } else if (key == tl::attr::kLocalVarInit) {
         if (auto local_init_map = kv.second.try_cast<Map<Var, PrimExpr>>()) {
@@ -254,6 +267,14 @@ class OpaqueBlockLower : public StmtExprMutator {
                      << "` to be a PrimExpr or Map<Var, PrimExpr>, but got "
                      << kv.second.GetTypeKey();
         }
+      } else if (key == "cluster_dims") {
+        if (auto arr = kv.second.try_cast<Array<Integer>>()) {
+          cluster_dims_ = arr.value();
+        } else {
+          LOG(FATAL) << "Expected `" << "cluster_dims"
+                     << "` to be an Array<Integer>, but got "
+                     << kv.second.GetTypeKey();
+        }
       } else if (!is_block) {
         // the loop annotation is preserved
         preserved_annotations.Set(key, kv.second);
@@ -265,6 +286,11 @@ class OpaqueBlockLower : public StmtExprMutator {
     return preserved_annotations;
   }
 
+  static bool
+  HasLexicalAllocScopeAnnotation(const Map<String, ffi::Any> &annotations) {
+    return annotations.find(tl::attr::kLexicalAllocScope) != annotations.end();
+  }
+
   Buffer ResolveLocalVarBuffer(const Array<Buffer> &alloc_buffers) const {
     for (const Buffer &buffer : alloc_buffers) {
       std::string scope = buffer.scope();
@@ -290,6 +316,10 @@ class OpaqueBlockLower : public StmtExprMutator {
 
   /*! \brief Local var initializers collected from block annotations. */
   Map<Var, PrimExpr> local_var_init_map_;
+
+  /*! \brief Cluster dims collected from tilelang.cluster_dims block annotation.
+   */
+  Optional<Array<Integer>> cluster_dims_{std::nullopt};
 };
 
 PrimFunc TLLowerOpaqueBlock(PrimFunc f) {
diff --git a/src/transform/lower_pdl.cc b/src/transform/lower_pdl.cc
new file mode 100644
index 0000000000..13150bf673
--- /dev/null
+++ b/src/transform/lower_pdl.cc
@@ -0,0 +1,92 @@
+/*!
+ * \file lower_pdl.cc
+ * \brief Mark Device PrimFunc with attributes if CUDA PDL functions are called
+ */
+
+#include "../op/builtin.h"
+#include "../target/utils.h"
+#include "common/attr.h"
+#include "tvm/ir/type.h"
+#include "tvm/tir/builtin.h"
+#include "tvm/tir/expr.h"
+#include "tvm/tir/stmt.h"
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+// NVCC has issues with __ldg when using PDL (Programmatic Dependent Launch)
+// synchronization. Suppress the annotation when kHasGridSync is set.
+class CheckLDGCalls : public StmtExprVisitor {
+public:
+  void VisitExpr_(const tir::CallNode *op) final {
+    if (op->op.same_as(tl::__ldg())) {
+      LOG(FATAL) << "Cannot invoke __ldg function with pdl_sync";
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+};
+
+class MarkCudaSyncCalls : public StmtExprMutator {
+public:
+  static PrimFunc Substitute(PrimFunc f, bool support_pdl) {
+    MarkCudaSyncCalls mutator;
+    PrimFunc new_f = f;
+    new_f.CopyOnWrite()->body = mutator.VisitStmt(f->body);
+
+    if (!support_pdl) {
+      ICHECK(!mutator.has_trigger_launch_ && !mutator.has_grid_sync_)
+          << "PDL is not supported";
+    }
+
+    if (mutator.has_trigger_launch_) {
+      new_f = WithAttr(std::move(new_f), attr::kHasTriggerLaunch, 1);
+    }
+    if (mutator.has_grid_sync_) {
+      new_f = WithAttr(std::move(new_f), attr::kHasGridSync, 1);
+      CheckLDGCalls analyzer;
+      analyzer(f->body);
+    }
+    return new_f;
+  }
+
+  PrimExpr VisitExpr_(const tir::CallNode *op) final {
+    if (op->op.same_as(tl::pdl_trigger())) {
+      has_trigger_launch_ = true;
+    } else if (op->op.same_as(tl::pdl_sync())) {
+      has_grid_sync_ = true;
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+private:
+  bool has_trigger_launch_ = false;
+  bool has_grid_sync_ = false;
+
+  MarkCudaSyncCalls() = default;
+};
+
+using namespace tir::transform;
+
+tvm::transform::Pass MarkCudaSyncCallsPass(bool support_pdl) {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    return MarkCudaSyncCalls::Substitute(f, support_pdl);
+  };
+
+  return CreatePrimFuncPass(pass_func, 0, "tl.MarkCudaSyncCalls", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.MarkCudaSyncCalls",
+                        MarkCudaSyncCallsPass);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_ptx_async_copy.cc b/src/transform/lower_ptx_async_copy.cc
new file mode 100644
index 0000000000..fc90b9ad4c
--- /dev/null
+++ b/src/transform/lower_ptx_async_copy.cc
@@ -0,0 +1,745 @@
+/*!
+ * \brief Lower eligible global->shared copies into PTX cp.async
+ * \file lower_ptx_async_copy.cc
+ */
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/target/target.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <algorithm>
+#include <cstdint>
+#include <limits>
+#include <optional>
+#include <vector>
+
+#include "../op/builtin.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
+#include "ptx_async_copy_injector.h"
+#include "tir/ir/buffer_common.h"
+#include "tvm/tir/stmt.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+class PTXAsyncCopyInjector : public StmtMutator {
+public:
+  explicit PTXAsyncCopyInjector(bool enable_auto_async_copy,
+                                bool async_without_async_commit_wait)
+      : enable_auto_async_copy_(enable_auto_async_copy),
+        async_without_async_commit_wait_(async_without_async_commit_wait) {}
+
+  bool InjectedPTXAsyncCopy() const { return injected_ptx_async_copy_; }
+
+  Stmt Finalize(Stmt body) {
+    if (!pending_sync_copies_ || UseExplicitAsyncSemantics()) {
+      pending_sync_copies_ = false;
+      uncommitted_sync_copies_ = false;
+      return body;
+    }
+
+    Array<Stmt> seq;
+    seq.reserve(3);
+    seq.push_back(body);
+    AppendSyncVisibility(&seq, uncommitted_sync_copies_);
+    pending_sync_copies_ = false;
+    uncommitted_sync_copies_ = false;
+    return SeqStmt(seq);
+  }
+
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tir::attr::async_scope) {
+      ++explicit_async_scope_depth_;
+      Stmt body = this->VisitStmt(op->body);
+      --explicit_async_scope_depth_;
+      // `async_scope` is a lowering-only marker for cp.async semantics.
+      return body;
+    }
+    return StmtMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    // Track nested vectorized loop extents so we can decide whether an
+    // element-wise copy has a legal final cp.async width after later loop
+    // vectorization:
+    //   for v in T.vectorized(k): tl.ptx_cp_async(dst, src, elem_count)
+    // => tl.ptx_cp_async(dst_base, src_base, elem_count * k)
+    //
+    // TileLang records logical element counts in tl.ptx_cp_async. The final
+    // PTX byte width is derived later from the access_ptr dtype, so subbyte
+    // dtypes such as int4/fp4/int2/int1 remain representable here.
+    int previous_vectorized_lanes = current_vectorized_lanes_;
+    bool pushed_vectorized_loop = false;
+    if (op->kind == ForKind::kVectorized) {
+      const auto *extent_imm = op->extent.as<IntImmNode>();
+      ICHECK(extent_imm)
+          << "Vectorized loops must have constant extent, but got "
+          << op->extent;
+      int lanes = static_cast<int>(extent_imm->value);
+      if (lanes > 1 && current_vectorized_lanes_ <=
+                           std::numeric_limits<int>::max() / lanes) {
+        current_vectorized_lanes_ *= lanes;
+        active_vectorized_loops_.push_back({op->loop_var, lanes});
+        pushed_vectorized_loop = true;
+      }
+    }
+    Stmt stmt = StmtMutator::VisitStmt_(op);
+    if (pushed_vectorized_loop) {
+      active_vectorized_loops_.pop_back();
+    }
+    current_vectorized_lanes_ = previous_vectorized_lanes;
+    return stmt;
+  }
+
+  Optional<Stmt> TryInjectPTX(const BufferLoadNode *load,
+                              const BufferStoreNode *store,
+                              bool predicated = false,
+                              const PrimExpr &predicate_value = PrimExpr()) {
+    // Pipeline:
+    // 1) Analyze source/destination indices and transfer width eligibility.
+    // 2) Build tl.ptx_cp_async with scalar/vectorized base offsets when the
+    //    eventual PTX byte width is representable.
+    std::optional<CopyIndexInfo> index_info = PrepareCopyIndexInfo(load, store);
+    if (!index_info.has_value()) {
+      return Optional<Stmt>();
+    }
+
+    if (index_info->index_lanes == 1) {
+      if (current_vectorized_lanes_ > 1 &&
+          !HasContiguousVectorizedOffsets(index_info->src_index,
+                                          index_info->dst_index)) {
+        return Optional<Stmt>();
+      }
+      return MakeCPAsyncStmtFromLoads(
+          store,
+          /*dst_base_load=*/BufferLoad(store->buffer, store->indices),
+          /*src_base_load=*/BufferLoad(load->buffer, load->indices),
+          /*num_elems=*/index_info->per_access_num_elems, predicated,
+          predicate_value);
+    }
+
+    Optional<Array<PrimExpr>> src_base_indices =
+        ExtractVectorBaseIndices(load->indices);
+    Optional<Array<PrimExpr>> dst_base_indices =
+        ExtractVectorBaseIndices(store->indices);
+    if (!src_base_indices.defined() || !dst_base_indices.defined()) {
+      // If we can't extract base indices from vectorized accesses, fall back.
+      if (predicated) {
+        LOG(WARNING)
+            << "Cannot extract base indices from vectorized accesses for "
+               "predicated cp.async; falling back to regular buffer store/load";
+      }
+      return Optional<Stmt>();
+    }
+    return MakeCPAsyncStmtFromLoads(
+        store,
+        /*dst_base_load=*/BufferLoad(store->buffer, dst_base_indices.value()),
+        /*src_base_load=*/BufferLoad(load->buffer, src_base_indices.value()),
+        /*num_elems=*/index_info->per_access_num_elems, predicated,
+        predicate_value);
+  }
+
+  Stmt VisitStmt_(const SeqStmtNode *op) final {
+    if (UseExplicitAsyncSemantics()) {
+      return StmtMutator::VisitStmt_(op);
+    }
+
+    // Insert commit+wait at statement boundaries to preserve synchronous
+    // semantics for normal global->shared BufferStore copies.
+    //
+    // Important: avoid flushing inside inner loop bodies just because there
+    // are trailing no-op statements (e.g., Evaluate(0)) after the injected
+    // cp.async. Instead, treat "pure copy region" statements as part of the
+    // copy run and only flush right before the next non-copy statement.
+    Array<Stmt> out;
+    out.reserve(op->seq.size() + 2);
+
+    CopySyncState sync_state{pending_sync_copies_, uncommitted_sync_copies_};
+    pending_sync_copies_ = false;
+    uncommitted_sync_copies_ = false;
+
+    for (const Stmt &stmt : op->seq) {
+      VisitedStmtInfo visited_info = VisitAndAnalyzeStmt(stmt);
+      bool stmt_is_pure_copy_region = visited_info.analysis.is_pure_copy_region;
+
+      // Before we execute a non-copy statement, we must preserve synchronous
+      // semantics for injected cp.async stores by making the data visible.
+      if (sync_state.open_copy_region && !stmt_is_pure_copy_region) {
+        AppendSyncVisibility(&out, sync_state.uncommitted_transfers);
+        sync_state.open_copy_region = false;
+        sync_state.uncommitted_transfers = false;
+      }
+
+      // If we are carrying uncommitted injected cp.async into an explicit wait,
+      // ensure they are committed so the wait actually covers them.
+      if (sync_state.open_copy_region && sync_state.uncommitted_transfers &&
+          visited_info.analysis.wait > 0) {
+        out.push_back(MakeCommitGroupStmt());
+        sync_state.uncommitted_transfers = false;
+      }
+
+      out.push_back(visited_info.visited);
+
+      if (visited_info.opens_copy_region) {
+        sync_state.open_copy_region = true;
+        sync_state.uncommitted_transfers =
+            sync_state.uncommitted_transfers ||
+            visited_info.has_uncommitted_transfers;
+      }
+
+      if (visited_info.analysis.commit > 0) {
+        // A commit closes the currently open group, so there are no longer any
+        // uncommitted injected cp.async transfers.
+        sync_state.uncommitted_transfers = false;
+      }
+
+      if (visited_info.analysis.wait > 0) {
+        // Any explicit wait serves as a synchronization boundary for injected
+        // synchronous copies.
+        sync_state.open_copy_region = false;
+        sync_state.uncommitted_transfers = false;
+      }
+    }
+
+    pending_sync_copies_ = sync_state.open_copy_region;
+    uncommitted_sync_copies_ = sync_state.uncommitted_transfers;
+
+    if (out.empty()) {
+      return Evaluate(0);
+    }
+    if (out.size() == 1) {
+      return out[0];
+    }
+    return SeqStmt(out);
+  }
+
+  Stmt VisitStmt_(const IfThenElseNode *op) final {
+    if (UseExplicitAsyncSemantics()) {
+      return StmtMutator::VisitStmt_(op);
+    }
+
+    // Treat branches as separate control flow paths. We propagate pending
+    // synchronous copies into both branches (they occur before the branch),
+    // but do not let mutations in one branch affect the other.
+    bool pending_before = pending_sync_copies_;
+    bool uncommitted_before = uncommitted_sync_copies_;
+
+    pending_sync_copies_ = pending_before;
+    uncommitted_sync_copies_ = uncommitted_before;
+    Stmt then_case = this->VisitStmt(op->then_case);
+    bool pending_then = pending_sync_copies_;
+    bool uncommitted_then = uncommitted_sync_copies_;
+
+    bool pending_else = pending_before;
+    bool uncommitted_else = uncommitted_before;
+    Optional<Stmt> else_case;
+    if (op->else_case.defined()) {
+      pending_sync_copies_ = pending_before;
+      uncommitted_sync_copies_ = uncommitted_before;
+      else_case = this->VisitStmt(op->else_case.value());
+      pending_else = pending_sync_copies_;
+      uncommitted_else = uncommitted_sync_copies_;
+    }
+
+    pending_sync_copies_ = pending_then || pending_else;
+    uncommitted_sync_copies_ = uncommitted_then || uncommitted_else;
+
+    if (then_case.same_as(op->then_case) &&
+        (!else_case.defined() || else_case.same_as(op->else_case))) {
+      return tvm::ffi::GetRef<Stmt>(op);
+    }
+    return IfThenElse(op->condition, then_case, else_case);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *store) final {
+    if (!IsSharedBuffer(store->buffer)) {
+      return StmtMutator::VisitStmt_(store);
+    }
+    // Only lower copies in regions where async-copy rewrite is enabled.
+    if (!enable_auto_async_copy_) {
+      return StmtMutator::VisitStmt_(store);
+    }
+
+    Optional<PrimExpr> predicate = std::nullopt;
+    const BufferLoadNode *load =
+        MatchZeroFillBufferLoad(store->value, &predicate);
+    if (load) {
+      Optional<Stmt> injected =
+          TryInjectPTX(load, store, predicate.defined(),
+                       predicate.defined() ? predicate.value() : PrimExpr());
+      if (injected.defined()) {
+        injected_ptx_async_copy_ = true;
+        if (!UseExplicitAsyncSemantics()) {
+          pending_sync_copies_ = true;
+          uncommitted_sync_copies_ = true;
+        }
+        return injected.value();
+      }
+    }
+
+    return StmtMutator::VisitStmt_(store);
+  }
+
+private:
+  bool UseExplicitAsyncSemantics() const {
+    return async_without_async_commit_wait_ || explicit_async_scope_depth_ > 0;
+  }
+
+  // A copy candidate represented after flattening source/destination indexing.
+  struct CopyIndexInfo {
+    PrimExpr src_index;
+    PrimExpr dst_index;
+    int index_lanes{1};
+    int per_access_num_elems{0};
+  };
+
+  // Synchronization state for injected cp.async runs carried across statements.
+  struct CopySyncState {
+    bool open_copy_region{false};
+    bool uncommitted_transfers{false};
+  };
+
+  struct ActiveVectorizedLoop {
+    Var loop_var;
+    int extent;
+  };
+
+  // ---- Copy candidate analysis helpers ----
+  static bool IsZeroValue(const PrimExpr &expr) {
+    if (const auto *broadcast = expr.as<BroadcastNode>()) {
+      return IsZeroValue(broadcast->value);
+    }
+    if (const auto *float_imm = expr.as<FloatImmNode>()) {
+      return float_imm->value == 0.0f;
+    }
+    if (const auto *int_imm = expr.as<IntImmNode>()) {
+      return int_imm->value == 0;
+    }
+    return false;
+  }
+
+  static const BufferLoadNode *
+  MatchZeroFillBufferLoad(const PrimExpr &value,
+                          Optional<PrimExpr> *predicate) {
+    if (const auto *load = value.as<BufferLoadNode>()) {
+      return load;
+    }
+
+    const auto *call = value.as<CallNode>();
+    if (!call || !call->op.same_as(builtin::if_then_else()) ||
+        !IsZeroValue(call->args[2])) {
+      return nullptr;
+    }
+
+    const BufferLoadNode *load =
+        MatchZeroFillBufferLoad(call->args[1], predicate);
+    if (load == nullptr) {
+      return nullptr;
+    }
+
+    *predicate =
+        predicate->defined()
+            ? Optional<PrimExpr>(And(call->args[0], predicate->value()))
+            : Optional<PrimExpr>(call->args[0]);
+    return load;
+  }
+
+  static Optional<PrimExpr>
+  FlattenToLinearOffset(const Buffer &buf,
+                        const ffi::Array<PrimExpr> &indices) {
+    // Convert N-D indices (potentially with axis_separators) into a single
+    // row-major linear element offset.
+    ffi::Array<PrimExpr> physical = buf.OffsetOf(indices);
+    Buffer flattened_buf = buf.GetFlattenedBuffer();
+    if (physical.size() != flattened_buf->shape.size() || physical.empty()) {
+      return Optional<PrimExpr>();
+    }
+
+    PrimExpr linear = physical[0];
+    for (size_t i = 1; i < physical.size(); ++i) {
+      linear = linear * flattened_buf->shape[i] + physical[i];
+    }
+    return linear;
+  }
+
+  std::optional<CopyIndexInfo>
+  PrepareCopyIndexInfo(const BufferLoadNode *load,
+                       const BufferStoreNode *store) {
+    if (!IsGlobalBuffer(load->buffer)) {
+      return std::nullopt;
+    }
+
+    Optional<PrimExpr> src_index_opt =
+        FlattenToLinearOffset(load->buffer, load->indices);
+    Optional<PrimExpr> dst_index_opt =
+        FlattenToLinearOffset(store->buffer, store->indices);
+    if (!src_index_opt.defined() || !dst_index_opt.defined()) {
+      return std::nullopt;
+    }
+
+    PrimExpr src_index = src_index_opt.value();
+    PrimExpr dst_index = dst_index_opt.value();
+    if (src_index->dtype.lanes() != dst_index->dtype.lanes()) {
+      // Not a straightforward vectorized copy; skip.
+      return std::nullopt;
+    }
+
+    const int index_lanes = src_index->dtype.lanes();
+    const int value_lanes = load->dtype.lanes();
+    if (value_lanes > 1 && index_lanes > 1 && value_lanes != index_lanes) {
+      // Mismatched vector lane representations; be conservative.
+      return std::nullopt;
+    }
+
+    const int effective_lanes = std::max(value_lanes, index_lanes);
+    const int per_access_bits = effective_lanes * load->dtype.bits();
+    const int total_bits = static_cast<int>(per_access_bits) *
+                           static_cast<int>(current_vectorized_lanes_);
+    // PTX cp.async is byte-granular. `tl.ptx_cp_async` stores logical element
+    // counts, but we still need to know that the eventual vectorized transfer
+    // can map to a legal byte width without over-copying packed subbyte data.
+    if (total_bits % 8 != 0) {
+      return std::nullopt;
+    }
+    const int total_bytes = total_bits / 8;
+    if (!IsValidCPAsyncTransferBytes(total_bytes)) {
+      return std::nullopt;
+    }
+
+    CopyIndexInfo info;
+    info.src_index = src_index;
+    info.dst_index = dst_index;
+    info.index_lanes = index_lanes;
+    info.per_access_num_elems = effective_lanes;
+    return info;
+  }
+
+  static PrimExpr ExtractVectorBase(const PrimExpr &index) {
+    if (index.dtype().lanes() == 1) {
+      return index;
+    }
+    if (const auto *broadcast = index.as<BroadcastNode>()) {
+      return broadcast->value;
+    }
+    if (const auto *ramp = index.as<RampNode>()) {
+      if (!is_one(ramp->stride)) {
+        return PrimExpr();
+      }
+      return ramp->base;
+    }
+
+    const auto *add = index.as<AddNode>();
+    if (!add) {
+      return PrimExpr();
+    }
+
+    // Common pattern after flattening a vectorized N-D buffer access:
+    //   (broadcast(base_offset) + ramp(vec_base, 1, lanes))
+    // or its commuted form:
+    //   (ramp(vec_base, 1, lanes) + broadcast(base_offset))
+    const PrimExpr &lhs = add->a;
+    const PrimExpr &rhs = add->b;
+    if (const auto *lhs_ramp = lhs.as<RampNode>()) {
+      if (!is_one(lhs_ramp->stride)) {
+        return PrimExpr();
+      }
+      if (const auto *rhs_broadcast = rhs.as<BroadcastNode>()) {
+        return tir::Add(lhs_ramp->base, rhs_broadcast->value);
+      }
+    }
+    if (const auto *rhs_ramp = rhs.as<RampNode>()) {
+      if (!is_one(rhs_ramp->stride)) {
+        return PrimExpr();
+      }
+      if (const auto *lhs_broadcast = lhs.as<BroadcastNode>()) {
+        return tir::Add(rhs_ramp->base, lhs_broadcast->value);
+      }
+    }
+    return PrimExpr();
+  }
+
+  static Optional<Array<PrimExpr>>
+  ExtractVectorBaseIndices(const Array<PrimExpr> &indices) {
+    Array<PrimExpr> base_indices;
+    base_indices.reserve(indices.size());
+    for (const PrimExpr &index : indices) {
+      PrimExpr base = ExtractVectorBase(index);
+      if (!base.defined()) {
+        return Optional<Array<PrimExpr>>();
+      }
+      base_indices.push_back(base);
+    }
+    return base_indices;
+  }
+
+  static PrimExpr MakeAccessPtrFromLoad(const BufferLoad &base_load, int extent,
+                                        int rw_mask) {
+    return Call(DataType::Handle(), tvm::tl::access_ptr(),
+                {base_load, IntImm(DataType::Int(32), extent),
+                 IntImm(DataType::Int(32), rw_mask)});
+  }
+
+  static Optional<Stmt>
+  MakeCPAsyncStmtFromLoads(const BufferStoreNode *store,
+                           const BufferLoad &dst_base_load,
+                           const BufferLoad &src_base_load, int num_elems,
+                           bool predicated, const PrimExpr &predicate_value) {
+    PrimExpr dst_access_ptr =
+        MakeAccessPtrFromLoad(dst_base_load, num_elems, /*rw_mask=*/2);
+    PrimExpr src_access_ptr =
+        MakeAccessPtrFromLoad(src_base_load, num_elems, /*rw_mask=*/1);
+
+    ffi::Array<PrimExpr> cp_async_args;
+    if (predicated) {
+      cp_async_args = {dst_access_ptr, src_access_ptr, PrimExpr(num_elems),
+                       predicate_value};
+    } else {
+      cp_async_args = {dst_access_ptr, src_access_ptr, PrimExpr(num_elems)};
+    }
+    return Evaluate(
+        Call(store->buffer->dtype, tvm::tl::ptx_cp_async(), cp_async_args));
+  }
+
+  static Stmt MakeCommitGroupStmt() {
+    return Evaluate(Call(DataType::Handle(), builtin::ptx_commit_group(), {}));
+  }
+
+  static Stmt MakeWaitGroupStmt(int n) {
+    return Evaluate(Call(DataType::Handle(), builtin::ptx_wait_group(),
+                         {IntImm(DataType::Int(32), n)}));
+  }
+
+  // ---- Vectorized-offset contiguity helpers ----
+  static bool TryGetConstInt64(const PrimExpr &expr, int64_t *value) {
+    if (const auto *imm = expr.as<IntImmNode>()) {
+      *value = imm->value;
+      return true;
+    }
+    return false;
+  }
+
+  bool HasUnitStrideForVectorizedLoop(const PrimExpr &expr,
+                                      const ActiveVectorizedLoop &loop) {
+    PrimExpr prev = analyzer_.Simplify(
+        Substitute(expr, {{loop.loop_var, IntImm(loop.loop_var->dtype, 0)}}));
+
+    int64_t stride = 0;
+    for (int value = 1; value < loop.extent; ++value) {
+      PrimExpr curr = analyzer_.Simplify(Substitute(
+          expr, {{loop.loop_var, IntImm(loop.loop_var->dtype, value)}}));
+      PrimExpr delta = analyzer_.Simplify(curr - prev);
+      int64_t delta_value = 0;
+      if (!TryGetConstInt64(delta, &delta_value)) {
+        return false;
+      }
+      if (value == 1) {
+        stride = delta_value;
+      } else if (delta_value != stride) {
+        return false;
+      }
+      prev = curr;
+    }
+
+    return stride == 1;
+  }
+
+  bool HasContiguousVectorizedOffsets(const PrimExpr &src_index,
+                                      const PrimExpr &dst_index) {
+    for (const auto &loop : active_vectorized_loops_) {
+      if (!HasUnitStrideForVectorizedLoop(src_index, loop) ||
+          !HasUnitStrideForVectorizedLoop(dst_index, loop)) {
+        return false;
+      }
+    }
+    return true;
+  }
+
+  // ---- Copy-region synchronization analysis helpers ----
+  struct CopyRegionAnalysis {
+    bool is_pure_copy_region = true;
+    int commit = 0;
+    int wait = 0;
+  };
+
+  struct VisitedStmtInfo {
+    Stmt visited;
+    CopyRegionAnalysis analysis;
+    bool opens_copy_region{false};
+    bool has_uncommitted_transfers{false};
+  };
+
+  static CopyRegionAnalysis
+  MergeCopyRegionAnalysis(CopyRegionAnalysis a, const CopyRegionAnalysis &b) {
+    a.is_pure_copy_region = a.is_pure_copy_region && b.is_pure_copy_region;
+    a.commit += b.commit;
+    a.wait += b.wait;
+    return a;
+  }
+
+  static CopyRegionAnalysis AnalyzeCopyRegion(const Stmt &stmt) {
+    CopyRegionAnalysis out;
+    if (!stmt.defined()) {
+      return out;
+    }
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      for (const Stmt &s : seq->seq) {
+        out = MergeCopyRegionAnalysis(out, AnalyzeCopyRegion(s));
+      }
+      return out;
+    }
+    if (const auto *ite = stmt.as<IfThenElseNode>()) {
+      // Ignore the condition: treat it as pure control flow, and only care
+      // whether the branches are pure copy regions so we can hoist sync out.
+      out = MergeCopyRegionAnalysis(out, AnalyzeCopyRegion(ite->then_case));
+      if (ite->else_case.defined()) {
+        out = MergeCopyRegionAnalysis(
+            out, AnalyzeCopyRegion(ite->else_case.value()));
+      }
+      return out;
+    }
+    if (const auto *eval = stmt.as<EvaluateNode>()) {
+      if (is_const_int(eval->value)) {
+        return out;
+      }
+      const auto *call = eval->value.as<CallNode>();
+      if (!call) {
+        out.is_pure_copy_region = false;
+        return out;
+      }
+      if (call->op.same_as(builtin::ptx_cp_async()) ||
+          call->op.same_as(tl::ptx_cp_async())) {
+        return out;
+      }
+      if (call->op.same_as(builtin::ptx_commit_group())) {
+        out.commit += 1;
+        return out;
+      }
+      if (call->op.same_as(builtin::ptx_wait_group())) {
+        out.wait += 1;
+        return out;
+      }
+      out.is_pure_copy_region = false;
+      return out;
+    }
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      return AnalyzeCopyRegion(let->body);
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      return AnalyzeCopyRegion(attr->body);
+    }
+    if (const auto *loop = stmt.as<ForNode>()) {
+      return AnalyzeCopyRegion(loop->body);
+    }
+    if (const auto *block = stmt.as<BlockNode>()) {
+      if (block->init.defined()) {
+        out = MergeCopyRegionAnalysis(out,
+                                      AnalyzeCopyRegion(block->init.value()));
+      }
+      out = MergeCopyRegionAnalysis(out, AnalyzeCopyRegion(block->body));
+      return out;
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      // Treat the predicate as pure control flow (no side effects). We only
+      // care whether the realized body is a pure copy region so we can hoist
+      // the final commit+wait out of sequential loop nests.
+      const BlockNode *block = realize->block.get();
+      if (block->init.defined()) {
+        out = MergeCopyRegionAnalysis(out,
+                                      AnalyzeCopyRegion(block->init.value()));
+      }
+      out = MergeCopyRegionAnalysis(out, AnalyzeCopyRegion(block->body));
+      return out;
+    }
+    out.is_pure_copy_region = false;
+    return out;
+  }
+
+  VisitedStmtInfo VisitAndAnalyzeStmt(const Stmt &stmt) {
+    pending_sync_copies_ = false;
+    uncommitted_sync_copies_ = false;
+
+    Stmt visited = this->VisitStmt(stmt);
+    VisitedStmtInfo out;
+    out.visited = visited;
+    out.analysis = AnalyzeCopyRegion(visited);
+    out.opens_copy_region = pending_sync_copies_;
+    out.has_uncommitted_transfers = uncommitted_sync_copies_;
+    return out;
+  }
+
+  // ---- Synchronization emission helpers ----
+  void AppendSyncVisibility(Array<Stmt> *seq, bool include_commit) const {
+    if (include_commit) {
+      seq->push_back(MakeCommitGroupStmt());
+    }
+    seq->push_back(MakeWaitGroupStmt(0));
+  }
+
+  // Note: AnalyzeCopyRegion replaces both the old `IsPureCopyRegion` and
+  // `SummarizeAsyncIntrinsics` helpers to avoid redundant traversals.
+
+  bool enable_auto_async_copy_{true};
+  bool async_without_async_commit_wait_{false};
+  int explicit_async_scope_depth_{0};
+  int current_vectorized_lanes_{1};
+  std::vector<ActiveVectorizedLoop> active_vectorized_loops_;
+  arith::Analyzer analyzer_;
+  bool injected_ptx_async_copy_{false};
+  bool pending_sync_copies_{false};
+  bool uncommitted_sync_copies_{false};
+};
+
+using namespace tir::transform;
+
+PTXAsyncCopyInjectResult
+InjectPTXAsyncCopy(const Stmt &body, bool enable_auto_async_copy,
+                   bool async_without_async_commit_wait) {
+  PTXAsyncCopyInjector injector(enable_auto_async_copy,
+                                async_without_async_commit_wait);
+  Stmt injected = injector(body);
+  return {injector.Finalize(injected), injector.InjectedPTXAsyncCopy()};
+}
+
+tvm::transform::Pass LowerPTXAsyncCopy() {
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    auto target_opt = f->GetAttr<Target>(tvm::attr::kTarget);
+    if (!target_opt.defined()) {
+      return f;
+    }
+    Target target = target_opt.value();
+    if (!TargetIsCuda(target)) {
+      return f;
+    }
+
+    if (!TargetHasAsyncCopy(target)) {
+      // Graceful fallback on older architectures.
+      return f;
+    }
+
+    bool enable_auto_async_copy =
+        ctx->GetConfig<Bool>(kEnableAsyncCopy, Bool(true)).value();
+
+    auto *n = f.CopyOnWrite();
+    auto inject_result =
+        InjectPTXAsyncCopy(n->body, enable_auto_async_copy,
+                           /*async_without_async_commit_wait=*/false);
+    n->body = inject_result.stmt;
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.LowerPTXAsyncCopy", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.LowerPTXAsyncCopy", LowerPTXAsyncCopy);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/lower_shared_barrier.cc b/src/transform/lower_shared_barrier.cc
index 991676cb8d..9b81309f16 100644
--- a/src/transform/lower_shared_barrier.cc
+++ b/src/transform/lower_shared_barrier.cc
@@ -18,6 +18,11 @@
 namespace tvm {
 namespace tl {
 
+namespace attr {
+// BlockAttr, Recording the arrive counts for each barrier allocation
+constexpr const char *kBarrierInit = "barrier_init";
+} // namespace attr
+
 using namespace tir;
 
 class SharedBarrierRewriter : public StmtExprMutator {
@@ -43,15 +48,21 @@ class SharedBarrierRewriter : public StmtExprMutator {
       buffer_map_.insert({match_buffer->buffer->data, match_buffer->buffer});
     }
 
+    // Only check buffers allocated in THIS block, not accumulated from parent
+    // blocks
     Array<Buffer> barrier_buffers;
-
-    for (const auto &[data, buffer] : buffer_map_) {
+    for (auto buffer : alloc_buffers) {
       const auto *ptr_type =
           buffer->data->type_annotation.as<PointerTypeNode>();
+      if (!ptr_type)
+        continue;
       auto storage_scope = ptr_type->storage_scope;
-      ICHECK(ptr_type) << "Buffer Var's type annotation must be of PointerType";
-      if (storage_scope == "shared.barrier") {
+      if (storage_scope == "shared.barrier" ||
+          storage_scope == "shared.cluster_barrier") {
         barrier_buffers.push_back(buffer);
+        if (storage_scope == "shared.cluster_barrier") {
+          has_cluster_barrier_ = true;
+        }
       }
     }
 
@@ -66,58 +77,53 @@ class SharedBarrierRewriter : public StmtExprMutator {
     }
 
     /*
-    Transform the barrier buffers to new allocations
-    transform:
-        data_is_ready = T.alloc_buffer((128,), "uint64", scope="shared.barrier")
-        compute_is_done = T.alloc_buffer((128,), "uint64",
+    Transform:
+        mbarrier_list = T.alloc_barrier(arrive_counts: list[int], "handle",
     scope="shared.barrier")
 
     into:
-        data_is_ready = T.alloc_buffer((1,), "uint64", scope="shared")
-        compute_is_done = T.alloc_buffer((1,), "uint64", scope="shared")
+        # This is emitted by the definition of T.alloc_barrier
+        mbarrier_list = T.alloc_buffer(len(arrive_counts), "handle",
+    scope="shared.barrier")
 
+        # This is emitted by this pass
         if tx == 0:
-          T.ptx_init_barrier_thread_count(data_is_ready[0], 128)
-          T.ptx_init_barrier_thread_count(compute_is_done[0], 128)
+          for i in range(len(arrive_counts)):
+            T.ptx_init_barrier_thread_count(mbarrier_list[i], arrive_counts[i])
     */
 
-    // 2. create new buffers
-    Array<Buffer> new_buffers;
-    for (auto buffer : barrier_buffers) {
-      auto data = buffer->data;
-      auto new_buffer = Buffer(data, buffer->dtype, Array<PrimExpr>({1}),
-                               Array<PrimExpr>({1}), PrimExpr(0), buffer->name,
-                               buffer->data_alignment, buffer->offset_factor,
-                               buffer->buffer_type);
-      new_buffers.push_back(new_buffer);
-      buffer_remap_.Set(buffer, new_buffer);
-    }
-
-    // remove the barrier buffers
-    alloc_buffers.MutateByApply([this](Buffer buf) {
-      if (buffer_remap_.find(buf) != buffer_remap_.end()) {
-        return buffer_remap_.at(buf);
-      }
-      return buf;
-    });
-    if (!alloc_buffers.same_as(op->alloc_buffers)) {
-      block.CopyOnWrite()->alloc_buffers = alloc_buffers;
-    } else {
-      return StmtExprMutator::VisitStmt_(op);
-    }
-
-    // 3. create init calls for new buffers
+    // Extract the arrive counts from the block attr "barrier_init"
+    // The attr is a Map<Var, Array<PrimExpr>> where key is buffer.data and
+    // value is arrive counts
+    ICHECK(op->annotations.count(attr::kBarrierInit))
+        << "barrier_init is not defined";
+    auto barrier_init_map = op->annotations.Get(attr::kBarrierInit)
+                                ->as<Map<Var, Array<PrimExpr>>>()
+                                .value();
+
+    // Create init calls for each barrier buffer
+    // Initialize each barrier element with its respective arrive count
     Array<Stmt> init_mbarrier_calls_;
     for (auto buffer : barrier_buffers) {
       auto data = buffer->data;
-      auto old_buffer = buffer_data_to_buffer_.at(data);
-      auto new_buffer = buffer_remap_.at(old_buffer);
-      auto count = old_buffer->shape[0];
-
-      auto call =
-          Call(DataType::Handle(), builtin::ptx_init_barrier_thread_count(),
-               {BufferLoad(new_buffer, {0}), PrimExpr(count)});
-      init_mbarrier_calls_.push_back(Evaluate(call));
+      ICHECK(barrier_init_map.count(data))
+          << "Barrier buffer " << buffer->name
+          << " not found in barrier_init annotation";
+      auto arrive_counts = barrier_init_map.at(data);
+      ICHECK(arrive_counts.size() ==
+             static_cast<size_t>(buffer->shape[0].as<IntImmNode>()->value))
+          << "The number of arrive counts (" << arrive_counts.size()
+          << ") must match the barrier buffer size (" << buffer->shape[0]
+          << ") for buffer " << buffer->name;
+
+      for (size_t i = 0; i < arrive_counts.size(); i++) {
+        auto call =
+            Call(DataType::Handle(), builtin::ptx_init_barrier_thread_count(),
+                 {BufferLoad(buffer,
+                             {IntImm(DataType::Int(32), static_cast<int>(i))}),
+                  arrive_counts[i]});
+        init_mbarrier_calls_.push_back(Evaluate(call));
+      }
     }
     if (init_mbarrier_calls_.empty())
       return block;
@@ -134,9 +140,12 @@ class SharedBarrierRewriter : public StmtExprMutator {
                                       ? init_mbarrier_calls_.back()
                                       : SeqStmt(init_mbarrier_calls_),
                                   Stmt()));
+
     new_body.push_back(
-        Evaluate(Call(DataType::Handle(), builtin::tvm_storage_sync(),
-                      {StringImm("shared")})));
+        Evaluate(Call(DataType::Handle(), ptx_fence_barrier_init(), {})));
+    new_body.push_back(Evaluate(
+        Call(DataType::Handle(), builtin::tvm_storage_sync(),
+             {StringImm(has_cluster_barrier_ ? "cluster" : "shared")})));
     new_body.push_back(block->body);
 
     block.CopyOnWrite()->body = SeqStmt(new_body);
@@ -184,6 +193,8 @@ class SharedBarrierRewriter : public StmtExprMutator {
   std::unordered_map<Var, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
   // Disable shuffle elect for the warp specialized kernel
   bool disable_shuffle_elect_;
+  // Whether the block has a cluster barrier
+  bool has_cluster_barrier_ = false;
 };
 
 PrimFunc LowerSharedBarrier(PrimFunc f, bool disable_shuffle_elect) {
diff --git a/src/transform/lower_shared_tmem.cc b/src/transform/lower_shared_tmem.cc
index 4a3ad187e9..6e5f27d39b 100644
--- a/src/transform/lower_shared_tmem.cc
+++ b/src/transform/lower_shared_tmem.cc
@@ -15,20 +15,113 @@
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
+#include <unordered_set>
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
+using VarSet = std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>;
+
+/*!
+ * \brief Collect TMEM buffers explicitly deallocated on fallthrough paths.
+ *
+ * A "fallthrough path" is one that reaches the end of the statement without
+ * hitting thread_return().  Buffers deallocated on every such path already
+ * have an explicit dealloc, so we can skip the auto-dealloc at block end.
+ *
+ * \return {buffers deallocated on fallthrough, whether the stmt can
+ * fallthrough}
+ */
+static std::pair<VarSet, bool> CollectFallthroughDeallocs(const Stmt &stmt) {
+  if (!stmt.defined())
+    return {{}, true};
+
+  // Unwrap transparent wrapper nodes
+  if (auto *n = stmt.as<LetStmtNode>())
+    return CollectFallthroughDeallocs(n->body);
+  if (auto *n = stmt.as<AttrStmtNode>())
+    return CollectFallthroughDeallocs(n->body);
+  if (auto *n = stmt.as<BlockNode>())
+    return CollectFallthroughDeallocs(n->body);
+  if (auto *n = stmt.as<BlockRealizeNode>())
+    return CollectFallthroughDeallocs(n->block->body);
+  if (auto *n = stmt.as<ForNode>())
+    return CollectFallthroughDeallocs(n->body);
+
+  // Sequential: accumulate deallocs; stop if any child doesn't fallthrough
+  if (auto *seq = stmt.as<SeqStmtNode>()) {
+    VarSet deallocs;
+    for (const auto &child : seq->seq) {
+      auto [d, ft] = CollectFallthroughDeallocs(child);
+      if (!ft)
+        return {{}, false};
+      deallocs.insert(d.begin(), d.end());
+    }
+    return {std::move(deallocs), true};
+  }
+
+  // Branch: collect deallocs only from branches that can fallthrough
+  if (auto *iff = stmt.as<IfThenElseNode>()) {
+    auto [then_d, then_ft] = CollectFallthroughDeallocs(iff->then_case);
+    auto [else_d, else_ft] =
+        iff->else_case.defined()
+            ? CollectFallthroughDeallocs(iff->else_case.value())
+            : std::pair<VarSet, bool>{{}, true};
+    VarSet deallocs;
+    if (then_ft)
+      deallocs.insert(then_d.begin(), then_d.end());
+    if (else_ft)
+      deallocs.insert(else_d.begin(), else_d.end());
+    return {std::move(deallocs), then_ft || else_ft};
+  }
+
+  // Leaf: detect deallocate_tmem and thread_return
+  if (auto *eval = stmt.as<EvaluateNode>()) {
+    if (auto *call = eval->value.as<CallNode>()) {
+      if (call->op.same_as(tl::deallocate_tmem())) {
+        ICHECK_EQ(call->args.size(), 1U);
+        auto *buf = call->args[0].as<VarNode>();
+        ICHECK(buf) << "tl.deallocate_tmem expects a buffer data Var";
+        return {{tvm::ffi::GetRef<Var>(buf)}, true};
+      }
+      if (call->op.same_as(builtin::thread_return())) {
+        return {{}, false};
+      }
+    }
+  }
+
+  return {{}, true};
+}
+
 class SharedTmemRewriter : public StmtExprMutator {
 public:
-  static Stmt Rewrite(Stmt body) {
+  static Stmt Rewrite(Stmt body, Target target) {
     SharedTmemRewriter rewriter;
+    rewriter.target_ = std::move(target);
     return rewriter(body);
   }
 
 private:
+  int GetNumColsAllocated(const Buffer &buffer) const {
+    ICHECK_EQ(buffer->shape.size(), 2U);
+
+    auto analyzer = std::make_shared<arith::Analyzer>();
+    arith::ConstIntBound phy_col_bounds =
+        analyzer->const_int_bound(buffer->shape[1]);
+    int num_cols_required = phy_col_bounds->max_value;
+    ICHECK(num_cols_required <= 512)
+        << "The number of columns required for tmem buffer " << buffer->name
+        << " is " << num_cols_required
+        << ", which exceeds the maximum of 512 columns";
+
+    int num_cols_allocated = 32; // Align num_cols_allocated to power of 2
+    for (; num_cols_allocated < num_cols_required; num_cols_allocated *= 2) {
+    }
+    return num_cols_allocated;
+  }
+
   Stmt VisitStmt_(const BlockNode *op) final {
     Block block = tvm::ffi::GetRef<Block>(op);
     Array<Buffer> alloc_buffers = op->alloc_buffers;
@@ -64,6 +157,8 @@ class SharedTmemRewriter : public StmtExprMutator {
 
     ICHECK(thread_var_.defined()) << "thread_var_ is not defined";
 
+    auto [fallthrough_deallocs, _] = CollectFallthroughDeallocs(op->body);
+
     for (auto buffer : tmem_buffers) {
       buffer_data_to_buffer_.Set(buffer->data, buffer);
     }
@@ -125,6 +220,19 @@ class SharedTmemRewriter : public StmtExprMutator {
       return StmtExprMutator::VisitStmt_(op);
     }
 
+    // If block has use_2cta attr, add use_2cta: 1 to tmem alloc/dealloc call
+    // annotations.
+    Map<String, ObjectRef> tmem_call_ann;
+    if (op->annotations.count("use_2cta")) {
+      PrimExpr val = Downcast<PrimExpr>(op->annotations["use_2cta"]);
+      // Bool in TVM is a subclass of IntImm, so only check IntImm.
+      if (const auto *i = val.as<IntImmNode>()) {
+        if (i->value != 0) {
+          tmem_call_ann.Set("use_2cta", IntImm(DataType::Int(32), 1));
+        }
+      }
+    }
+
     // 3. create init & dealloc calls for new buffers
     std::vector<Stmt> init_mtmem_calls_;
     std::vector<Stmt> dealloc_tmem_calls_;
@@ -132,32 +240,35 @@ class SharedTmemRewriter : public StmtExprMutator {
       auto data = buffer->data;
       auto old_buffer = buffer_data_to_buffer_.at(data);
       auto new_buffer = buffer_remap_.at(old_buffer);
+      int num_cols_allocated = GetNumColsAllocated(old_buffer);
+
+      // Check that the number of rows doesn't exceed the tmem limit
+      {
+        auto analyzer = std::make_shared<arith::Analyzer>();
+        arith::ConstIntBound phy_row_bounds =
+            analyzer->const_int_bound(old_buffer->shape[0]);
+        int num_rows_required = phy_row_bounds->max_value;
+        ICHECK(num_rows_required <= 128)
+            << "The number of rows required for tmem buffer "
+            << old_buffer->name << " is " << num_rows_required
+            << ", which exceeds the maximum of 128 rows";
+      }
 
-      // Tmem physical coord range analysis
-      ICHECK(old_buffer->shape.size() == 2);
-
-      auto analyzer = std::make_shared<arith::Analyzer>();
-      arith::ConstIntBound phy_col_bounds =
-          analyzer->const_int_bound(old_buffer->shape[1]);
-      int num_cols_required = phy_col_bounds->max_value;
-      ICHECK(num_cols_required <= 512)
-          << "The number of columns required for tmem buffer "
-          << old_buffer->name << " is " << num_cols_required
-          << ", which exceeds the maximum of 512 columns";
-
-      int num_cols_allocated = 32; // Align num_cols_allocated to power of 2
-      for (; num_cols_allocated < num_cols_required; num_cols_allocated *= 2)
-        ;
+      tmem_num_cols_allocated_.insert({data, num_cols_allocated});
+      tmem_call_annotations_.insert({data, tmem_call_ann});
 
       auto new_buffer_access = new_buffer.access_ptr(1, DataType::Handle(), 1,
                                                      PrimExpr(0), PrimExpr(1));
       auto alloc_call = Call(DataType::Handle(), tl::ptx_init_tensor_memory(),
-                             {new_buffer_access, PrimExpr(num_cols_allocated)});
+                             {new_buffer_access, PrimExpr(num_cols_allocated)},
+                             tmem_call_ann);
       init_mtmem_calls_.push_back(Evaluate(alloc_call));
-      auto dealloc_call =
-          Call(DataType::Handle(), tl::ptx_deallocate_tensor_memory(),
-               {new_buffer_access, PrimExpr(num_cols_allocated)});
-      dealloc_tmem_calls_.push_back(Evaluate(dealloc_call));
+      if (!fallthrough_deallocs.count(data)) {
+        auto dealloc_call = Call(
+            DataType::Handle(), tl::ptx_deallocate_tensor_memory(),
+            {new_buffer_access, PrimExpr(num_cols_allocated)}, tmem_call_ann);
+        dealloc_tmem_calls_.push_back(Evaluate(dealloc_call));
+      }
     }
     auto compare_by_buffer_name = [&](const Stmt &a, const Stmt &b) {
       auto call_a = a.as<EvaluateNode>()->value.as<CallNode>();
@@ -170,8 +281,8 @@ class SharedTmemRewriter : public StmtExprMutator {
               compare_by_buffer_name);
 
     Array<Stmt> new_body;
-    auto target = Target::Current();
-    auto warp_size = TargetGetWarpSize(target);
+    ICHECK(target_.defined()) << "LowerSharedTmem requires a bound target";
+    auto warp_size = TargetGetWarpSize(target_);
     auto thread_var_div_warp_size =
         FloorDiv(thread_var_->var, IntImm(thread_var_->var->dtype, warp_size));
     new_body.push_back(IfThenElse(EQ(thread_var_div_warp_size, 0),
@@ -183,11 +294,17 @@ class SharedTmemRewriter : public StmtExprMutator {
         Evaluate(Call(DataType::Handle(), builtin::tvm_storage_sync(),
                       {StringImm("shared")})));
     new_body.push_back(block->body);
-    new_body.push_back(IfThenElse(EQ(thread_var_div_warp_size, 0),
-                                  dealloc_tmem_calls_.size() > 1
-                                      ? SeqStmt(dealloc_tmem_calls_)
-                                      : dealloc_tmem_calls_.back(),
-                                  Stmt()));
+    if (!dealloc_tmem_calls_.empty()) {
+      if (tmem_call_ann.find("use_2cta") != tmem_call_ann.end()) {
+        new_body.push_back(
+            Evaluate(Call(DataType::Handle(), tl::cluster_sync(), {})));
+      }
+      new_body.push_back(IfThenElse(EQ(thread_var_div_warp_size, 0),
+                                    dealloc_tmem_calls_.size() > 1
+                                        ? SeqStmt(dealloc_tmem_calls_)
+                                        : dealloc_tmem_calls_.back(),
+                                    Stmt()));
+    }
 
     auto block_ptr = block.CopyOnWrite();
     block_ptr->annotations.erase(attr::kLayoutMap);
@@ -247,6 +364,30 @@ class SharedTmemRewriter : public StmtExprMutator {
   }
 
   PrimExpr VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(tl::deallocate_tmem())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      Var buffer_data = Downcast<Var>(op->args[0]);
+      auto num_cols_it = tmem_num_cols_allocated_.find(buffer_data);
+      ICHECK(num_cols_it != tmem_num_cols_allocated_.end())
+          << "tl.deallocate_tmem expects a TMEM buffer allocated in the same "
+             "or an enclosing block";
+      ICHECK(buffer_data_to_buffer_.count(buffer_data))
+          << "TMEM buffer for tl.deallocate_tmem is not tracked";
+      Buffer old_buffer = buffer_data_to_buffer_.at(buffer_data);
+      ICHECK(buffer_remap_.count(old_buffer))
+          << "TMEM buffer for tl.deallocate_tmem has not been remapped";
+      Buffer new_buffer = buffer_remap_[old_buffer];
+      auto new_buffer_access = new_buffer.access_ptr(1, DataType::Handle(), 1,
+                                                     PrimExpr(0), PrimExpr(1));
+
+      Map<String, ObjectRef> ann;
+      auto ann_it = tmem_call_annotations_.find(buffer_data);
+      if (ann_it != tmem_call_annotations_.end()) {
+        ann = ann_it->second;
+      }
+      return Call(DataType::Handle(), tl::ptx_deallocate_tensor_memory(),
+                  {new_buffer_access, PrimExpr(num_cols_it->second)}, ann);
+    }
     if (op->op.same_as(builtin::tvm_access_ptr())) {
       ICHECK_EQ(op->args.size(), 5U);
       Var buffer_data = Downcast<Var>(op->args[1]);
@@ -285,19 +426,23 @@ class SharedTmemRewriter : public StmtExprMutator {
   // This is a workaround for cpu backend,
   // we need to define a thread_var for the serial loop.
   IterVar thread_var_;
+  Target target_;
   Map<Var, Var> var_remap_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Buffer, Buffer> buffer_remap_;
   // Mapping from data Var of a Buffer to Buffer, for lookup
   std::unordered_map<Var, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
+  std::unordered_map<Var, int, ObjectPtrHash, ObjectPtrEqual>
+      tmem_num_cols_allocated_;
+  std::unordered_map<Var, Map<String, ObjectRef>, ObjectPtrHash, ObjectPtrEqual>
+      tmem_call_annotations_;
   Map<Buffer, Layout> layout_map_;
 };
 
 PrimFunc LowerSharedTmem(PrimFunc f) {
   auto target = f->GetAttr<Target>(tvm::attr::kTarget);
   ICHECK(target.defined()) << "LowerSharedTmem: Require the target attribute";
-  SharedTmemRewriter rewriter;
-  f.CopyOnWrite()->body = rewriter.Rewrite(f->body);
+  f.CopyOnWrite()->body = SharedTmemRewriter::Rewrite(f->body, target.value());
   return f;
 }
 
diff --git a/src/transform/lower_tile_op.cc b/src/transform/lower_tile_op.cc
index 303ac03eed..7267f17ae7 100644
--- a/src/transform/lower_tile_op.cc
+++ b/src/transform/lower_tile_op.cc
@@ -20,8 +20,11 @@
 #include "../op/operator.h"
 #include "../op/utils.h"
 #include "../target/utils.h"
+#include "ptx_async_copy_injector.h"
 
 #include "arith/ir_mutator_with_analyzer.h"
+#include "common/mbarrier.h"
+#include "common/pipeline_utils.h"
 #include "layout_reducer.h"
 #include "loop_partition.h"
 
@@ -36,13 +39,13 @@ static Buffer makeBufferWithLayout(const Buffer &buffer, const Layout &layout,
       TVM_TYPE_AS(buffer->data->type_annotation, PointerTypeNode);
   Type new_type;
   // convert fragments to normal local buffer
-  if (ptr_type->storage_scope == "local.fragment") {
+  if (IsFragmentBuffer(buffer)) {
     new_type = PointerType(ptr_type->element_type, "local");
   } else {
     new_type = buffer->data->type_annotation;
   }
   Var new_var;
-  if (ptr_type->storage_scope == "global") {
+  if (IsGlobalBuffer(buffer)) {
     new_var = buffer->data;
   } else {
     if (var_remap.count(buffer->data)) {
@@ -54,9 +57,7 @@ static Buffer makeBufferWithLayout(const Buffer &buffer, const Layout &layout,
   }
   Array<PrimExpr> layout_shape = layout->OutputShape();
   Array<PrimExpr> output_shape = layout_shape;
-
-  if (ptr_type->storage_scope == "shared" ||
-      ptr_type->storage_scope == "shared.dyn") {
+  if (IsSharedBuffer(buffer)) {
     int replicate_extent = 1;
     Array<PrimExpr> buffer_shape = buffer->shape;
     int buffer_extent = 1;
@@ -67,6 +68,8 @@ static Buffer makeBufferWithLayout(const Buffer &buffer, const Layout &layout,
     }
     for (size_t i = 0; i < layout_shape.size(); i++) {
       auto shape = layout_shape[i].as<IntImmNode>();
+      ICHECK(shape) << "Layout output shape must be constant integer, but got: "
+                    << layout_shape[i];
       layout_extent *= shape->value;
     }
     replicate_extent = buffer_extent / layout_extent;
@@ -201,7 +204,13 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     arith::Analyzer analyzer;
     LowerTileOpPass substituter(&analyzer);
     // Trace the buffer map for tvm_access_ptr
-    substituter.buffer_map_.insert(f->buffer_map.begin(), f->buffer_map.end());
+    // Insert both handle var and data var as keys for lookup
+    for (const auto &[param_var, buffer] : f->buffer_map) {
+      substituter.buffer_map_.insert(
+          {param_var, buffer}); // handle key (e.g., dQ_handle)
+      substituter.buffer_map_.insert(
+          {buffer->data, buffer}); // data key (e.g., dQ)
+    }
     for (const auto &[_, buffer] : f->buffer_map) {
       substituter.buffer_data_to_buffer_.Set(buffer->data, buffer);
     }
@@ -214,15 +223,73 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         RemapBufferRewriter::Substitute(fptr->body, substituter.buffer_remap_);
     fptr->body =
         LayoutRemapRewriter::Substitute(fptr->body, substituter.layout_remap_);
-    tvm::transform::PassContext ctxt = tvm::transform::PassContext::Current();
-    Optional<Bool> opt_disable_tma_lower =
-        ctxt->GetConfig(kDisableTMALower, Optional<Bool>());
-
-    if (!opt_disable_tma_lower.value_or(Bool(false))) {
-      // @lei: this is a workaround, as if we don't disable tma lower,
-      // cp async lowering won't be generated.
-      ctxt->config.Set(kDisableTMALower, Bool(!substituter.has_tma_));
+    // Record whether TMA was actually used as a PrimFunc attribute so that
+    // later phases (OptimizeForTarget) can choose the right pass pipeline
+    // without relying on pass-context side-channel mutation.
+    f = WithAttr(std::move(f), kHasTMA, Bool(substituter.has_tma_));
+    fptr = f.CopyOnWrite();
+
+    // If any TMA copies allocated mbarriers, inject the barrier buffer
+    // into the tilelang_root block with a barrier_init annotation.
+    // Pipeline buffer versioning expands it for pipelining, and
+    // LowerSharedBarrier will process it into ptx_init_barrier_thread_count.
+    if (substituter.mbarrier_count_ > 0) {
+      ICHECK(substituter.mbarrier_buffer_.defined())
+          << "mbarrier_buffer_ must have been created by AllocMBarrier "
+             "callback";
+      Buffer mbar_buf = substituter.mbarrier_buffer_.value();
+      // Update buffer shape in-place to final count. We use const_cast
+      // because CopyOnWrite would create a new BufferNode, breaking identity
+      // with BufferLoad references already in the body. Pipeline buffer
+      // versioning relies on buffer identity to remap accesses correctly.
+      const_cast<BufferNode *>(mbar_buf.get())->shape = {
+          IntImm(DataType::Int(32), substituter.mbarrier_count_)};
+
+      Array<PrimExpr> counts;
+      counts.reserve(substituter.mbarrier_count_);
+      for (auto c : substituter.mbarrier_arrive_counts_)
+        counts.push_back(IntImm(DataType::Int(32), c));
+
+      // Walk the body to find the inner "tilelang_root" BlockRealize
+      // (inside the threadIdx.x scope) and inject the barrier buffer
+      // + barrier_init annotation.
+      struct RootBlockInjector : public StmtMutator {
+        Buffer barrier_buf;
+        Array<PrimExpr> arrive_counts;
+        bool injected{false};
+
+        Stmt VisitStmt_(const BlockRealizeNode *op) final {
+          if (injected)
+            return StmtMutator::VisitStmt_(op);
+          if (op->block->name_hint == "root") {
+            return StmtMutator::VisitStmt_(op);
+          }
+          injected = true;
+          Block block = op->block;
+          auto block_ptr = block.CopyOnWrite();
+          block_ptr->alloc_buffers.push_back(barrier_buf);
+          Map<Var, Array<PrimExpr>> barrier_init_map;
+          if (block_ptr->annotations.count("barrier_init")) {
+            barrier_init_map = Downcast<Map<Var, Array<PrimExpr>>>(
+                block_ptr->annotations.at("barrier_init"));
+          }
+          barrier_init_map.Set(barrier_buf->data, arrive_counts);
+          block_ptr->annotations.Set("barrier_init", barrier_init_map);
+          auto realize = tvm::ffi::GetRef<BlockRealize>(op);
+          auto realize_ptr = realize.CopyOnWrite();
+          realize_ptr->block = block;
+          return realize;
+        }
+      };
+
+      RootBlockInjector injector;
+      injector.barrier_buf = mbar_buf;
+      injector.arrive_counts = counts;
+      fptr->body = injector(fptr->body);
+      ICHECK(injector.injected)
+          << "Failed to find root BlockRealize for barrier injection";
     }
+
     return f;
   }
 
@@ -251,6 +318,16 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         layout_map_.Set(buffer, layout);
       }
     }
+    // Extract cluster_size from cluster_dims annotation
+    if (op->annotations.count("cluster_dims")) {
+      if (auto arr =
+              op->annotations.Get("cluster_dims")->try_cast<Array<Integer>>()) {
+        int sz = 1;
+        for (auto d : arr.value())
+          sz *= static_cast<int>(d->value);
+        cluster_size_ = sz;
+      }
+    }
     // Begin a new workspace collection frame for this block scope
     workspace_stack_.emplace_back();
 
@@ -298,7 +375,88 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         << "Invalid access ptr for permuted layout: " << access_ptr;
     auto access_ptr_call = Downcast<Call>(access_ptr);
     if (access_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
-      LOG(FATAL) << "Transformation for tvm_access_ptr is not implemented yet";
+      // tvm_access_ptr format: (dtype, data, offset, extent, rw_mask)
+      auto buffer_var = Downcast<Var>(access_ptr_call->args[1]);
+
+      // Find original buffer from buffer_map_ using buffer_var
+      auto it = buffer_map_.find(buffer_var);
+      if (it == buffer_map_.end()) {
+        // If not found, buffer_var might be a new var after remap
+        // Do reverse lookup in var_remap_
+        for (const auto &[old_var, new_var] : var_remap_) {
+          if (new_var.same_as(buffer_var)) {
+            it = buffer_map_.find(old_var);
+            break;
+          }
+        }
+      }
+
+      if (it == buffer_map_.end()) {
+        return result; // Buffer not found, no transformation needed
+      }
+
+      Buffer original_buffer = it->second;
+
+      // Check if this buffer has a layout
+      if (!layout_map_.count(original_buffer)) {
+        return result; // No layout, no transformation needed
+      }
+
+      Layout layout = layout_map_[original_buffer];
+      Buffer new_buffer = buffer_remap_[original_buffer];
+
+      // In TMA context, swizzle is encoded in TMA descriptor parameters
+      // rather than in memory indices, so we only update buffer data
+      // without recomputing indices.
+      if (in_tma_context_) {
+        Array<PrimExpr> new_args = access_ptr_call->args;
+        new_args.Set(1, new_buffer->data); // Only replace data var
+        layout_remap_.Set(new_buffer, layout);
+        result.rewritten = true;
+        result.expr =
+            Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                 access_ptr_call->annotations, access_ptr_call->span);
+        return result;
+      }
+
+      // Get the offset from tvm_access_ptr args[2]
+      PrimExpr elem_offset = access_ptr_call->args[2];
+      if (offset.defined()) {
+        elem_offset = elem_offset + offset.value();
+      }
+      // Get original and new buffer shapes
+      Array<PrimExpr> old_shape = original_buffer->shape;
+      Array<PrimExpr> new_shape = new_buffer->shape;
+      // Convert linear offset to multi-dimensional indices
+      Array<PrimExpr> multi_dim_indices;
+      PrimExpr remaining_offset = elem_offset;
+      for (int i = static_cast<int>(old_shape.size()) - 1; i >= 0; --i) {
+        multi_dim_indices.insert(
+            multi_dim_indices.begin(),
+            analyzer_->Simplify(floormod(remaining_offset, old_shape[i])));
+        remaining_offset =
+            analyzer_->Simplify(floordiv(remaining_offset, old_shape[i]));
+      }
+      // Apply layout transformation
+      auto forward_indices = layout->Forward(multi_dim_indices);
+      PrimExpr new_offset = 0;
+      PrimExpr stride_offset = 1;
+      for (int i = static_cast<int>(new_shape.size()) - 1; i >= 0; --i) {
+        new_offset += forward_indices[i] * stride_offset;
+        stride_offset *= new_shape[i];
+      }
+      new_offset = analyzer_->Simplify(new_offset);
+      Array<PrimExpr> new_indices;
+      layout_remap_.Set(new_buffer, layout);
+
+      // Build new tvm_access_ptr call with new buffer and offset
+      Array<PrimExpr> new_args = access_ptr_call->args;
+      new_args.Set(1, new_buffer->data); // Replace data var
+      new_args.Set(2, new_offset);       // Replace offset
+      result.rewritten = true;
+      result.expr = Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                         access_ptr_call->annotations, access_ptr_call->span);
+      return result;
     } else if (access_ptr_call->op.same_as(builtin::address_of())) {
       Optional<PrimExpr> resolved = ResolveBufferLoad(access_ptr_call->args[0]);
       ICHECK(resolved.defined())
@@ -321,6 +479,31 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
           << "but got indices size: " << indices.size()
           << " and shape size: " << old_shape.size();
 
+      Buffer remap_key = FindRemapBuffer(load->buffer).value_or(load->buffer);
+      Optional<Layout> layout = FindLayout(remap_key);
+      if (!layout.defined() || !buffer_map_.count(remap_key->data)) {
+        return result;
+      }
+      auto new_buffer = buffer_remap_.count(remap_key)
+                            ? buffer_remap_[remap_key]
+                            : load->buffer;
+      auto new_shape = new_buffer->shape;
+
+      // In TMA context, swizzle is encoded in TMA descriptor parameters
+      // rather than in memory indices, so we only update buffer data
+      // without recomputing indices.
+      if (in_tma_context_) {
+        Array<PrimExpr> new_args = {BufferLoad(new_buffer, indices)};
+        if (buffer_remap_.count(remap_key)) {
+          layout_remap_.Set(new_buffer, layout.value());
+        }
+        result.rewritten = true;
+        result.expr =
+            Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                 access_ptr_call->annotations, access_ptr_call->span);
+        return result;
+      }
+
       PrimExpr elem_offset = 0;
       PrimExpr stride = 1;
 
@@ -332,6 +515,75 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       PrimExpr smem_offset =
           elem_offset + (offset.defined() ? offset.value() : 0);
 
+      auto buffer_map_iter = buffer_map_.find(Downcast<Var>(remap_key->data));
+
+      int buffer_row_size = CheckAndGetBufferRowSize(buffer_map_iter->second);
+      (void)buffer_row_size;
+
+      // Convert offset to target-dimension, reindex it and convert it back
+      Array<PrimExpr> multi_dim_indices;
+      PrimExpr remaining_offset = smem_offset;
+
+      for (int i = static_cast<int>(old_shape.size()) - 1; i >= 0; --i) {
+        multi_dim_indices.insert(multi_dim_indices.begin(),
+                                 floormod(remaining_offset, old_shape[i]));
+        remaining_offset = floordiv(remaining_offset, old_shape[i]);
+      }
+
+      auto forward_indices = layout.value()->Forward(multi_dim_indices);
+      PrimExpr new_offset = 0;
+      PrimExpr stride_offset = 1;
+      for (int i = static_cast<int>(new_shape.size()) - 1; i >= 0; --i) {
+        new_offset += forward_indices[i] * stride_offset;
+        stride_offset *= new_shape[i];
+      }
+      new_offset = analyzer_->Simplify(new_offset);
+
+      Array<PrimExpr> new_indices;
+      for (int i = static_cast<int>(new_shape.size()) - 1; i >= 0; --i) {
+        new_indices.insert(new_indices.begin(),
+                           floormod(new_offset, new_shape[i]));
+        new_offset = floordiv(new_offset, new_shape[i]);
+      }
+
+      Array<PrimExpr> new_args = {BufferLoad(new_buffer, new_indices)};
+      if (buffer_remap_.count(remap_key)) {
+        layout_remap_.Set(new_buffer, layout.value());
+      }
+      result.rewritten = true;
+      result.expr = Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                         access_ptr_call->annotations, access_ptr_call->span);
+      return result;
+    } else if (access_ptr_call->op.same_as(tl::access_ptr())) {
+      // tl.access_ptr format: (base_load, extent, rw_mask)
+      ICHECK_EQ(access_ptr_call->args.size(), 3U)
+          << "tl.access_ptr expects 3 args: (BufferLoad, extent, rw_mask)";
+      Optional<PrimExpr> resolved = ResolveBufferLoad(access_ptr_call->args[0]);
+      ICHECK(resolved.defined())
+          << "Invalid tl.access_ptr argument for permuted layout: "
+          << access_ptr_call->args[0];
+      PrimExpr load_expr = resolved.value();
+      if (!load_expr.same_as(access_ptr_call->args[0])) {
+        Array<PrimExpr> new_args = access_ptr_call->args;
+        new_args.Set(0, load_expr);
+        access_ptr_call =
+            Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                 access_ptr_call->annotations, access_ptr_call->span);
+      }
+
+      BufferLoad load = Downcast<BufferLoad>(access_ptr_call->args[0]);
+      PrimExpr extent = access_ptr_call->args[1];
+      PrimExpr rw_mask = access_ptr_call->args[2];
+
+      Array<PrimExpr> indices = load->indices;
+      Array<PrimExpr> old_shape = load->buffer->shape;
+
+      CHECK_EQ(indices.size(), old_shape.size())
+          << "Indices size and shape size must match for general N-dimensional "
+             "buffer "
+          << "but got indices size: " << indices.size()
+          << " and shape size: " << old_shape.size();
+
       Buffer remap_key = FindRemapBuffer(load->buffer).value_or(load->buffer);
       Optional<Layout> layout = FindLayout(remap_key);
       if (!layout.defined() || !buffer_map_.count(remap_key->data)) {
@@ -342,15 +594,39 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
                             : load->buffer;
       auto new_shape = new_buffer->shape;
 
-      auto buffer_map_iter = buffer_map_.find(Downcast<Var>(remap_key->data));
+      // In TMA context, swizzle is encoded in TMA descriptor parameters
+      // rather than in memory indices, so we only update buffer data
+      // without recomputing indices.
+      if (in_tma_context_) {
+        Array<PrimExpr> new_args = {BufferLoad(new_buffer, indices), extent,
+                                    rw_mask};
+        if (buffer_remap_.count(remap_key)) {
+          layout_remap_.Set(new_buffer, layout.value());
+        }
+        result.rewritten = true;
+        result.expr =
+            Call(access_ptr_call->dtype, access_ptr_call->op, new_args,
+                 access_ptr_call->annotations, access_ptr_call->span);
+        return result;
+      }
+
+      PrimExpr elem_offset = 0;
+      PrimExpr stride = 1;
+      for (int i = static_cast<int>(old_shape.size()) - 1; i >= 0; --i) {
+        elem_offset += indices[i] * stride;
+        stride *= old_shape[i];
+      }
 
+      PrimExpr smem_offset =
+          elem_offset + (offset.defined() ? offset.value() : 0);
+
+      auto buffer_map_iter = buffer_map_.find(Downcast<Var>(remap_key->data));
       int buffer_row_size = CheckAndGetBufferRowSize(buffer_map_iter->second);
       (void)buffer_row_size;
 
       // Convert offset to target-dimension, reindex it and convert it back
       Array<PrimExpr> multi_dim_indices;
       PrimExpr remaining_offset = smem_offset;
-
       for (int i = static_cast<int>(old_shape.size()) - 1; i >= 0; --i) {
         multi_dim_indices.insert(multi_dim_indices.begin(),
                                  floormod(remaining_offset, old_shape[i]));
@@ -373,7 +649,8 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         new_offset = floordiv(new_offset, new_shape[i]);
       }
 
-      Array<PrimExpr> new_args = {BufferLoad(new_buffer, new_indices)};
+      Array<PrimExpr> new_args = {BufferLoad(new_buffer, new_indices), extent,
+                                  rw_mask};
       if (buffer_remap_.count(remap_key)) {
         layout_remap_.Set(new_buffer, layout.value());
       }
@@ -441,55 +718,188 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
   }
 
   PrimExpr VisitExpr_(const tir::CallNode *op) final {
-    if ((!has_tma_) && (op->op.same_as(tl::tma_load()) ||
-                        op->op.same_as(tl::tma_load_im2col()) ||
-                        op->op.same_as(tl::tma_store()))) {
+    if (op->op.same_as(tl::tma_load()) ||
+        op->op.same_as(tl::tma_load_im2col()) ||
+        op->op.same_as(tl::tma_store())) {
+      // skip tma related calls, as they were transformed implicitly.
       has_tma_ = true;
-    }
-    Array<RelaxExpr> ptx_instructions = {builtin::ptx_ldmatrix(),
-                                         builtin::mma_store()};
-
-    if (std::find(ptx_instructions.begin(), ptx_instructions.end(), op->op) ==
-        ptx_instructions.end()) {
+      in_tma_context_ = true;
       auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      in_tma_context_ = false;
       return call;
-    } else {
-      is_ptx_ = true;
     }
-    // Rewrite from/to shared or shared.dyn to/from local
-    auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
-    if (call->op.same_as(builtin::ptx_ldmatrix())) {
+
+    if (is_ptx_) {
+      return Downcast<Call>(op);
+    }
+
+    // Handle ptx_ldmatrix
+    if (op->op.same_as(builtin::ptx_ldmatrix())) {
+      is_ptx_ = true;
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      is_ptx_ = false;
       // form: T.ptx_ldmatrix(..., smem_ptr, smem_offset)
       // smem_ptr: T.tvm_access_ptr(ptype, data, offset, extent, rw_mask)
       // or T.address_of(buffer, offset)
       PrimExpr access_ptr = call->args[5];
       PrimExpr smem_offset = call->args[6];
-      Call address_of_call = Downcast<Call>(access_ptr);
-      if (!address_of_call->op.same_as(builtin::address_of())) {
+      Call access_ptr_call = Downcast<Call>(access_ptr);
+
+      // Handle both tvm_access_ptr and address_of
+      if (access_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, smem_offset, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(5, new_access_ptr.expr);
+          new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+        }
+      } else if (access_ptr_call->op.same_as(builtin::address_of())) {
+        Optional<PrimExpr> resolved =
+            ResolveBufferLoad(access_ptr_call->args[0]);
+        ICHECK(resolved.defined())
+            << "Invalid address_of argument for permuted layout: "
+            << access_ptr_call->args[0];
+        PrimExpr load_expr = resolved.value();
+        if (!load_expr.same_as(access_ptr_call->args[0])) {
+          auto call_node = call.CopyOnWrite();
+          call_node->args.Set(
+              5, Call(access_ptr_call->dtype, access_ptr_call->op, {load_expr},
+                      access_ptr_call->annotations, access_ptr_call->span));
+          access_ptr_call = Downcast<Call>(call->args[5]);
+          access_ptr = call->args[5];
+        }
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, smem_offset, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(5, new_access_ptr.expr);
+          new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+        }
+      } else if (access_ptr_call->op.same_as(tl::access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, smem_offset, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(5, new_access_ptr.expr);
+          new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+        }
+      } else {
         LOG(FATAL) << "Invalid access ptr for permuted layout: " << access_ptr;
       }
-      Optional<PrimExpr> resolved = ResolveBufferLoad(address_of_call->args[0]);
-      ICHECK(resolved.defined())
-          << "Invalid address_of argument for permuted layout: "
-          << address_of_call->args[0];
-      PrimExpr load_expr = resolved.value();
-      if (!load_expr.same_as(address_of_call->args[0])) {
-        auto call_node = call.CopyOnWrite();
-        call_node->args.Set(5, Call(address_of_call->dtype, address_of_call->op,
-                                    {load_expr}, address_of_call->annotations,
-                                    address_of_call->span));
-        address_of_call = Downcast<Call>(call->args[5]);
-        access_ptr = call->args[5];
-      }
-      BufferLoad load = Downcast<BufferLoad>(address_of_call->args[0]);
-      auto new_access_ptr =
-          HandleAccessPtrAndOffset(access_ptr, smem_offset, call->dtype);
-      if (new_access_ptr.rewritten) {
-        auto new_call = call.CopyOnWrite();
-        new_call->args.Set(5, new_access_ptr.expr);
-        new_call->args.Set(6, IntImm(smem_offset->dtype, 0));
+      return call;
+    }
+
+    if (op->op.same_as(tl::ptx_ldmatrix())) {
+      is_ptx_ = true;
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      is_ptx_ = false;
+      // form: T.ptx_ldmatrix(..., smem_ptr, smem_offset)
+      // smem_ptr: T.tvm_access_ptr(ptype, data, offset, extent, rw_mask)
+      // or T.address_of(buffer, offset)
+      PrimExpr access_ptr = call->args[2];
+      Call access_ptr_call = Downcast<Call>(access_ptr);
+
+      // Handle both tvm_access_ptr and address_of
+      if (access_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else if (access_ptr_call->op.same_as(builtin::address_of())) {
+        Optional<PrimExpr> resolved =
+            ResolveBufferLoad(access_ptr_call->args[0]);
+        ICHECK(resolved.defined())
+            << "Invalid address_of argument for permuted layout: "
+            << access_ptr_call->args[0];
+        PrimExpr load_expr = resolved.value();
+        if (!load_expr.same_as(access_ptr_call->args[0])) {
+          auto call_node = call.CopyOnWrite();
+          call_node->args.Set(
+              2, Call(access_ptr_call->dtype, access_ptr_call->op, {load_expr},
+                      access_ptr_call->annotations, access_ptr_call->span));
+          access_ptr_call = Downcast<Call>(call->args[2]);
+          access_ptr = call->args[2];
+        }
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else if (access_ptr_call->op.same_as(tl::access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else {
+        LOG(FATAL) << "Invalid access ptr for permuted layout: " << access_ptr;
+      }
+      return call;
+    }
+
+    // Handle tl::ptx_stmatrix
+    if (op->op.same_as(tl::ptx_stmatrix())) {
+      is_ptx_ = true;
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      is_ptx_ = false;
+      // form: T.ptx_stmatrix(trans, num, smem_ptr, value0, value1, ...)
+      // smem_ptr: T.tvm_access_ptr(ptype, data, offset, extent, rw_mask)
+      // or T.address_of(buffer, offset)
+      PrimExpr access_ptr = call->args[2];
+      Call access_ptr_call = Downcast<Call>(access_ptr);
+
+      // Handle both tvm_access_ptr and address_of
+      if (access_ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else if (access_ptr_call->op.same_as(builtin::address_of())) {
+        Optional<PrimExpr> resolved =
+            ResolveBufferLoad(access_ptr_call->args[0]);
+        ICHECK(resolved.defined())
+            << "Invalid address_of argument for permuted layout: "
+            << access_ptr_call->args[0];
+        PrimExpr load_expr = resolved.value();
+        if (!load_expr.same_as(access_ptr_call->args[0])) {
+          auto call_node = call.CopyOnWrite();
+          call_node->args.Set(
+              2, Call(access_ptr_call->dtype, access_ptr_call->op, {load_expr},
+                      access_ptr_call->annotations, access_ptr_call->span));
+          access_ptr_call = Downcast<Call>(call->args[2]);
+          access_ptr = call->args[2];
+        }
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else if (access_ptr_call->op.same_as(tl::access_ptr())) {
+        auto new_access_ptr =
+            HandleAccessPtrAndOffset(access_ptr, std::nullopt, call->dtype);
+        if (new_access_ptr.rewritten) {
+          auto new_call = call.CopyOnWrite();
+          new_call->args.Set(2, new_access_ptr.expr);
+        }
+      } else {
+        LOG(FATAL) << "Invalid access ptr for permuted layout: " << access_ptr;
       }
-    } else if (call->op.same_as(builtin::mma_store())) {
+      return call;
+    }
+
+    // Handle mma_store
+    if (op->op.same_as(builtin::mma_store())) {
+      is_ptx_ = true;
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      is_ptx_ = false;
       // because we will directly store result to Buffer instead of calling
       // mma_store now
       auto access_ptr = call->args[2];
@@ -499,10 +909,22 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
         auto new_call = call.CopyOnWrite();
         new_call->args.Set(2, new_access_ptr.expr);
       }
-    } else {
-      LOG(FATAL) << "Invalid call node: " << call;
+      return call;
     }
-    is_ptx_ = false;
+
+    // Handle standalone tvm_access_ptr calls with layout transformation
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
+      auto new_access_ptr =
+          HandleAccessPtrAndOffset(call, std::nullopt, call->dtype);
+      if (new_access_ptr.rewritten) {
+        return new_access_ptr.expr;
+      }
+      return call;
+    }
+
+    // Default: visit normally
+    auto call = Downcast<Call>(IRMutatorWithAnalyzer::VisitExpr_(op));
     return call;
   }
 
@@ -629,19 +1051,7 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       return workspace.access_ptr(2); // write
     };
 
-    Range thread_bounds;
-
-    if (analyzer_->const_int_bound.IsBound(thread_var_->var)) {
-      auto const_int_bound = analyzer_->const_int_bound(thread_var_);
-      auto min_value = const_int_bound->min_value;
-      auto max_value = const_int_bound->max_value;
-      auto extent = max_value + 1 - min_value;
-      thread_bounds =
-          Range::FromMinExtent(IntImm(thread_var_->var.dtype(), min_value),
-                               IntImm(thread_var_->var.dtype(), extent));
-    } else {
-      thread_bounds = Range::FromMinExtent(0, 1);
-    }
+    Range thread_bounds = CurrentThreadBounds();
 
     // Convert let_bindings_ to Map<Var, PrimExpr> for LowerArgs
     Map<Var, PrimExpr> let_var_to_expr;
@@ -649,14 +1059,32 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
       let_var_to_expr.Set(var, expr);
     }
 
-    auto lowered = tile_op->Lower(
-        LowerArgs{target_, thread_bounds, thread_var_->var, callback,
-                  layout_map_, buffer_remap_, let_var_to_expr},
-        analyzer_);
+    AllocMBarrierCallback mbarrier_callback = [this](int arrive_count) -> int {
+      if (!mbarrier_buffer_.defined()) {
+        mbarrier_buffer_ = CreateMBarrierBuffer(injected_mbarrier_name_, 1);
+      }
+      int id = mbarrier_count_++;
+      mbarrier_arrive_counts_.push_back(arrive_count);
+      return id;
+    };
+
+    auto lowered =
+        tile_op->Lower(LowerArgs{target_, thread_bounds, thread_var_->var,
+                                 callback, mbarrier_callback, layout_map_,
+                                 buffer_remap_, let_var_to_expr,
+                                 loop_mbar_phase_stack_.empty()
+                                     ? PrimExpr(IntImm(DataType::Int(32), 0))
+                                     : loop_mbar_phase_stack_.back(),
+                                 &mbarrier_buffer_, cluster_size_},
+                       analyzer_);
+
     return IRMutatorWithAnalyzer::VisitStmt(lowered);
   }
 
   Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == kPipelineContextNumStages) {
+      return VisitStmt(op->body);
+    }
     if (op->attr_key == tir::attr::thread_extent) {
       IterVar iv = Downcast<IterVar>(op->node);
       ICHECK_NE(iv->thread_tag.length(), 0U);
@@ -691,6 +1119,29 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
    * @return Stmt The lowered statement.
    */
   Stmt VisitStmt_(const ForNode *op) final {
+    bool pushed_loop_mbar_phase = false;
+    if (op->kind == ForKind::kSerial) {
+      int num_stages = 1;
+      if (auto ns_anno = op->annotations.Get("num_stages")) {
+        if (const auto *ns_int = ns_anno.value().as<IntImmNode>()) {
+          if (ns_int->value > 1) {
+            num_stages = static_cast<int>(ns_int->value);
+          }
+        }
+      }
+      PrimExpr phase_expr;
+      DataType loop_dtype = op->loop_var.dtype();
+      PrimExpr two = make_const(loop_dtype, 2);
+      if (num_stages > 1) {
+        PrimExpr num_stages_expr = make_const(loop_dtype, num_stages);
+        phase_expr = FloorMod(FloorDiv(op->loop_var, num_stages_expr), two);
+      } else {
+        phase_expr = FloorMod(op->loop_var, two);
+      }
+      loop_mbar_phase_stack_.push_back(analyzer_->Simplify(phase_expr));
+      pushed_loop_mbar_phase = true;
+    }
+
     // Extract reducer info from annotations
     Map<Var, ReducerInfo> reducer_info;
     if (op->annotations.count(attr::kReducerInfo)) {
@@ -699,8 +1150,11 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
                          .value();
     }
 
-    // First visit the body
+    // First visit the body.
     For for_node = Downcast<For>(arith::IRMutatorWithAnalyzer::VisitStmt_(op));
+    if (pushed_loop_mbar_phase) {
+      loop_mbar_phase_stack_.pop_back();
+    }
 
     // Only process parallel loops
     if (op->kind != ForKind::kParallel) {
@@ -708,69 +1162,111 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     }
 
     // For nested parallel loops, the annotation is placed on the outermost
-    // loop. Inner parallel loops without annotation should be skipped here -
+    // loop. Inner parallel loops without annotation should be skipped here –
     // they will be processed as part of the outer loop's partitioning.
+    // Rationale: inner loops cannot govern their outer loops; the outermost
+    // loop is the correct place to carry layout so we can rewrite the whole
+    // nested region in one place.
     if (!op->annotations.count(attr::kParallelLoopLayout)) {
       return for_node;
     }
 
     auto loop_layout = Downcast<Fragment>(
         op->annotations.Get(attr::kParallelLoopLayout).value());
-
     // Get predicate if it exists
     Optional<PrimExpr> predicate;
     if (op->annotations.count(attr::kParallelLoopPredicate)) {
       predicate = Downcast<PrimExpr>(
           op->annotations.Get(attr::kParallelLoopPredicate).value());
     }
+    bool parallel_prefer_async = false;
+    if (auto prefer_async_anno = op->annotations.Get(attr::kLoopPreferAsync)) {
+      if (auto prefer_async_bool = prefer_async_anno.value().try_cast<Bool>()) {
+        parallel_prefer_async = prefer_async_bool.value()->value;
+      } else {
+        LOG(WARNING) << "Loop annotation `" << attr::kLoopPreferAsync
+                     << "` expects Bool value (True/False), but got "
+                     << prefer_async_anno.value().GetTypeKey()
+                     << ". Ignore override.";
+      }
+    }
+    bool parallel_async_without_async_commit_wait = false;
+    if (auto no_commit_wait_anno =
+            op->annotations.Get(attr::kParallelAsyncWithoutAsyncCommitWait)) {
+      if (auto no_commit_wait_bool =
+              no_commit_wait_anno.value().try_cast<Bool>()) {
+        parallel_async_without_async_commit_wait =
+            no_commit_wait_bool.value()->value;
+      } else {
+        LOG(WARNING) << "Loop annotation `"
+                     << attr::kParallelAsyncWithoutAsyncCommitWait
+                     << "` expects Bool value (True/False), but got "
+                     << no_commit_wait_anno.value().GetTypeKey()
+                     << ". Ignore override.";
+      }
+    }
 
     auto root = tvm::ffi::GetRef<For>(op);
 
-    // Check if the loop stores into local buffers.
+    // Check if the loop writes to any non-local buffer.
+    // Thread partitioning is unnecessary when all stores target local buffers.
     // For example:
     //   for i in T.Parallel(1024):
     //     A_local[i] = A_global[i]
     // Here, A_local is a register-local buffer held independently by each
     // thread, so explicit thread binding is not required.
-    bool store_into_local = false;
-    PostOrderVisit(root, [&](const ObjectRef &obj) {
-      if (const auto *store = obj.as<BufferStoreNode>()) {
-        if (IsLocalBuffer(store->buffer)) {
-          store_into_local = true;
-        }
-      }
-    });
 
-    // Check if the loop only manipulates "local" buffers.
-    // for i in T.Parallel(1024):
-    //     A_local[i] = B_local[i]
-    // This indicates register usage and justifies skipping thread binding.
-    bool local_register_only = true;
+    // NOTE: For cases when stores to both local and non-local buffers exist
+    // (mixed case), we still conservatively assume that thread partitioning is
+    // needed. In such case, the programmer should carefully consider the
+    // access patterns of the mixed accesses to ensure correctness.
+
+    // Element-level intrinsics (e.g. atomic_add) pass non-local buffer
+    // pointers via tvm_access_ptr / tl::access_ptr inside CallNodes.
+    bool has_non_local_store = false;
     PostOrderVisit(root, [&](const ObjectRef &obj) {
       if (const auto *store = obj.as<BufferStoreNode>()) {
         if (!IsLocalBuffer(store->buffer)) {
-          local_register_only = false;
+          has_non_local_store = true;
         }
-      } else if (const auto *load = obj.as<BufferLoadNode>()) {
-        if (!IsLocalBuffer(load->buffer)) {
-          local_register_only = false;
+      } else if (const auto *call = obj.as<CallNode>()) {
+        if (call->op.same_as(builtin::tvm_access_ptr())) {
+          // tvm_access_ptr format: (dtype, data, offset, extent, rw_mask)
+          auto buffer_var = call->args[1].as<VarNode>();
+          if (buffer_var) {
+            Var var = tvm::ffi::GetRef<Var>(buffer_var);
+            auto it = buffer_map_.find(var);
+            if (it != buffer_map_.end() && !IsLocalBuffer(it->second)) {
+              has_non_local_store = true;
+            }
+          }
+        } else if (call->op.same_as(tl::access_ptr())) {
+          // tl::access_ptr format: (BufferLoad, extent, rw_mask)
+          if (const auto *load = call->args[0].as<BufferLoadNode>()) {
+            if (!IsLocalBuffer(load->buffer)) {
+              has_non_local_store = true;
+            }
+          }
         }
       }
     });
 
-    // Determine if this is a true parallel loop requiring thread partitioning.
-    // Skip partitioning for loops that only operate on local/register buffers.
-    bool parallel_loop = !local_register_only && !store_into_local;
+    // Determine if this is a true parallel loop requiring thread
+    // partitioning: parallel_loop = True if we need to partition the loop.
+    // Skip partitioning for loops that only have local stores.
+    bool parallel_loop = has_non_local_store;
 
     // Check if there are non-local buffer accesses (for vectorization decision)
     bool has_non_local = false;
     PostOrderVisit(for_node->body, [&](const ObjectRef &obj) {
       if (const auto *load = obj.as<BufferLoadNode>()) {
-        if (!IsLocalBuffer(load->buffer) && !IsFragmentBuffer(load->buffer)) {
+        if (!IsLocalBuffer(load->buffer, /*allow_var*/ true) &&
+            !IsFragmentBuffer(load->buffer)) {
           has_non_local = true;
         }
       } else if (const auto *store = obj.as<BufferStoreNode>()) {
-        if (!IsLocalBuffer(store->buffer) && !IsFragmentBuffer(store->buffer)) {
+        if (!IsLocalBuffer(store->buffer, /*allow_var*/ true) &&
+            !IsFragmentBuffer(store->buffer)) {
           has_non_local = true;
         }
       }
@@ -820,10 +1316,31 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
     // - AND no reducers are present
     bool should_vectorize =
         (has_non_local || has_cast_operations) && !has_reducer;
-
     // Lower the parallel loop using the common function
-    return LowerParallelLoop(for_node, loop_layout, thread_var_->var, analyzer_,
-                             predicate, parallel_loop, should_vectorize);
+    Stmt lowered = LowerParallelLoop(for_node, loop_layout, thread_var_->var,
+                                     analyzer_, layout_map_, predicate,
+                                     parallel_loop, should_vectorize);
+
+    // Only parallel-loop lowering needs PTX cp.async injection. Thread-level
+    // lowering does not require converting eligible global->shared copies to
+    // `tir.ptx_cp_async`.
+    if (TargetIsCuda(target_) && TargetHasAsyncCopy(target_)) {
+      tvm::transform::PassContext ctx = tvm::transform::PassContext::Current();
+      bool enable_auto_async_copy =
+          ctx->GetConfig<Bool>(kEnableAsyncCopy, Bool(true)).value();
+      bool should_enable_async_copy =
+          parallel_prefer_async ||
+          (enable_auto_async_copy && parallel_async_without_async_commit_wait);
+      auto inject_result =
+          InjectPTXAsyncCopy(lowered, should_enable_async_copy,
+                             parallel_async_without_async_commit_wait);
+      lowered = inject_result.stmt;
+    }
+    return lowered;
+  }
+
+  Range CurrentThreadBounds() const {
+    return ComputeThreadBounds(thread_var_, *analyzer_);
   }
 
   Target target_;
@@ -836,8 +1353,19 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
   IterVar thread_var_ = IterVar(Range::FromMinExtent(0, 1), Var("v_thread"),
                                 IterVarType::kDataPar);
   size_t thread_block_size_ = 0;
+  // Product of cluster_dims from block annotation (default 1).
+  int cluster_size_ = 1;
   // Stack of per-Block workspace buffers gathered while visiting children
   std::vector<Array<Buffer>> workspace_stack_;
+  // Counter and arrive-counts for mbarrier allocation via
+  // AllocMBarrierCallback. Used to inject a barrier buffer with
+  // barrier_init annotation into the root block after all tile ops are lowered.
+  int mbarrier_count_{0};
+  std::vector<int> mbarrier_arrive_counts_;
+  // The shared.barrier scope buffer created lazily by AllocMBarrier callback.
+  Optional<Buffer> mbarrier_buffer_;
+  // Fallback mbarrier parity derived from the nearest enclosing serial loop.
+  std::vector<PrimExpr> loop_mbar_phase_stack_;
   // For ptx Node, we need to remap the buffer and indices
   // By access CallNode instead of BufferLoad Node.
   bool is_ptx_{false};
@@ -847,6 +1375,11 @@ class LowerTileOpPass : arith::IRMutatorWithAnalyzer {
   std::unordered_map<Var, Buffer, ObjectPtrHash, ObjectPtrEqual> buffer_map_;
   Map<Var, Var> var_remap_;
   bool has_tma_{false};
+  // Flag to indicate we are inside a TMA context (tma_load, tma_load_im2col,
+  // tma_store). When true, HandleAccessPtrAndOffset only updates buffer data
+  // without recomputing indices, since swizzle is encoded in TMA descriptor
+  // parameters rather than in memory indices.
+  bool in_tma_context_{false};
 };
 
 namespace transform {
diff --git a/src/transform/make_packed_api.cc b/src/transform/make_packed_api.cc
index 0fd3a2c160..be331d9736 100644
--- a/src/transform/make_packed_api.cc
+++ b/src/transform/make_packed_api.cc
@@ -39,6 +39,7 @@
 
 #include "../op/builtin.h"
 #include "arg_binder.h"
+#include "common/attr.h"
 #include "merge_if_stmt.h"
 #include "tir/transforms/ir_utils.h"
 
@@ -204,6 +205,13 @@ Optional<String> RequiresPackedAPI(const PrimFunc &func) {
     }
   }
 
+  // Source kernels must stay as direct GlobalVar calls until
+  // LowerDeviceKernelLaunch can turn them into device launches using the
+  // selected external CUDA entry symbol.
+  if (func->GetAttr<String>(tl::attr::kCodeBlockSource)) {
+    return std::nullopt;
+  }
+
   // Internal function calls do not need the PackedFunc API
   auto global_symbol = func->GetAttr<String>(tvm::attr::kGlobalSymbol);
   if (!global_symbol) {
@@ -514,7 +522,7 @@ PrimFunc MakePackedAPI(PrimFunc func) {
   }
 
   binder.BindDLTensors(buffer_def, device_type, device_id, name_hint,
-                       used_param_buffers);
+                       used_param_buffers, detector.used_shape_vars);
   for (const auto &[var, buffer] : buffer_def) {
     // Prefer buffer data var name in diagnostics to avoid exposing low-level
     // handle vars
diff --git a/src/transform/merge_shared_memory_allocations.cc b/src/transform/merge_shared_memory_allocations.cc
index 096fbd9285..5bfac6bece 100644
--- a/src/transform/merge_shared_memory_allocations.cc
+++ b/src/transform/merge_shared_memory_allocations.cc
@@ -281,9 +281,7 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
     } else if (op->attr_key == tir::attr::virtual_thread) {
       VisitNewScope(op);
     } else if (op->attr_key == "kWarpSpecializationScope") {
-      IfThenElse body = Downcast<IfThenElse>(op->body);
-      this->VisitStmt(body->then_case);
-      this->VisitStmt(body->else_case.value());
+      VisitWarpSpecializationBody(op->body);
     } else {
       StmtExprVisitor::VisitStmt_(op);
     }
@@ -325,6 +323,32 @@ class SharedMemLinearAccessPatternFinder final : public StmtExprVisitor {
   std::unordered_map<const Object *, StmtAttr> stmt_attrs_;
 
 private:
+  void VisitWarpSpecializationBody(const Stmt &stmt) {
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      for (const auto &sub_stmt : seq->seq) {
+        VisitWarpSpecializationBody(sub_stmt);
+      }
+      return;
+    }
+    if (const auto *if_node = stmt.as<IfThenElseNode>()) {
+      this->VisitStmt(if_node->then_case);
+      if (if_node->else_case.defined()) {
+        this->VisitStmt(if_node->else_case.value());
+      }
+      return;
+    }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      VisitWarpSpecializationBody(attr->body);
+      return;
+    }
+    if (const auto *let_node = stmt.as<LetStmtNode>()) {
+      this->VisitExpr(let_node->value);
+      VisitWarpSpecializationBody(let_node->body);
+      return;
+    }
+    StmtExprVisitor::VisitStmt(stmt);
+  }
+
   // Wrapper function to determine if the shared memory allocation for a
   // variable is appropriate.
   bool IsAppropriateSharedMemory(const Var &var) {
@@ -366,7 +390,7 @@ class SharedMemoryAlignmentPlanner : public StmtExprVisitor {
     if (scope == "shared" || scope == "shared.dyn") {
       auto target = Target::Current();
       ICHECK(target.defined()) << "Target is not defined";
-      const int alignment = TargetIsHopper(target) ? 1024 : 16;
+      const int alignment = TargetHasBulkCopy(target) ? 1024 : 16;
       shmem_alignment_map_[op] = alignment;
     }
   }
@@ -565,30 +589,45 @@ class SharedMemoryRewriter : public StmtExprMutator {
       return Call(op->dtype, op->op,
                   {op->args[0], merged_buf_var_, extra_offset + offset, extent,
                    op->args[4]});
-    } else if (op->op.same_as(builtin::ptx_cp_async())) {
-      ICHECK((op->args.size() == 5U) || (op->args.size() == 6U));
-      DataType dtype = op->dtype;
-      Var buffer = Downcast<Var>(op->args[0]);
+    } else if (op->op.same_as(builtin::ptx_cp_async()) ||
+               op->op.same_as(tl::ptx_cp_async())) {
+      ICHECK(op->args.size() == 3U || op->args.size() == 4U)
+          << "ptx_cp_async expects 3 or 4 arguments (dst_access_ptr, "
+             "src_access_ptr, count[, predicate])";
+
+      // Extract dst_access_ptr and check if it needs merging
+      Call dst_access_ptr = Downcast<Call>(op->args[0]);
+      ICHECK(dst_access_ptr->op.same_as(builtin::tvm_access_ptr()))
+          << "First argument must be tvm_access_ptr";
+
+      // tvm_access_ptr(ptype, data, offset, extent, rw_mask)
+      Var buffer = Downcast<Var>(dst_access_ptr->args[1]);
       if (!IsAppropriateSharedMemory(buffer)) {
         return StmtExprMutator::VisitExpr_(op);
       }
-      PrimExpr extra_offset = GetBufferOffset(buffer, dtype);
-      PrimExpr offset = this->VisitExpr(op->args[1]);
-      // the dst shared memory is a byte buffer generated by merging shared
-      // memory. we need to multiply the offset index by the byte size of the
-      // original value dtype, to get the correct offset of merged shared
-      // buffer.
-      int index_factor = dtype.bytes();
-      if (op->args.size() == 5)
-        return Call(dtype, op->op,
-                    {merged_buf_var_,
-                     mul(extra_offset + offset, PrimExpr(index_factor)),
-                     op->args[2], op->args[3], op->args[4]});
-      else
-        return Call(dtype, op->op,
-                    {merged_buf_var_,
-                     mul(extra_offset + offset, PrimExpr(index_factor)),
-                     op->args[2], op->args[3], op->args[4], op->args[5]});
+
+      DataType dtype = op->dtype;
+      DataType ptr_dtype = dst_access_ptr->args[0].dtype();
+      PrimExpr extra_offset = GetBufferOffset(buffer, ptr_dtype);
+      PrimExpr offset = this->VisitExpr(dst_access_ptr->args[2]);
+
+      // Create new dst_access_ptr with merged buffer and adjusted offset
+      auto new_dst_access_ptr =
+          Call(DataType::Handle(), builtin::tvm_access_ptr(),
+               {
+                   dst_access_ptr->args[0], // ptype
+                   merged_buf_var_,         // merged buffer
+                   extra_offset + offset,   // adjusted offset
+                   dst_access_ptr->args[3], // extent
+                   dst_access_ptr->args[4]  // rw_mask
+               });
+
+      Array<PrimExpr> cp_async_args = {new_dst_access_ptr, op->args[1],
+                                       op->args[2]};
+      if (op->args.size() == 4U) {
+        cp_async_args.push_back(op->args[3]);
+      }
+      return Call(dtype, op->op, cp_async_args);
     } else {
       return StmtExprMutator::VisitExpr_(op);
     }
@@ -665,66 +704,100 @@ class SharedMemoryRewriter : public StmtExprMutator {
       for (int i = 0, n = static_cast<int>(blocks_.size()); i < n; ++i) {
         size_t aligned = AlignUpSize(blocks_[i].offset, alignment);
         size_t head = aligned - blocks_[i].offset;
-        if (head <= blocks_[i].size && (blocks_[i].size - head) >= need) {
-          size_t waste = blocks_[i].size - head - need;
-          if (waste < best_waste) {
-            best_waste = waste;
-            best = i;
-          }
+        if (head > blocks_[i].size)
+          continue;
+        size_t usable = blocks_[i].size - head;
+        if (usable < need)
+          continue;
+        size_t waste = blocks_[i].size - need;
+        if (waste < best_waste) {
+          best_waste = waste;
+          best = i;
         }
       }
-      if (best < 0) {
+      if (best < 0)
         return std::nullopt;
-      }
-      FreeBlock blk = blocks_[best];
-      size_t aligned = AlignUpSize(blk.offset, alignment);
+      return CarveBlock(best, need, alignment);
+    }
+
+    // Try to allocate from the free block whose end touches arena_top.
+    // The block may be smaller than need; the caller grows the arena to
+    // cover the deficit.  Returns the aligned start offset on success.
+    std::optional<size_t> AllocateFromTail(size_t need, size_t alignment,
+                                           size_t arena_top) {
+      if (blocks_.empty())
+        return std::nullopt;
+      int tail_idx = static_cast<int>(blocks_.size()) - 1;
+      if (blocks_[tail_idx].offset + blocks_[tail_idx].size != arena_top)
+        return std::nullopt;
+
+      size_t aligned = AlignUpSize(blocks_[tail_idx].offset, alignment);
+      if (aligned >= arena_top)
+        return std::nullopt;
+
+      FreeBlock blk = blocks_[tail_idx];
       size_t head = aligned - blk.offset;
-      size_t tail = blk.size - head - need;
-      blocks_.erase(blocks_.begin() + best);
+
+      blocks_.erase(blocks_.begin() + tail_idx);
       if (head) {
-        blocks_.push_back({blk.offset, head});
-      }
-      if (tail) {
-        blocks_.push_back({aligned + need, tail});
+        InsertBlock(blk.offset, head);
       }
-      Normalize();
       return aligned;
     }
 
     void Free(size_t offset, size_t size) {
       if (size == 0)
         return;
-      blocks_.push_back({offset, size});
-      Normalize();
+      InsertBlock(offset, size);
     }
 
   private:
-    void Normalize() {
-      if (blocks_.empty())
-        return;
-      std::sort(blocks_.begin(), blocks_.end(),
-                [](const FreeBlock &a, const FreeBlock &b) {
-                  return a.offset < b.offset;
-                });
-      std::vector<FreeBlock> merged;
-      merged.reserve(blocks_.size());
-      for (const FreeBlock &blk : blocks_) {
-        if (merged.empty()) {
-          merged.push_back(blk);
-          continue;
-        }
-        FreeBlock &last = merged.back();
-        size_t last_end = last.offset + last.size;
-        if (blk.offset <= last_end) {
-          size_t blk_end = blk.offset + blk.size;
-          if (blk_end > last_end) {
-            last.size = blk_end - last.offset;
-          }
-        } else {
-          merged.push_back(blk);
+    // Insert a block at the correct sorted position and merge with adjacent
+    // neighbours so the sorted-and-coalesced invariant is preserved.
+    void InsertBlock(size_t offset, size_t size) {
+      FreeBlock entry{offset, size};
+      auto it = std::lower_bound(
+          blocks_.begin(), blocks_.end(), offset,
+          [](const FreeBlock &b, size_t off) { return b.offset < off; });
+      it = blocks_.insert(it, entry);
+
+      // Merge with the next neighbour.
+      auto next = std::next(it);
+      if (next != blocks_.end() && it->offset + it->size >= next->offset) {
+        size_t merged_end =
+            std::max(it->offset + it->size, next->offset + next->size);
+        it->size = merged_end - it->offset;
+        blocks_.erase(next);
+      }
+      // Merge with the previous neighbour.
+      if (it != blocks_.begin()) {
+        auto prev = std::prev(it);
+        if (prev->offset + prev->size >= it->offset) {
+          size_t merged_end =
+              std::max(prev->offset + prev->size, it->offset + it->size);
+          prev->size = merged_end - prev->offset;
+          blocks_.erase(it);
         }
       }
-      blocks_ = std::move(merged);
+    }
+
+    // Remove blocks_[idx], allocate `need` bytes at the aligned offset
+    // within it, and return any head/tail fragments to the free list.
+    size_t CarveBlock(int idx, size_t need, size_t alignment) {
+      FreeBlock blk = blocks_[idx];
+      blocks_.erase(blocks_.begin() + idx);
+
+      size_t aligned = AlignUpSize(blk.offset, alignment);
+      size_t head = aligned - blk.offset;
+      size_t tail = blk.size - head - need;
+
+      // InsertBlock uses lower_bound + coalesce, so insertion order is
+      // irrelevant for correctness.
+      if (tail)
+        InsertBlock(aligned + need, tail);
+      if (head)
+        InsertBlock(blk.offset, head);
+      return aligned;
     }
 
     std::vector<FreeBlock> blocks_;
@@ -751,8 +824,6 @@ class SharedMemoryRewriter : public StmtExprMutator {
                 if (lhs.size_bytes != rhs.size_bytes) {
                   return lhs.size_bytes > rhs.size_bytes;
                 }
-                // Use name comparison for deterministic ordering instead of
-                // pointer comparison
                 return lhs.var->name_hint < rhs.var->name_hint;
               });
 
@@ -763,7 +834,6 @@ class SharedMemoryRewriter : public StmtExprMutator {
     size_t arena_top = 0;
     std::unordered_map<const VarNode *, size_t> offsets;
 
-    // Expire intervals that end before or at program counter `pc`.
     auto retire = [&](int pc) {
       while (!active.empty() && active.top().end <= pc) {
         const ActiveInterval top = active.top();
@@ -775,13 +845,23 @@ class SharedMemoryRewriter : public StmtExprMutator {
     for (const Interval &interval : intervals) {
       retire(interval.start);
       size_t offset = 0;
-      // Try to recycle previously freed memory first; fall back to bumping the
-      // arena.
+      // 1) Reuse a fully fitting free block (best-fit).
+      // 2) Extend the tail free block that touches arena_top.
+      // 3) Bump-allocate at arena_top (reclaim alignment gap).
       if (auto slot =
               freelist.Allocate(interval.size_bytes, interval.alignment)) {
         offset = slot.value();
+      } else if (auto tail_slot = freelist.AllocateFromTail(
+                     interval.size_bytes, interval.alignment, arena_top)) {
+        offset = tail_slot.value();
+        arena_top = offset + interval.size_bytes;
       } else {
         offset = AlignUpSize(arena_top, interval.alignment);
+        // Reclaim the alignment gap [arena_top, offset) so future small
+        // allocations can reuse it.
+        if (offset > arena_top) {
+          freelist.Free(arena_top, offset - arena_top);
+        }
         arena_top = offset + interval.size_bytes;
       }
       active.push(ActiveInterval{interval.end, offset, interval.size_bytes,
@@ -981,15 +1061,43 @@ class SharedMemoryRewriter : public StmtExprMutator {
           }
           // start from current statement and find the last statement at
           // gen_level
+          //
+          // Additionally, stop if the next statement generates (births) a
+          // different shared-memory buffer.  Without this check the
+          // reordered kill can land *past* another buffer's gen, creating
+          // a false liveness overlap that blocks memory reuse even when the
+          // two buffers' true lifetimes are disjoint (e.g., Q_shared and
+          // O_shared in Flash Attention can share the same shared memory
+          // region).
+          //
+          // This is safe because shared-memory allocations (T.alloc_shared)
+          // are always placed *outside* pipelined loop bodies — no new
+          // shared buffer is born inside the deep scope where kills are
+          // being reordered from.
 
           for (; stmt_it != gen_kill_seq.end(); ++stmt_it) {
-            // Check if next statement has different level
             auto next_it = stmt_it + 1;
             if (next_it == gen_kill_seq.end() ||
                 stmt_attrs.at(next_it->stmt).level == gen_level) {
               last_stmt_at_level = stmt_it->stmt;
               break;
             }
+            // Stop if the next statement births a different shared buffer.
+            auto next_event_it = event_map_.find(next_it->stmt);
+            if (next_event_it != event_map_.end() &&
+                !next_event_it->second.gen.empty()) {
+              bool has_other_gen = false;
+              for (const VarNode *gen_buf : next_event_it->second.gen) {
+                if (gen_buf != buffer) {
+                  has_other_gen = true;
+                  break;
+                }
+              }
+              if (has_other_gen) {
+                last_stmt_at_level = stmt_it->stmt;
+                break;
+              }
+            }
           }
           if (last_stmt_at_level) {
             event_map_[last_stmt_at_level].kill.push_back(buffer);
diff --git a/src/transform/multi_version_buffer_rewriter.cc b/src/transform/multi_version_buffer_rewriter.cc
index 4075673ec5..e73e2e1b93 100644
--- a/src/transform/multi_version_buffer_rewriter.cc
+++ b/src/transform/multi_version_buffer_rewriter.cc
@@ -3,24 +3,116 @@
  * \brief Warp specialized Pipeline for cuda GPU (sm90+)
  */
 
-#include <tvm/ffi/reflection/registry.h>
+#include <tvm/arith/analyzer.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
-#include <tvm/tir/transform.h>
 
 #include <functional>
 #include <unordered_set>
 #include <utility>
+#include <vector>
 
+#include "../layout/layout.h"
 #include "../op/builtin.h"
+#include "../op/operator.h"
+#include "../op/region.h"
+#include "../op/utils.h"
+#include "common/pipeline_utils.h"
+#include "multi_version_buffer_rewriter.h"
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
 
+namespace {
+
+bool ShapesEqual(const Array<PrimExpr> &lhs, const Array<PrimExpr> &rhs,
+                 arith::Analyzer *analyzer) {
+  if (lhs.size() != rhs.size()) {
+    return false;
+  }
+  for (size_t i = 0; i < lhs.size(); ++i) {
+    if (!analyzer->CanProveEqual(lhs[i], rhs[i])) {
+      return false;
+    }
+  }
+  return true;
+}
+
+Layout ExpandAnnotatedLayoutForMultiVersionedBuffer(const Layout &layout,
+                                                    const Buffer &old_buffer,
+                                                    const Buffer &new_buffer) {
+  if (!layout.defined() ||
+      new_buffer->shape.size() <= old_buffer->shape.size()) {
+    return Layout();
+  }
+
+  arith::Analyzer analyzer;
+  if (!ShapesEqual(layout->InputShape(), old_buffer->shape, &analyzer)) {
+    return Layout();
+  }
+
+  size_t leading_ndim = new_buffer->shape.size() - old_buffer->shape.size();
+  Array<PrimExpr> trailing_shape;
+  Array<PrimExpr> leading_shape;
+  for (size_t i = 0; i < leading_ndim; ++i) {
+    leading_shape.push_back(new_buffer->shape[i]);
+  }
+  for (size_t i = 0; i < old_buffer->shape.size(); ++i) {
+    trailing_shape.push_back(new_buffer->shape[leading_ndim + i]);
+  }
+  if (!ShapesEqual(trailing_shape, old_buffer->shape, &analyzer)) {
+    return Layout();
+  }
+
+  return layout->Expand(leading_shape);
+}
+
+bool UpdateExpandedLayoutMapForRemappedAllocs(
+    const std::vector<std::pair<Buffer, Buffer>> &remapped_allocs,
+    Map<String, Any> *annotations) {
+  if (remapped_allocs.empty() || !annotations->count(attr::kLayoutMap)) {
+    return false;
+  }
+
+  auto layout_map_ref = annotations->Get(attr::kLayoutMap);
+  if (!layout_map_ref.has_value()) {
+    return false;
+  }
+  auto layout_map = layout_map_ref.value().as<Map<Var, Layout>>();
+  if (!layout_map.has_value()) {
+    return false;
+  }
+
+  Map<Var, Layout> updated_layout_map = layout_map.value();
+  std::unordered_set<const VarNode *> visited;
+  bool changed = false;
+  for (const auto &[old_buffer, new_buffer] : remapped_allocs) {
+    if (!visited.insert(old_buffer->data.get()).second ||
+        !updated_layout_map.count(old_buffer->data)) {
+      continue;
+    }
+    Layout layout = updated_layout_map[old_buffer->data];
+    Layout expanded = ExpandAnnotatedLayoutForMultiVersionedBuffer(
+        layout, old_buffer, new_buffer);
+    if (!expanded.defined()) {
+      continue;
+    }
+    updated_layout_map.Set(old_buffer->data, expanded);
+    changed = true;
+  }
+
+  if (changed) {
+    annotations->Set(attr::kLayoutMap, updated_layout_map);
+  }
+  return changed;
+}
+
+} // namespace
+
 enum class Role : uint8_t { kConsumer, kProducer, kBoth };
 
 class WarpSpecializedRoleMarker_ : public StmtVisitor {
@@ -30,7 +122,8 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
 
   Role GetRole(const StmtNode *stmt) const {
     auto it = map_.find(stmt);
-    ICHECK(it != map_.end());
+    ICHECK(it != map_.end())
+        << " Cannot find role for stmt: " << stmt->GetTypeKey();
     return it->second;
   }
 
@@ -48,9 +141,7 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
   }
 
   void VisitStmt_(const BufferStoreNode *op) final {
-    bool is_shared_store =
-        op->buffer.scope() == "shared.dyn" || op->buffer.scope() == "shared";
-    if (!is_shared_store) {
+    if (!IsSharedBuffer(op->buffer)) {
       SetRole(op, Role::kConsumer);
       return;
     }
@@ -62,7 +153,7 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
     auto reads = access[0];
     Role role = Role::kProducer;
     for (auto read : reads) {
-      if (read->buffer.scope() != "global") {
+      if (!IsGlobalBuffer(read->buffer)) {
         role = Role::kConsumer;
         break;
       }
@@ -110,6 +201,8 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
   void VisitStmt_(const AttrStmtNode *op) final { HandleBodyStmt(op); }
   void VisitStmt_(const AssertStmtNode *op) final { HandleBodyStmt(op); }
   void VisitStmt_(const BlockNode *op) final { HandleBodyStmt(op); }
+  void VisitStmt_(const AllocateNode *op) final { HandleBodyStmt(op); }
+  void VisitStmt_(const DeclBufferNode *op) final { HandleBodyStmt(op); }
 
   bool HasProducer() { return has_simt_copy_ || has_bulk_copy_; }
 
@@ -125,7 +218,7 @@ class WarpSpecializedRoleMarker_ : public StmtVisitor {
 
 class MultiVersionBufferRewriter : public StmtExprMutator {
 public:
-  static PrimFunc Substitute(PrimFunc &f) {
+  static PrimFunc Substitute(PrimFunc f) {
     auto rewriter = MultiVersionBufferRewriter();
     rewriter.buffer_lca_ = DetectBufferAccessLCA(f);
     for (auto [buffer, _] : rewriter.buffer_lca_) {
@@ -137,7 +230,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
   }
 
 private:
-  MultiVersionBufferRewriter() = default;
+  explicit MultiVersionBufferRewriter() = default;
 
   Array<Buffer> GetVersionedBuffers(const Array<Stmt> &seq_stmt,
                                     const Array<Buffer> &scoped_buffers) {
@@ -157,6 +250,13 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
         collect_stmts(attr->body);
         return;
       }
+      if (const auto *if_then_else = stmt.as<IfThenElseNode>()) {
+        collect_stmts(if_then_else->then_case);
+        if (if_then_else->else_case.defined()) {
+          collect_stmts(if_then_else->else_case.value());
+        }
+        return;
+      }
       if (const auto *block_realize = stmt.as<BlockRealizeNode>()) {
         collect_stmts(block_realize->block->body);
         return;
@@ -179,8 +279,52 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
       Block block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
                   /*name_hint=*/"", /*body*/ stmt);
       auto access = GetBlockAccessRegion(block, buffer_data_to_buffer_);
-      reads.push_back(access[0]);
-      writes.push_back(access[1]);
+      Array<BufferRegion> stmt_reads = access[0];
+      Array<BufferRegion> stmt_writes = access[1];
+
+      // Supplement with tile-op analysis.
+      // GetBlockAccessRegion misses buffer references that are encoded as
+      // tl.tileop.region Call args or as plain BufferLoad args whose
+      // semantic role (read vs write) is only known to the tile-op.
+      // Let the tile-op report its own access regions, and fall back to
+      // RegionOp scanning for any ops that still do not expose them.
+      if (auto *eval = stmt.as<EvaluateNode>()) {
+        if (auto *call = eval->value.as<CallNode>()) {
+          auto tile_op = ParseOperator(ffi::GetRef<Call>(call));
+          if (tile_op.defined()) {
+            AccessRegions access = tile_op->GetAccessRegions();
+            if (!access.reads.empty() || !access.writes.empty()) {
+              stmt_reads.insert(stmt_reads.end(), access.reads.begin(),
+                                access.reads.end());
+              stmt_writes.insert(stmt_writes.end(), access.writes.begin(),
+                                 access.writes.end());
+            } else {
+              // Fallback: scan RegionOp-encoded args.
+              for (const auto &arg : call->args) {
+                if (auto *region_call = arg.as<CallNode>()) {
+                  if (region_call->op.same_as(RegionOp::Get())) {
+                    auto region_op =
+                        ParseOperator(ffi::GetRef<Call>(region_call));
+                    if (auto *rn = region_op.as<RegionOpNode>()) {
+                      int mask = rn->GetAccessMask();
+                      auto br = BufferRegion(rn->GetBuffer(), rn->GetRanges());
+                      if (mask & 1) { // read
+                        stmt_reads.push_back(br);
+                      }
+                      if (mask & 2) { // write
+                        stmt_writes.push_back(br);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+
+      reads.push_back(stmt_reads);
+      writes.push_back(stmt_writes);
       roles.push_back(marker.GetRole(stmt));
     }
 
@@ -190,8 +334,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
     auto is_copy_stage = [&](size_t idx) {
       bool has_shared_write = false;
       for (const BufferRegion &wr : writes[idx]) {
-        auto scope = wr->buffer.scope();
-        if (scope == "shared" || scope == "shared.dyn") {
+        if (IsSharedBuffer(wr->buffer)) {
           has_shared_write = true;
           break;
         }
@@ -199,7 +342,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
       if (!has_shared_write)
         return false;
       for (const BufferRegion &rd : reads[idx]) {
-        if (rd->buffer.scope() == "global") {
+        if (IsGlobalBuffer(rd->buffer)) {
           return true;
         }
       }
@@ -255,101 +398,48 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
   static Buffer RewriteAllocBuffer(const Buffer &buffer, int num_versions) {
     ObjectPtr<BufferNode> new_buffer =
         tvm::ffi::make_object<BufferNode>(*(buffer.get()));
-    new_buffer->shape.insert(new_buffer->shape.begin(), PrimExpr(num_versions));
-    if (!new_buffer->strides.empty()) {
-      ICHECK(new_buffer->strides.size() + 1 == new_buffer->shape.size());
-      PrimExpr stride_0 = new_buffer->strides[0] * new_buffer->shape[1];
-      new_buffer->strides.insert(new_buffer->strides.begin(), stride_0);
+    if (buffer.scope() == "shared.barrier") {
+      // Barrier buffers: expand first dimension to keep 1D shape.
+      // (1,) -> (num_versions,) so lower_shared_barrier.cc still works.
+      new_buffer->shape.Set(0, PrimExpr(num_versions) * new_buffer->shape[0]);
+    } else {
+      new_buffer->shape.insert(new_buffer->shape.begin(),
+                               PrimExpr(num_versions));
+      if (!new_buffer->strides.empty()) {
+        ICHECK(new_buffer->strides.size() + 1 == new_buffer->shape.size());
+        PrimExpr stride_0 = new_buffer->strides[0] * new_buffer->shape[1];
+        new_buffer->strides.insert(new_buffer->strides.begin(), stride_0);
+      }
     }
     return Buffer(new_buffer);
   }
 
-  Stmt VisitStmt_(const BlockRealizeNode *op) final {
-    BlockRealize block_realize =
-        Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
-    Block block = block_realize->block;
-    Array<Buffer> alloc_buffers;
-    for (auto buffer : block->alloc_buffers) {
-      if (buffer_remap_.count(buffer)) {
-        Buffer new_buffer = buffer_remap_[buffer];
-        alloc_buffers.push_back(new_buffer);
-      } else {
-        alloc_buffers.push_back(buffer);
+  Array<Stmt> GetPipelineTopLevelStmts(const Stmt &pipeline_body) const {
+    Stmt current = pipeline_body;
+    while (true) {
+      if (const auto *realize = current.as<BlockRealizeNode>()) {
+        current = realize->block->body;
+        continue;
       }
-    }
-    block.CopyOnWrite()->alloc_buffers = std::move(alloc_buffers);
-    // Record the updated alloc list to recover buffers whose LCA is the block.
-    block_alloc_buffers_[op->block.get()] = block->alloc_buffers;
-    block_realize.CopyOnWrite()->block = block;
-    return block_realize;
-  }
-
-  Stmt VisitStmt_(const BlockNode *op) final {
-    stmt_stack_.push_back(op);
-    Stmt stmt = StmtExprMutator::VisitStmt_(op);
-    stmt_stack_.pop_back();
-    return stmt;
-  }
-
-  Stmt VisitStmt_(const ForNode *op) final {
-    stmt_stack_.push_back(op);
-    loop_stack_.emplace_back(op->loop_var, op->extent);
-    auto num_stages_anno = op->annotations.Get("num_stages");
-    if (!num_stages_anno) {
-      auto for_node = StmtExprMutator::VisitStmt_(op);
-      loop_stack_.pop_back();
-      stmt_stack_.pop_back();
-      return for_node;
-    }
-
-    ICHECK(num_stages_anno->as<IntImmNode>());
-    int num_stages = static_cast<int>(num_stages_anno->as<IntImmNode>()->value);
-
-    Stmt pipeline_body_root{nullptr};
-    if (const auto *realize = op->body.as<BlockRealizeNode>()) {
-      const auto &block = realize->block;
-      for (const auto &buffer : block->alloc_buffers) {
-        ICHECK(buffer->IsInstance<BufferNode>());
-        buffer_data_to_buffer_.Set(buffer->data, buffer);
+      if (const auto *block = current.as<BlockNode>()) {
+        current = block->body;
+        continue;
       }
-      pipeline_body_root = block->body;
-    } else {
-      pipeline_body_root = op->body;
+      break;
     }
-
-    const SeqStmtNode *pipeline_body_seq = nullptr;
-    {
-      // Traverse trivial wrappers (let/if) to find the actual SeqStmt body.
-      Stmt current = pipeline_body_root;
-      while (true) {
-        if (const auto *seq_stmt = current.as<SeqStmtNode>()) {
-          pipeline_body_seq = seq_stmt;
-          break;
-        }
-        if (const auto *if_then_else = current.as<IfThenElseNode>()) {
-          ICHECK(!if_then_else->else_case.defined())
-              << "MultiVersionBuffer: Can't handle the body of the loop "
-                 "because the IfThenElse node has an else branch";
-          current = if_then_else->then_case;
-          continue;
-        }
-        if (const auto *let_stmt = current.as<LetStmtNode>()) {
-          current = let_stmt->body;
-          continue;
-        }
-        LOG(FATAL)
-            << "MultiVersionBuffer: Can't handle the body of the loop because "
-            << "it is not a SeqStmt, IfThenElse without else, "
-            << "or LetStmt wrapping them, but got " << current->GetTypeKey();
-      }
+    if (const auto *seq = current.as<SeqStmtNode>()) {
+      return seq->seq;
     }
-    ICHECK(pipeline_body_seq != nullptr);
+    return {current};
+  }
 
+  Array<Buffer> CollectScopedBuffers() const {
     Array<Buffer> scoped_buffers;
     std::unordered_set<const BufferNode *> seen;
     for (auto [buffer, stmt] : buffer_lca_) {
-      if (!stmt.defined())
+      if (!stmt.defined()) {
         continue;
+      }
       const StmtNode *lca = stmt.value().get();
       bool in_scope = false;
       for (const StmtNode *ancestor : stmt_stack_) {
@@ -358,48 +448,241 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
           break;
         }
       }
-      if (!in_scope)
+      if (!in_scope) {
         continue;
-      // Only double-buffer shared allocations; locals do not need versioning.
-      auto scope = buffer.scope();
-      if (!(scope == "shared" || scope == "shared.dyn"))
+      }
+      if (!IsSharedBuffer(buffer) && buffer.scope() != "shared.barrier") {
         continue;
+      }
       if (seen.insert(buffer.get()).second) {
         scoped_buffers.push_back(buffer);
       }
     }
     for (auto it = stmt_stack_.rbegin(); it != stmt_stack_.rend(); ++it) {
-      if (!(*it)->IsInstance<BlockNode>())
+      if (!(*it)->IsInstance<BlockNode>()) {
         continue;
+      }
       const auto *block = static_cast<const BlockNode *>(*it);
       auto map_it = block_alloc_buffers_.find(block);
-      if (map_it == block_alloc_buffers_.end())
-        continue;
-      for (const Buffer &buffer : map_it->second) {
-        auto scope = buffer.scope();
-        if (!(scope == "shared" || scope == "shared.dyn"))
+      const Array<Buffer> &buffers = map_it != block_alloc_buffers_.end()
+                                         ? map_it->second
+                                         : block->alloc_buffers;
+      for (const Buffer &buffer : buffers) {
+        if (!IsSharedBuffer(buffer) && buffer.scope() != "shared.barrier") {
           continue;
+        }
         if (seen.insert(buffer.get()).second) {
           scoped_buffers.push_back(buffer);
         }
       }
     }
+    return scoped_buffers;
+  }
+
+  Array<Buffer> SelectVersionedBuffers(const Stmt &pipeline_body,
+                                       int num_stages) {
+    Array<Buffer> scoped_buffers = CollectScopedBuffers();
+    Array<Buffer> versioned_buffers = GetVersionedBuffers(
+        GetPipelineTopLevelStmts(pipeline_body), scoped_buffers);
+
+    std::unordered_set<const BufferNode *> already;
+    for (const Buffer &buffer : versioned_buffers) {
+      already.insert(buffer.get());
+    }
+    for (const Buffer &buffer : scoped_buffers) {
+      if (buffer.scope() == "shared.barrier" && !already.count(buffer.get())) {
+        versioned_buffers.push_back(buffer);
+      }
+    }
 
-    Array<Buffer> versioned_buffers =
-        GetVersionedBuffers(pipeline_body_seq->seq, scoped_buffers);
+    if (num_stages <= 1) {
+      Array<Buffer> filtered;
+      for (const Buffer &buffer : versioned_buffers) {
+        if (buffer.scope() == "shared.barrier") {
+          filtered.push_back(buffer);
+        }
+      }
+      versioned_buffers = filtered;
+    }
+    return versioned_buffers;
+  }
 
-    for (auto buffer : versioned_buffers) {
+  void EnsureVersionedBuffers(const Array<Buffer> &versioned_buffers,
+                              int num_stages) {
+    for (const Buffer &buffer : versioned_buffers) {
+      if (buffer_remap_.count(buffer)) {
+        continue;
+      }
       Var buffer_var = buffer->data;
       Buffer new_buffer = RewriteAllocBuffer(buffer, num_stages);
       buffer_remap_.Set(buffer, new_buffer);
+      if (!buffer_data_to_buffer_.count(buffer_var)) {
+        buffer_data_to_buffer_.Set(buffer_var, buffer);
+      }
+    }
+  }
+
+  PrimExpr CurrentVersionIndex() const {
+    if (!explicit_version_index_stack_.empty()) {
+      return explicit_version_index_stack_.back();
+    }
+    return version_index_;
+  }
+
+  PrimExpr CurrentParityCycle() const {
+    if (!explicit_parity_cycle_stack_.empty()) {
+      return explicit_parity_cycle_stack_.back();
+    }
+    return parity_cycle_;
+  }
+
+  Stmt VisitStmt_(const BlockRealizeNode *op) final {
+    BlockRealize block_realize =
+        Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
+    Block block = block_realize->block;
+    Array<Buffer> alloc_buffers;
+    std::vector<std::pair<Buffer, Buffer>> remapped_allocs;
+    for (auto buffer : block->alloc_buffers) {
+      if (buffer_remap_.count(buffer)) {
+        Buffer new_buffer = buffer_remap_[buffer];
+        alloc_buffers.push_back(new_buffer);
+        remapped_allocs.emplace_back(buffer, new_buffer);
+      } else {
+        alloc_buffers.push_back(buffer);
+      }
+    }
+    block.CopyOnWrite()->alloc_buffers = std::move(alloc_buffers);
+
+    if (!remapped_allocs.empty()) {
+      auto ann = block->annotations;
+      if (UpdateExpandedLayoutMapForRemappedAllocs(remapped_allocs, &ann)) {
+        block.CopyOnWrite()->annotations = std::move(ann);
+      }
+    }
+
+    // Update barrier_init annotation: replicate arrive counts for versioned
+    // barrier buffers so lower_shared_barrier sees the correct count.
+    if (block->annotations.count("barrier_init")) {
+      auto barrier_init_map = Downcast<Map<Var, Array<PrimExpr>>>(
+          block->annotations.Get("barrier_init").value());
+      Map<Var, Array<PrimExpr>> new_init;
+      bool changed = false;
+      for (auto [data_var, counts] : barrier_init_map) {
+        auto buf_it = buffer_data_to_buffer_.find(data_var);
+        if (buf_it != buffer_data_to_buffer_.end()) {
+          Buffer old_buf = (*buf_it).second;
+          auto remap_it = buffer_remap_.find(old_buf);
+          if (remap_it != buffer_remap_.end()) {
+            Buffer new_buf = (*remap_it).second;
+            int new_size =
+                static_cast<int>(Downcast<IntImm>(new_buf->shape[0])->value);
+            Array<PrimExpr> new_counts;
+            new_counts.reserve(new_size);
+            for (int v = 0; v < new_size;
+                 v += static_cast<int>(counts.size())) {
+              for (auto c : counts)
+                new_counts.push_back(c);
+            }
+            new_init.Set(data_var, new_counts);
+            changed = true;
+            continue;
+          }
+        }
+        new_init.Set(data_var, counts);
+      }
+      if (changed) {
+        auto ann = block->annotations;
+        ann.Set("barrier_init", new_init);
+        block.CopyOnWrite()->annotations = std::move(ann);
+      }
     }
+
+    // Record the updated alloc list to recover buffers whose LCA is the block.
+    block_alloc_buffers_[op->block.get()] = block->alloc_buffers;
+    block_realize.CopyOnWrite()->block = block;
+    return block_realize;
+  }
+
+  Stmt VisitStmt_(const BlockNode *op) final {
+    stmt_stack_.push_back(op);
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    stmt_stack_.pop_back();
+    return stmt;
+  }
+
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    stmt_stack_.push_back(op);
+
+    bool pushed_explicit_version = false;
+    bool pushed_explicit_parity = false;
+    if (op->attr_key == kPipelineMVBContextNumStages) {
+      if (const int64_t *imm = as_const_int(op->value)) {
+        int num_stages = static_cast<int>(*imm);
+        EnsureVersionedBuffers(SelectVersionedBuffers(op->body, num_stages),
+                               num_stages);
+      }
+    } else if (op->attr_key == kPipelineMVBStageExpr) {
+      explicit_version_index_stack_.push_back(op->value);
+      pushed_explicit_version = true;
+    } else if (op->attr_key == kPipelineMVBParityExpr) {
+      explicit_parity_cycle_stack_.push_back(op->value);
+      pushed_explicit_parity = true;
+    }
+
+    Stmt body = this->VisitStmt(op->body);
+
+    if (pushed_explicit_version) {
+      explicit_version_index_stack_.pop_back();
+    }
+    if (pushed_explicit_parity) {
+      explicit_parity_cycle_stack_.pop_back();
+    }
+    stmt_stack_.pop_back();
+
+    if (op->attr_key == kPipelineMVBStageExpr ||
+        op->attr_key == kPipelineMVBParityExpr ||
+        op->attr_key == kPipelineMVBContextNumStages) {
+      return body;
+    }
+    return AttrStmt(op->node, op->attr_key, op->value, body, op->span);
+  }
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    stmt_stack_.push_back(op);
+    loop_stack_.emplace_back(op->loop_var, op->extent);
+    Optional<Integer> num_stages_anno = GetPipelineNumStages(op);
+    if (!num_stages_anno) {
+      auto for_node = StmtExprMutator::VisitStmt_(op);
+      loop_stack_.pop_back();
+      stmt_stack_.pop_back();
+      return for_node;
+    }
+
+    int num_stages = num_stages_anno.value()->value;
+    EnsureVersionedBuffers(SelectVersionedBuffers(op->body, num_stages),
+                           num_stages);
+
     PrimExpr linear_index = loop_stack_[0].first;
     for (size_t i = 1; i < loop_stack_.size(); ++i) {
       linear_index =
           linear_index * loop_stack_[i].second + loop_stack_[i].first;
     }
+    PrimExpr old_version_index = version_index_;
+    PrimExpr old_parity_cycle = parity_cycle_;
+    Var old_pipeline_loop_var = pipeline_loop_var_;
+    PrimExpr old_pipeline_loop_min = pipeline_loop_min_;
     version_index_ = FloorMod(linear_index, num_stages);
+    // Parity cycles every num_stages iterations for mbarrier phase tracking.
+    parity_cycle_ = FloorMod(FloorDiv(linear_index, num_stages), 2);
+    // Store the pipelined loop variable and its min value so we can compute
+    // the initial-phase offset of each mbarrier_wait_parity expression.
+    pipeline_loop_var_ = op->loop_var;
+    pipeline_loop_min_ = op->min;
     auto for_node = StmtExprMutator::VisitStmt_(op);
+    version_index_ = old_version_index;
+    parity_cycle_ = old_parity_cycle;
+    pipeline_loop_var_ = old_pipeline_loop_var;
+    pipeline_loop_min_ = old_pipeline_loop_min;
     loop_stack_.pop_back();
     stmt_stack_.pop_back();
 
@@ -412,10 +695,19 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
     if (it == buffer_remap_.end()) {
       return std::move(load);
     }
+    Buffer old_buffer = load->buffer;
     const Buffer &new_buffer = (*it).second;
+    PrimExpr version_index = CurrentVersionIndex();
+    ICHECK(version_index.defined())
+        << "Versioned buffer load escaped pipeline stage context";
     auto *n = load.CopyOnWrite();
     n->buffer = new_buffer;
-    n->indices.insert(n->indices.begin(), version_index_);
+    if (old_buffer.scope() == "shared.barrier") {
+      // Barrier: offset into expanded 1D array
+      n->indices.Set(0, version_index * old_buffer->shape[0] + n->indices[0]);
+    } else {
+      n->indices.insert(n->indices.begin(), version_index);
+    }
     return std::move(load);
   }
 
@@ -425,10 +717,18 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
     if (it == buffer_remap_.end()) {
       return std::move(store);
     }
+    Buffer old_buffer = store->buffer;
     const Buffer &new_buffer = (*it).second;
+    PrimExpr version_index = CurrentVersionIndex();
+    ICHECK(version_index.defined())
+        << "Versioned buffer store escaped pipeline stage context";
     auto *n = store.CopyOnWrite();
     n->buffer = new_buffer;
-    n->indices.insert(n->indices.begin(), version_index_);
+    if (old_buffer.scope() == "shared.barrier") {
+      n->indices.Set(0, version_index * old_buffer->shape[0] + n->indices[0]);
+    } else {
+      n->indices.insert(n->indices.begin(), version_index);
+    }
     return std::move(store);
   }
 
@@ -437,6 +737,79 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
     if (call->op.same_as(builtin::tvm_access_ptr())) {
       return RewriteBufferAccess(call, {1});
     }
+    // Rewrite tl.tileop.region Calls for versioned buffers.
+    // The region encoding is:
+    //   region(BufferLoad(buf, [min_0, ..., min_N]), access_mask, ext_0, ...,
+    //   ext_N)
+    // After the recursive visit, VisitExpr_(BufferLoadNode*) prepends a
+    // version_index to the BufferLoad indices, yielding [version_index,
+    // min_0, ..., min_N].  We must also insert a matching extent (1) for the
+    // new leading dimension so that RegionOp's ndim == indices.size()
+    // invariant is preserved.
+    //
+    // Detection: if the BufferLoad has more indices than the number of extent
+    // args (args.size() - 2), a version index was prepended.
+    if (call->op.same_as(RegionOp::Get()) && call->args.size() >= 2) {
+      if (auto load = call->args[0].as<BufferLoadNode>()) {
+        size_t num_extents =
+            call->args.size() - 2; // args = [load, mask, ext...]
+        if (load->indices.size() == num_extents + 1) {
+          // Version index was prepended.  Insert a unit extent to match.
+          Array<PrimExpr> new_args;
+          new_args.push_back(call->args[0]); // rewritten BufferLoad
+          new_args.push_back(call->args[1]); // access_mask
+          new_args.push_back(IntImm(DataType::Int(32), 1)); // stage extent
+          for (size_t i = 2; i < call->args.size(); ++i) {
+            new_args.push_back(call->args[i]);
+          }
+          return Call(call->dtype, call->op, new_args, call->annotations);
+        }
+      }
+    }
+    // Rewrite parity for mbarrier_wait_parity on versioned barrier buffers.
+    // The user writes single-barrier parity (e.g. k % 2 or (k+1) % 2).
+    // After multi-versioning, each barrier is reused every num_stages
+    // iterations, so the base parity becomes (k // num_stages) % 2.
+    // However, different barriers may have different initial-phase offsets
+    // (e.g. back-pressure barriers use (k+1)%2 so the first iteration
+    // passes immediately). We detect this offset by evaluating the original
+    // parity at the loop's initial value and preserving it.
+    PrimExpr parity_cycle = CurrentParityCycle();
+    if (call->op.same_as(mbarrier_wait_parity()) && parity_cycle.defined()) {
+      if (auto load = call->args[0].as<BufferLoadNode>()) {
+        if (load->buffer.scope() == "shared.barrier") {
+          PrimExpr new_parity = parity_cycle;
+          arith::Analyzer analyzer;
+          PrimExpr init_orig = call->args[1];
+          PrimExpr init_cycle = parity_cycle;
+          if (!explicit_parity_cycle_stack_.empty()) {
+            PrimExpr version_index = CurrentVersionIndex();
+            ICHECK(version_index.defined())
+                << "Explicit parity rewrite requires a version index";
+            init_cycle = version_index;
+          }
+          if (pipeline_loop_var_.defined()) {
+            auto subst = [&](const Var &v) -> Optional<PrimExpr> {
+              if (v.same_as(pipeline_loop_var_))
+                return pipeline_loop_min_;
+              return Optional<PrimExpr>();
+            };
+            init_orig = analyzer.Simplify(tir::Substitute(init_orig, subst));
+            init_cycle = analyzer.Simplify(tir::Substitute(init_cycle, subst));
+          }
+          PrimExpr offset =
+              analyzer.Simplify(FloorMod(init_orig - init_cycle, 2));
+          if (const int64_t *imm = as_const_int(offset)) {
+            if (*imm % 2 != 0) {
+              new_parity = FloorMod(parity_cycle + 1, 2);
+            }
+          }
+          Array<PrimExpr> new_args = call->args;
+          new_args.Set(1, new_parity);
+          return Call(call->dtype, call->op, new_args, call->annotations);
+        }
+      }
+    }
     return call;
   }
 
@@ -457,6 +830,9 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
       const Buffer &buffer = buffer_data_to_buffer_[buffer_var];
       auto it = buffer_remap_.find(buffer);
       if (it != buffer_remap_.end()) {
+        PrimExpr version_index = CurrentVersionIndex();
+        ICHECK(version_index.defined())
+            << "Versioned access_ptr escaped pipeline stage context";
         const Buffer &new_buffer = (*it).second;
         const PrimExpr &old_index = call->args[i + 1];
         PrimExpr offset;
@@ -465,7 +841,7 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
         } else {
           offset = new_buffer->strides[0];
         }
-        PrimExpr new_index = old_index + version_index_ * offset;
+        PrimExpr new_index = old_index + version_index * offset;
         new_args.Set(i + 1, new_index);
       }
     }
@@ -473,10 +849,15 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
   }
 
   PrimExpr version_index_;
+  PrimExpr parity_cycle_; // (k / num_stages) % 2 for mbarrier parity rewriting
+  Var pipeline_loop_var_; // loop variable of the pipelined loop
+  PrimExpr pipeline_loop_min_; // min value of the pipelined loop
   std::vector<std::pair<Var, PrimExpr>> loop_stack_;
   // Track ancestor statements to query whether an LCA is inside the current
   // loop.
   std::vector<const StmtNode *> stmt_stack_;
+  std::vector<PrimExpr> explicit_version_index_stack_;
+  std::vector<PrimExpr> explicit_parity_cycle_stack_;
   Map<Var, Buffer> buffer_data_to_buffer_;
   Map<Buffer, Optional<Stmt>> buffer_lca_;
   Map<Buffer, Buffer> buffer_remap_;
@@ -485,18 +866,8 @@ class MultiVersionBufferRewriter : public StmtExprMutator {
   std::unordered_map<const BlockNode *, Array<Buffer>> block_alloc_buffers_;
 };
 
-using namespace tir::transform;
-
-tvm::transform::Pass MultiVersionBuffer() {
-  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
-    return MultiVersionBufferRewriter::Substitute(f);
-  };
-  return CreatePrimFuncPass(pass_func, 0, "tl.MultiVersionBuffer", {});
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.transform.MultiVersionBuffer", MultiVersionBuffer);
+PrimFunc ApplyMultiVersionBufferRewriter(PrimFunc f) {
+  return MultiVersionBufferRewriter::Substitute(std::move(f));
 }
 
 } // namespace tl
diff --git a/src/transform/multi_version_buffer_rewriter.h b/src/transform/multi_version_buffer_rewriter.h
new file mode 100644
index 0000000000..01e248b68c
--- /dev/null
+++ b/src/transform/multi_version_buffer_rewriter.h
@@ -0,0 +1,19 @@
+/*!
+ * \brief Internal helper for pipeline buffer multi-versioning.
+ * \file multi_version_buffer_rewriter.h
+ */
+
+#ifndef TVM_TL_TRANSFORM_MULTI_VERSION_BUFFER_REWRITER_H_
+#define TVM_TL_TRANSFORM_MULTI_VERSION_BUFFER_REWRITER_H_
+
+#include <tvm/tir/function.h>
+
+namespace tvm {
+namespace tl {
+
+tir::PrimFunc ApplyMultiVersionBufferRewriter(tir::PrimFunc f);
+
+} // namespace tl
+} // namespace tvm
+
+#endif // TVM_TL_TRANSFORM_MULTI_VERSION_BUFFER_REWRITER_H_
diff --git a/src/transform/parallel_loop_layout_validator.h b/src/transform/parallel_loop_layout_validator.h
index c4cc2e1fc9..55af837ee9 100644
--- a/src/transform/parallel_loop_layout_validator.h
+++ b/src/transform/parallel_loop_layout_validator.h
@@ -34,6 +34,11 @@ inline int CountNestedParallelLoops(const ForNode *op) {
 /*!
  * \brief Validator that checks parallel loop layout annotations.
  *
+ * Rationale: in TileLang's design, inner loops cannot control their outer
+ * loops, while the outermost loop can manage its inner nested region. Hence
+ * the layout annotation is placed on the outermost parallel loop so passes
+ * can reason about and transform the whole nest from the outside.
+ *
  * This validator checks:
  * 1. All parallel loops must have layout annotations (either directly or via
  *    an outer nested parallel loop).
diff --git a/src/transform/pipeline_planning.cc b/src/transform/pipeline_planning.cc
index 717dce27f7..5cc2344508 100644
--- a/src/transform/pipeline_planning.cc
+++ b/src/transform/pipeline_planning.cc
@@ -7,7 +7,18 @@
 #include <tvm/tir/transform.h>
 
 #include "../op/builtin.h"
+#include "../op/copy.h"
+#include "../op/parallel.h"
+#include "../op/region.h"
+#include "../op/utils.h"
+#include "common/pipeline_utils.h"
+#include <algorithm>
+#include <functional>
+#include <limits>
+#include <numeric>
+#include <queue>
 #include <unordered_map>
+#include <unordered_set>
 #include <utility>
 
 #include "../target/utils.h"
@@ -84,6 +95,27 @@ class AsyncDependencyChainBuilder : public StmtExprVisitor {
 private:
   Map<Var, Buffer> buffer_data_to_buffer_;
 
+  // Shared memory buffers referenced by initialize_tcgen05_descriptor calls,
+  // accumulated during traversal and linked to mbar on tcgen05_mma_arrive.
+  std::vector<BufferRegion> pending_tcgen05_smem_reads_;
+
+  // C_tmem buffer from ptx_tcgen05_mma_ss, accumulated during traversal.
+  Optional<Buffer> pending_tcgen05_c_buf_;
+
+  // Helper to extract a Buffer from a tvm_access_ptr call expression.
+  Optional<Buffer> TryGetBufFromAccessPtr(const PrimExpr &expr) {
+    auto call = expr.as<CallNode>();
+    if (!call || !call->op.same_as(builtin::tvm_access_ptr()))
+      return Optional<Buffer>();
+    auto var = call->args[1].as<VarNode>();
+    if (!var)
+      return Optional<Buffer>();
+    auto it = buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(var));
+    if (it == buffer_data_to_buffer_.end())
+      return Optional<Buffer>();
+    return (*it).second;
+  }
+
   void VisitExpr_(const CallNode *op) final {
     auto args = op->args;
     if (op->op.same_as(builtin::call_extern())) {
@@ -95,7 +127,7 @@ class AsyncDependencyChainBuilder : public StmtExprVisitor {
       // TODO(lei): refactor to use identical ops.
       if (func_name == "tl::tcgen5mma_gemm_ts" ||
           func_name == "tl::tcgen5mma_gemm_ss") {
-        // TCGEN5MMA
+        // TCGEN5MMA (high-level, before LowerTileOp)
         auto get_buf_from_access_ptr_call =
             [&](const PrimExpr &expr) -> Buffer {
           auto call = expr.as<CallNode>();
@@ -132,6 +164,59 @@ class AsyncDependencyChainBuilder : public StmtExprVisitor {
         }
       }
       // TODO (lei) Link wgmma to buffers and tl.wait_wgmma
+    } else if (op->op.same_as(initialize_tcgen05_descriptor())) {
+      // Lowered form: initialize_tcgen05_descriptor(desc, start_addr, ...)
+      // args[1] is a tvm_access_ptr to the shared memory buffer (A or B).
+      if (args.size() >= 2) {
+        if (auto buf = TryGetBufFromAccessPtr(args[1])) {
+          pending_tcgen05_smem_reads_.push_back(
+              BufferRegion::FullRegion(buf.value()));
+        }
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    } else if (op->op.same_as(ptx_tcgen05_mma_ss()) ||
+               op->op.same_as(ptx_tcgen05_mma_ts())) {
+      // Lowered form: ptx_tcgen05_mma_ss(kind, desc_a, off_a, desc_b, off_b,
+      //                                  c_tmem, c_off, desc_val, scale, ...)
+      // args[5] is C_tmem.data (a Var, not tvm_access_ptr).
+      if (args.size() > 5) {
+        auto var = args[5].as<VarNode>();
+        if (var) {
+          auto it = buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(var));
+          if (it != buffer_data_to_buffer_.end()) {
+            pending_tcgen05_c_buf_ = (*it).second;
+          }
+        }
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    } else if (op->op.same_as(tcgen05_mma_arrive())) {
+      // Lowered form: tcgen05_mma_arrive(mbar_access_ptr)
+      // Link accumulated shared memory reads to mbar.
+      if (!args.empty()) {
+        if (auto mbar_buf = TryGetBufFromAccessPtr(args[0])) {
+          const BufferNode *mbar_key = mbar_buf.value().get();
+          for (const auto &region : pending_tcgen05_smem_reads_) {
+            mbar_to_buffer_reads_[mbar_key].push_back(region);
+          }
+          if (pending_tcgen05_c_buf_.defined()) {
+            mbar_to_buffer_writes_[mbar_key].push_back(
+                BufferRegion::FullRegion(pending_tcgen05_c_buf_.value()));
+          }
+        } else if (!pending_tcgen05_smem_reads_.empty() ||
+                   pending_tcgen05_c_buf_.defined()) {
+          LOG(WARNING) << "tcgen05_mma_arrive: could not resolve mbar buffer "
+                       << "from args[0]; discarding pending state";
+        }
+      } else if (!pending_tcgen05_smem_reads_.empty() ||
+                 pending_tcgen05_c_buf_.defined()) {
+        LOG(WARNING) << "tcgen05_mma_arrive: empty args; discarding "
+                     << "pending state";
+      }
+      // Always clear pending state after an arrive, whether successful or not,
+      // to prevent stale entries from being misattributed to a future arrive.
+      pending_tcgen05_smem_reads_.clear();
+      pending_tcgen05_c_buf_ = Optional<Buffer>();
+      StmtExprVisitor::VisitExpr_(op);
     } else if (op->op.same_as(tir::builtin::if_then_else())) {
       const PrimExpr &then_expr = args[1];
       const PrimExpr &else_expr = args[2];
@@ -152,9 +237,10 @@ class AsyncDependencyChainBuilder : public StmtExprVisitor {
 class BufferRegionCollector : public StmtExprVisitor {
 public:
   BufferRegionCollector(Map<Var, Buffer> buffer_data_to_buffer,
-                        const AsyncDependencyChainBuilder &chain_builder)
+                        const AsyncDependencyChainBuilder &chain_builder,
+                        Target target)
       : buffer_data_to_buffer_(buffer_data_to_buffer),
-        chain_builder_(chain_builder) {}
+        chain_builder_(chain_builder), target_(target) {}
 
   Array<BufferRegion> GetReads() const { return reads_; }
 
@@ -162,7 +248,88 @@ class BufferRegionCollector : public StmtExprVisitor {
 
   bool GetGlobalCopyPattern() const { return is_global_copy_pattern_; }
 
+  bool GetTmaCopyPattern() const { return is_tma_copy_; }
+
+  bool HasNonCopyTileOp() const { return has_non_copy_tile_op_; }
+
 private:
+  static bool IsGlobalLikeBuffer(const Buffer &buffer) {
+    return IsGlobalBuffer(buffer) ||
+           (buffer.defined() && buffer.scope().empty());
+  }
+
+  void HandleTileOp(const TileOperator &tile_op) {
+    if (tile_op.as<RegionOpNode>()) {
+      return;
+    }
+    if (const auto *parallel = tile_op.as<ParallelOpNode>()) {
+      BufferRegionCollector nested(buffer_data_to_buffer_, chain_builder_,
+                                   target_);
+      nested(parallel->GetRoot());
+      reads_.insert(reads_.end(), nested.GetReads().begin(),
+                    nested.GetReads().end());
+      writes_.insert(writes_.end(), nested.GetWrites().begin(),
+                     nested.GetWrites().end());
+      is_global_copy_pattern_ =
+          is_global_copy_pattern_ || nested.GetGlobalCopyPattern();
+      is_tma_copy_ = is_tma_copy_ || nested.GetTmaCopyPattern();
+      has_non_copy_tile_op_ =
+          has_non_copy_tile_op_ || nested.HasNonCopyTileOp();
+      return;
+    }
+    AccessRegions access = tile_op->GetAccessRegions();
+    reads_.insert(reads_.end(), access.reads.begin(), access.reads.end());
+    writes_.insert(writes_.end(), access.writes.begin(), access.writes.end());
+    // Detect explicit TMA-like producer ops for pipeline planning.
+    // Plain T.copy no longer auto-upgrades to TMA in the generic pipeline
+    // path; only warp-specialized rewriting may turn it into
+    // tl.tileop.tma_copy.
+    if (const auto *copy = tile_op.as<CopyNode>()) {
+      if (IsGlobalLikeBuffer(copy->src) && IsSharedBuffer(copy->dst)) {
+        is_global_copy_pattern_ = true;
+      }
+    }
+    // Conv2D im2col always uses TMA on Hopper.
+    if (const auto *im2col = tile_op.as<Conv2DIm2ColOpNode>()) {
+      if (IsGlobalLikeBuffer(im2col->src_) && IsSharedBuffer(im2col->dst_)) {
+        is_global_copy_pattern_ = true;
+        if (TargetIsHopper(target_)) {
+          is_tma_copy_ = true;
+        }
+      }
+      return;
+    }
+    if (!tile_op.as<CopyNode>()) {
+      has_non_copy_tile_op_ = true;
+    }
+  }
+
+  Optional<Buffer> TryGetBufFromAccessPtr(const PrimExpr &expr) const {
+    auto call = expr.as<CallNode>();
+    if (!call)
+      return Optional<Buffer>();
+    if (call->op.same_as(builtin::tvm_access_ptr())) {
+      if (call->args.size() <= 1)
+        return Optional<Buffer>();
+      auto *var = call->args[1].as<VarNode>();
+      if (!var)
+        return Optional<Buffer>();
+      auto it = buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(var));
+      if (it == buffer_data_to_buffer_.end())
+        return Optional<Buffer>();
+      return (*it).second;
+    }
+    if (call->op.same_as(tl::access_ptr())) {
+      if (call->args.empty())
+        return Optional<Buffer>();
+      auto *load = call->args[0].as<BufferLoadNode>();
+      if (!load)
+        return Optional<Buffer>();
+      return load->buffer;
+    }
+    return Optional<Buffer>();
+  }
+
   void VisitStmt_(const BufferStoreNode *op) final {
     Buffer store_buffer = op->buffer;
     Array<PrimExpr> indices = op->indices;
@@ -176,8 +343,7 @@ class BufferRegionCollector : public StmtExprVisitor {
 
     is_global_read_ = false;
     this->VisitExpr(op->value);
-    if (is_global_read_ && (store_buffer.scope() == "shared" ||
-                            store_buffer.scope() == "shared.dyn")) {
+    if (is_global_read_ && IsSharedBuffer(store_buffer)) {
       is_global_copy_pattern_ = true;
     }
     is_global_read_ = false;
@@ -194,7 +360,7 @@ class BufferRegionCollector : public StmtExprVisitor {
     auto load_region = BufferRegion(load_buffer, region);
     reads_.push_back(load_region);
 
-    if (op->buffer.scope() == "global" && !within_condition_expr_) {
+    if (IsGlobalLikeBuffer(op->buffer) && !within_condition_expr_) {
       // skip condition expr of if_then_else node
       // shared[i] = T.if_then_else(global[i] < n, register_a[i], register_b[i])
       // is not a global read shared[i] = T.if_then_else(global[i] < n,
@@ -205,6 +371,12 @@ class BufferRegionCollector : public StmtExprVisitor {
 
   void VisitExpr_(const CallNode *op) final {
     auto args = op->args;
+    if (auto tile_op = ParseOperator(tvm::ffi::GetRef<Call>(op));
+        tile_op.defined()) {
+      HandleTileOp(tile_op);
+      StmtExprVisitor::VisitExpr_(op);
+      return;
+    }
     if (op->op.same_as(builtin::address_of())) {
       BufferRegion buffer_region;
       if (const auto *load = op->args[0].as<BufferLoadNode>()) {
@@ -230,6 +402,38 @@ class BufferRegionCollector : public StmtExprVisitor {
         // because we only care about the buffer itself instead of indices
         reads_.push_back(buffer_region);
       }
+    } else if (op->op.same_as(builtin::ptx_cp_async()) ||
+               op->op.same_as(tl::ptx_cp_async())) {
+      // Explicit cp.async call: args[0] = dst access ptr, args[1] = src access
+      // ptr. Treat as a global->shared copy candidate in pipeline planning.
+      if (args.size() >= 2) {
+        auto dst_buf = TryGetBufFromAccessPtr(args[0]);
+        auto src_buf = TryGetBufFromAccessPtr(args[1]);
+        if (src_buf.defined()) {
+          reads_.push_back(BufferRegion::FullRegion(src_buf.value()));
+        }
+        if (dst_buf.defined()) {
+          writes_.push_back(BufferRegion::FullRegion(dst_buf.value()));
+        }
+        if (src_buf.defined() && dst_buf.defined() &&
+            IsGlobalLikeBuffer(src_buf.value()) &&
+            IsSharedBuffer(dst_buf.value())) {
+          is_global_copy_pattern_ = true;
+        }
+      }
+      if (args.size() == 4) {
+        // Predicated cp.async should not be treated as a proven full
+        // overwrite of the destination buffer at pipeline-planning
+        // granularity. Model a conservative dependence on the destination so
+        // required initialization or other producer-side writes are not moved
+        // across the async copy.
+        if (auto dst_buf = TryGetBufFromAccessPtr(args[0])) {
+          reads_.push_back(BufferRegion::FullRegion(dst_buf.value()));
+        }
+        // Preserve dependence from a predicated guard expression.
+        this->VisitExpr(args[3]);
+      }
+      return;
     } else if (op->op.same_as(builtin::if_then_else())) {
       within_condition_expr_ = true;
       this->VisitExpr(op->args[0]);
@@ -238,21 +442,25 @@ class BufferRegionCollector : public StmtExprVisitor {
         this->VisitExpr(op->args[i]);
       }
     } else if (op->op.same_as(tl::mbarrier_wait_parity())) {
-      ICHECK(args[0].as<BufferLoadNode>());
-      Buffer mbar_buf = args[0].as<BufferLoadNode>()->buffer;
-      auto buffer_reads =
-          chain_builder_.mbar_to_buffer_reads_.find(mbar_buf.get());
-      auto buffer_writes =
-          chain_builder_.mbar_to_buffer_writes_.find(mbar_buf.get());
-      if (buffer_reads != chain_builder_.mbar_to_buffer_reads_.end()) {
-        reads_.insert(reads_.end(), buffer_reads->second.begin(),
-                      buffer_reads->second.end());
-      }
-      if (buffer_writes != chain_builder_.mbar_to_buffer_writes_.end()) {
-        writes_.insert(
-            writes_.end(),
-            chain_builder_.mbar_to_buffer_writes_.at(mbar_buf.get()).begin(),
-            chain_builder_.mbar_to_buffer_writes_.at(mbar_buf.get()).end());
+      // The mbarrier argument is a BufferLoad on a shared.barrier scope
+      // buffer.  Only track mbarrier→buffer dependencies for user-allocated
+      // barriers.
+      if (auto *buf_load = args[0].as<BufferLoadNode>()) {
+        Buffer mbar_buf = buf_load->buffer;
+        auto buffer_reads =
+            chain_builder_.mbar_to_buffer_reads_.find(mbar_buf.get());
+        auto buffer_writes =
+            chain_builder_.mbar_to_buffer_writes_.find(mbar_buf.get());
+        if (buffer_reads != chain_builder_.mbar_to_buffer_reads_.end()) {
+          reads_.insert(reads_.end(), buffer_reads->second.begin(),
+                        buffer_reads->second.end());
+        }
+        if (buffer_writes != chain_builder_.mbar_to_buffer_writes_.end()) {
+          writes_.insert(
+              writes_.end(),
+              chain_builder_.mbar_to_buffer_writes_.at(mbar_buf.get()).begin(),
+              chain_builder_.mbar_to_buffer_writes_.at(mbar_buf.get()).end());
+        }
       }
     } else {
       StmtExprVisitor::VisitExpr_(op);
@@ -274,11 +482,14 @@ class BufferRegionCollector : public StmtExprVisitor {
 private:
   AsyncDependencyChainBuilder chain_builder_;
   Map<Var, Buffer> buffer_data_to_buffer_;
+  Target target_;
   Array<BufferRegion> reads_;
   Array<BufferRegion> writes_;
   bool is_global_read_ = false;
   bool under_buffer_store_ = false;
   bool is_global_copy_pattern_ = false;
+  bool is_tma_copy_ = false;
+  bool has_non_copy_tile_op_ = false;
   bool within_condition_expr_ = false;
 };
 
@@ -325,19 +536,416 @@ class PipelinePlanner : public StmtExprMutator {
     int original_stmt_index{};
     int order = -1, stage = -1;
     bool copy_stage = false;
+    bool tma_copy = false; // true if this copy stage uses TMA (not cp.async)
+    bool conditional_execution = false;
     bool producer_for_copy = false;
+    // Commit statements have no buffer writes, but they must be scheduled as a
+    // part of their cp.async producer group (after the cp.async calls).
+    bool cp_async_commit_stage = false;
+    int cp_async_call_count = 0;
+    int cp_async_commit_count = 0;
+    int cp_async_wait_count = 0;
+    // Minimal static wait_group(n) value in this stmt.
+    // numeric_limits<int>::max() means no static wait value is observed.
+    int cp_async_wait_min_inflight = std::numeric_limits<int>::max();
+    bool cp_async_wait_has_dynamic = false;
+    int cp_async_group = -1;
     int last_use_stmt_index =
         -1; // Initialized to -1, indicating no consumers found yet
 
   public:
-    bool is_first_stage() const { return copy_stage || producer_for_copy; }
+    bool is_first_stage() const {
+      return copy_stage || producer_for_copy || cp_async_commit_stage;
+    }
     bool is_copy_stage() const { return copy_stage; }
+    bool is_tma_copy() const { return tma_copy; }
     bool is_producer_for_copy() const { return producer_for_copy; }
+    bool is_cp_async_commit_stage() const { return cp_async_commit_stage; }
+    bool has_cp_async_call() const { return cp_async_call_count > 0; }
+    bool has_cp_async_commit() const { return cp_async_commit_count > 0; }
+    bool has_cp_async_wait() const { return cp_async_wait_count > 0; }
     bool is_last_use_stmt_index_valid() const {
       return last_use_stmt_index != -1;
     }
   };
 
+  struct AsyncIntrinInfo {
+    int cp_async_call_count = 0;
+    int cp_async_commit_count = 0;
+    int cp_async_wait_count = 0;
+    int cp_async_wait_min_inflight = std::numeric_limits<int>::max();
+    bool cp_async_wait_has_dynamic = false;
+  };
+
+  AsyncIntrinInfo AnalyzeAsyncIntrinsics(const Stmt &stmt) {
+    AsyncIntrinInfo info;
+    PostOrderVisit(stmt, [&](const ObjectRef &node) {
+      const auto *call = node.as<CallNode>();
+      if (call == nullptr) {
+        return;
+      }
+      if (call->op.same_as(builtin::ptx_cp_async()) ||
+          call->op.same_as(tl::ptx_cp_async())) {
+        ++info.cp_async_call_count;
+      } else if (call->op.same_as(builtin::ptx_commit_group())) {
+        ++info.cp_async_commit_count;
+      } else if (call->op.same_as(builtin::ptx_wait_group())) {
+        ++info.cp_async_wait_count;
+        if (!call->args.empty()) {
+          if (const int64_t *imm = as_const_int(call->args[0])) {
+            info.cp_async_wait_min_inflight = std::min(
+                info.cp_async_wait_min_inflight, static_cast<int>(*imm));
+          } else {
+            info.cp_async_wait_has_dynamic = true;
+          }
+        } else {
+          info.cp_async_wait_has_dynamic = true;
+        }
+      }
+    });
+    return info;
+  }
+
+  bool MayBeConditionallyExecuted(const Stmt &stmt) const {
+    bool conditional = false;
+    PostOrderVisit(stmt, [&](const ObjectRef &node) {
+      if (conditional) {
+        return;
+      }
+      if (const auto *if_then_else = node.as<IfThenElseNode>()) {
+        conditional = true;
+        return;
+      }
+      if (const auto *realize = node.as<BlockRealizeNode>()) {
+        if (!is_one(realize->predicate)) {
+          conditional = true;
+        }
+      }
+    });
+    return conditional;
+  }
+
+  bool IsAsyncProducerCandidate(const PipelineStageInfo &pinfo) const {
+    if (pinfo.conditional_execution) {
+      return false;
+    }
+    if (pinfo.is_tma_copy()) {
+      return false;
+    }
+    if (pinfo.has_cp_async_wait()) {
+      return false;
+    }
+    if (pinfo.has_cp_async_commit() && !pinfo.has_cp_async_call()) {
+      return false;
+    }
+    return pinfo.is_copy_stage() || pinfo.has_cp_async_call();
+  }
+
+  bool IsPureCopyStmt(const Stmt &stmt) const {
+    auto is_global_like_buffer = [](const Buffer &buffer) {
+      return IsGlobalBuffer(buffer) ||
+             (buffer.defined() && buffer.scope().empty());
+    };
+    auto is_pure_raw_copy_value = [&](const PrimExpr &expr,
+                                      const auto &self) -> bool {
+      if (const auto *load = expr.as<BufferLoadNode>()) {
+        return is_global_like_buffer(load->buffer);
+      }
+      if (const auto *cast = expr.as<CastNode>()) {
+        return self(cast->value, self);
+      }
+      return false;
+    };
+
+    bool saw_copy = false;
+    bool saw_non_copy_tile_op = false;
+    bool saw_non_copy_buffer_store = false;
+    PostOrderVisit(stmt, [&](const ObjectRef &node) {
+      if (saw_non_copy_tile_op || saw_non_copy_buffer_store) {
+        return;
+      }
+      if (const auto *store = node.as<BufferStoreNode>()) {
+        saw_copy = true;
+        if ((!IsSharedBuffer(store->buffer) &&
+             !IsLocalBuffer(store->buffer, /*allow_var=*/true)) ||
+            !is_pure_raw_copy_value(store->value, is_pure_raw_copy_value)) {
+          saw_non_copy_buffer_store = true;
+        }
+        return;
+      }
+      const auto *call = node.as<CallNode>();
+      if (call == nullptr) {
+        return;
+      }
+      auto tile_op = ParseOperator(tvm::ffi::GetRef<Call>(call));
+      if (!tile_op.defined()) {
+        return;
+      }
+      if (tile_op.as<RegionOpNode>()) {
+        return;
+      }
+      if (const auto *parallel = tile_op.as<ParallelOpNode>()) {
+        if (IsPureCopyStmt(parallel->GetRoot())) {
+          saw_copy = true;
+        } else {
+          saw_non_copy_tile_op = true;
+        }
+        return;
+      }
+      if (tile_op.as<CopyNode>() || tile_op.as<Conv2DIm2ColOpNode>()) {
+        saw_copy = true;
+      } else {
+        saw_non_copy_tile_op = true;
+      }
+    });
+    return saw_copy && !saw_non_copy_tile_op && !saw_non_copy_buffer_store;
+  }
+
+  Optional<TileOperator> GetSinglePureCopyTileOp(const Stmt &stmt) const {
+    Optional<TileOperator> copy_tile_op;
+    bool saw_non_copy_tile_op = false;
+    bool saw_multiple_copy_ops = false;
+    PostOrderVisit(stmt, [&](const ObjectRef &node) {
+      if (saw_non_copy_tile_op || saw_multiple_copy_ops) {
+        return;
+      }
+      const auto *call = node.as<CallNode>();
+      if (call == nullptr) {
+        return;
+      }
+      auto tile_op = ParseOperator(tvm::ffi::GetRef<Call>(call));
+      if (!tile_op.defined()) {
+        return;
+      }
+      if (tile_op.as<RegionOpNode>()) {
+        return;
+      }
+      if (tile_op.as<CopyNode>() || tile_op.as<Conv2DIm2ColOpNode>()) {
+        if (copy_tile_op.defined()) {
+          saw_multiple_copy_ops = true;
+          copy_tile_op = Optional<TileOperator>();
+        } else {
+          copy_tile_op = tile_op;
+        }
+      } else {
+        saw_non_copy_tile_op = true;
+        copy_tile_op = Optional<TileOperator>();
+      }
+    });
+    if (saw_non_copy_tile_op || saw_multiple_copy_ops) {
+      return Optional<TileOperator>();
+    }
+    return copy_tile_op;
+  }
+
+  static bool IsGlobalLikeBuffer(const Buffer &buffer) {
+    return IsGlobalBuffer(buffer) ||
+           (buffer.defined() && buffer.scope().empty());
+  }
+
+  void ClassifyCopyLikeStage(const Stmt &stmt, PipelineStageInfo *pinfo) const {
+    ICHECK(pinfo != nullptr);
+    if (pinfo->conditional_execution) {
+      return;
+    }
+
+    // Explicit cp.async producer statements participate in the synthetic
+    // stage-0 producer schedule just like ordinary global->shared copies.
+    if (pinfo->has_cp_async_call()) {
+      pinfo->copy_stage = true;
+      return;
+    }
+
+    if (pinfo->copy_stage) {
+      return;
+    }
+
+    auto copy_tile_op = GetSinglePureCopyTileOp(stmt);
+    if (!copy_tile_op.defined()) {
+      return;
+    }
+
+    if (const auto *copy = copy_tile_op.value().as<CopyNode>()) {
+      if (!IsGlobalLikeBuffer(copy->src) || !IsSharedBuffer(copy->dst)) {
+        return;
+      }
+      pinfo->copy_stage = true;
+      return;
+    }
+
+    if (const auto *im2col = copy_tile_op.value().as<Conv2DIm2ColOpNode>()) {
+      if (!IsGlobalLikeBuffer(im2col->src_) || !IsSharedBuffer(im2col->dst_)) {
+        return;
+      }
+      pinfo->copy_stage = true;
+      pinfo->tma_copy = TargetIsHopper(target_);
+    }
+  }
+
+  void AnalyzeCopyLastUse(
+      std::vector<PipelineStageInfo> *pipeline_stage_infos) const {
+    for (auto &pinfo : *pipeline_stage_infos) {
+      if (!pinfo.is_first_stage()) {
+        continue;
+      }
+
+      for (int i = pinfo.original_stmt_index + 1;
+           i < static_cast<int>(pipeline_stage_infos->size()); ++i) {
+        for (const BufferRegion &read : (*pipeline_stage_infos)[i].reads) {
+          if (std::find_if(pinfo.writes.begin(), pinfo.writes.end(),
+                           [&](const BufferRegion &r) {
+                             return r->buffer == read->buffer &&
+                                    MayConflict(r->region, read->region);
+                           }) != pinfo.writes.end()) {
+            pinfo.last_use_stmt_index = std::max(pinfo.last_use_stmt_index, i);
+          }
+        }
+
+        if (!pinfo.is_copy_stage() ||
+            (pinfo.cp_async_group >= 0 &&
+             pinfo.cp_async_group ==
+                 (*pipeline_stage_infos)[i].cp_async_group)) {
+          continue;
+        }
+
+        for (const BufferRegion &write : (*pipeline_stage_infos)[i].writes) {
+          if (std::find_if(pinfo.writes.begin(), pinfo.writes.end(),
+                           [&](const BufferRegion &r) {
+                             return r->buffer == write->buffer &&
+                                    MayConflict(r->region, write->region);
+                           }) != pinfo.writes.end()) {
+            LOG(FATAL) << "Pipeline planning error: Multiple writes to "
+                          "overlapping buffer regions detected. "
+                       << "Stage " << pinfo.original_stmt_index << " and stage "
+                       << i << " are both writing to buffer '"
+                       << write->buffer->name
+                       << "' with overlapping regions. This is not supported "
+                          "in pipeline planning.";
+          }
+        }
+      }
+    }
+  }
+
+  bool EmitImplicitAsyncAnnotations(
+      const std::vector<PipelineStageInfo> &pipeline_stage_infos,
+      Map<String, Any> *annotations) const {
+    if (!TargetHasAsyncCopy(target_) || !use_async_copy_) {
+      return false;
+    }
+
+    std::vector<int> async_group_ids(pipeline_stage_infos.size(), -1);
+    std::vector<int> stmt_indices_by_order(pipeline_stage_infos.size());
+    std::iota(stmt_indices_by_order.begin(), stmt_indices_by_order.end(), 0);
+    std::stable_sort(stmt_indices_by_order.begin(), stmt_indices_by_order.end(),
+                     [&](int lhs, int rhs) {
+                       if (pipeline_stage_infos[lhs].order !=
+                           pipeline_stage_infos[rhs].order) {
+                         return pipeline_stage_infos[lhs].order <
+                                pipeline_stage_infos[rhs].order;
+                       }
+                       return lhs < rhs;
+                     });
+
+    int next_async_group_id = 0;
+    std::map<std::pair<int, int>, int> implicit_group_ids;
+    for (int stmt_idx : stmt_indices_by_order) {
+      const auto &pinfo = pipeline_stage_infos[stmt_idx];
+      if (!IsAsyncProducerCandidate(pinfo)) {
+        continue;
+      }
+      auto key = std::make_pair(pinfo.stage, pinfo.last_use_stmt_index);
+      auto [it, inserted] =
+          implicit_group_ids.emplace(key, next_async_group_id);
+      if (inserted) {
+        ++next_async_group_id;
+      }
+      async_group_ids[stmt_idx] = it->second;
+    }
+
+    if (next_async_group_id == 0) {
+      return false;
+    }
+
+    std::vector<Integer> async_producers;
+    std::vector<Integer> async_producer_groups;
+    async_producers.reserve(pipeline_stage_infos.size());
+    async_producer_groups.reserve(pipeline_stage_infos.size());
+    std::unordered_set<int> async_stage_ids;
+    for (size_t i = 0; i < pipeline_stage_infos.size(); ++i) {
+      bool is_async_producer = async_group_ids[i] != -1;
+      async_producers.push_back(Integer(is_async_producer ? 1 : 0));
+      async_producer_groups.push_back(Integer(async_group_ids[i]));
+      if (is_async_producer) {
+        async_stage_ids.insert(pipeline_stage_infos[i].stage);
+      }
+    }
+
+    annotations->Set(kPipelineAsyncProducers, Array<Integer>(async_producers));
+    annotations->Set(kPipelineAsyncProducerGroups,
+                     Array<Integer>(async_producer_groups));
+
+    std::vector<int> sorted_async_stage_ids(async_stage_ids.begin(),
+                                            async_stage_ids.end());
+    std::sort(sorted_async_stage_ids.begin(), sorted_async_stage_ids.end());
+    std::vector<Integer> async_stages;
+    async_stages.reserve(sorted_async_stage_ids.size());
+    for (int stage_id : sorted_async_stage_ids) {
+      async_stages.push_back(Integer(stage_id));
+    }
+    annotations->Set(tir::attr::software_pipeline_async_stages,
+                     Array<Integer>(async_stages));
+    return true;
+  }
+
+  void MaybeAnnotateLegacyAsyncPipelineLoop(const Stmt &pipeline_body_root,
+                                            const Array<Stmt> &pipeline_stmts,
+                                            const Array<Integer> &order_array,
+                                            const Array<Integer> &stage_array,
+                                            Map<String, Any> *annotations) {
+    if (!TargetHasAsyncCopy(target_) || !use_async_copy_) {
+      return;
+    }
+    ICHECK_EQ(pipeline_stmts.size(), order_array.size());
+    ICHECK_EQ(pipeline_stmts.size(), stage_array.size());
+
+    AsyncDependencyChainBuilder chain_builder(buffer_data_to_buffer_);
+    chain_builder(pipeline_body_root);
+
+    std::vector<PipelineStageInfo> pipeline_stage_infos;
+    pipeline_stage_infos.reserve(pipeline_stmts.size());
+    for (size_t i = 0; i < pipeline_stmts.size(); ++i) {
+      auto pinfo = MakePipelineStageInfo(pipeline_stmts[i], i, chain_builder);
+      ClassifyCopyLikeStage(pipeline_stmts[i], &pinfo);
+      pinfo.order = static_cast<int>(order_array[i]->value);
+      pinfo.stage = static_cast<int>(stage_array[i]->value);
+      if (!pinfo.is_copy_stage() && !pinfo.conditional_execution &&
+          pinfo.stage == 0) {
+        bool reads_global = false;
+        bool writes_shared = false;
+        for (const BufferRegion &read : pinfo.reads) {
+          if (IsGlobalLikeBuffer(read->buffer)) {
+            reads_global = true;
+            break;
+          }
+        }
+        for (const BufferRegion &write : pinfo.writes) {
+          if (IsSharedBuffer(write->buffer)) {
+            writes_shared = true;
+            break;
+          }
+        }
+        if (reads_global && writes_shared) {
+          pinfo.copy_stage = true;
+        }
+      }
+      pipeline_stage_infos.push_back(std::move(pinfo));
+    }
+
+    AnalyzeCopyLastUse(&pipeline_stage_infos);
+    EmitImplicitAsyncAnnotations(pipeline_stage_infos, annotations);
+  }
+
   PipelineStageInfo
   MakePipelineStageInfo(Stmt stmt, int idx,
                         AsyncDependencyChainBuilder &chain_builder) {
@@ -346,13 +954,25 @@ class PipelinePlanner : public StmtExprMutator {
     Array<Array<BufferRegion>> access =
         GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
     auto collector =
-        BufferRegionCollector(buffer_data_to_buffer_, chain_builder);
+        BufferRegionCollector(buffer_data_to_buffer_, chain_builder, target_);
     collector(block);
     PipelineStageInfo pinfo;
     pinfo.reads = std::move(collector.GetReads());
     pinfo.writes = std::move(collector.GetWrites());
     pinfo.original_stmt_index = idx;
-    pinfo.copy_stage = collector.GetGlobalCopyPattern();
+    pinfo.conditional_execution = MayBeConditionallyExecuted(block->body);
+    bool pure_copy_stage =
+        collector.GetGlobalCopyPattern() && IsPureCopyStmt(block->body);
+    pinfo.copy_stage = pure_copy_stage;
+    pinfo.tma_copy = pure_copy_stage && !pinfo.conditional_execution &&
+                     collector.GetTmaCopyPattern();
+    auto async_info = AnalyzeAsyncIntrinsics(block->body);
+    pinfo.cp_async_call_count = async_info.cp_async_call_count;
+    pinfo.cp_async_commit_count = async_info.cp_async_commit_count;
+    pinfo.cp_async_wait_count = async_info.cp_async_wait_count;
+    pinfo.cp_async_wait_min_inflight = async_info.cp_async_wait_min_inflight;
+    pinfo.cp_async_wait_has_dynamic = async_info.cp_async_wait_has_dynamic;
+    ClassifyCopyLikeStage(block->body, &pinfo);
     return std::move(pinfo);
   }
 
@@ -399,9 +1019,53 @@ class PipelinePlanner : public StmtExprMutator {
         }
       }
       annotations.Set(tir::attr::software_pipeline_stage, stage_anno.value());
-      if (TargetHasAsyncCopy(target_) && use_async_copy_)
+      if (TargetHasAsyncCopy(target_) && use_async_copy_) {
+        // Legacy explicit stage/order annotations do not carry per-statement
+        // async producer metadata yet, so keep the previous stage-level
+        // behavior as a fallback for these loops.
         annotations.Set(tir::attr::software_pipeline_async_stages,
                         Array<Integer>{0});
+      }
+      Stmt pipeline_body_root{nullptr};
+      const SeqStmtNode *pipeline_body_seq = nullptr;
+      if (const auto *realize = loop->body.as<BlockRealizeNode>()) {
+        const auto &block = realize->block;
+        for (const auto &buffer : block->alloc_buffers) {
+          ICHECK(buffer->IsInstance<BufferNode>());
+          buffer_data_to_buffer_.Set(buffer->data, buffer);
+        }
+        pipeline_body_root = block->body;
+      } else {
+        pipeline_body_root = loop->body;
+      }
+      {
+        Stmt current = pipeline_body_root;
+        while (true) {
+          if (const auto *seq_stmt = current.as<SeqStmtNode>()) {
+            pipeline_body_seq = seq_stmt;
+            break;
+          }
+          if (const auto *if_then_else = current.as<IfThenElseNode>()) {
+            ICHECK(!if_then_else->else_case.defined())
+                << "Pipeline_Planning: Can't handle the body of the loop "
+                   "because the IfThenElse node has an else branch";
+            current = if_then_else->then_case;
+            continue;
+          }
+          if (const auto *let_stmt = current.as<LetStmtNode>()) {
+            current = let_stmt->body;
+            continue;
+          }
+          LOG(FATAL) << "Pipeline_Planning: Can't handle the body of the loop "
+                     << "because it is not a SeqStmt, IfThenElse without else, "
+                     << "or LetStmt wrapping them, but got "
+                     << current->GetTypeKey();
+        }
+      }
+      ICHECK(pipeline_body_seq != nullptr);
+      MaybeAnnotateLegacyAsyncPipelineLoop(pipeline_body_root,
+                                           pipeline_body_seq->seq, order_array,
+                                           stage_array, &annotations);
       auto for_node = tvm::ffi::GetRef<For>(loop);
       for_node.CopyOnWrite()->annotations = annotations;
       return for_node;
@@ -410,6 +1074,29 @@ class PipelinePlanner : public StmtExprMutator {
     if (!num_stages_anno)
       return StmtExprMutator::VisitStmt_(loop);
     int num_stages = num_stages_anno->as<IntImmNode>()->value;
+    // Skip software pipelining on ROCm targets where async-copy pipelining
+    // has not been validated.  Currently only gfx950 (CDNA4 / MI350) supports
+    // the full HIP async-copy pipeline path.  gfx942 (CDNA3 / MI300X) has
+    // async-copy hardware but the software pipeline for that target has not
+    // been validated yet, so it falls back to a plain sequential loop as well.
+    // RDNA targets have no async-copy support at all and also fall back.
+    if (TargetIsRocm(target_) && !TargetIsGfx950(target_) && num_stages >= 1) {
+      // Strip the "num_stages" annotation before recursing so that downstream
+      // passes (InjectSoftwarePipeline, MultiVersionBufferRewriter, etc.) do
+      // not treat this loop as pipelined.  Leaving the annotation in place
+      // would cause those passes to multi-version shared buffers and inject
+      // cp.async / barrier code that is incompatible with the plain sequential
+      // execution path chosen here.
+      auto stripped = tvm::ffi::GetRef<For>(loop);
+      Map<String, Any> annotations;
+      for (const auto &[key, value] : loop->annotations) {
+        if (key != "num_stages") {
+          annotations.Set(key, value);
+        }
+      }
+      stripped.CopyOnWrite()->annotations = annotations;
+      return StmtExprMutator::VisitStmt_(stripped.get());
+    }
     Stmt pipeline_body_root{nullptr};
     if (const auto *realize = loop->body.as<BlockRealizeNode>()) {
       const auto &block = realize->block;
@@ -451,16 +1138,208 @@ class PipelinePlanner : public StmtExprMutator {
     CHECK(num_stages >= 1);
     CHECK(loop->kind == ForKind::kSerial);
 
+    // Flatten nested SeqStmts. TMA copy lowering emits
+    // SeqStmt({produce, wait}) which creates nested SeqStmts when placed
+    // inside the loop body. Flatten them so pipeline planning can assign
+    // individual stages to the produce and wait statements.
+    Array<Stmt> flat_stmts;
+    std::function<void(const Stmt &)> flatten_seq = [&](const Stmt &s) {
+      if (auto *seq = s.as<SeqStmtNode>()) {
+        for (const auto &sub : seq->seq) {
+          flatten_seq(sub);
+        }
+      } else {
+        flat_stmts.push_back(s);
+      }
+    };
+    for (size_t i = 0; i < pipeline_body_seq->size(); i++) {
+      flatten_seq(pipeline_body_seq->seq[i]);
+    }
+
     AsyncDependencyChainBuilder chain_builder(buffer_data_to_buffer_);
     chain_builder(pipeline_body_root);
 
     std::vector<PipelineStageInfo> pipeline_stage_infos;
-    for (size_t i = 0; i < pipeline_body_seq->size(); i++) {
-      auto pinfo =
-          MakePipelineStageInfo(pipeline_body_seq->seq[i], i, chain_builder);
+    for (size_t i = 0; i < flat_stmts.size(); i++) {
+      auto pinfo = MakePipelineStageInfo(flat_stmts[i], i, chain_builder);
       pipeline_stage_infos.push_back(std::move(pinfo));
     }
 
+    // Build a formal cp.async synchronization model in original statement
+    // order:
+    //   group := cp_async* then commit
+    // and map wait_group(n) to the committed groups it must wait for.
+    struct CPAsyncGroupInfo {
+      int group_id = -1;
+      int anchor_cp_async_stmt = -1;
+      std::vector<int> cp_async_stmt_indices;
+      std::vector<int> commit_stmt_indices;
+      std::unordered_set<const BufferNode *> written_buffers;
+      int last_use_stmt_index = -1;
+    };
+    struct WaitDependencyInfo {
+      int wait_stmt_index = -1;
+      // Committed groups that must be completed before this wait can pass.
+      std::vector<int> required_group_ids;
+    };
+
+    std::vector<CPAsyncGroupInfo> cp_async_groups;
+    std::vector<WaitDependencyInfo> wait_dependencies;
+    std::vector<int> committed_groups_in_order;
+
+    auto create_new_group = [&]() -> int {
+      int group_id = static_cast<int>(cp_async_groups.size());
+      CPAsyncGroupInfo group;
+      group.group_id = group_id;
+      cp_async_groups.push_back(std::move(group));
+      return group_id;
+    };
+
+    int open_group = -1;
+    for (size_t i = 0; i < pipeline_stage_infos.size(); ++i) {
+      auto &pinfo = pipeline_stage_infos[i];
+      if (pinfo.has_cp_async_call()) {
+        if (open_group == -1) {
+          open_group = create_new_group();
+        }
+        pinfo.cp_async_group = open_group;
+        auto &group = cp_async_groups[open_group];
+        group.cp_async_stmt_indices.push_back(static_cast<int>(i));
+        if (group.anchor_cp_async_stmt == -1) {
+          group.anchor_cp_async_stmt = static_cast<int>(i);
+        }
+        for (const auto &write : pinfo.writes) {
+          group.written_buffers.insert(write->buffer.get());
+        }
+      }
+      if (pinfo.has_cp_async_commit()) {
+        if (open_group == -1) {
+          open_group = create_new_group();
+        }
+        pinfo.cp_async_group = open_group;
+        cp_async_groups[open_group].commit_stmt_indices.push_back(
+            static_cast<int>(i));
+        committed_groups_in_order.push_back(open_group);
+        // A commit closes the currently open cp.async group.
+        open_group = -1;
+      }
+      if (pinfo.has_cp_async_wait()) {
+        int committed_count =
+            static_cast<int>(committed_groups_in_order.size());
+        int retain_inflight = pinfo.cp_async_wait_has_dynamic
+                                  ? 0
+                                  : pinfo.cp_async_wait_min_inflight;
+        int required_count =
+            pinfo.cp_async_wait_has_dynamic
+                ? committed_count
+                : std::max(0, committed_count - retain_inflight);
+
+        WaitDependencyInfo wait_dep;
+        wait_dep.wait_stmt_index = static_cast<int>(i);
+        wait_dep.required_group_ids.assign(committed_groups_in_order.begin(),
+                                           committed_groups_in_order.begin() +
+                                               required_count);
+        wait_dependencies.push_back(std::move(wait_dep));
+      }
+    }
+
+    const int pipeline_stmt_count =
+        static_cast<int>(pipeline_stage_infos.size());
+    auto stmt_reads_buffer_set =
+        [&](int stmt_idx,
+            const std::unordered_set<const BufferNode *> &buffers) -> bool {
+      if (buffers.empty() || stmt_idx < 0 || stmt_idx >= pipeline_stmt_count) {
+        return false;
+      }
+      for (const BufferRegion &read : pipeline_stage_infos[stmt_idx].reads) {
+        if (buffers.count(read->buffer.get())) {
+          return true;
+        }
+      }
+      return false;
+    };
+
+    // Record earliest consumers for each cp.async group, and track all
+    // cp.async-written buffers for wait remapping.
+    std::unordered_set<const BufferNode *> async_written_buffers;
+    std::vector<int> cp_async_group_first_consumer(
+        cp_async_groups.size(), std::numeric_limits<int>::max());
+    for (size_t group_id = 0; group_id < cp_async_groups.size(); ++group_id) {
+      const auto &group = cp_async_groups[group_id];
+      async_written_buffers.insert(group.written_buffers.begin(),
+                                   group.written_buffers.end());
+      for (int stmt_idx = 0; stmt_idx < pipeline_stmt_count; ++stmt_idx) {
+        if (pipeline_stage_infos[stmt_idx].is_first_stage()) {
+          continue;
+        }
+        if (stmt_reads_buffer_set(stmt_idx, group.written_buffers)) {
+          cp_async_group_first_consumer[group_id] = stmt_idx;
+          break;
+        }
+      }
+    }
+
+    // Heuristic for wait_group(0): bind each wait to the first unmatched
+    // downstream consumer of cp.async-written shared buffers, then derive the
+    // required groups from that consumer's read set.
+    //
+    // This keeps wait-group scheduling buffer-aware even when wait uses a full
+    // drain value, enabling patterns like "wait/decode B first, then wait A".
+    int last_bound_consumer_stmt = -1;
+    for (auto &wait_dep : wait_dependencies) {
+      int wait_stmt_idx = wait_dep.wait_stmt_index;
+      if (wait_stmt_idx < 0 || wait_stmt_idx >= pipeline_stmt_count) {
+        continue;
+      }
+      const auto &wait_stmt_info = pipeline_stage_infos[wait_stmt_idx];
+      if (!wait_stmt_info.has_cp_async_wait() ||
+          wait_stmt_info.cp_async_wait_has_dynamic ||
+          wait_stmt_info.cp_async_wait_min_inflight != 0) {
+        continue;
+      }
+      if (wait_stmt_info.has_cp_async_call() ||
+          wait_stmt_info.has_cp_async_commit()) {
+        continue;
+      }
+
+      int search_start =
+          std::max(wait_stmt_idx + 1, last_bound_consumer_stmt + 1);
+      int consumer_stmt_idx = -1;
+      for (int stmt_idx = search_start; stmt_idx < pipeline_stmt_count;
+           ++stmt_idx) {
+        if (pipeline_stage_infos[stmt_idx].is_first_stage()) {
+          continue;
+        }
+        if (stmt_reads_buffer_set(stmt_idx, async_written_buffers)) {
+          consumer_stmt_idx = stmt_idx;
+          break;
+        }
+      }
+      if (consumer_stmt_idx < 0) {
+        continue;
+      }
+
+      std::vector<int> required_groups_for_consumer;
+      for (size_t group_id = 0; group_id < cp_async_groups.size(); ++group_id) {
+        if (stmt_reads_buffer_set(consumer_stmt_idx,
+                                  cp_async_groups[group_id].written_buffers)) {
+          required_groups_for_consumer.push_back(static_cast<int>(group_id));
+        }
+      }
+      if (required_groups_for_consumer.empty()) {
+        continue;
+      }
+
+      wait_dep.required_group_ids = std::move(required_groups_for_consumer);
+      last_bound_consumer_stmt = consumer_stmt_idx;
+    }
+
+    std::vector<int> cp_async_group_schedule_order;
+    cp_async_group_schedule_order.reserve(cp_async_groups.size());
+    for (size_t group_id = 0; group_id < cp_async_groups.size(); ++group_id) {
+      cp_async_group_schedule_order.push_back(static_cast<int>(group_id));
+    }
+
     // For every copy stage, mark all its dependency stages as producer_for_copy
     // Helper struct to manage copy stage dependency reads
     struct CopyStageDependencyReadsManager {
@@ -561,55 +1440,70 @@ class PipelinePlanner : public StmtExprMutator {
     // identifies the index of the last statement that consumes data produced by
     // copy stages, enabling optimal placement of copy operations in the
     // pipeline schedule.
-    for (auto &pinfo : pipeline_stage_infos) {
-      // Only analyze copy stages (memory copy operations)
-      if (!pinfo.is_first_stage())
+    AnalyzeCopyLastUse(&pipeline_stage_infos);
+
+    // Treat each explicit `cp_async* ; commit` producer group as a synthetic
+    // copy stage for scheduling. All statements in the group share the same
+    // last-use anchor, so stage assignment keeps the producer group together
+    // instead of scheduling individual cp.async members near different
+    // consumers.
+    for (auto &group : cp_async_groups) {
+      if (group.anchor_cp_async_stmt < 0) {
         continue;
-
-      // Check all subsequent statements to find the latest consumer
-      for (int i = pinfo.original_stmt_index + 1;
-           i < static_cast<int>(pipeline_body_seq->size()); i++) {
-
-        // Check if any read operation in statement 'i' uses data written by
-        // this copy stage
-        for (const BufferRegion &read : pipeline_stage_infos[i].reads) {
-          // Look for overlapping buffer regions between this stage's writes and
-          // stage 'i's reads
-          if (std::find_if(pinfo.writes.begin(), pinfo.writes.end(),
-                           [&](const BufferRegion &r) {
-                             return r->buffer == read->buffer &&
-                                    MayConflict(r->region, read->region);
-                           }) != pinfo.writes.end()) {
-            // Update last_use_stmt_index to the maximum (latest) statement
-            // index that uses this data This ensures we capture the final
-            // consumer of the copied data
-            pinfo.last_use_stmt_index = std::max(pinfo.last_use_stmt_index, i);
-          }
-        }
-        // Check for write-after-write conflicts (multiple stages writing to
-        // same buffer region) This is important for pipeline correctness and
-        // affects last_use_stmt_index analysis
-        if (pinfo.is_copy_stage()) {
-          for (const BufferRegion &write : pipeline_stage_infos[i].writes) {
-            if (std::find_if(pinfo.writes.begin(), pinfo.writes.end(),
-                             [&](const BufferRegion &r) {
-                               return r->buffer == write->buffer &&
-                                      MayConflict(r->region, write->region);
-                             }) != pinfo.writes.end()) {
-              LOG(FATAL) << "Pipeline planning error: Multiple writes to "
-                            "overlapping buffer regions detected. "
-                         << "Stage " << pinfo.original_stmt_index
-                         << " and stage " << i
-                         << " are both writing to buffer '"
-                         << write->buffer->name
-                         << "' with overlapping regions. This is not supported "
-                            "in pipeline planning.";
-            }
-          }
+      }
+      int group_last_use = -1;
+      int group_last_cp_async_stmt = group.anchor_cp_async_stmt;
+      for (int cp_async_stmt_idx : group.cp_async_stmt_indices) {
+        group_last_cp_async_stmt =
+            std::max(group_last_cp_async_stmt, cp_async_stmt_idx);
+        group_last_use = std::max(
+            group_last_use,
+            pipeline_stage_infos[cp_async_stmt_idx].last_use_stmt_index);
+      }
+      if (group_last_use < 0) {
+        // Fallback to the latest cp.async statement when no consumer is found
+        // (rare, but keep local ordering correct).
+        group_last_use = group_last_cp_async_stmt;
+      }
+      group.last_use_stmt_index = group_last_use;
+      for (int cp_async_stmt_idx : group.cp_async_stmt_indices) {
+        pipeline_stage_infos[cp_async_stmt_idx].last_use_stmt_index =
+            group_last_use;
+      }
+      for (int commit_stmt_idx : group.commit_stmt_indices) {
+        auto &commit_info = pipeline_stage_infos[commit_stmt_idx];
+        commit_info.last_use_stmt_index = group_last_use;
+        // Only mark commit-only statements. If commit is already fused with
+        // cp.async calls in the same statement, its local ordering is
+        // preserved by the statement itself.
+        if (commit_info.has_cp_async_commit() &&
+            !commit_info.has_cp_async_call()) {
+          commit_info.cp_async_commit_stage = true;
         }
       }
     }
 
+    // Order explicit cp.async producer groups by the lifetime of the data they
+    // introduce. Groups whose data dies earlier should be scheduled earlier in
+    // the synthetic stage-0 producer schedule, which also matches the desired
+    // wait rebinding behavior for wait_group(0) consumers.
+    std::stable_sort(
+        cp_async_group_schedule_order.begin(),
+        cp_async_group_schedule_order.end(), [&](int lhs_group, int rhs_group) {
+          int lhs_last_use = cp_async_groups[lhs_group].last_use_stmt_index;
+          int rhs_last_use = cp_async_groups[rhs_group].last_use_stmt_index;
+          if (lhs_last_use != rhs_last_use) {
+            return lhs_last_use < rhs_last_use;
+          }
+          int lhs_first_consumer = cp_async_group_first_consumer[lhs_group];
+          int rhs_first_consumer = cp_async_group_first_consumer[rhs_group];
+          if (lhs_first_consumer != rhs_first_consumer) {
+            return lhs_first_consumer < rhs_first_consumer;
+          }
+          return cp_async_groups[lhs_group].anchor_cp_async_stmt <
+                 cp_async_groups[rhs_group].anchor_cp_async_stmt;
+        });
+
     // Making stages and orders
     int order_idx = 0;
     // Stage 1. Create pipeline stages and assign order
@@ -669,11 +1563,387 @@ class PipelinePlanner : public StmtExprMutator {
       for (auto &pinfo : pipeline_stage_infos) { // move copy to the beginning
         pinfo.order =
             (pinfo.order + copy_stage_at_end) % pipeline_stage_infos.size();
-        if (!pinfo.is_copy_stage() && !pinfo.is_producer_for_copy())
+        if (!pinfo.is_copy_stage() && !pinfo.is_producer_for_copy() &&
+            !pinfo.is_cp_async_commit_stage())
           pinfo.stage--;
       }
     }
 
+    // Enforce stage(commit) == stage(cp_async anchor) per group.
+    for (const auto &group : cp_async_groups) {
+      if (group.anchor_cp_async_stmt < 0) {
+        continue;
+      }
+      int anchor_stage = pipeline_stage_infos[group.anchor_cp_async_stmt].stage;
+      for (int commit_stmt_idx : group.commit_stmt_indices) {
+        pipeline_stage_infos[commit_stmt_idx].stage = anchor_stage;
+      }
+    }
+
+    // Sanity check: within a cp.async group, commit statements must appear
+    // after all cp.async calls in the same stage order.
+    for (const auto &group : cp_async_groups) {
+      if (group.anchor_cp_async_stmt < 0) {
+        continue;
+      }
+      int max_cp_async_order = -1;
+      int anchor_stage = pipeline_stage_infos[group.anchor_cp_async_stmt].stage;
+      for (int cp_async_stmt_idx : group.cp_async_stmt_indices) {
+        if (pipeline_stage_infos[cp_async_stmt_idx].stage == anchor_stage) {
+          max_cp_async_order =
+              std::max(max_cp_async_order,
+                       pipeline_stage_infos[cp_async_stmt_idx].order);
+        }
+      }
+      for (int commit_stmt_idx : group.commit_stmt_indices) {
+        if (pipeline_stage_infos[commit_stmt_idx].stage == anchor_stage) {
+          // If commit is fused with cp.async calls in the same statement, the
+          // statement-local order is preserved and we cannot enforce an
+          // inter-statement order relation.
+          if (pipeline_stage_infos[commit_stmt_idx].has_cp_async_call()) {
+            continue;
+          }
+          CHECK_GT(pipeline_stage_infos[commit_stmt_idx].order,
+                   max_cp_async_order)
+              << "Pipeline planning error: cp.async commit is scheduled before "
+                 "its cp.async calls. commit_stmt="
+              << commit_stmt_idx << ", commit_order="
+              << pipeline_stage_infos[commit_stmt_idx].order
+              << ", max_cp_async_order=" << max_cp_async_order
+              << ", stage=" << anchor_stage;
+        }
+      }
+    }
+
+    // Enforce wait placement based on the formal group dependency model.
+    // For static wait_group(n), it depends on committed groups except the
+    // latest n groups. For dynamic wait args, we conservatively treat it as
+    // wait_group(0), i.e. draining all committed groups.
+    auto get_group_stage = [&](int group_id) -> int {
+      if (group_id < 0 ||
+          group_id >= static_cast<int>(cp_async_groups.size())) {
+        return 0;
+      }
+      const auto &group = cp_async_groups[group_id];
+      if (!group.commit_stmt_indices.empty()) {
+        return pipeline_stage_infos[group.commit_stmt_indices.back()].stage;
+      }
+      if (group.anchor_cp_async_stmt >= 0) {
+        return pipeline_stage_infos[group.anchor_cp_async_stmt].stage;
+      }
+      return 0;
+    };
+
+    for (const auto &wait_dep : wait_dependencies) {
+      if (wait_dep.wait_stmt_index < 0 ||
+          wait_dep.wait_stmt_index >=
+              static_cast<int>(pipeline_stage_infos.size())) {
+        continue;
+      }
+      const auto &wait_stmt_info =
+          pipeline_stage_infos[wait_dep.wait_stmt_index];
+      // If wait is fused with cp.async/commit in the same statement, we cannot
+      // place it independently at stage granularity. Keep the statement stage
+      // unchanged and rely on the statement's explicit local ordering.
+      if (wait_stmt_info.has_cp_async_call() ||
+          wait_stmt_info.has_cp_async_commit()) {
+        continue;
+      }
+      if (wait_dep.required_group_ids.empty()) {
+        continue;
+      }
+
+      int required_stage = pipeline_stage_infos[wait_dep.wait_stmt_index].stage;
+      std::unordered_set<const BufferNode *> waited_buffers;
+      for (int group_id : wait_dep.required_group_ids) {
+        required_stage = std::max(required_stage, get_group_stage(group_id));
+        if (group_id >= 0 &&
+            group_id < static_cast<int>(cp_async_groups.size())) {
+          const auto &group = cp_async_groups[group_id];
+          waited_buffers.insert(group.written_buffers.begin(),
+                                group.written_buffers.end());
+        }
+      }
+
+      int dependent_consumer_stage = -1;
+      if (!waited_buffers.empty()) {
+        for (int stmt_idx = wait_dep.wait_stmt_index + 1;
+             stmt_idx < static_cast<int>(pipeline_stage_infos.size());
+             ++stmt_idx) {
+          if (pipeline_stage_infos[stmt_idx].is_first_stage()) {
+            continue;
+          }
+          bool dependent_read = false;
+          for (const BufferRegion &read :
+               pipeline_stage_infos[stmt_idx].reads) {
+            if (waited_buffers.count(read->buffer.get())) {
+              dependent_read = true;
+              break;
+            }
+          }
+          if (dependent_read) {
+            dependent_consumer_stage = pipeline_stage_infos[stmt_idx].stage;
+            break;
+          }
+        }
+      }
+
+      if (dependent_consumer_stage >= 0) {
+        CHECK_GE(dependent_consumer_stage, required_stage)
+            << "Pipeline planning error: wait_group stage cannot be after its "
+               "dependent consumer stage. wait_stmt="
+            << wait_dep.wait_stmt_index << ", required_stage=" << required_stage
+            << ", consumer_stage=" << dependent_consumer_stage;
+        pipeline_stage_infos[wait_dep.wait_stmt_index].stage =
+            dependent_consumer_stage;
+      } else {
+        pipeline_stage_infos[wait_dep.wait_stmt_index].stage = required_stage;
+      }
+    }
+
+    // Enforce cp.async ordering constraints.
+    //
+    // PipelinePlanning's scheduling heuristic may float pure control intrinsics
+    // like `ptx_commit_group` earlier because they have no buffer read/write
+    // regions. This can break cp.async group semantics and generate illegal
+    // code patterns such as:
+    //   cp_async_commit();
+    //   cp_async_gs<...>(...);
+    //
+    // We fix this by building a small control-dependency graph among pipeline
+    // body statements and doing a stable topological sort with the existing
+    // `order` as the tie-breaker.
+    {
+      int n = static_cast<int>(pipeline_stage_infos.size());
+      std::vector<int> order_rank(n, 0);
+      for (int i = 0; i < n; ++i) {
+        order_rank[i] = pipeline_stage_infos[i].order;
+      }
+
+      std::vector<std::unordered_set<int>> edges(n);
+      std::vector<int> indeg(n, 0);
+
+      auto add_edge = [&](int u, int v) {
+        if (u < 0 || v < 0 || u >= n || v >= n || u == v) {
+          return;
+        }
+        if (edges[u].insert(v).second) {
+          indeg[v] += 1;
+        }
+      };
+
+      auto group_schedule_key = [&](const CPAsyncGroupInfo &group) {
+        int key = std::numeric_limits<int>::max();
+        for (int cp_stmt_idx : group.cp_async_stmt_indices) {
+          key = std::min(key, pipeline_stage_infos[cp_stmt_idx].order);
+        }
+        for (int commit_stmt_idx : group.commit_stmt_indices) {
+          key = std::min(key, pipeline_stage_infos[commit_stmt_idx].order);
+        }
+        if (key == std::numeric_limits<int>::max()) {
+          key = group.anchor_cp_async_stmt;
+        }
+        return key;
+      };
+
+      // Respect the synthetic producer-group order computed above. The control
+      // dependencies below only preserve cp.async group boundaries; they must
+      // not re-prioritize groups using a different heuristic.
+      std::vector<int> cp_async_group_schedule_order;
+      cp_async_group_schedule_order.reserve(cp_async_groups.size());
+      for (size_t group_id = 0; group_id < cp_async_groups.size(); ++group_id) {
+        cp_async_group_schedule_order.push_back(static_cast<int>(group_id));
+      }
+      std::stable_sort(
+          cp_async_group_schedule_order.begin(),
+          cp_async_group_schedule_order.end(),
+          [&](int lhs_group, int rhs_group) {
+            int lhs_key = group_schedule_key(cp_async_groups[lhs_group]);
+            int rhs_key = group_schedule_key(cp_async_groups[rhs_group]);
+            if (lhs_key != rhs_key) {
+              return lhs_key < rhs_key;
+            }
+            return cp_async_groups[lhs_group].anchor_cp_async_stmt <
+                   cp_async_groups[rhs_group].anchor_cp_async_stmt;
+          });
+
+      // (1) cp.async group semantics:
+      //   group := cp_async* ; commit
+      // and group boundaries must be preserved:
+      //   commit(group_i) happens before cp_async(group_{i+1}).
+      for (size_t g = 0; g < cp_async_groups.size(); ++g) {
+        const auto &group = cp_async_groups[g];
+        for (int cp_stmt_idx : group.cp_async_stmt_indices) {
+          for (int commit_stmt_idx : group.commit_stmt_indices) {
+            // Only enforce intra-iteration order (same stage).
+            if (pipeline_stage_infos[cp_stmt_idx].stage ==
+                pipeline_stage_infos[commit_stmt_idx].stage) {
+              add_edge(cp_stmt_idx, commit_stmt_idx);
+            }
+          }
+        }
+      }
+      for (size_t i = 0; i + 1 < cp_async_group_schedule_order.size(); ++i) {
+        const auto &group = cp_async_groups[cp_async_group_schedule_order[i]];
+        if (group.commit_stmt_indices.empty()) {
+          continue;
+        }
+        const auto &next_group =
+            cp_async_groups[cp_async_group_schedule_order[i + 1]];
+        for (int commit_stmt_idx : group.commit_stmt_indices) {
+          for (int next_cp_stmt_idx : next_group.cp_async_stmt_indices) {
+            if (pipeline_stage_infos[commit_stmt_idx].stage ==
+                pipeline_stage_infos[next_cp_stmt_idx].stage) {
+              add_edge(commit_stmt_idx, next_cp_stmt_idx);
+            }
+          }
+        }
+      }
+
+      // (2) wait_group ordering:
+      //   - wait must stay before dependent same-stage consumers;
+      //   - if wait is in the same stage as a required commit, it must be
+      //     after that commit;
+      //   - when legal, delay wait to be as close as possible to its first
+      //     dependent consumer by placing independent same-stage statements
+      //     before wait (without crossing async/control-only boundaries).
+      for (const auto &wait_dep : wait_dependencies) {
+        int wait_stmt_idx = wait_dep.wait_stmt_index;
+        if (wait_stmt_idx < 0 || wait_stmt_idx >= n) {
+          continue;
+        }
+
+        const auto &wait_stmt_info = pipeline_stage_infos[wait_stmt_idx];
+        // If wait is fused with cp.async/commit, rely on local statement order.
+        if (wait_stmt_info.has_cp_async_call() ||
+            wait_stmt_info.has_cp_async_commit()) {
+          continue;
+        }
+
+        std::unordered_set<const BufferNode *> waited_buffers;
+        for (int group_id : wait_dep.required_group_ids) {
+          if (group_id < 0 ||
+              group_id >= static_cast<int>(cp_async_groups.size())) {
+            continue;
+          }
+          const auto &group = cp_async_groups[group_id];
+          waited_buffers.insert(group.written_buffers.begin(),
+                                group.written_buffers.end());
+
+          // If wait shares the same stage with a required commit, it must be
+          // ordered after that commit for the same original iteration.
+          for (int commit_stmt_idx : group.commit_stmt_indices) {
+            if (pipeline_stage_infos[commit_stmt_idx].stage ==
+                wait_stmt_info.stage) {
+              add_edge(commit_stmt_idx, wait_stmt_idx);
+            }
+          }
+        }
+
+        if (waited_buffers.empty()) {
+          continue;
+        }
+
+        // Ensure wait happens before all same-stage consumers that read any of
+        // the waited buffers.
+        int first_dependent_consumer_idx = -1;
+        for (int consumer_stmt_idx = wait_stmt_idx + 1; consumer_stmt_idx < n;
+             ++consumer_stmt_idx) {
+          if (pipeline_stage_infos[consumer_stmt_idx].stage !=
+              wait_stmt_info.stage) {
+            continue;
+          }
+          bool dependent_read = false;
+          for (const BufferRegion &read :
+               pipeline_stage_infos[consumer_stmt_idx].reads) {
+            if (waited_buffers.count(read->buffer.get())) {
+              dependent_read = true;
+              break;
+            }
+          }
+          if (dependent_read) {
+            if (first_dependent_consumer_idx == -1) {
+              first_dependent_consumer_idx = consumer_stmt_idx;
+            }
+            add_edge(wait_stmt_idx, consumer_stmt_idx);
+          }
+        }
+
+        // Delay wait within the same stage until right before the first
+        // dependent consumer when possible, so independent prep work can run
+        // while async copies are still in flight.
+        if (first_dependent_consumer_idx != -1) {
+          for (int stmt_idx = wait_stmt_idx + 1;
+               stmt_idx < first_dependent_consumer_idx; ++stmt_idx) {
+            const auto &mid_stmt_info = pipeline_stage_infos[stmt_idx];
+            if (mid_stmt_info.stage != wait_stmt_info.stage) {
+              continue;
+            }
+            // Do not move wait across pure control statements (e.g. barriers)
+            // because they are not represented in buffer read/write regions.
+            if (mid_stmt_info.reads.empty() && mid_stmt_info.writes.empty()) {
+              break;
+            }
+            // Do not move wait across explicit async synchronization points.
+            if (mid_stmt_info.has_cp_async_call() ||
+                mid_stmt_info.has_cp_async_commit() ||
+                mid_stmt_info.has_cp_async_wait()) {
+              break;
+            }
+            bool touches_waited_buffers = false;
+            for (const BufferRegion &read : mid_stmt_info.reads) {
+              if (waited_buffers.count(read->buffer.get())) {
+                touches_waited_buffers = true;
+                break;
+              }
+            }
+            if (!touches_waited_buffers) {
+              for (const BufferRegion &write : mid_stmt_info.writes) {
+                if (waited_buffers.count(write->buffer.get())) {
+                  touches_waited_buffers = true;
+                  break;
+                }
+              }
+            }
+            if (!touches_waited_buffers) {
+              add_edge(stmt_idx, wait_stmt_idx);
+            }
+          }
+        }
+      }
+
+      // Stable topological sort: pick the smallest existing order each time.
+      using Item = std::pair<int, int>; // (order_rank, stmt_idx)
+      std::priority_queue<Item, std::vector<Item>, std::greater<Item>> ready;
+      for (int i = 0; i < n; ++i) {
+        if (indeg[i] == 0) {
+          ready.push({order_rank[i], i});
+        }
+      }
+
+      std::vector<int> topo_order;
+      topo_order.reserve(n);
+      while (!ready.empty()) {
+        auto [rank, u] = ready.top();
+        ready.pop();
+        topo_order.push_back(u);
+        for (int v : edges[u]) {
+          indeg[v] -= 1;
+          if (indeg[v] == 0) {
+            ready.push({order_rank[v], v});
+          }
+        }
+      }
+
+      CHECK_EQ(static_cast<int>(topo_order.size()), n)
+          << "Pipeline planning error: cycle detected while enforcing cp.async "
+             "ordering constraints.";
+
+      for (int new_order = 0; new_order < n; ++new_order) {
+        pipeline_stage_infos[topo_order[new_order]].order = new_order;
+      }
+    }
+
     // Finally, make the pipeline annotation
     Map<String, Any> annotations;
     for (const auto &[key, value] : loop->annotations) {
@@ -681,6 +1951,12 @@ class PipelinePlanner : public StmtExprMutator {
         annotations.Set(key, value);
       }
     }
+    // Preserve the original TileLang pipelining depth for downstream scheduling
+    // (e.g. cp.async wait_group relaxation/splitting). We intentionally do NOT
+    // keep the legacy key "num_stages" here because multiple downstream passes
+    // (e.g. internal buffer versioning / warp specialization) treat it as an
+    // active pipeline marker and do not support nested pipelines.
+    annotations.Set("tl_pipelined_num_stages", Integer(num_stages));
 
     std::vector<Integer> orders, stages;
     orders.reserve(pipeline_stage_infos.size());
@@ -692,12 +1968,119 @@ class PipelinePlanner : public StmtExprMutator {
 
     annotations.Set(tir::attr::software_pipeline_stage, Array<Integer>(stages));
     annotations.Set(tir::attr::software_pipeline_order, Array<Integer>(orders));
-    if (TargetHasAsyncCopy(target_) && use_async_copy_)
-      annotations.Set(tir::attr::software_pipeline_async_stages,
-                      Array<Integer>{0});
 
-    return For(loop->loop_var, loop->min, loop->extent, loop->kind, loop->body,
-               loop->thread_binding, annotations);
+    // Propagate per-statement TMA eligibility so InjectSoftwarePipeline can
+    // rewrite TMA copies to use pipeline-level barrier management.
+    {
+      std::vector<Integer> tma_copies;
+      tma_copies.reserve(pipeline_stage_infos.size());
+      for (auto &pinfo : pipeline_stage_infos) {
+        tma_copies.push_back(Integer(pinfo.is_tma_copy() ? 1 : 0));
+      }
+      annotations.Set(kPipelineTmaCopies, Array<Integer>(tma_copies));
+    }
+
+    if (TargetHasAsyncCopy(target_) && use_async_copy_) {
+      std::vector<int> async_group_ids(pipeline_stage_infos.size(), -1);
+      int next_async_group_id = 0;
+
+      for (int scheduled_group_id : cp_async_group_schedule_order) {
+        const auto &group = cp_async_groups[scheduled_group_id];
+        bool emitted_group = false;
+        for (int stmt_idx : group.cp_async_stmt_indices) {
+          if (!IsAsyncProducerCandidate(pipeline_stage_infos[stmt_idx])) {
+            continue;
+          }
+          async_group_ids[stmt_idx] = next_async_group_id;
+          emitted_group = true;
+        }
+        if (emitted_group) {
+          ++next_async_group_id;
+        }
+      }
+
+      std::vector<int> stmt_indices_by_order(pipeline_stage_infos.size());
+      std::iota(stmt_indices_by_order.begin(), stmt_indices_by_order.end(), 0);
+      std::stable_sort(stmt_indices_by_order.begin(),
+                       stmt_indices_by_order.end(), [&](int lhs, int rhs) {
+                         if (pipeline_stage_infos[lhs].order !=
+                             pipeline_stage_infos[rhs].order) {
+                           return pipeline_stage_infos[lhs].order <
+                                  pipeline_stage_infos[rhs].order;
+                         }
+                         return lhs < rhs;
+                       });
+      std::map<std::pair<int, int>, int> implicit_group_ids;
+      for (int stmt_idx : stmt_indices_by_order) {
+        const auto &pinfo = pipeline_stage_infos[stmt_idx];
+        if (!IsAsyncProducerCandidate(pinfo) ||
+            async_group_ids[stmt_idx] != -1) {
+          continue;
+        }
+        auto key = std::make_pair(pinfo.stage, pinfo.last_use_stmt_index);
+        auto [it, inserted] =
+            implicit_group_ids.emplace(key, next_async_group_id);
+        if (inserted) {
+          ++next_async_group_id;
+        }
+        async_group_ids[stmt_idx] = it->second;
+      }
+
+      std::vector<Integer> async_producers;
+      std::vector<Integer> async_producer_groups;
+      async_producers.reserve(pipeline_stage_infos.size());
+      async_producer_groups.reserve(pipeline_stage_infos.size());
+      std::unordered_set<int> async_stage_ids;
+      for (size_t i = 0; i < pipeline_stage_infos.size(); ++i) {
+        bool is_async_producer = async_group_ids[i] != -1;
+        async_producers.push_back(Integer(is_async_producer ? 1 : 0));
+        async_producer_groups.push_back(Integer(async_group_ids[i]));
+        if (is_async_producer) {
+          async_stage_ids.insert(pipeline_stage_infos[i].stage);
+        }
+      }
+      annotations.Set(kPipelineAsyncProducers, Array<Integer>(async_producers));
+      annotations.Set(kPipelineAsyncProducerGroups,
+                      Array<Integer>(async_producer_groups));
+      if (!async_stage_ids.empty()) {
+        std::vector<int> sorted_async_stage_ids(async_stage_ids.begin(),
+                                                async_stage_ids.end());
+        std::sort(sorted_async_stage_ids.begin(), sorted_async_stage_ids.end());
+        std::vector<Integer> async_stages;
+        async_stages.reserve(sorted_async_stage_ids.size());
+        for (int stage_id : sorted_async_stage_ids) {
+          async_stages.push_back(Integer(stage_id));
+        }
+        annotations.Set(tir::attr::software_pipeline_async_stages,
+                        Array<Integer>(async_stages));
+      }
+    }
+
+    // Reconstruct the loop body with the flattened SeqStmt so that
+    // InjectSoftwarePipeline sees the correct number of pipeline stages.
+    Stmt new_body_seq = SeqStmt(flat_stmts);
+    // Rebuild any wrapper layers (IfThenElse, LetStmt, BlockRealize)
+    // between the loop body and the SeqStmt.
+    Stmt new_loop_body;
+    if (const auto *realize = loop->body.as<BlockRealizeNode>()) {
+      const auto &block = realize->block;
+      // Rebuild: body_root → ... → new_body_seq
+      // We need to reconstruct the chain from block->body to the SeqStmt.
+      Stmt rebuilt_inner =
+          RebuildBodyWrapper(block->body, pipeline_body_seq, new_body_seq);
+      Block new_block(block->iter_vars, block->reads, block->writes,
+                      block->name_hint, rebuilt_inner, block->init,
+                      block->alloc_buffers, block->match_buffers,
+                      block->annotations);
+      new_loop_body =
+          BlockRealize(realize->iter_values, realize->predicate, new_block);
+    } else {
+      new_loop_body =
+          RebuildBodyWrapper(loop->body, pipeline_body_seq, new_body_seq);
+    }
+
+    return For(loop->loop_var, loop->min, loop->extent, loop->kind,
+               new_loop_body, loop->thread_binding, annotations);
   }
 
   Stmt VisitStmt_(const BlockNode *op) final {
@@ -711,6 +2094,31 @@ class PipelinePlanner : public StmtExprMutator {
     return std::move(block);
   }
 
+  /*!
+   * \brief Rebuild the chain of wrapper statements (IfThenElse, LetStmt)
+   *        between the loop body root and the inner SeqStmt, replacing
+   *        the old SeqStmt with the new (flattened) one.
+   */
+  Stmt RebuildBodyWrapper(const Stmt &current, const SeqStmtNode *old_seq,
+                          const Stmt &new_seq) {
+    if (current.get() == old_seq) {
+      return new_seq;
+    }
+    if (const auto *if_node = current.as<IfThenElseNode>()) {
+      return IfThenElse(
+          if_node->condition,
+          RebuildBodyWrapper(if_node->then_case, old_seq, new_seq),
+          if_node->else_case);
+    }
+    if (const auto *let_node = current.as<LetStmtNode>()) {
+      return LetStmt(let_node->var, let_node->value,
+                     RebuildBodyWrapper(let_node->body, old_seq, new_seq));
+    }
+    LOG(FATAL) << "RebuildBodyWrapper: unexpected node type "
+               << current->GetTypeKey();
+    return current;
+  }
+
   Map<Var, Buffer> buffer_data_to_buffer_;
   Target target_;
   bool use_async_copy_{};
diff --git a/src/transform/plan_update_buffer_allocation_location.cc b/src/transform/plan_update_buffer_allocation_location.cc
index 995b21519d..5575f6785f 100644
--- a/src/transform/plan_update_buffer_allocation_location.cc
+++ b/src/transform/plan_update_buffer_allocation_location.cc
@@ -28,6 +28,7 @@
 #include <tvm/tir/transform.h>
 #include <tvm/tir/var.h>
 
+#include "../op/utils.h"
 #include "tir/transforms/ir_utils.h"
 
 // Forward-declare tir's var-level LCA helper which has no public header.
@@ -116,6 +117,72 @@ class BufferAllocateOrderCollector : public StmtExprVisitor {
   ffi::Array<Buffer> buffer_alloc_recorder_;
 };
 
+/*! \brief Collect scope parent links and buffer vars referenced in for headers.
+ *
+ *  Allocations attached to a ForNode are injected into the loop body.  If a
+ *  buffer var is referenced by the loop header itself (e.g. in the extent),
+ *  the allocation must therefore be placed at an outer scope instead.
+ */
+class ScopePlacementInfoCollector : public StmtExprVisitor {
+public:
+  static ScopePlacementInfoCollector Collect(const PrimFunc &func) {
+    ScopePlacementInfoCollector collector;
+    collector.scope_stack_.push_back(nullptr);
+    collector(func->body);
+    return collector;
+  }
+
+  std::unordered_map<const StmtNode *, const StmtNode *> parent_scope_;
+  std::unordered_map<const ForNode *, std::unordered_set<const VarNode *>>
+      for_header_vars_;
+
+private:
+  void VisitStmt_(const BlockRealizeNode *op) final {
+    parent_scope_[op->block.get()] = scope_stack_.back();
+    scope_stack_.push_back(op->block.get());
+    if (!is_one(op->predicate)) {
+      this->VisitExpr(op->predicate);
+    }
+    this->VisitStmt(op->block->body);
+    scope_stack_.pop_back();
+  }
+
+  void VisitStmt_(const ForNode *op) final {
+    parent_scope_[op] = scope_stack_.back();
+    const ForNode *prev_for_header = current_for_header_;
+    current_for_header_ = op;
+    this->VisitExpr(op->min);
+    this->VisitExpr(op->extent);
+    for (const auto &kv : op->annotations) {
+      if (auto expr = kv.second.try_cast<PrimExpr>()) {
+        this->VisitExpr(expr.value());
+      }
+    }
+    current_for_header_ = prev_for_header;
+
+    scope_stack_.push_back(op);
+    this->VisitStmt(op->body);
+    scope_stack_.pop_back();
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (current_for_header_ != nullptr) {
+      for_header_vars_[current_for_header_].insert(op->buffer->data.get());
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitExpr_(const VarNode *op) final {
+    if (current_for_header_ != nullptr) {
+      for_header_vars_[current_for_header_].insert(op);
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  std::vector<const StmtNode *> scope_stack_;
+  const ForNode *current_for_header_{nullptr};
+};
+
 class BufferAllocationLocator : public StmtExprMutator {
 public:
   explicit BufferAllocationLocator(const PrimFunc &func) {
@@ -131,8 +198,12 @@ class BufferAllocationLocator : public StmtExprMutator {
         BufferAllocateOrderCollector::Collect(func);
     std::unordered_set<const VarNode *> arg_buffer_vars;
     CollectManagedAllocations collector;
+    ScopePlacementInfoCollector scope_info =
+        ScopePlacementInfoCollector::Collect(func);
     collector(func->body);
     managed_allocations_ = collector.managed_allocations;
+    parent_scope_ = std::move(scope_info.parent_scope_);
+    for_header_vars_ = std::move(scope_info.for_header_vars_);
 
     for (const auto &kv : func->buffer_map) {
       const Buffer &buffer = kv.second;
@@ -141,6 +212,13 @@ class BufferAllocationLocator : public StmtExprMutator {
     }
     // create buffers to be allocated at each stmts
     for (const auto &buffer : buffer_alloc_recorder) {
+      // Shared barriers must stay in their original block so the accompanying
+      // barrier_init annotation remains attached to the block that owns the
+      // initialization. Moving them into injected opaque blocks causes
+      // LowerSharedBarrier to see barrier buffers without local annotations.
+      if (IsBarrierBuffer(buffer)) {
+        continue;
+      }
       // Prefer the LCA derived from the underlying data var. If missing, fall
       // back to Buffer LCA.
       const StmtNode *stmt = nullptr;
@@ -153,10 +231,14 @@ class BufferAllocationLocator : public StmtExprMutator {
           stmt = (*bit).second.get();
         }
       }
+      stmt = ResolveAllocationSite(buffer->data.get(), stmt);
       if (stmt != nullptr || vit != var_lca.end()) {
+        // Skip moving allocations that are already bound to func arguments.
         if (arg_buffer_vars.count(buffer->data.get())) {
           continue;
         }
+        // Only allocate managed allocations (a.k.a ignore the ones that are not
+        // allocated outside of the block)
         if (managed_allocations_.count(buffer->data.get())) {
           alloc_buffers_[stmt].push_back(buffer);
         }
@@ -218,6 +300,24 @@ class BufferAllocationLocator : public StmtExprMutator {
     return out;
   }
 
+  const StmtNode *ResolveAllocationSite(const VarNode *buffer_var,
+                                        const StmtNode *stmt) const {
+    while (stmt != nullptr && stmt->IsInstance<ForNode>()) {
+      const auto *for_node = static_cast<const ForNode *>(stmt);
+      auto header_it = for_header_vars_.find(for_node);
+      if (header_it == for_header_vars_.end() ||
+          !header_it->second.count(buffer_var)) {
+        break;
+      }
+      auto parent_it = parent_scope_.find(stmt);
+      if (parent_it == parent_scope_.end() || parent_it->second == nullptr) {
+        break;
+      }
+      stmt = parent_it->second;
+    }
+    return stmt;
+  }
+
   Stmt VisitStmt_(const ForNode *op) final {
     auto it = alloc_buffers_.find(op);
     if (it == alloc_buffers_.end()) {
@@ -246,10 +346,18 @@ class BufferAllocationLocator : public StmtExprMutator {
   Stmt VisitStmt_(const BlockNode *op) final {
     ICHECK(!op->init.defined());
     ffi::Array<Buffer> alloc_buffers;
+    ffi::Array<Buffer> preserved_barrier_buffers;
+    for (const Buffer &buf : op->alloc_buffers) {
+      if (IsBarrierBuffer(buf)) {
+        alloc_buffers.push_back(buf);
+        preserved_barrier_buffers.push_back(buf);
+        PushBinding(buf->data, buf);
+      }
+    }
     auto it = alloc_buffers_.find(op);
     if (it != alloc_buffers_.end()) {
-      alloc_buffers = it->second;
       for (const Buffer &buf : it->second) {
+        alloc_buffers.push_back(buf);
         PushBinding(buf->data, buf);
       }
     }
@@ -276,6 +384,9 @@ class BufferAllocationLocator : public StmtExprMutator {
         PopBinding(buf->data);
       }
     }
+    for (const Buffer &buf : preserved_barrier_buffers) {
+      PopBinding(buf->data);
+    }
 
     ObjectPtr<BlockNode> n = CopyOnWrite(op);
     n->alloc_buffers = std::move(alloc_buffers);
@@ -324,10 +435,20 @@ class BufferAllocationLocator : public StmtExprMutator {
 
   /*! \brief The map from stmt to the buffers to be allocated under it. */
   std::unordered_map<const StmtNode *, ffi::Array<Buffer>> alloc_buffers_;
+  /*! \brief Parent scope for each For/Block stmt. */
+  std::unordered_map<const StmtNode *, const StmtNode *> parent_scope_;
+  /*! \brief Buffer vars referenced in the header of each For stmt. */
+  std::unordered_map<const ForNode *, std::unordered_set<const VarNode *>>
+      for_header_vars_;
   /*! \brief Stack of buffers per data var for scoping correctness. */
   ffi::Map<Var, ffi::Array<Buffer>> buffer_data_to_buffers_;
   /*! \brief Buffers that are allocated within a BlockNode, and may be moved. */
   std::unordered_set<const VarNode *> managed_allocations_;
+
+  static bool IsBarrierBuffer(const Buffer &buffer) {
+    String scope = buffer.scope();
+    return scope == "shared.barrier" || scope == "shared.cluster_barrier";
+  }
 };
 
 PrimFunc PlanAndUpdateBufferAllocationLocation(PrimFunc func) {
diff --git a/src/transform/producer_consumer_ws.cc b/src/transform/producer_consumer_ws.cc
new file mode 100644
index 0000000000..d4ba4f65a2
--- /dev/null
+++ b/src/transform/producer_consumer_ws.cc
@@ -0,0 +1,2452 @@
+/*!
+ * \file producer_consumer_ws.cc
+ * \brief Warp-specialized producer/consumer rewriting at the tile-op level.
+ *
+ * This pass runs **before** LayoutInference and LowerTileOp, operating on
+ * high-level tile ops (`tl.tileop.copy`, `tl.tileop.gemm`, etc.).
+ * It recognizes pipelined producer/consumer structure directly from tile-op
+ * semantics and splits eligible loops into warp-specialized branches with
+ * explicit barrier synchronization.
+ *
+ * The output IR is equivalent to a hand-written warp-specialized kernel:
+ *   - TMA-annotated copies become `tl.tileop.tma_copy` with barrier refs
+ *   - Barriers (`mbarrier_wait_parity`, `ptx_arrive_barrier`) are inserted
+ *   - The loop body is wrapped in `if (threadIdx.x >= consumer_extent)`
+ *
+ * Limitations (v1):
+ *   - Pure TMA pipelines only (no mixed TMA + cp.async)
+ *   - No conditionally guarded loop bodies (phase counters)
+ *   - Single pipelined loop per block
+ *   - No pre-loop TMA prefetch / prologue optimizations
+ */
+
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/cast.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include "../op/builtin.h"
+#include "../op/copy.h"
+#include "../op/fill.h"
+#include "../op/gemm.h"
+#include "../op/operator.h"
+#include "../op/region.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
+#include "common/mbarrier.h"
+#include "multi_version_buffer_rewriter.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+namespace {
+
+// ---------------------------------------------------------------------------
+// Utility: flatten SeqStmt recursively
+// ---------------------------------------------------------------------------
+void FlattenSeqStmt(const Stmt &s, Array<Stmt> *out) {
+  if (auto *seq = s.as<SeqStmtNode>()) {
+    for (const auto &sub : seq->seq) {
+      FlattenSeqStmt(sub, out);
+    }
+  } else {
+    out->push_back(s);
+  }
+}
+
+/// Annotation key marking that this function was transformed by the tiled WS
+/// pass, so downstream passes can skip redundant transformations.
+static constexpr const char *kTiledWSApplied = "tl_tiled_ws_applied";
+
+// ---------------------------------------------------------------------------
+// PhaseCounter: local counter for correct barrier parity in guarded loops
+// ---------------------------------------------------------------------------
+struct PhaseCounter {
+  Buffer buf;
+
+  static PhaseCounter Create(const std::string &name) {
+    return {decl_buffer({IntImm(DataType::Int(32), 1)}, DataType::Int(32), name,
+                        "local")};
+  }
+
+  PrimExpr Load() const {
+    return BufferLoad(buf, {IntImm(DataType::Int(32), 0)});
+  }
+
+  Stmt Init() const {
+    return BufferStore(buf, IntImm(DataType::Int(32), 0),
+                       {IntImm(DataType::Int(32), 0)});
+  }
+
+  Stmt Increment() const {
+    return BufferStore(buf, Load() + 1, {IntImm(DataType::Int(32), 0)});
+  }
+
+  Stmt WrapLoopWithAlloc(Stmt loop) const {
+    Stmt body = SeqStmt({Init(), std::move(loop)});
+    body = DeclBuffer(buf, body);
+    return Allocate(buf->data, buf->dtype, buf->shape, const_true(), body);
+  }
+
+  PrimExpr StageExpr(int num_stages) const {
+    if (num_stages == 1)
+      return IntImm(DataType::Int(32), 0);
+    return FloorMod(Load(), num_stages);
+  }
+
+  PrimExpr ParityExpr(int num_stages) const {
+    if (num_stages == 1)
+      return FloorMod(Load(), 2);
+    return FloorMod(FloorDiv(Load(), num_stages), 2);
+  }
+};
+
+// ---------------------------------------------------------------------------
+// StageExprReplacer: rewrite loop-var-based stage indexing to counter-based
+// ---------------------------------------------------------------------------
+class StageExprReplacer : public StmtExprMutator {
+public:
+  static Stmt Replace(const Stmt &stmt, Var loop_var, PrimExpr loop_min,
+                      int num_stages, PrimExpr replacement) {
+    StageExprReplacer r(std::move(loop_var), std::move(loop_min), num_stages,
+                        std::move(replacement));
+    return r.VisitStmt(stmt);
+  }
+
+private:
+  StageExprReplacer(Var loop_var, PrimExpr loop_min, int num_stages,
+                    PrimExpr replacement)
+      : loop_var_(std::move(loop_var)), loop_min_(std::move(loop_min)),
+        num_stages_(num_stages), replacement_(std::move(replacement)) {}
+
+  PrimExpr VisitExpr_(const FloorModNode *op) final {
+    if (is_const_int(op->b, num_stages_) && MatchLinearIdx(op->a)) {
+      return replacement_;
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+  bool MatchLinearIdx(const PrimExpr &expr) const {
+    if (expr.same_as(loop_var_))
+      return true;
+    if (const auto *sub = expr.as<SubNode>()) {
+      if (sub->a.same_as(loop_var_)) {
+        if (is_const_int(sub->b, 0))
+          return true;
+        if (sub->b.same_as(loop_min_))
+          return true;
+      }
+    }
+    return false;
+  }
+
+  Var loop_var_;
+  PrimExpr loop_min_;
+  int num_stages_;
+  PrimExpr replacement_;
+};
+
+// ---------------------------------------------------------------------------
+// Statement classification
+// ---------------------------------------------------------------------------
+
+using BufferDataToBufferMap =
+    std::unordered_map<Var, Buffer, ObjectPtrHash, ObjectPtrEqual>;
+using BufferSet = std::unordered_set<Buffer, ObjectPtrHash, ObjectPtrEqual>;
+using VarSet = std::unordered_set<Var, ObjectPtrHash, ObjectPtrEqual>;
+using BufferNodeMap = std::unordered_map<const BufferNode *, Buffer>;
+using VarExprMap = std::unordered_map<const VarNode *, PrimExpr>;
+
+struct LocalAccessSummary {
+  BufferSet read_buffers;
+  BufferSet write_buffers;
+  VarSet read_vars;
+  VarSet def_vars;
+
+  bool HasTrackedDefs() const {
+    return !write_buffers.empty() || !def_vars.empty();
+  }
+};
+
+struct LocalLiveSet {
+  BufferSet buffers;
+  VarSet vars;
+
+  bool NeedsAnyDef(const LocalAccessSummary &summary) const {
+    for (const auto &buf : summary.write_buffers) {
+      if (buffers.count(buf)) {
+        return true;
+      }
+    }
+    for (const auto &var : summary.def_vars) {
+      if (vars.count(var)) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  void AddUses(const LocalAccessSummary &summary) {
+    buffers.insert(summary.read_buffers.begin(), summary.read_buffers.end());
+    vars.insert(summary.read_vars.begin(), summary.read_vars.end());
+  }
+};
+
+static void MergeLocalAccessSummary(LocalAccessSummary *dst,
+                                    const LocalAccessSummary &src) {
+  dst->read_buffers.insert(src.read_buffers.begin(), src.read_buffers.end());
+  dst->write_buffers.insert(src.write_buffers.begin(), src.write_buffers.end());
+  dst->read_vars.insert(src.read_vars.begin(), src.read_vars.end());
+  dst->def_vars.insert(src.def_vars.begin(), src.def_vars.end());
+}
+
+static Buffer CloneBranchPrivateBuffer(const Buffer &buffer,
+                                       const std::string &suffix) {
+  Type new_type = buffer->data->type_annotation;
+  if (IsFragmentBuffer(buffer)) {
+    const auto *ptr_type = buffer->data->type_annotation.as<PointerTypeNode>();
+    ICHECK(ptr_type);
+    new_type = PointerType(ptr_type->element_type, "local");
+  }
+  Var new_var(buffer->data->name_hint + suffix, new_type);
+  return Buffer(new_var, buffer->dtype, buffer->shape, buffer->strides,
+                buffer->elem_offset, buffer->name + suffix,
+                buffer->data_alignment, buffer->offset_factor,
+                buffer->buffer_type);
+}
+
+class BufferRemapper : public StmtExprMutator {
+public:
+  static Stmt Rewrite(const Stmt &stmt, const BufferNodeMap &buffer_remap) {
+    if (buffer_remap.empty()) {
+      return stmt;
+    }
+    BufferRemapper remapper(buffer_remap);
+    return remapper.VisitStmt(stmt);
+  }
+
+private:
+  explicit BufferRemapper(const BufferNodeMap &buffer_remap)
+      : buffer_remap_(buffer_remap) {
+    for (const auto &[old_buf, new_buf] : buffer_remap_) {
+      var_remap_.emplace(old_buf->data.get(), new_buf->data);
+    }
+  }
+
+  Buffer RemapBuffer(const Buffer &buffer) const {
+    auto it = buffer_remap_.find(buffer.get());
+    if (it != buffer_remap_.end()) {
+      return it->second;
+    }
+    return buffer;
+  }
+
+  PrimExpr VisitExpr_(const VarNode *op) final {
+    auto it = var_remap_.find(op);
+    if (it != var_remap_.end()) {
+      return it->second;
+    }
+    return StmtExprMutator::VisitExpr_(op);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    BufferLoad load = Downcast<BufferLoad>(StmtExprMutator::VisitExpr_(op));
+    Buffer new_buffer = RemapBuffer(load->buffer);
+    if (!new_buffer.same_as(load->buffer)) {
+      return BufferLoad(new_buffer, load->indices, load->predicate, load->span);
+    }
+    return load;
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    BufferStore store = Downcast<BufferStore>(StmtExprMutator::VisitStmt_(op));
+    Buffer new_buffer = RemapBuffer(store->buffer);
+    if (!new_buffer.same_as(store->buffer)) {
+      return BufferStore(new_buffer, store->value, store->indices,
+                         store->predicate, store->span);
+    }
+    return store;
+  }
+
+  const BufferNodeMap &buffer_remap_;
+  VarExprMap var_remap_;
+};
+
+enum class TileStmtKind {
+  kTmaProducer,     // TMA load producer (global->shared)
+  kCpAsyncProducer, // Explicit cp.async / commit / wait_group producer stmt
+  kSimtProducer, // Non-tile-op SIMT copy: For loop writing shared from global
+  kConsumer,     // Compute (gemm, reduce, element-wise, etc.)
+  kOther         // Unclassified
+};
+
+/// Detect if a statement is a SIMT global-to-shared memory copy.
+/// Matches any statement that writes to shared memory and reads from global
+/// memory, without reading shared or local buffers (which would indicate
+/// consumer-side compute).  This is intentionally broader than "pure direct
+/// copy" so that T.Parallel with complex indexing / if_then_else (later
+/// lowered to cp.async) is also captured.
+class SimtProducerDetector : public StmtExprVisitor {
+public:
+  static bool Detect(const Stmt &stmt) {
+    SimtProducerDetector d;
+    d(stmt);
+    return d.writes_shared_ && d.reads_global_ && !d.reads_shared_local_;
+  }
+
+private:
+  void VisitStmt_(const BufferStoreNode *op) final {
+    if (IsSharedBuffer(op->buffer)) {
+      writes_shared_ = true;
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (IsGlobalBuffer(op->buffer)) {
+      reads_global_ = true;
+    }
+    if (IsSharedBuffer(op->buffer) || IsLocalBuffer(op->buffer, true)) {
+      reads_shared_local_ = true;
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  bool writes_shared_{false};
+  bool reads_global_{false};
+  bool reads_shared_local_{false};
+};
+
+static const CallNode *GetEvaluateCallInSimpleWrapper(const Stmt &stmt) {
+  if (const auto *eval = stmt.as<EvaluateNode>()) {
+    return eval->value.as<CallNode>();
+  }
+  if (const auto *if_stmt = stmt.as<IfThenElseNode>()) {
+    if (!if_stmt->else_case.defined()) {
+      return GetEvaluateCallInSimpleWrapper(if_stmt->then_case);
+    }
+    return nullptr;
+  }
+  if (const auto *attr = stmt.as<AttrStmtNode>()) {
+    return GetEvaluateCallInSimpleWrapper(attr->body);
+  }
+  if (const auto *let = stmt.as<LetStmtNode>()) {
+    return GetEvaluateCallInSimpleWrapper(let->body);
+  }
+  if (const auto *block = stmt.as<BlockNode>()) {
+    return GetEvaluateCallInSimpleWrapper(block->body);
+  }
+  if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+    return GetEvaluateCallInSimpleWrapper(realize->block->body);
+  }
+  return nullptr;
+}
+
+class BufferDataToBufferCollector : public StmtExprVisitor {
+public:
+  static BufferDataToBufferMap Collect(const Stmt &stmt) {
+    BufferDataToBufferCollector collector;
+    collector.VisitStmt(stmt);
+    return collector.result_;
+  }
+
+private:
+  void VisitStmt_(const BlockRealizeNode *op) final {
+    CollectBuffers(op->block);
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    CollectBuffers(ffi::GetRef<Block>(op));
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void CollectBuffers(const Block &block) {
+    for (const auto &buffer : block->alloc_buffers) {
+      result_.emplace(buffer->data, buffer);
+    }
+  }
+
+  BufferDataToBufferMap result_;
+};
+
+class LocalAccessCollector : public StmtExprVisitor {
+public:
+  static LocalAccessSummary Collect(const Stmt &stmt,
+                                    const BufferDataToBufferMap &buffer_map) {
+    LocalAccessCollector collector(buffer_map);
+    collector.VisitStmt(stmt);
+    return std::move(collector.summary_);
+  }
+
+private:
+  explicit LocalAccessCollector(const BufferDataToBufferMap &buffer_map)
+      : buffer_data_to_buffer_(buffer_map) {}
+
+  static bool IsBranchPrivateBuffer(const Buffer &buffer) {
+    return IsFragmentBuffer(buffer) || IsLocalBuffer(buffer, true);
+  }
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    VisitExpr(op->value);
+    summary_.def_vars.insert(op->var);
+    bound_vars_.insert(op->var);
+    VisitStmt(op->body);
+    bound_vars_.erase(op->var);
+  }
+
+  void VisitStmt_(const ForNode *op) final {
+    VisitExpr(op->min);
+    VisitExpr(op->extent);
+    bound_vars_.insert(op->loop_var);
+    VisitStmt(op->body);
+    bound_vars_.erase(op->loop_var);
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    if (IsBranchPrivateBuffer(op->buffer)) {
+      summary_.read_buffers.insert(op->buffer);
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BufferStoreNode *op) final {
+    if (IsBranchPrivateBuffer(op->buffer)) {
+      summary_.write_buffers.insert(op->buffer);
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  void VisitExpr_(const VarNode *op) final {
+    Var var = ffi::GetRef<Var>(op);
+    if (bound_vars_.count(var) || buffer_data_to_buffer_.count(var)) {
+      return;
+    }
+    summary_.read_vars.insert(var);
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (auto tile_op = ParseOperator(ffi::GetRef<Call>(op));
+        tile_op.defined()) {
+      if (const auto *copy = tile_op.as<CopyNode>()) {
+        if (IsBranchPrivateBuffer(copy->src)) {
+          summary_.read_buffers.insert(copy->src);
+        }
+        if (IsBranchPrivateBuffer(copy->dst)) {
+          summary_.write_buffers.insert(copy->dst);
+        }
+        for (const auto &range : copy->src_range) {
+          VisitExpr(range->min);
+          VisitExpr(range->extent);
+        }
+        for (const auto &range : copy->dst_range) {
+          VisitExpr(range->min);
+          VisitExpr(range->extent);
+        }
+        return;
+      }
+      if (const auto *fill = tile_op.as<FillNode>()) {
+        if (IsBranchPrivateBuffer(fill->dst)) {
+          summary_.write_buffers.insert(fill->dst);
+        }
+        VisitExpr(fill->value);
+        for (const auto &range : fill->region) {
+          VisitExpr(range->min);
+          VisitExpr(range->extent);
+        }
+        return;
+      }
+    }
+
+    if (op->op.same_as(tl::access_ptr())) {
+      ICHECK_EQ(op->args.size(), 3);
+      const auto *base_load = op->args[0].as<BufferLoadNode>();
+      ICHECK(base_load);
+      if (IsBranchPrivateBuffer(base_load->buffer)) {
+        int rw_mask = GetConstAccessMask(op->args[2]);
+        if (rw_mask & 1) {
+          summary_.read_buffers.insert(base_load->buffer);
+        }
+        if (rw_mask & 2) {
+          summary_.write_buffers.insert(base_load->buffer);
+        }
+      }
+      for (const auto &index : base_load->indices) {
+        VisitExpr(index);
+      }
+      VisitExpr(op->args[1]);
+      return;
+    }
+
+    if (op->op.same_as(builtin::tvm_access_ptr())) {
+      ICHECK_EQ(op->args.size(), 5);
+      const auto *var = op->args[1].as<VarNode>();
+      ICHECK(var);
+      auto it = buffer_data_to_buffer_.find(ffi::GetRef<Var>(var));
+      if (it != buffer_data_to_buffer_.end() &&
+          IsBranchPrivateBuffer(it->second)) {
+        int rw_mask = GetConstAccessMask(op->args[4]);
+        if (rw_mask & 1) {
+          summary_.read_buffers.insert(it->second);
+        }
+        if (rw_mask & 2) {
+          summary_.write_buffers.insert(it->second);
+        }
+      }
+      VisitExpr(op->args[2]);
+      VisitExpr(op->args[3]);
+      return;
+    }
+
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  int GetConstAccessMask(const PrimExpr &expr) const {
+    if (const int64_t *imm = as_const_int(expr)) {
+      return static_cast<int>(*imm);
+    }
+    return 3;
+  }
+
+  const BufferDataToBufferMap &buffer_data_to_buffer_;
+  LocalAccessSummary summary_;
+  VarSet bound_vars_;
+};
+
+enum class PreludeStmtPlacement : uint8_t {
+  kKeepSharedPrelude,
+  kProducerOnly,
+  kConsumerOnly,
+  kDuplicateToBoth,
+};
+
+static PreludeStmtPlacement
+ClassifyPreludeStmt(const Stmt &stmt, const BufferDataToBufferMap &buffer_map,
+                    const LocalLiveSet &shared_live_seed,
+                    const LocalLiveSet &producer_live_seed,
+                    const LocalLiveSet &consumer_live_seed) {
+  LocalAccessSummary summary = LocalAccessCollector::Collect(stmt, buffer_map);
+  if (!summary.HasTrackedDefs()) {
+    return PreludeStmtPlacement::kKeepSharedPrelude;
+  }
+
+  if (shared_live_seed.NeedsAnyDef(summary)) {
+    return PreludeStmtPlacement::kKeepSharedPrelude;
+  }
+
+  bool producer_needs = producer_live_seed.NeedsAnyDef(summary);
+  bool consumer_needs = consumer_live_seed.NeedsAnyDef(summary);
+  if (producer_needs && consumer_needs) {
+    return PreludeStmtPlacement::kDuplicateToBoth;
+  }
+  if (producer_needs) {
+    return PreludeStmtPlacement::kProducerOnly;
+  }
+  if (consumer_needs) {
+    return PreludeStmtPlacement::kConsumerOnly;
+  }
+  return PreludeStmtPlacement::kKeepSharedPrelude;
+}
+
+static bool ContainsPtxCpAsync(const Stmt &stmt) {
+  bool found = false;
+  PostOrderVisit(stmt, [&](const ObjectRef &node) {
+    if (found) {
+      return;
+    }
+    if (const auto *call = node.as<CallNode>()) {
+      if (call->op.same_as(builtin::ptx_cp_async()) ||
+          call->op.same_as(tl::ptx_cp_async())) {
+        found = true;
+      }
+    }
+  });
+  return found;
+}
+
+static bool IsPtxCommitGroup(const Stmt &stmt) {
+  const auto *call = GetEvaluateCallInSimpleWrapper(stmt);
+  return call && call->op.same_as(builtin::ptx_commit_group());
+}
+
+static bool IsPtxWaitGroup(const Stmt &stmt) {
+  const auto *call = GetEvaluateCallInSimpleWrapper(stmt);
+  return call && call->op.same_as(builtin::ptx_wait_group());
+}
+
+static bool IsBarrierOrTmaControlCall(const CallNode *call) {
+  return call->op.same_as(mbarrier_wait_parity()) ||
+         call->op.same_as(mbarrier_expect_tx()) ||
+         call->op.same_as(builtin::ptx_arrive_barrier()) ||
+         call->op.same_as(tl::ptx_arrive_cluster_barrier()) ||
+         call->op.same_as(builtin::ptx_arrive_barrier_expect_tx()) ||
+         call->op.same_as(builtin::ptx_cp_async_barrier()) ||
+         call->op.same_as(tl::ptx_cp_async_barrier_noinc()) ||
+         call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col()) ||
+         call->op.same_as(tma_store()) ||
+         call->op.same_as(tma_store_arrive()) ||
+         call->op.same_as(tma_store_wait()) ||
+         call->op.same_as(builtin::tvm_storage_sync());
+}
+
+static bool IsSyncGlobalToSharedCopyLikeStmt(const Stmt &stmt, Target target) {
+  const auto *call = GetEvaluateCallInSimpleWrapper(stmt);
+  if (!call) {
+    return false;
+  }
+  auto tile_op = ParseOperator(ffi::GetRef<Call>(call));
+  if (!tile_op.defined()) {
+    return false;
+  }
+  const auto *copy = tile_op.as<CopyNode>();
+  if (copy == nullptr || !copy->CheckCPAsyncCopyPreconditions() ||
+      copy->GetIsTmaCopy() || copy->GetIsAsyncCopy()) {
+    return false;
+  }
+
+  arith::Analyzer analyzer;
+  return !copy->CheckBulkLoad(target, &analyzer, /*check_last_dim=*/true);
+}
+
+static bool IsProducerMovableLoopPrefixStmt(const Stmt &stmt, Target target) {
+  if (IsSyncGlobalToSharedCopyLikeStmt(stmt, target)) {
+    return true;
+  }
+
+  bool has_allowed_work = false;
+  bool has_disallowed = false;
+  PostOrderVisit(stmt, [&](const ObjectRef &node) {
+    if (has_disallowed) {
+      return;
+    }
+    if (const auto *call = node.as<CallNode>()) {
+      if (call->op.same_as(builtin::tvm_storage_sync())) {
+        const auto *scope = call->args[0].as<StringImmNode>();
+        if (!scope ||
+            (scope->value != "shared" && scope->value != "shared.dyn")) {
+          has_disallowed = true;
+          return;
+        }
+        has_allowed_work = true;
+        return;
+      }
+      if (IsBarrierOrTmaControlCall(call)) {
+        has_disallowed = true;
+        return;
+      }
+    }
+    if (const auto *ld = node.as<BufferLoadNode>()) {
+      if (IsSharedBuffer(ld->buffer) || IsLocalBuffer(ld->buffer, true)) {
+        has_disallowed = true;
+        return;
+      }
+      if (IsGlobalBuffer(ld->buffer)) {
+        has_allowed_work = true;
+      }
+    }
+    if (const auto *st = node.as<BufferStoreNode>()) {
+      if (IsSharedBuffer(st->buffer)) {
+        has_allowed_work = true;
+        return;
+      }
+      has_disallowed = true;
+    }
+  });
+  return has_allowed_work && !has_disallowed;
+}
+
+/// Classify a tile-op copy as TMA load producer, cp.async producer, or
+/// consumer. Replicates the coarse checks from InstructionAnnotation inline so
+/// that the tiled WS pass does not depend on a prior annotation pass.
+static TileStmtKind ClassifyCopy(const CopyNode *copy, Target target) {
+  // Explicit T.tma_copy() is a load-side primitive: only treat valid
+  // global->shared TMA loads as producers.  TMA stores consume previously
+  // produced shared data and must stay on the consumer side to preserve
+  // per-iteration ordering.
+  if (copy->GetIsTmaCopy()) {
+    arith::Analyzer analyzer;
+    if (copy->CheckBulkLoad(target, &analyzer, /*check_last_dim=*/false)) {
+      return TileStmtKind::kTmaProducer;
+    }
+    return TileStmtKind::kConsumer; // target doesn't support TMA
+  }
+  // Explicit T.async_copy()
+  if (copy->GetIsAsyncCopy()) {
+    return TileStmtKind::kCpAsyncProducer;
+  }
+  // Generic T.copy(): check if TMA is possible
+  {
+    arith::Analyzer analyzer;
+    if (!copy->GetDisableTMA() &&
+        copy->CheckBulkLoad(target, &analyzer, /*check_last_dim=*/true)) {
+      return TileStmtKind::kTmaProducer;
+    }
+  }
+  return TileStmtKind::kConsumer;
+}
+
+/// Classify a single statement in the pipeline loop body.
+TileStmtKind ClassifyStmt(const Stmt &stmt, Target target) {
+  // Tile-op Calls: classify directly via CopyNode checks.
+  if (auto *eval = stmt.as<EvaluateNode>()) {
+    if (auto *call = eval->value.as<CallNode>()) {
+      auto tile_op = ParseOperator(ffi::GetRef<Call>(call));
+      if (tile_op.defined()) {
+        if (auto *copy = tile_op.as<CopyNode>()) {
+          return ClassifyCopy(copy, target);
+        }
+        // Conv2D im2col lowers to tma_load_im2col on Hopper — treat as TMA
+        // producer so it goes to the producer warp group.
+        if (tile_op.as<Conv2DIm2ColOpNode>()) {
+          if (TargetIsHopper(target)) {
+            return TileStmtKind::kTmaProducer;
+          }
+        }
+        return TileStmtKind::kConsumer; // non-copy tile-op
+      }
+    }
+  }
+  // Explicit cp.async producer-side statements are already low-level builtins.
+  if (ContainsPtxCpAsync(stmt) || IsPtxCommitGroup(stmt) ||
+      IsPtxWaitGroup(stmt)) {
+    return TileStmtKind::kCpAsyncProducer;
+  }
+  // Non-tile-op: check for SIMT global-to-shared copy.
+  if (SimtProducerDetector::Detect(stmt)) {
+    return TileStmtKind::kSimtProducer;
+  }
+  return TileStmtKind::kConsumer;
+}
+
+bool IsProducer(TileStmtKind kind) {
+  return kind == TileStmtKind::kTmaProducer ||
+         kind == TileStmtKind::kCpAsyncProducer ||
+         kind == TileStmtKind::kSimtProducer;
+}
+
+// ---------------------------------------------------------------------------
+// Helpers: create barrier IR nodes
+// ---------------------------------------------------------------------------
+
+static Stmt MakeParityWait(const Buffer &barrier_buf, PrimExpr barrier_id,
+                           PrimExpr parity) {
+  auto ref = MakeBarrierRef(barrier_buf, std::move(barrier_id));
+  return Evaluate(Call(DataType::Handle(), mbarrier_wait_parity(),
+                       {ref, std::move(parity)}));
+}
+
+static Stmt MakeArriveBarrier(const Buffer &barrier_buf, PrimExpr barrier_id) {
+  auto ref = MakeBarrierRef(barrier_buf, std::move(barrier_id));
+  return Evaluate(
+      Call(DataType::Handle(), builtin::ptx_arrive_barrier(), {ref}));
+}
+
+// ---------------------------------------------------------------------------
+// Convert tl.tileop.copy → tl.tileop.tma_copy with barrier annotation
+// ---------------------------------------------------------------------------
+
+/// Rewrite a `tl.tileop.copy` Call into a `tl.tileop.tma_copy` Call with
+/// barrier reference.  The args (src/dst regions) are preserved; only the op
+/// and annotations change.
+static PrimExpr RewriteCopyToTmaCopy(const Call &copy_call,
+                                     const Buffer &barrier_buf,
+                                     PrimExpr barrier_id) {
+  static const Op &tma_copy_op = Op::Get("tl.tileop.tma_copy");
+  auto new_annotations = copy_call->annotations;
+  new_annotations.Set("barrier", MakeBarrierRef(barrier_buf, barrier_id));
+  new_annotations.Set("is_tma_copy", IntImm(DataType::Int(32), 1));
+  return Call(copy_call->dtype, tma_copy_op, copy_call->args, new_annotations,
+              copy_call->span);
+}
+
+/// Annotate SIMT producer statements so the enclosing transform owns cp.async
+/// synchronization.
+/// - ForNodes get `kParallelAsyncWithoutAsyncCommitWait = true` so
+///   InjectPTXAsyncCopy does not emit commit_group + wait_group(0).
+/// - Tile-op copy calls get `kAsyncCopyNoImplicitCommitWait` so copy.cc does
+///   not emit its own implicit commit/wait either.
+/// This allows the WS pass to emit its own commit_group +
+/// cp_async_barrier_noinc, tying cp.async completion to the forward mbarrier.
+class SimtProducerAnnotator : public StmtExprMutator {
+public:
+  static Stmt Annotate(const Stmt &stmt, Target target) {
+    SimtProducerAnnotator a(std::move(target));
+    return a.VisitStmt(stmt);
+  }
+
+private:
+  explicit SimtProducerAnnotator(Target target) : target_(std::move(target)) {}
+
+  Stmt VisitStmt_(const ForNode *op) final {
+    Stmt body = VisitStmt(op->body);
+    auto annotations = op->annotations;
+    annotations.Set(attr::kParallelAsyncWithoutAsyncCommitWait, Bool(true));
+    return For(op->loop_var, op->min, op->extent, op->kind, body,
+               op->thread_binding, annotations, op->step, op->span);
+  }
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    static const Op &copy_op = Op::Get("tl.tileop.copy");
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (!call->op.same_as(copy_op) || !CanUsePipelineManagedCPAsyncCopy(call)) {
+      return call;
+    }
+    auto annotations = call->annotations;
+    annotations.Set(attr::kAsyncCopyNoImplicitCommitWait,
+                    IntImm(DataType::Int(32), 1));
+    return Call(call->dtype, call->op, call->args, annotations, call->span);
+  }
+
+  bool CanUsePipelineManagedCPAsyncCopy(const Call &call) const {
+    auto tile_op = ParseOperator(call);
+    const auto *copy = tile_op.as<CopyNode>();
+    if (copy == nullptr) {
+      return false;
+    }
+    return copy->CheckPipelineManagedCPAsyncCopy(target_, &analyzer_);
+  }
+
+  Target target_;
+  mutable arith::Analyzer analyzer_;
+};
+
+class TileOpMbarPhaseAnnotator : public StmtExprMutator {
+public:
+  static Stmt Annotate(const Stmt &stmt, PrimExpr phase_expr) {
+    TileOpMbarPhaseAnnotator annotator(std::move(phase_expr));
+    return annotator.VisitStmt(stmt);
+  }
+
+private:
+  explicit TileOpMbarPhaseAnnotator(PrimExpr phase_expr)
+      : phase_expr_(std::move(phase_expr)) {}
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+    if (!IsMbarPhaseConsumer(call)) {
+      return call;
+    }
+    if (call->annotations.count(attr::kPipelineMbarPhaseExpr)) {
+      return call;
+    }
+    auto annotations = call->annotations;
+    annotations.Set(attr::kPipelineMbarPhaseExpr, phase_expr_);
+    return Call(call->dtype, call->op, call->args, annotations, call->span);
+  }
+
+  bool IsMbarPhaseConsumer(const Call &call) const {
+    auto tile_op = ParseOperator(call);
+    return tile_op.defined() && (tile_op.as<CopyNode>() != nullptr ||
+                                 tile_op.as<Conv2DIm2ColOpNode>() != nullptr ||
+                                 tile_op.as<GemmNode>() != nullptr);
+  }
+
+  PrimExpr phase_expr_;
+};
+
+/// Annotate a tile-op Call (e.g., c2d_im2col) with a barrier reference.
+/// The tile-op's Lower() is expected to check for the "barrier" annotation
+/// and use it instead of allocating its own mbarrier.
+static PrimExpr AnnotateTileOpBarrier(const Call &tile_call,
+                                      const Buffer &barrier_buf,
+                                      PrimExpr barrier_id) {
+  auto new_annotations = tile_call->annotations;
+  new_annotations.Set("barrier", MakeBarrierRef(barrier_buf, barrier_id));
+  return Call(tile_call->dtype, tile_call->op, tile_call->args, new_annotations,
+              tile_call->span);
+}
+
+struct BufferDataAccessInfo {
+  bool read{false};
+  bool write{false};
+
+  bool HasAnyAccess() const { return read || write; }
+};
+
+struct PreludeTmaLoadPlan {
+  Stmt stmt;
+  const StmtNode *stmt_node{nullptr};
+  int wait_pos{-1};
+};
+
+static BufferDataAccessInfo
+AnalyzeBufferDataAccess(const Stmt &stmt, const Var &buffer_data,
+                        const BufferDataToBufferMap &buffer_map) {
+  class BufferDataAccessDetector : public StmtExprVisitor {
+  public:
+    BufferDataAccessDetector(const Var &buffer_data,
+                             const BufferDataToBufferMap &buffer_map)
+        : buffer_data_(buffer_data), buffer_map_(buffer_map) {}
+
+    BufferDataAccessInfo Result() const { return result_; }
+
+  private:
+    void VisitExpr_(const BufferLoadNode *op) final {
+      if (op->buffer->data.same_as(buffer_data_)) {
+        result_.read = true;
+      }
+      StmtExprVisitor::VisitExpr_(op);
+    }
+
+    void VisitStmt_(const BufferStoreNode *op) final {
+      if (op->buffer->data.same_as(buffer_data_)) {
+        result_.write = true;
+      }
+      StmtExprVisitor::VisitStmt_(op);
+    }
+
+    void VisitExpr_(const CallNode *op) final {
+      if (op->op.same_as(tl::access_ptr())) {
+        ICHECK_EQ(op->args.size(), 3);
+        const auto *base_load = op->args[0].as<BufferLoadNode>();
+        ICHECK(base_load);
+        if (base_load->buffer->data.same_as(buffer_data_)) {
+          MarkAccess(op->args[2]);
+        }
+        for (const auto &index : base_load->indices) {
+          VisitExpr(index);
+        }
+        VisitExpr(op->args[1]);
+        return;
+      }
+
+      if (op->op.same_as(builtin::tvm_access_ptr())) {
+        ICHECK_EQ(op->args.size(), 5);
+        const auto *var = op->args[1].as<VarNode>();
+        ICHECK(var);
+        auto it = buffer_map_.find(ffi::GetRef<Var>(var));
+        if (it != buffer_map_.end() && it->second->data.same_as(buffer_data_)) {
+          MarkAccess(op->args[4]);
+        }
+        VisitExpr(op->args[2]);
+        VisitExpr(op->args[3]);
+        return;
+      }
+
+      StmtExprVisitor::VisitExpr_(op);
+    }
+
+    void MarkAccess(const PrimExpr &rw_expr) {
+      int rw_mask = 3;
+      if (const int64_t *imm = as_const_int(rw_expr)) {
+        rw_mask = static_cast<int>(*imm);
+      }
+      if (rw_mask & 1) {
+        result_.read = true;
+      }
+      if (rw_mask & 2) {
+        result_.write = true;
+      }
+    }
+
+    Var buffer_data_;
+    const BufferDataToBufferMap &buffer_map_;
+    BufferDataAccessInfo result_;
+  };
+
+  BufferDataAccessDetector detector(buffer_data, buffer_map);
+  detector(stmt);
+  return detector.Result();
+}
+
+static bool CollectPreludeStmtsToPipelineLoop(const Stmt &stmt,
+                                              const ForNode *pipeline_loop,
+                                              Array<Stmt> *prelude_stmts) {
+  if (stmt.get() == pipeline_loop) {
+    return true;
+  }
+  if (const auto *seq = stmt.as<SeqStmtNode>()) {
+    for (int i = 0; i < static_cast<int>(seq->seq.size()); ++i) {
+      Array<Stmt> nested_prelude;
+      if (CollectPreludeStmtsToPipelineLoop(seq->seq[i], pipeline_loop,
+                                            &nested_prelude)) {
+        for (int j = 0; j < i; ++j) {
+          prelude_stmts->push_back(seq->seq[j]);
+        }
+        prelude_stmts->insert(prelude_stmts->end(), nested_prelude.begin(),
+                              nested_prelude.end());
+        return true;
+      }
+    }
+    return false;
+  }
+  if (const auto *let = stmt.as<LetStmtNode>()) {
+    return CollectPreludeStmtsToPipelineLoop(let->body, pipeline_loop,
+                                             prelude_stmts);
+  }
+  if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+    return CollectPreludeStmtsToPipelineLoop(realize->block->body,
+                                             pipeline_loop, prelude_stmts);
+  }
+  if (const auto *block = stmt.as<BlockNode>()) {
+    return CollectPreludeStmtsToPipelineLoop(block->body, pipeline_loop,
+                                             prelude_stmts);
+  }
+  if (const auto *attr = stmt.as<AttrStmtNode>()) {
+    return CollectPreludeStmtsToPipelineLoop(attr->body, pipeline_loop,
+                                             prelude_stmts);
+  }
+  return false;
+}
+
+static Optional<Var> ExtractProducerWriteBufferData(const Stmt &stmt) {
+  const auto *call = GetEvaluateCallInSimpleWrapper(stmt);
+  if (!call) {
+    return Optional<Var>();
+  }
+  auto tile_op = ParseOperator(ffi::GetRef<Call>(call));
+  if (!tile_op.defined()) {
+    return Optional<Var>();
+  }
+  if (const auto *copy = tile_op.as<CopyNode>()) {
+    if (IsSharedBuffer(copy->dst)) {
+      return copy->dst->data;
+    }
+  }
+  if (const auto *im2col = tile_op.as<Conv2DIm2ColOpNode>()) {
+    if (IsSharedBuffer(im2col->dst_)) {
+      return im2col->dst_->data;
+    }
+  }
+  return Optional<Var>();
+}
+
+static Stmt RewritePreludeTmaProducerStmt(const Stmt &stmt,
+                                          const Buffer &barrier_buf,
+                                          PrimExpr barrier_id) {
+  class PreludeTmaProducerRewriter : public StmtExprMutator {
+  public:
+    PreludeTmaProducerRewriter(Buffer barrier_buf, PrimExpr barrier_id)
+        : barrier_buf_(std::move(barrier_buf)),
+          barrier_id_(std::move(barrier_id)) {}
+
+    Stmt Rewrite(const Stmt &stmt) { return VisitStmt(stmt); }
+
+  private:
+    PrimExpr VisitExpr_(const CallNode *op) final {
+      Call call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
+      if (rewritten_) {
+        return call;
+      }
+      auto tile_op = ParseOperator(call);
+      if (!tile_op.defined()) {
+        return call;
+      }
+      PrimExpr rewritten_call;
+      if (tile_op.as<CopyNode>()) {
+        rewritten_call = RewriteCopyToTmaCopy(call, barrier_buf_, barrier_id_);
+      } else if (tile_op.as<Conv2DIm2ColOpNode>()) {
+        rewritten_call = AnnotateTileOpBarrier(call, barrier_buf_, barrier_id_);
+      } else {
+        return call;
+      }
+      Call new_call = Downcast<Call>(rewritten_call);
+      auto annotations = new_call->annotations;
+      annotations.Set("emit_arrive", IntImm(DataType::Int(32), 1));
+      rewritten_ = true;
+      return Call(new_call->dtype, new_call->op, new_call->args, annotations,
+                  new_call->span);
+    }
+
+    Buffer barrier_buf_;
+    PrimExpr barrier_id_;
+    bool rewritten_{false};
+  };
+
+  PreludeTmaProducerRewriter rewriter(barrier_buf, std::move(barrier_id));
+  return rewriter.Rewrite(stmt);
+}
+
+// ---------------------------------------------------------------------------
+// Main rewriter
+// ---------------------------------------------------------------------------
+
+class ProducerConsumerWSRewriter : public StmtExprMutator {
+public:
+  static PrimFunc Substitute(PrimFunc f) {
+    auto target = f->GetAttr<Target>(tvm::attr::kTarget);
+    ICHECK(target.defined())
+        << "ProducerConsumerWS: target attribute is required";
+
+    ProducerConsumerWSRewriter T;
+    T.target_ = target.value();
+    f.CopyOnWrite()->body = T(f->body);
+
+    if (T.ws_transformed_) {
+      f = WithAttr(std::move(f), kTiledWSApplied, IntImm(DataType::Int(32), 1));
+    }
+    return f;
+  }
+
+private:
+  // --- Track threadIdx.x binding ---
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == tir::attr::thread_extent) {
+      IterVar iv = Downcast<IterVar>(op->node);
+      if (iv->thread_tag == "threadIdx.x") {
+        thread_iv_ = iv;
+        Optional<PrimExpr> old_num_threads = num_threads_;
+        num_threads_ = std::nullopt;
+        AttrStmt attr = Downcast<AttrStmt>(StmtExprMutator::VisitStmt_(op));
+        if (num_threads_.defined()) {
+          PrimExpr nt = num_threads_.value();
+          thread_iv_.CopyOnWrite()->dom = {0, nt};
+          attr.CopyOnWrite()->node = thread_iv_;
+          attr.CopyOnWrite()->value = nt;
+        }
+        num_threads_ = old_num_threads;
+        thread_iv_ = {};
+        return attr;
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  // --- Find the block containing the pipeline loop ---
+  Stmt VisitStmt_(const BlockRealizeNode *op) final {
+    if (!thread_iv_.defined())
+      return StmtExprMutator::VisitStmt_(op);
+
+    const Block &orig_block = op->block;
+
+    // Find the pipelined loop.
+    const ForNode *pipeline_loop = FindPipelineLoop(orig_block->body);
+    if (!pipeline_loop)
+      return StmtExprMutator::VisitStmt_(op);
+
+    auto num_stages_anno = pipeline_loop->annotations.Get("num_stages");
+    if (!num_stages_anno)
+      return StmtExprMutator::VisitStmt_(op);
+    int num_stages =
+        static_cast<int>(Downcast<Integer>(num_stages_anno.value())->value);
+    if (num_stages < 1)
+      return StmtExprMutator::VisitStmt_(op);
+
+    // Flatten the loop body.
+    Array<Stmt> flat_stmts;
+    Stmt loop_body = pipeline_loop->body;
+    if (auto *realize = loop_body.as<BlockRealizeNode>()) {
+      loop_body = realize->block->body;
+    }
+    // Unwrap LetStmt chain that dominates the whole loop body.
+    std::vector<std::pair<Var, PrimExpr>> outer_let_bindings;
+    while (const auto *let = loop_body.as<LetStmtNode>()) {
+      outer_let_bindings.emplace_back(let->var, let->value);
+      loop_body = let->body;
+    }
+    // Unwrap a single IfThenElse wrapper (no else branch) so that
+    // TMA producers inside conditional loop bodies can be classified.
+    // Keep LetStmt chains inside the conditional separate so they stay
+    // dominated by the original guard after rebuilding WS branches.
+    Optional<PrimExpr> loop_body_condition;
+    std::vector<std::pair<Var, PrimExpr>> inner_let_bindings;
+    if (const auto *if_stmt = loop_body.as<IfThenElseNode>()) {
+      if (!if_stmt->else_case.defined()) {
+        // Peel LetStmt chain from inside the conditional body. These
+        // bindings must remain inside the guarded region.
+        Stmt inner = if_stmt->then_case;
+        while (const auto *let = inner.as<LetStmtNode>()) {
+          inner_let_bindings.emplace_back(let->var, let->value);
+          inner = let->body;
+        }
+        loop_body_condition = if_stmt->condition;
+        loop_body = inner;
+      }
+    }
+    FlattenSeqStmt(loop_body, &flat_stmts);
+
+    // Classify statements into producer (TMA/SIMT copy) and consumer.
+    std::vector<TileStmtKind> kinds;
+    int num_tma = 0;
+    int num_simt = 0;
+    for (const Stmt &s : flat_stmts) {
+      auto k = ClassifyStmt(s, target_);
+      kinds.push_back(k);
+      if (k == TileStmtKind::kTmaProducer)
+        ++num_tma;
+      if (k == TileStmtKind::kSimtProducer)
+        ++num_simt;
+    }
+
+    // Require at least one TMA producer.
+    if (num_tma == 0)
+      return StmtExprMutator::VisitStmt_(op);
+
+    // --- Build the WS transformation ---
+    return BuildWSBlock(op, orig_block, pipeline_loop, num_stages, flat_stmts,
+                        kinds, outer_let_bindings, inner_let_bindings,
+                        loop_body_condition);
+  }
+
+  Stmt
+  BuildWSBlock(const BlockRealizeNode *orig_realize, const Block &orig_block,
+               const ForNode *pipeline_loop, int num_stages,
+               const Array<Stmt> &flat_stmts,
+               const std::vector<TileStmtKind> &kinds,
+               const std::vector<std::pair<Var, PrimExpr>> &outer_let_bindings,
+               const std::vector<std::pair<Var, PrimExpr>> &inner_let_bindings,
+               Optional<PrimExpr> loop_body_condition = Optional<PrimExpr>()) {
+    Var loop_var = pipeline_loop->loop_var;
+    PrimExpr loop_min = pipeline_loop->min;
+    PrimExpr loop_extent = pipeline_loop->extent;
+    PrimExpr linear_idx = loop_var - loop_min;
+
+    PrimExpr base_stage_expr = FloorMod(linear_idx, num_stages);
+    PrimExpr base_parity_expr = FloorMod(FloorDiv(linear_idx, num_stages), 2);
+
+    // When the loop body is conditionally guarded, use PhaseCounters
+    // instead of the loop variable for barrier stage/parity.  This
+    // ensures parity stays correct when iterations are skipped.
+    bool needs_phase_counter = loop_body_condition.defined();
+    Optional<PhaseCounter> producer_phase_counter;
+    Optional<PhaseCounter> consumer_phase_counter;
+    PrimExpr p_stage_expr = base_stage_expr;
+    PrimExpr p_parity_expr = base_parity_expr;
+    PrimExpr c_stage_expr = base_stage_expr;
+    PrimExpr c_parity_expr = base_parity_expr;
+    if (needs_phase_counter) {
+      producer_phase_counter = PhaseCounter::Create("producer_phase_cnt");
+      consumer_phase_counter = PhaseCounter::Create("consumer_phase_cnt");
+      p_stage_expr = producer_phase_counter.value().StageExpr(num_stages);
+      p_parity_expr = producer_phase_counter.value().ParityExpr(num_stages);
+      c_stage_expr = consumer_phase_counter.value().StageExpr(num_stages);
+      c_parity_expr = consumer_phase_counter.value().ParityExpr(num_stages);
+    }
+
+    PrimExpr consumer_extent = thread_iv_->dom->extent;
+    PrimExpr producer_extent = IntImm(DataType::Int(32), 128);
+    common_prelude_rewrites_.clear();
+
+    bool has_simt_producer = false;
+    bool has_cp_async_producer = false;
+    int num_producer_groups = 0;
+    for (auto k : kinds) {
+      if (k == TileStmtKind::kTmaProducer)
+        ++num_producer_groups;
+      if (k == TileStmtKind::kSimtProducer)
+        has_simt_producer = true;
+      if (k == TileStmtKind::kCpAsyncProducer)
+        has_cp_async_producer = true;
+    }
+
+    // --- Barrier allocation ---
+    // Layout: [fwd_0..fwd_{G*S-1}] [bp_0..bp_{G*S-1}]
+    // where G = num_producer_groups (one per TMA copy), S = num_stages.
+    // When SIMT producers are present, all producer types share the same
+    // barrier group — the last forward arrive covers everything.
+    int num_fwd = num_producer_groups * num_stages;
+    int num_bp = num_producer_groups * num_stages;
+
+    buffer_data_to_buffer_ =
+        BufferDataToBufferCollector::Collect(orig_block->body);
+    Array<Stmt> consumer_compute_stmts;
+    for (size_t i = 0; i < flat_stmts.size(); ++i) {
+      if (!IsProducer(kinds[i])) {
+        consumer_compute_stmts.push_back(flat_stmts[i]);
+      }
+    }
+
+    Array<Stmt> prelude_stmts;
+    CollectPreludeStmtsToPipelineLoop(orig_block->body, pipeline_loop,
+                                      &prelude_stmts);
+    std::vector<PreludeTmaLoadPlan> prelude_tma_plans;
+    for (const Stmt &stmt : prelude_stmts) {
+      if (ClassifyStmt(stmt, target_) != TileStmtKind::kTmaProducer) {
+        continue;
+      }
+      Optional<Var> write_buffer_data = ExtractProducerWriteBufferData(stmt);
+      if (!write_buffer_data.defined()) {
+        continue;
+      }
+      int first_read = -1;
+      for (size_t ci = 0; ci < consumer_compute_stmts.size(); ++ci) {
+        BufferDataAccessInfo access = AnalyzeBufferDataAccess(
+            consumer_compute_stmts[ci], write_buffer_data.value(),
+            buffer_data_to_buffer_);
+        if (access.read) {
+          first_read = static_cast<int>(ci);
+          break;
+        }
+      }
+      if (first_read < 0) {
+        continue;
+      }
+      prelude_tma_plans.push_back({stmt, stmt.get(), first_read});
+    }
+
+    int total_barriers = num_fwd + num_bp + prelude_tma_plans.size();
+    Buffer barrier_buf =
+        CreateMBarrierBuffer(injected_mbarrier_name_, total_barriers);
+    // arrive_counts are computed later (after producer_extent is finalized).
+
+    std::vector<int> wait_insert_pos(num_producer_groups, 0);
+    std::vector<int> arrive_insert_pos(
+        num_producer_groups, static_cast<int>(consumer_compute_stmts.size()));
+    int access_group_idx = 0;
+    for (size_t i = 0; i < flat_stmts.size(); ++i) {
+      if (kinds[i] != TileStmtKind::kTmaProducer) {
+        continue;
+      }
+      Optional<Var> write_buffer_data =
+          ExtractProducerWriteBufferData(flat_stmts[i]);
+      if (write_buffer_data.defined()) {
+        int first_read = -1;
+        int last_access = -1;
+        for (size_t ci = 0; ci < consumer_compute_stmts.size(); ++ci) {
+          BufferDataAccessInfo access = AnalyzeBufferDataAccess(
+              consumer_compute_stmts[ci], write_buffer_data.value(),
+              buffer_data_to_buffer_);
+          if (access.read && first_read < 0) {
+            first_read = static_cast<int>(ci);
+          }
+          if (access.HasAnyAccess()) {
+            last_access = static_cast<int>(ci);
+          }
+        }
+        if (first_read >= 0) {
+          wait_insert_pos[access_group_idx] = first_read;
+          arrive_insert_pos[access_group_idx] = last_access + 1;
+        } else if (last_access >= 0) {
+          wait_insert_pos[access_group_idx] = 0;
+          arrive_insert_pos[access_group_idx] = last_access + 1;
+        }
+      }
+      ++access_group_idx;
+    }
+
+    // --- Determine if TMA barriers can be merged ---
+    // When all pure-TMA producers wait at the same consumer position and
+    // release at the same position, forward and back-pressure barriers can
+    // be shared across all TMA copies, reducing from 2*G*S to 2*S barriers.
+    bool can_merge_tma_barriers = (num_producer_groups > 1) &&
+                                  !has_simt_producer && !has_cp_async_producer;
+    if (can_merge_tma_barriers) {
+      for (int g = 1; g < num_producer_groups; ++g) {
+        if (wait_insert_pos[g] != wait_insert_pos[0] ||
+            arrive_insert_pos[g] != arrive_insert_pos[0]) {
+          can_merge_tma_barriers = false;
+          break;
+        }
+      }
+    }
+    if (can_merge_tma_barriers) {
+      // Re-compute barrier layout with a single merged group.
+      num_fwd = num_stages;
+      num_bp = num_stages;
+      total_barriers = num_fwd + num_bp + prelude_tma_plans.size();
+      barrier_buf =
+          CreateMBarrierBuffer(injected_mbarrier_name_, total_barriers);
+    }
+
+    std::vector<Array<Stmt>> producer_loop_prefix_stmts(num_producer_groups);
+    std::vector<bool> moved_compute_stmts(consumer_compute_stmts.size(), false);
+    int compute_cursor = 0;
+    for (int ti = 0; ti < num_producer_groups; ++ti) {
+      int wait_pos = wait_insert_pos[ti];
+      if (wait_pos <= compute_cursor) {
+        compute_cursor = std::max(compute_cursor, wait_pos);
+        continue;
+      }
+      bool all_movable = true;
+      for (int ci = compute_cursor; ci < wait_pos; ++ci) {
+        if (!IsProducerMovableLoopPrefixStmt(consumer_compute_stmts[ci],
+                                             target_)) {
+          all_movable = false;
+          break;
+        }
+      }
+      if (all_movable) {
+        for (int ci = compute_cursor; ci < wait_pos; ++ci) {
+          producer_loop_prefix_stmts[ti].push_back(consumer_compute_stmts[ci]);
+          moved_compute_stmts[ci] = true;
+        }
+      }
+      compute_cursor = wait_pos;
+    }
+
+    bool producer_needs_full_thread_extent = false;
+    for (size_t i = 0;
+         i < flat_stmts.size() && !producer_needs_full_thread_extent; ++i) {
+      if (kinds[i] == TileStmtKind::kSimtProducer ||
+          IsSyncGlobalToSharedCopyLikeStmt(flat_stmts[i], target_)) {
+        producer_needs_full_thread_extent = true;
+      }
+    }
+    if (!producer_needs_full_thread_extent) {
+      for (const auto &prefix_stmts : producer_loop_prefix_stmts) {
+        for (const auto &stmt : prefix_stmts) {
+          if (IsSyncGlobalToSharedCopyLikeStmt(stmt, target_)) {
+            producer_needs_full_thread_extent = true;
+            break;
+          }
+        }
+        if (producer_needs_full_thread_extent) {
+          break;
+        }
+      }
+    }
+    if (producer_needs_full_thread_extent) {
+      // LowerTileOp will materialize these producer-side sync copies into
+      // explicit SIMT global->shared loops. Keep the producer partition at the
+      // original thread extent so the lowered thread mapping stays valid.
+      producer_extent = consumer_extent;
+    }
+
+    // --- Compute arrive_counts (after producer_extent is finalized) ---
+    // Forward arrive_count:
+    //   - Pure TMA (possibly merged): 1 (leader thread only)
+    //   - Mixed TMA with SIMT/cp.async: producer_extent (all producer threads)
+    PrimExpr fwd_arrive_count = (can_merge_tma_barriers ||
+                                 (!has_simt_producer && !has_cp_async_producer))
+                                    ? IntImm(DataType::Int(32), 1)
+                                    : producer_extent;
+    Array<PrimExpr> arrive_counts;
+    for (int i = 0; i < num_fwd; ++i) {
+      arrive_counts.push_back(fwd_arrive_count);
+    }
+    for (int i = 0; i < num_bp; ++i) {
+      arrive_counts.push_back(consumer_extent);
+    }
+    for (size_t i = 0; i < prelude_tma_plans.size(); ++i) {
+      arrive_counts.push_back(IntImm(DataType::Int(32), 1));
+    }
+
+    std::vector<Array<Stmt>> prelude_waits_before_consumer(
+        consumer_compute_stmts.size());
+    PrimExpr prelude_wait_guard =
+        needs_phase_counter ? EQ(consumer_phase_counter.value().Load(),
+                                 IntImm(DataType::Int(32), 0))
+                            : EQ(loop_var, loop_min);
+    int prelude_barrier_base = num_fwd + num_bp;
+    for (size_t i = 0; i < prelude_tma_plans.size(); ++i) {
+      PrimExpr barrier_id = IntImm(DataType::Int(32), prelude_barrier_base + i);
+      common_prelude_rewrites_.emplace(
+          prelude_tma_plans[i].stmt_node,
+          RewritePreludeTmaProducerStmt(prelude_tma_plans[i].stmt, barrier_buf,
+                                        barrier_id));
+      int wait_pos = prelude_tma_plans[i].wait_pos;
+      ICHECK_GE(wait_pos, 0);
+      ICHECK_LT(wait_pos, static_cast<int>(consumer_compute_stmts.size()));
+      prelude_waits_before_consumer[wait_pos].push_back(IfThenElse(
+          prelude_wait_guard, MakeParityWait(barrier_buf, barrier_id,
+                                             IntImm(DataType::Int(32), 0))));
+    }
+
+    // --- Build producer body ---
+    // Producer structure (mixed TMA + SIMT/cp.async):
+    //   bp_wait → SIMT copies (all threads, async) → TMA copies (leader) →
+    //   commit + cp_async_barrier_noinc.
+    // SIMT copies are placed after bp_wait but before TMA so cp.async
+    // and TMA can overlap.
+
+    // First pass: collect SIMT/cp.async producer stmts separately.
+    Array<Stmt> simt_producer_stmts;
+    for (size_t i = 0; i < flat_stmts.size(); ++i) {
+      if (kinds[i] == TileStmtKind::kSimtProducer) {
+        // Annotate ForNodes with kParallelAsyncWithoutAsyncCommitWait so
+        // InjectPTXAsyncCopy (called from LowerTileOp) does not insert
+        // commit+wait — the WS pass will emit its own commit+barrier_noinc.
+        simt_producer_stmts.push_back(
+            SimtProducerAnnotator::Annotate(flat_stmts[i], target_));
+      } else if (kinds[i] == TileStmtKind::kCpAsyncProducer) {
+        simt_producer_stmts.push_back(flat_stmts[i]);
+      }
+    }
+
+    // Second pass: build the producer body with correct ordering.
+    Array<Stmt> producer_stmts;
+    int tma_idx = 0;
+    int last_tma_idx = num_producer_groups - 1;
+    bool simt_stmts_emitted = false;
+    for (size_t i = 0; i < flat_stmts.size(); ++i) {
+      if (kinds[i] == TileStmtKind::kTmaProducer) {
+        int barrier_group = can_merge_tma_barriers ? 0 : tma_idx;
+        int fwd_base = barrier_group * num_stages;
+        int bp_base = num_fwd + barrier_group * num_stages;
+        PrimExpr fwd_id = IntImm(DataType::Int(32), fwd_base) + p_stage_expr;
+        PrimExpr bp_id = IntImm(DataType::Int(32), bp_base) + p_stage_expr;
+
+        // Back-pressure wait (only once when barriers are merged)
+        if (!can_merge_tma_barriers || tma_idx == 0) {
+          producer_stmts.push_back(MakeParityWait(
+              barrier_buf, bp_id,
+              bitwise_xor(p_parity_expr, IntImm(DataType::Int(32), 1))));
+        }
+
+        // After the first bp_wait, emit all SIMT/cp.async producers
+        // followed immediately by commit_group so the hardware can start
+        // the async transfers as early as possible, overlapping with TMA.
+        if (!simt_stmts_emitted && !simt_producer_stmts.empty()) {
+          for (const auto &s : simt_producer_stmts) {
+            producer_stmts.push_back(s);
+          }
+          // Commit cp.async group right after issuing — the earlier the
+          // commit, the more overlap with subsequent TMA loads.
+          if (has_simt_producer || has_cp_async_producer) {
+            producer_stmts.push_back(Evaluate(
+                Call(DataType::Handle(), builtin::ptx_commit_group(), {})));
+          }
+          simt_stmts_emitted = true;
+        }
+
+        for (const auto &stmt : producer_loop_prefix_stmts[tma_idx]) {
+          producer_stmts.push_back(stmt);
+        }
+        // Convert copy → tma_copy with barrier, or annotate non-copy
+        // TMA tile-ops (e.g. c2d_im2col) with barrier reference.
+        const auto *eval = flat_stmts[i].as<EvaluateNode>();
+        ICHECK(eval);
+        Call tile_call = Downcast<Call>(eval->value);
+        auto tile_op = ParseOperator(tile_call);
+        PrimExpr tma_call;
+        // For pure TMA, tell LowerTileOp to emit arrive inside the same
+        // tl_shuffle_elect block (via emit_arrive annotation), producing
+        // arrive_and_expect_tx instead of separate expect_tx + arrive.
+        // When merged barriers, only the last TMA copy should arrive.
+        bool emit_arrive_on_this =
+            !has_simt_producer && !has_cp_async_producer &&
+            (!can_merge_tma_barriers || tma_idx == last_tma_idx);
+
+        if (tile_op.defined() && tile_op.as<CopyNode>()) {
+          tma_call = RewriteCopyToTmaCopy(tile_call, barrier_buf, fwd_id);
+        } else {
+          // Non-copy TMA producer (e.g. Conv2DIm2ColOp): annotate with
+          // barrier so Lower() uses the WS barrier instead of its own.
+          tma_call = AnnotateTileOpBarrier(tile_call, barrier_buf, fwd_id);
+        }
+        if (emit_arrive_on_this) {
+          auto call = Downcast<Call>(tma_call);
+          auto annos = call->annotations;
+          annos.Set("emit_arrive", IntImm(DataType::Int(32), 1));
+          tma_call = Call(call->dtype, call->op, call->args, annos, call->span);
+        }
+        producer_stmts.push_back(Evaluate(tma_call));
+        ++tma_idx;
+      }
+      // SIMT/cp.async producers are handled above (after first bp_wait).
+      // Consumer/Other statements are skipped in producer.
+    }
+    // Fallback: if there were no TMA producers to anchor the bp_wait,
+    // emit SIMT stmts now (shouldn't happen in the mixed path).
+    if (!simt_stmts_emitted && !simt_producer_stmts.empty()) {
+      for (const auto &s : simt_producer_stmts) {
+        producer_stmts.push_back(s);
+      }
+    }
+    // When any producer-side work is not single-threaded pure-TMA, all
+    // producer threads arrive on all forward barriers after finishing it.
+    // SIMT copies (later lowered to cp.async by InjectPTXAsyncCopy) and
+    // explicit cp.async groups use commit_group + cp_async_barrier_noinc
+    // so the async copy completion drives the mbarrier arrival, allowing
+    // TMA and cp.async to overlap.  Other groups use MakeArriveBarrier.
+    if (has_simt_producer || has_cp_async_producer) {
+      // Any SIMT producer will become cp.async after LowerTileOp.
+      bool group_has_async_copy = has_simt_producer || has_cp_async_producer;
+      for (int g = 0; g < num_producer_groups; ++g) {
+        int fwd_base = g * num_stages;
+        PrimExpr fwd_id = IntImm(DataType::Int(32), fwd_base) + p_stage_expr;
+        if (group_has_async_copy) {
+          // Tie cp.async completion to the forward mbarrier.
+          // commit_group was already emitted right after the cp.async
+          // instructions (before TMA) to maximize overlap.
+          producer_stmts.push_back(Evaluate(
+              Call(DataType::Handle(), tl::ptx_cp_async_barrier_noinc(),
+                   {MakeBarrierRef(barrier_buf, fwd_id)})));
+        } else {
+          producer_stmts.push_back(MakeArriveBarrier(barrier_buf, fwd_id));
+        }
+      }
+    }
+    // Phase counter increment at end of producer guarded iteration
+    if (needs_phase_counter) {
+      producer_stmts.push_back(producer_phase_counter.value().Increment());
+    }
+
+    // --- Build consumer body ---
+    // When barriers are merged, iterate over a single effective group.
+    int consumer_barrier_groups =
+        can_merge_tma_barriers ? 1 : num_producer_groups;
+    Array<Stmt> consumer_stmts;
+    std::vector<bool> arrive_emitted(consumer_barrier_groups, false);
+    for (size_t ci = 0; ci < consumer_compute_stmts.size(); ++ci) {
+      for (const auto &stmt : prelude_waits_before_consumer[ci]) {
+        consumer_stmts.push_back(stmt);
+      }
+      for (int g = 0; g < consumer_barrier_groups; ++g) {
+        if (wait_insert_pos[g] == static_cast<int>(ci)) {
+          int fwd_base = g * num_stages;
+          PrimExpr fwd_id = IntImm(DataType::Int(32), fwd_base) + c_stage_expr;
+          consumer_stmts.push_back(
+              MakeParityWait(barrier_buf, fwd_id, c_parity_expr));
+        }
+      }
+      if (!moved_compute_stmts[ci]) {
+        consumer_stmts.push_back(consumer_compute_stmts[ci]);
+      }
+      for (int g = 0; g < consumer_barrier_groups; ++g) {
+        if (arrive_insert_pos[g] == static_cast<int>(ci + 1)) {
+          int bp_base = num_fwd + g * num_stages;
+          PrimExpr bp_id = IntImm(DataType::Int(32), bp_base) + c_stage_expr;
+          consumer_stmts.push_back(MakeArriveBarrier(barrier_buf, bp_id));
+          arrive_emitted[g] = true;
+        }
+      }
+    }
+    if (consumer_compute_stmts.empty()) {
+      for (int g = 0; g < consumer_barrier_groups; ++g) {
+        int fwd_base = g * num_stages;
+        PrimExpr fwd_id = IntImm(DataType::Int(32), fwd_base) + c_stage_expr;
+        consumer_stmts.push_back(
+            MakeParityWait(barrier_buf, fwd_id, c_parity_expr));
+      }
+    }
+    for (int g = 0; g < consumer_barrier_groups; ++g) {
+      if (!arrive_emitted[g] &&
+          arrive_insert_pos[g] ==
+              static_cast<int>(consumer_compute_stmts.size())) {
+        int bp_base = num_fwd + g * num_stages;
+        PrimExpr bp_id = IntImm(DataType::Int(32), bp_base) + c_stage_expr;
+        consumer_stmts.push_back(MakeArriveBarrier(barrier_buf, bp_id));
+      }
+    }
+    // Phase counter increment at end of consumer guarded iteration
+    if (needs_phase_counter) {
+      consumer_stmts.push_back(consumer_phase_counter.value().Increment());
+    }
+
+    // --- Wrap with let bindings and optional condition ---
+    auto wrap_lets =
+        [&](Stmt body,
+            const std::vector<std::pair<Var, PrimExpr>> &bindings) -> Stmt {
+      for (auto it = bindings.rbegin(); it != bindings.rend(); ++it) {
+        body = LetStmt(it->first, it->second, body);
+      }
+      return body;
+    };
+
+    Stmt producer_body = wrap_lets(SeqStmt(producer_stmts), inner_let_bindings);
+    Stmt consumer_body = wrap_lets(SeqStmt(consumer_stmts), inner_let_bindings);
+
+    // Wrap in original condition if the loop body was guarded.
+    if (loop_body_condition.defined()) {
+      producer_body = IfThenElse(loop_body_condition.value(), producer_body);
+      consumer_body = IfThenElse(loop_body_condition.value(), consumer_body);
+    }
+
+    producer_body = wrap_lets(producer_body, outer_let_bindings);
+    consumer_body = wrap_lets(consumer_body, outer_let_bindings);
+
+    // Rewrite shared-buffer stage indices from loop-var-based to
+    // counter-based so they stay in sync with barrier parity.
+    if (needs_phase_counter) {
+      producer_body = StageExprReplacer::Replace(
+          producer_body, loop_var, loop_min, num_stages,
+          producer_phase_counter.value().StageExpr(num_stages));
+      consumer_body = StageExprReplacer::Replace(
+          consumer_body, loop_var, loop_min, num_stages,
+          consumer_phase_counter.value().StageExpr(num_stages));
+    }
+    producer_body =
+        TileOpMbarPhaseAnnotator::Annotate(producer_body, p_parity_expr);
+    consumer_body =
+        TileOpMbarPhaseAnnotator::Annotate(consumer_body, c_parity_expr);
+
+    // --- Build loops (strip pipeline annotations) ---
+    // WS handles pipeline overlap via barriers, so strip all pipeline-
+    // related annotations to prevent PipelinePlanning / InjectSoftware-
+    // Pipeline from re-pipelining the already WS-transformed loops.
+    Map<String, Any> loop_annos;
+    for (const auto &[key, value] : pipeline_loop->annotations) {
+      if (key != "num_stages" && key != "tl_pipeline_order" &&
+          key != "tl_pipeline_stage" && key != "software_pipeline_order" &&
+          key != "software_pipeline_stage") {
+        loop_annos.Set(key, value);
+      }
+    }
+
+    For producer_loop(loop_var, loop_min, loop_extent, ForKind::kSerial,
+                      producer_body, Optional<IterVar>(), loop_annos);
+    For consumer_loop(loop_var, loop_min, loop_extent, ForKind::kSerial,
+                      consumer_body, Optional<IterVar>(), loop_annos);
+
+    // Wrap loops with phase counter allocation when needed.
+    Stmt final_producer_loop = producer_loop;
+    Stmt final_consumer_loop = consumer_loop;
+    if (needs_phase_counter) {
+      final_producer_loop =
+          producer_phase_counter.value().WrapLoopWithAlloc(producer_loop);
+      final_consumer_loop =
+          consumer_phase_counter.value().WrapLoopWithAlloc(consumer_loop);
+    }
+
+    // --- Rewrite threadIdx.x for producer partition ---
+    // Producer: threadIdx.x - consumer_extent (maps to [0, producer_extent))
+    Stmt rewritten_producer = PCThreadIdxRewriter::Rewrite(
+        final_producer_loop, thread_iv_->var, thread_iv_->var - consumer_extent,
+        producer_extent, false);
+    // Consumer: threadIdx.x stays, but extent is consumer_extent
+    Stmt rewritten_consumer = final_consumer_loop;
+
+    shared_prelude_live_seed_ = {};
+    producer_prelude_live_seed_ = {};
+    consumer_prelude_live_seed_ = {};
+    producer_prelude_live_seed_.AddUses(LocalAccessCollector::Collect(
+        rewritten_producer, buffer_data_to_buffer_));
+    consumer_prelude_live_seed_.AddUses(LocalAccessCollector::Collect(
+        rewritten_consumer, buffer_data_to_buffer_));
+
+    // Move pre-loop branch-private initialization next to the branch that
+    // consumes it. Classification is based on downstream producer/consumer
+    // uses of the values defined by each prelude statement.
+    extracted_producer_init_ = {};
+    extracted_consumer_init_ = {};
+
+    Array<IntImm> ws_partition = {Downcast<IntImm>(producer_extent),
+                                  Downcast<IntImm>(consumer_extent)};
+
+    // First pass: find and extract consumer-only pre-loop statements
+    // by doing a dry replacement that populates extracted_consumer_init_.
+    Stmt dummy_producer = rewritten_producer;
+    const Stmt &dummy_consumer = rewritten_consumer;
+    Stmt dummy_ws = IfThenElse(GE(thread_iv_->var, consumer_extent),
+                               dummy_producer, dummy_consumer);
+    dummy_ws =
+        AttrStmt(ws_partition, attr::kWarpSpecializationScope, 0, dummy_ws);
+    ReplaceResult replaced = ReplacePipelineLoopInStmt(
+        orig_block->body, pipeline_loop, dummy_ws, consumer_extent);
+
+    // Producer and consumer partitions cannot safely share the same block-level
+    // local/fragment buffers after tiled WS is introduced before
+    // LayoutInference: a single fragment layout cannot represent both thread
+    // ranges. Clone every branch-private buffer touched by the producer so
+    // LayoutInference can infer an independent producer-side thread range.
+    BufferNodeMap producer_buffer_remap;
+    Array<Buffer> producer_private_buffers;
+    {
+      std::unordered_set<const BufferNode *> block_alloc_buffers;
+      for (const auto &buffer : orig_block->alloc_buffers) {
+        block_alloc_buffers.insert(buffer.get());
+      }
+      LocalAccessSummary producer_access = LocalAccessCollector::Collect(
+          rewritten_producer, buffer_data_to_buffer_);
+      for (const auto &stmt : extracted_producer_init_) {
+        MergeLocalAccessSummary(
+            &producer_access,
+            LocalAccessCollector::Collect(stmt, buffer_data_to_buffer_));
+      }
+      auto maybe_clone = [&](const Buffer &buffer) {
+        if (!buffer.defined() ||
+            !(IsFragmentBuffer(buffer) || IsLocalBuffer(buffer)) ||
+            !block_alloc_buffers.count(buffer.get()) ||
+            producer_buffer_remap.count(buffer.get())) {
+          return;
+        }
+        Buffer cloned = CloneBranchPrivateBuffer(buffer, "_producer_ws");
+        producer_buffer_remap.emplace(buffer.get(), cloned);
+        producer_private_buffers.push_back(cloned);
+      };
+      for (const auto &buffer : producer_access.read_buffers) {
+        maybe_clone(buffer);
+      }
+      for (const auto &buffer : producer_access.write_buffers) {
+        maybe_clone(buffer);
+      }
+    }
+    if (!producer_buffer_remap.empty()) {
+      rewritten_producer =
+          BufferRemapper::Rewrite(rewritten_producer, producer_buffer_remap);
+      Array<Stmt> remapped_producer_init;
+      for (const auto &stmt : extracted_producer_init_) {
+        remapped_producer_init.push_back(
+            BufferRemapper::Rewrite(stmt, producer_buffer_remap));
+      }
+      extracted_producer_init_ = remapped_producer_init;
+    }
+
+    // If branch-local prelude init/copy was extracted, rebuild with it inside
+    // the corresponding WS branch so each branch initializes its own local
+    // state before entering the pipelined loop.
+    if (!extracted_producer_init_.empty() ||
+        !extracted_consumer_init_.empty()) {
+      Stmt enriched_producer = rewritten_producer;
+      if (!extracted_producer_init_.empty()) {
+        Array<Stmt> producer_parts;
+        for (const auto &s : extracted_producer_init_) {
+          producer_parts.push_back(PCThreadIdxRewriter::Rewrite(
+              s, thread_iv_->var, thread_iv_->var - consumer_extent,
+              producer_extent, false));
+        }
+        producer_parts.push_back(rewritten_producer);
+        enriched_producer = producer_parts.size() == 1
+                                ? producer_parts[0]
+                                : SeqStmt(producer_parts);
+      }
+      Array<Stmt> consumer_parts;
+      for (const auto &s : extracted_consumer_init_) {
+        consumer_parts.push_back(s);
+      }
+      consumer_parts.push_back(rewritten_consumer);
+      Stmt enriched_consumer = consumer_parts.size() == 1
+                                   ? consumer_parts[0]
+                                   : SeqStmt(consumer_parts);
+      Stmt scoped_producer = enriched_producer;
+      const Stmt &scoped_consumer = enriched_consumer;
+      Stmt ws_body = IfThenElse(GE(thread_iv_->var, consumer_extent),
+                                scoped_producer, scoped_consumer);
+      ws_body =
+          AttrStmt(ws_partition, attr::kWarpSpecializationScope, 0, ws_body);
+      // Second pass: replace again with the enriched WS body.
+      // extracted_consumer_init_ is already empty (stmts were removed
+      // from the prelude in the first pass result).
+      // We need to replace in the ALREADY-modified body from pass 1.
+      // But ReplacePipelineLoopInStmt finds the pipeline_loop by
+      // pointer comparison, which won't match in the modified tree.
+      // Instead, just substitute the dummy_ws in the replaced result.
+      // Since dummy_ws appears exactly once in replaced.stmt, do a
+      // simple statement replacement on the full placeholder stmt.
+      class SubstWsBody : public StmtExprMutator {
+      public:
+        SubstWsBody(const Stmt &old_ws, const Stmt &new_ws)
+            : old_(old_ws), new_(new_ws) {}
+        Stmt VisitStmt(const Stmt &stmt) final {
+          if (stmt.same_as(old_)) {
+            return new_;
+          }
+          return StmtExprMutator::VisitStmt(stmt);
+        }
+        Stmt old_, new_;
+      };
+      SubstWsBody subst(dummy_ws, ws_body);
+      replaced.stmt = subst(replaced.stmt);
+    }
+    ICHECK(replaced.found)
+        << "ProducerConsumerWS: failed to replace pipeline loop";
+    Stmt new_block_body = SinkGuardedConsumerPostlude::Rewrite(
+        replaced.stmt, thread_iv_->var, consumer_extent);
+
+    // --- Update block ---
+    Block new_block = orig_block;
+    auto *block_ptr = new_block.CopyOnWrite();
+    block_ptr->body = new_block_body;
+    for (const auto &buffer : producer_private_buffers) {
+      block_ptr->alloc_buffers.push_back(buffer);
+    }
+
+    // Add barrier buffer to alloc_buffers.
+    block_ptr->alloc_buffers.push_back(barrier_buf);
+
+    // Add barrier_init annotation.
+    Map<Var, Array<PrimExpr>> barrier_init_map;
+    barrier_init_map.Set(barrier_buf->data, arrive_counts);
+    auto ann = block_ptr->annotations;
+    if (ann.count("barrier_init")) {
+      auto existing =
+          Downcast<Map<Var, Array<PrimExpr>>>(ann.Get("barrier_init").value());
+      for (auto [k, v] : existing) {
+        barrier_init_map.Set(k, v);
+      }
+    }
+    ann.Set("barrier_init", barrier_init_map);
+    block_ptr->annotations = std::move(ann);
+
+    // Update thread extent at the tiled WS level so LayoutInference sees
+    // the producer branch as live and can analyze explicit TMA copies.
+    num_threads_ = consumer_extent + producer_extent;
+    ws_transformed_ = true;
+
+    // Rebuild BlockRealize.
+    BlockRealize new_realize = ffi::GetRef<BlockRealize>(orig_realize);
+    new_realize.CopyOnWrite()->block = new_block;
+    return new_realize;
+  }
+
+  // --- Find the first For loop with num_stages annotation ---
+  const ForNode *FindPipelineLoop(const Stmt &stmt) {
+    if (auto *for_node = stmt.as<ForNode>()) {
+      if (for_node->annotations.Get("num_stages")) {
+        return for_node;
+      }
+    }
+    // Walk through SeqStmt, LetStmt, etc.
+    if (auto *seq = stmt.as<SeqStmtNode>()) {
+      for (const Stmt &s : seq->seq) {
+        if (auto *result = FindPipelineLoop(s)) {
+          return result;
+        }
+      }
+    }
+    if (auto *let = stmt.as<LetStmtNode>()) {
+      return FindPipelineLoop(let->body);
+    }
+    if (auto *realize = stmt.as<BlockRealizeNode>()) {
+      return FindPipelineLoop(realize->block->body);
+    }
+    if (auto *block = stmt.as<BlockNode>()) {
+      return FindPipelineLoop(block->body);
+    }
+    if (auto *attr = stmt.as<AttrStmtNode>()) {
+      return FindPipelineLoop(attr->body);
+    }
+    return nullptr;
+  }
+
+  struct ReplaceResult {
+    Stmt stmt;
+    bool found{false};
+  };
+
+  class SinkGuardedConsumerPostlude : public StmtExprMutator {
+  public:
+    static Stmt Rewrite(const Stmt &stmt, Var thread_var,
+                        PrimExpr consumer_extent) {
+      SinkGuardedConsumerPostlude sinker(std::move(thread_var),
+                                         std::move(consumer_extent));
+      return sinker.VisitStmt(stmt);
+    }
+
+  private:
+    SinkGuardedConsumerPostlude(Var thread_var, PrimExpr consumer_extent)
+        : thread_var_(std::move(thread_var)),
+          consumer_extent_(std::move(consumer_extent)) {}
+
+    static bool SameExpr(const PrimExpr &lhs, const PrimExpr &rhs) {
+      return ExprDeepEqual()(lhs, rhs);
+    }
+
+    bool IsWSBranchStmt(const Stmt &stmt, IfThenElse *branch) const {
+      const auto *if_node = stmt.as<IfThenElseNode>();
+      if (!if_node || !if_node->else_case.defined()) {
+        return false;
+      }
+      const auto *ge = if_node->condition.as<GENode>();
+      if (!ge) {
+        return false;
+      }
+      const auto *lhs = ge->a.as<VarNode>();
+      if (!lhs || lhs != thread_var_.get()) {
+        return false;
+      }
+      if (!SameExpr(ge->b, consumer_extent_)) {
+        return false;
+      }
+      *branch = ffi::GetRef<IfThenElse>(if_node);
+      return true;
+    }
+
+    bool IsWSBranch(const Stmt &stmt, Stmt *container,
+                    IfThenElse *branch) const {
+      if (IsWSBranchStmt(stmt, branch)) {
+        *container = stmt;
+        return true;
+      }
+      const auto *attr_node = stmt.as<AttrStmtNode>();
+      if (!attr_node || attr_node->attr_key != attr::kWarpSpecializationScope) {
+        return false;
+      }
+      if (!IsWSBranchStmt(attr_node->body, branch)) {
+        return false;
+      }
+      *container = stmt;
+      return true;
+    }
+
+    bool IsGuardedConsumerStmt(const Stmt &stmt, Stmt *body) const {
+      const auto *if_node = stmt.as<IfThenElseNode>();
+      if (!if_node || if_node->else_case.defined()) {
+        return false;
+      }
+      const auto *lt = if_node->condition.as<LTNode>();
+      if (!lt) {
+        return false;
+      }
+      const auto *lhs = lt->a.as<VarNode>();
+      if (!lhs || lhs != thread_var_.get()) {
+        return false;
+      }
+      if (!SameExpr(lt->b, consumer_extent_)) {
+        return false;
+      }
+      *body = if_node->then_case;
+      return true;
+    }
+
+    static Stmt AppendToStmt(const Stmt &stmt, const Array<Stmt> &suffix) {
+      if (suffix.empty()) {
+        return stmt;
+      }
+      Array<Stmt> seq;
+      if (const auto *seq_stmt = stmt.as<SeqStmtNode>()) {
+        for (const auto &s : seq_stmt->seq) {
+          seq.push_back(s);
+        }
+      } else {
+        seq.push_back(stmt);
+      }
+      for (const auto &s : suffix) {
+        seq.push_back(s);
+      }
+      return seq.size() == 1 ? seq[0] : SeqStmt(seq);
+    }
+
+    Stmt UpdateWSBranchContainer(const Stmt &container,
+                                 const IfThenElse &branch,
+                                 const Array<Stmt> &consumer_postlude) const {
+      auto *branch_ptr = const_cast<IfThenElse &>(branch).CopyOnWrite();
+      ICHECK(branch_ptr->else_case.defined());
+      branch_ptr->else_case =
+          AppendToStmt(branch_ptr->else_case.value(), consumer_postlude);
+      if (container.same_as(branch)) {
+        return branch;
+      }
+      AttrStmt attr = Downcast<AttrStmt>(container);
+      attr.CopyOnWrite()->body = branch;
+      return attr;
+    }
+
+    Stmt VisitStmt_(const SeqStmtNode *op) final {
+      Array<Stmt> visited;
+      for (const auto &stmt : op->seq) {
+        visited.push_back(VisitStmt(stmt));
+      }
+
+      Array<Stmt> rebuilt;
+      for (int i = 0; i < static_cast<int>(visited.size()); ++i) {
+        Stmt ws_container;
+        IfThenElse ws_branch;
+        if (!IsWSBranch(visited[i], &ws_container, &ws_branch)) {
+          rebuilt.push_back(visited[i]);
+          continue;
+        }
+
+        Array<Stmt> consumer_postlude;
+        int j = i + 1;
+        for (; j < static_cast<int>(visited.size()); ++j) {
+          Stmt body;
+          if (!IsGuardedConsumerStmt(visited[j], &body)) {
+            break;
+          }
+          consumer_postlude.push_back(body);
+        }
+        if (consumer_postlude.empty()) {
+          rebuilt.push_back(visited[i]);
+          continue;
+        }
+
+        rebuilt.push_back(UpdateWSBranchContainer(ws_container, ws_branch,
+                                                  consumer_postlude));
+        i = j - 1;
+      }
+
+      return rebuilt.size() == 1 ? rebuilt[0] : SeqStmt(rebuilt);
+    }
+
+    Var thread_var_;
+    PrimExpr consumer_extent_;
+  };
+
+  Stmt GuardConsumerOnly(const Stmt &stmt, PrimExpr consumer_extent) {
+    return IfThenElse(LT(thread_iv_->var, consumer_extent), stmt);
+  }
+
+  ReplaceResult ReplacePipelineLoopInStmt(const Stmt &stmt,
+                                          const ForNode *pipeline_loop,
+                                          const Stmt &ws_body,
+                                          PrimExpr consumer_extent) {
+    if (stmt.get() == pipeline_loop) {
+      return {ws_body, true};
+    }
+    if (auto *seq = stmt.as<SeqStmtNode>()) {
+      Array<Stmt> new_seq;
+      bool found = false;
+      // First pass: find which child contains the pipeline loop.
+      int loop_idx = -1;
+      for (int i = 0; i < static_cast<int>(seq->seq.size()); ++i) {
+        ReplaceResult probe = ReplacePipelineLoopInStmt(
+            seq->seq[i], pipeline_loop, ws_body, consumer_extent);
+        if (probe.found) {
+          loop_idx = i;
+          break;
+        }
+      }
+      if (loop_idx < 0) {
+        return {stmt, false};
+      }
+      // Propagate liveness backwards through prelude statements so that
+      // transitive dependencies are captured.  For example, if consumer
+      // needs `m_start` and `m_start` is defined by a prelude statement
+      // that reads `cur_batch_idx`, the loop defining `cur_batch_idx`
+      // must also be visible to the consumer.
+      {
+        LocalLiveSet producer_live = producer_prelude_live_seed_;
+        LocalLiveSet consumer_live = consumer_prelude_live_seed_;
+        for (int i = loop_idx - 1; i >= 0; --i) {
+          LocalAccessSummary summary = LocalAccessCollector::Collect(
+              seq->seq[i], buffer_data_to_buffer_);
+          if (!summary.HasTrackedDefs())
+            continue;
+          if (producer_live.NeedsAnyDef(summary)) {
+            producer_live.AddUses(summary);
+          }
+          if (consumer_live.NeedsAnyDef(summary)) {
+            consumer_live.AddUses(summary);
+          }
+        }
+        producer_prelude_live_seed_ = producer_live;
+        consumer_prelude_live_seed_ = consumer_live;
+      }
+      // Classify pre-loop statements using branch-private def/use sets.
+      // Shared-prelude statements stay in place; branch-private definitions
+      // move next to the branch that consumes them, or are duplicated when
+      // both producer and consumer need the same definition.
+      for (int i = 0; i < loop_idx; ++i) {
+        switch (ClassifyPreludeStmt(
+            seq->seq[i], buffer_data_to_buffer_, shared_prelude_live_seed_,
+            producer_prelude_live_seed_, consumer_prelude_live_seed_)) {
+        case PreludeStmtPlacement::kProducerOnly:
+          extracted_producer_init_.push_back(seq->seq[i]);
+          break;
+        case PreludeStmtPlacement::kConsumerOnly:
+          extracted_consumer_init_.push_back(seq->seq[i]);
+          break;
+        case PreludeStmtPlacement::kDuplicateToBoth:
+          extracted_producer_init_.push_back(seq->seq[i]);
+          extracted_consumer_init_.push_back(seq->seq[i]);
+          break;
+        case PreludeStmtPlacement::kKeepSharedPrelude:
+          if (auto it = common_prelude_rewrites_.find(seq->seq[i].get());
+              it != common_prelude_rewrites_.end()) {
+            new_seq.push_back(it->second);
+          } else {
+            new_seq.push_back(seq->seq[i]);
+          }
+          break;
+        }
+      }
+      // Replace the pipeline loop itself.
+      ReplaceResult result = ReplacePipelineLoopInStmt(
+          seq->seq[loop_idx], pipeline_loop, ws_body, consumer_extent);
+      new_seq.push_back(result.stmt);
+      // Guard post-loop siblings.
+      for (int i = loop_idx + 1; i < static_cast<int>(seq->seq.size()); ++i) {
+        new_seq.push_back(GuardConsumerOnly(seq->seq[i], consumer_extent));
+      }
+      return {new_seq.size() == 1 ? new_seq[0] : SeqStmt(new_seq), true};
+    }
+    if (auto *let = stmt.as<LetStmtNode>()) {
+      // The LetStmt value is evaluated in the shared prelude (outside
+      // both producer and consumer branches).  If it reads branch-private
+      // buffers or vars defined by a prelude statement, that definition
+      // must remain available in the shared scope.  Propagate such uses
+      // into both live seeds before visiting the body so the upstream
+      // prelude-statement classifier sees them when classifying the
+      // surrounding SeqStmt.
+      {
+        LocalAccessSummary val_summary = LocalAccessCollector::Collect(
+            Evaluate(let->value), buffer_data_to_buffer_);
+        shared_prelude_live_seed_.AddUses(val_summary);
+      }
+      ReplaceResult result = ReplacePipelineLoopInStmt(
+          let->body, pipeline_loop, ws_body, consumer_extent);
+      if (!result.found) {
+        return {stmt, false};
+      }
+      return {LetStmt(let->var, let->value, result.stmt), true};
+    }
+    if (auto *realize = stmt.as<BlockRealizeNode>()) {
+      ReplaceResult result = ReplacePipelineLoopInStmt(
+          realize->block->body, pipeline_loop, ws_body, consumer_extent);
+      if (!result.found) {
+        return {stmt, false};
+      }
+      Block block = realize->block;
+      block.CopyOnWrite()->body = result.stmt;
+      BlockRealize new_realize = ffi::GetRef<BlockRealize>(realize);
+      new_realize.CopyOnWrite()->block = block;
+      return {new_realize, true};
+    }
+    if (auto *block = stmt.as<BlockNode>()) {
+      ReplaceResult result = ReplacePipelineLoopInStmt(
+          block->body, pipeline_loop, ws_body, consumer_extent);
+      if (!result.found) {
+        return {stmt, false};
+      }
+      Block new_block = ffi::GetRef<Block>(block);
+      new_block.CopyOnWrite()->body = result.stmt;
+      return {new_block, true};
+    }
+    if (auto *attr = stmt.as<AttrStmtNode>()) {
+      ReplaceResult result = ReplacePipelineLoopInStmt(
+          attr->body, pipeline_loop, ws_body, consumer_extent);
+      if (!result.found) {
+        return {stmt, false};
+      }
+      AttrStmt new_attr = ffi::GetRef<AttrStmt>(attr);
+      new_attr.CopyOnWrite()->body = result.stmt;
+      return {new_attr, true};
+    }
+    return {stmt, false};
+  }
+
+  // --- PCThreadIdxRewriter (simplified for tile-op level) ---
+  class PCThreadIdxRewriter : public StmtExprMutator {
+  public:
+    static Stmt Rewrite(Stmt stmt, Var thread_var, PrimExpr replaced,
+                        PrimExpr thread_extent, bool do_shuffle) {
+      PCThreadIdxRewriter r(std::move(thread_var), std::move(replaced),
+                            std::move(thread_extent));
+      return r(std::move(stmt));
+    }
+
+  private:
+    PCThreadIdxRewriter(Var thread_var, PrimExpr replaced,
+                        PrimExpr thread_extent)
+        : thread_var_(std::move(thread_var)), replaced_(std::move(replaced)),
+          thread_extent_(std::move(thread_extent)) {}
+
+    PrimExpr VisitExpr_(const VarNode *var) final {
+      if (var == thread_var_.get()) {
+        return replaced_;
+      }
+      return StmtExprMutator::VisitExpr_(var);
+    }
+
+    Var thread_var_;
+    PrimExpr replaced_;
+    PrimExpr thread_extent_;
+  };
+
+  // State
+  Target target_;
+  IterVar thread_iv_;
+  Optional<PrimExpr> num_threads_; // total (consumer + producer)
+  bool ws_transformed_{false};
+  BufferDataToBufferMap buffer_data_to_buffer_;
+  std::unordered_map<const StmtNode *, Stmt> common_prelude_rewrites_;
+  LocalLiveSet shared_prelude_live_seed_;
+  LocalLiveSet producer_prelude_live_seed_;
+  LocalLiveSet consumer_prelude_live_seed_;
+  Array<Stmt> extracted_producer_init_;
+  Array<Stmt> extracted_consumer_init_;
+};
+
+// ---------------------------------------------------------------------------
+// Detect if manual WS is already present (skip if so)
+// ---------------------------------------------------------------------------
+
+class ManualWSDetector : public StmtExprVisitor {
+public:
+  static bool HasManualWS(const Stmt &stmt) {
+    ManualWSDetector d;
+    d(stmt);
+    return d.found_;
+  }
+
+private:
+  void VisitStmt_(const AttrStmtNode *op) final {
+    // Detect both the T.ws() language-level attr ("warp_specialize") and
+    // the compiler-level attr (kWarpSpecializationScope).
+    if (op->attr_key == "warp_specialize" ||
+        op->attr_key == attr::kWarpSpecializationScope) {
+      found_ = true;
+      return;
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  bool found_{false};
+};
+
+/// Quick pre-scan: check if the function contains a pipelined loop (num_stages
+/// >= 1) with at least one TMA load producer tile op and no manual layout
+/// annotations (which are incompatible with early MVB expansion).
+/// Check whether a layout annotation on a shared buffer is compatible with
+/// TMA.  TMA supports identity (linear) layouts and the three standard
+/// swizzle modes (32B / 64B / 128B).  Any other layout (e.g. padded,
+/// Volta-style) cannot be used with TMA.
+static bool IsTmaCompatibleLayout(const Layout &layout, const Buffer &buffer) {
+  // Recognised swizzle → TMA with swizzle.
+  if (DetectSwizzleMode(layout, buffer) != SwizzleMode::kNone) {
+    return true;
+  }
+  // Identity / row-major linear → TMA without swizzle.
+  if (StructuralEqual()(layout, makeLinearLayout(buffer->shape))) {
+    return true;
+  }
+  return false;
+}
+
+class TiledWSCandidate : public StmtExprVisitor {
+public:
+  static bool Check(const Stmt &stmt, Target target) {
+    TiledWSCandidate c;
+    c.target_ = target;
+    c(stmt);
+    return c.has_pipeline_loop_ && c.has_tma_tile_op_;
+  }
+
+private:
+  void VisitStmt_(const ForNode *op) final {
+    bool old = in_pipeline_;
+    if (auto anno = op->annotations.Get("num_stages")) {
+      if (auto *imm = anno->as<IntImmNode>()) {
+        if (imm->value >= 1) {
+          has_pipeline_loop_ = true;
+          in_pipeline_ = true;
+        }
+      }
+    }
+    StmtExprVisitor::VisitStmt_(op);
+    in_pipeline_ = old;
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    if (in_pipeline_ && !has_tma_tile_op_) {
+      auto tile_op = ParseOperator(ffi::GetRef<Call>(op));
+      if (auto *copy = tile_op.as<CopyNode>()) {
+        if (ClassifyCopy(copy, target_) == TileStmtKind::kTmaProducer) {
+          // If the destination buffer has a layout annotation, verify
+          // that the layout is TMA-compatible (swizzle or linear).
+          // Copies whose layout is incompatible with TMA cannot become
+          // TMA producers.
+          if (HasTmaCompatibleLayout(copy->dst)) {
+            has_tma_tile_op_ = true;
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitExpr_(op);
+  }
+
+  void VisitStmt_(const BlockNode *op) final {
+    // Collect layout_map entries so we can cross-check TMA copy targets.
+    if (op->annotations.count("layout_map")) {
+      auto anno = op->annotations.Get("layout_map");
+      if (auto gmap = anno->as<Map<ObjectRef, ObjectRef>>(); gmap.has_value()) {
+        for (const auto &[key, val] : gmap.value()) {
+          Layout layout;
+          if (auto l = val.as<Layout>(); l.has_value())
+            layout = l.value();
+          if (auto buf = key.as<Buffer>(); buf.has_value()) {
+            layout_map_[buf.value()->data.get()] = {buf.value(), layout};
+          } else if (auto var = key.as<Var>(); var.has_value()) {
+            for (const auto &buf : op->alloc_buffers) {
+              if (buf->data.same_as(var.value())) {
+                layout_map_[buf->data.get()] = {buf, layout};
+                break;
+              }
+            }
+          }
+        }
+      }
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+
+  /// A copy destination is TMA-compatible if it has no layout annotation,
+  /// or its annotated layout is a recognised swizzle / linear layout.
+  bool HasTmaCompatibleLayout(const Buffer &dst) const {
+    auto it = layout_map_.find(dst->data.get());
+    if (it == layout_map_.end()) {
+      return true; // no annotation → identity layout → TMA OK
+    }
+    const auto &[buf, layout] = it->second;
+    if (!layout.defined()) {
+      return false; // annotation present but layout not parseable
+    }
+    return IsTmaCompatibleLayout(layout, buf);
+  }
+
+  Target target_;
+  bool in_pipeline_{false};
+  bool has_pipeline_loop_{false};
+  bool has_tma_tile_op_{false};
+  // Map from buffer data Var pointer → (Buffer, Layout) for layout_map entries.
+  std::unordered_map<const Object *, std::pair<Buffer, Layout>> layout_map_;
+};
+
+} // namespace
+
+// ---------------------------------------------------------------------------
+// Pass registration
+// ---------------------------------------------------------------------------
+
+tvm::transform::Pass ProducerConsumerWarpSpecialized() {
+  using namespace tir::transform;
+  auto pass_func = [=](PrimFunc f, const IRModule &m, const PassContext &ctx) {
+    // Skip if disabled.
+    if (ctx->GetConfig(kDisableWarpSpecialized, Optional<Bool>())
+            .value_or(false)) {
+      return f;
+    }
+    // Skip if the function already has manual WS.
+    if (ManualWSDetector::HasManualWS(f->body)) {
+      return f;
+    }
+    // Skip if TMA is not available.
+    auto target = f->GetAttr<Target>(tvm::attr::kTarget);
+    if (!target.defined() || !TargetHasBulkCopy(target.value())) {
+      return f;
+    }
+    // Only apply MVB + WS if the function is a tiled WS candidate.
+    if (!TiledWSCandidate::Check(f->body, target.value())) {
+      DLOG(WARNING) << "[WS] skipped: no TMA copies in pipeline loop";
+      return f;
+    }
+    DLOG(WARNING) << "[WS] candidate found, applying MVB + WS";
+    // Expand shared buffers for pipelining before the WS split.
+    // Keep the original so we can fall back if the WS rewriter doesn't fire
+    // (e.g. non-tile-op consumers in the loop body).
+    PrimFunc original_f = f;
+    f = ApplyMultiVersionBufferRewriter(std::move(f));
+    PrimFunc result = ProducerConsumerWSRewriter::Substitute(std::move(f));
+    if (!result->HasNonzeroAttr(kTiledWSApplied)) {
+      DLOG(WARNING) << "[WS] rewriter did not fire, falling back";
+      // The TMA kernel needs warp specialization for correct pipelined
+      // execution.  Since the tiled rewriter could not apply WS (e.g.
+      // conditional loop body), strip pipeline annotations so that
+      // PipelinePlanning / InjectSoftwarePipeline do not generate
+      // broken non-WS TMA pipeline code.
+      class StripPipelineAnnotation : public tir::StmtExprMutator {
+      public:
+        tir::Stmt VisitStmt_(const tir::ForNode *op) final {
+          auto stmt = tir::StmtExprMutator::VisitStmt_(op);
+          const auto *for_node = stmt.as<tir::ForNode>();
+          ICHECK(for_node);
+          if (for_node->annotations.count("num_stages")) {
+            tir::For new_for = Downcast<tir::For>(stmt);
+            auto *n = new_for.CopyOnWrite();
+            n->annotations.erase("num_stages");
+            return std::move(new_for);
+          }
+          return stmt;
+        }
+      };
+      StripPipelineAnnotation stripper;
+      auto stripped = stripper(original_f->body);
+      auto *fn = original_f.CopyOnWrite();
+      fn->body = stripped;
+      return original_f;
+    }
+    DLOG(WARNING) << "[WS] transformation applied successfully";
+    return result;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.ProducerConsumerWarpSpecialized",
+                            {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.ProducerConsumerWarpSpecialized",
+                        ProducerConsumerWarpSpecialized);
+  refl::GlobalDef().def("tl.transform.ProducerConsumerWarpSpecializedTiled",
+                        ProducerConsumerWarpSpecialized);
+}
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/ptx_async_copy_injector.h b/src/transform/ptx_async_copy_injector.h
new file mode 100644
index 0000000000..80c642562c
--- /dev/null
+++ b/src/transform/ptx_async_copy_injector.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <tvm/tir/stmt.h>
+
+namespace tvm {
+namespace tl {
+
+struct PTXAsyncCopyInjectResult {
+  tvm::tir::Stmt stmt;
+  bool injected_ptx_async_copy{false};
+};
+
+/*! \brief Inject PTX cp.async lowering patterns into a statement.
+ *
+ * This is the statement-level entrypoint used by other transforms to apply the
+ * same rewrite as the `tl.LowerPTXAsyncCopy` pass, but scoped to a region
+ * (e.g., a lowered parallel loop) rather than the whole PrimFunc.
+ */
+PTXAsyncCopyInjectResult
+InjectPTXAsyncCopy(const tvm::tir::Stmt &body, bool enable_auto_async_copy,
+                   bool async_without_async_commit_wait = false);
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/simplify.cc b/src/transform/simplify.cc
index c10d5687a7..10f8b3a35b 100644
--- a/src/transform/simplify.cc
+++ b/src/transform/simplify.cc
@@ -32,6 +32,7 @@ struct SimplifyConfigNode : public AttrsNodeReflAdapter<SimplifyConfigNode> {
   bool propagate_knowns_to_simplify_expressions{};
   bool convert_boolean_to_and_of_ors{};
   bool apply_constraints_to_boolean_branches{};
+  bool enable_simplify_let_inline{true};
 
   static void RegisterReflection() {
     namespace refl = tvm::ffi::reflection;
@@ -61,7 +62,11 @@ struct SimplifyConfigNode : public AttrsNodeReflAdapter<SimplifyConfigNode> {
                 "If true, simplify each branch of AND/OR under a constraints "
                 "provided by the other "
                 "branch",
-                refl::DefaultValue(false));
+                refl::DefaultValue(false))
+        .def_ro("enable_simplify_let_inline",
+                &SimplifyConfigNode::enable_simplify_let_inline,
+                "If true, inline let statements when possible",
+                refl::DefaultValue(true));
   }
   TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.transform.SimplifyConfig",
                                     SimplifyConfigNode, BaseAttrsNode);
@@ -312,7 +317,10 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
   }
 
   Stmt VisitStmt_(const ForNode *op) final {
-    analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent));
+    if (analyzer_->CanProve(op->extent <= 0)) {
+      // Remove loops with non-positive extent
+      return Evaluate(0);
+    }
     With<ConstraintContext> ctx1(analyzer_, op->loop_var >= op->min);
     With<ConstraintContext> ctx2(analyzer_,
                                  op->loop_var < op->min + op->extent);
@@ -320,6 +328,8 @@ class StmtSimplifier : public IRMutatorWithAnalyzer {
   }
 
   bool CanInlineLetStmt(const LetStmtNode *op) {
+    if (!config_->enable_simplify_let_inline)
+      return false;
     if (is_const_number(op->value))
       return true;
     if (op->value.as<VarNode>())
diff --git a/src/transform/split_host_device.cc b/src/transform/split_host_device.cc
index 57d4b81272..072ca0e663 100644
--- a/src/transform/split_host_device.cc
+++ b/src/transform/split_host_device.cc
@@ -33,8 +33,11 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <unordered_set>
+
 #include "../op/builtin.h"
 #include "common/assume.h"
+#include "common/attr.h"
 #include "tir/analysis/var_use_def_analysis.h"
 #include "tvm/node/cast.h"
 #include "tvm/runtime/logging.h"
@@ -64,6 +67,15 @@ class HostDeviceSplitter : public tir::StmtMutator {
     }
   }
 
+  void SetClusterDims(Array<Integer> cluster_dims) {
+    cluster_dims_ = std::move(cluster_dims);
+  }
+
+  void SetHostFuncSignature(const tir::PrimFunc &func) {
+    host_params_ = func->params;
+    host_buffer_map_ = func->buffer_map;
+  }
+
   tir::Stmt VisitStmt_(const tir::AttrStmtNode *op) final {
     if (op->attr_key == tvm::attr::kTarget) {
       found_device_region_ = true;
@@ -100,40 +112,200 @@ class HostDeviceSplitter : public tir::StmtMutator {
 
 private:
   bool found_device_region_{false};
+  Array<tir::Var> host_params_;
+  Map<tir::Var, tir::Buffer> host_buffer_map_;
   Array<tir::Var> non_restrict_params_;
+  Optional<Array<Integer>> cluster_dims_{std::nullopt};
+  Optional<String> code_block_source_{std::nullopt};
+  Optional<String> code_block_entry_name_{std::nullopt};
+
+  static void SortDeviceParams(std::vector<tir::Var> *params) {
+    std::sort(params->begin(), params->end(),
+              [](const tir::Var &a, const tir::Var &b) {
+                auto sort_key = [](const tir::Var &var) {
+                  return std::tuple{
+                      !var->dtype.is_handle(),
+                      var->name_hint,
+                  };
+                };
+                return sort_key(a) < sort_key(b);
+              });
+  }
+
+  std::tuple<Array<tir::Var>, Array<tir::Buffer>>
+  CollectSourceKernelSignature() const {
+    std::vector<tir::Var> params;
+    std::unordered_set<std::string> seen_vars;
+
+    auto push = [&](const tir::Var &var) {
+      if (var.defined() && seen_vars.insert(var->name_hint).second) {
+        params.push_back(var);
+      }
+    };
+
+    Array<tir::Buffer> buffers_to_declare;
+    for (const auto &kv : host_buffer_map_) {
+      const tir::Buffer &buf = kv.second;
+      push(buf->data);
+      buffers_to_declare.push_back(buf);
+      for (const PrimExpr &dim : buf->shape) {
+        if (const auto *var = dim.as<tir::VarNode>()) {
+          push(GetRef<tir::Var>(var));
+        }
+      }
+      for (const PrimExpr &stride : buf->strides) {
+        if (const auto *var = stride.as<tir::VarNode>()) {
+          push(GetRef<tir::Var>(var));
+        }
+      }
+      if (const auto *var = buf->elem_offset.as<tir::VarNode>()) {
+        push(GetRef<tir::Var>(var));
+      }
+    }
+
+    SortDeviceParams(&params);
+    return {Array<tir::Var>(params.begin(), params.end()), buffers_to_declare};
+  }
+
+  class SourceKernelAttrExtractor : public tir::StmtMutator {
+  public:
+    static Stmt Extract(Stmt body, Optional<String> *code_block_source,
+                        Optional<String> *code_block_entry_name) {
+      SourceKernelAttrExtractor extractor(code_block_source,
+                                          code_block_entry_name);
+      return extractor(std::move(body));
+    }
+
+  private:
+    explicit SourceKernelAttrExtractor(Optional<String> *code_block_source,
+                                       Optional<String> *code_block_entry_name)
+        : code_block_source_(code_block_source),
+          code_block_entry_name_(code_block_entry_name) {}
+
+    Stmt VisitStmt_(const tir::AttrStmtNode *op) final {
+      if (op->attr_key == tl::attr::kCodeBlockSource) {
+        if (auto str = op->value.as<StringImmNode>()) {
+          *code_block_source_ = str->value;
+        } else {
+          LOG(FATAL) << "Expected `" << tl::attr::kCodeBlockSource
+                     << "` AttrStmt to carry a StringImm value, but got "
+                     << op->value->GetTypeKey();
+        }
+        return VisitStmt(op->body);
+      }
+
+      if (op->attr_key == tl::attr::kCodeBlockEntryName) {
+        if (auto str = op->value.as<StringImmNode>()) {
+          *code_block_entry_name_ = str->value;
+        } else {
+          LOG(FATAL) << "Expected `" << tl::attr::kCodeBlockEntryName
+                     << "` AttrStmt to carry a StringImm value, but got "
+                     << op->value->GetTypeKey();
+        }
+        return VisitStmt(op->body);
+      }
+
+      return tir::StmtMutator::VisitStmt_(op);
+    }
+
+    Optional<String> *code_block_source_;
+    Optional<String> *code_block_entry_name_;
+  };
+
+  // Wrap body with assumes, substituting variables in assumes with the
+  // corresponding variables in the device body based on name_hint matching.
+  // This substitution is necessary because host-side assume variables may be
+  // different Var objects from device-side parameters, even if they have the
+  // same name. We always perform substitution to ensure ConvertSSA sees
+  // consistent variable references.
+  Stmt wrapBodyWithHostSideAssumes(
+      Stmt body, const std::unordered_map<std::string, tir::Var> &name_to_var) {
+    // Build substitution map: assume_var -> body_var
+    // Always substitute if we find a matching name, regardless of whether
+    // it's the same object. This ensures ConvertSSA treats them as the same
+    // variable.
+    auto substitute_func =
+        [&name_to_var](const tir::Var &var) -> Optional<PrimExpr> {
+      auto it = name_to_var.find(var->name_hint);
+      if (it != name_to_var.end()) {
+        return it->second;
+      }
+      return Optional<PrimExpr>();
+    };
 
-  Stmt wrapBodyWithHostSideAssumes(Stmt body) {
     for (auto it = host_assumes_.rbegin(); it != host_assumes_.rend(); ++it) {
-      body =
-          AttrStmt((*it)->node, tir::attr::tilelang_assume, (*it)->value, body);
+      // Substitute variables in the assume condition
+      PrimExpr original_node = Downcast<PrimExpr>((*it)->node);
+      PrimExpr substituted_node =
+          tir::Substitute(original_node, substitute_func);
+      body = AttrStmt(substituted_node, tir::attr::tilelang_assume,
+                      (*it)->value, body);
     }
     return body;
   }
 
   tir::Stmt SplitDeviceFunc(tir::Stmt body, tvm::Target device_target) {
-
-    auto [params, buffers_to_declare] =
+    code_block_source_ = std::nullopt;
+    code_block_entry_name_ = std::nullopt;
+    body = SourceKernelAttrExtractor::Extract(
+        std::move(body), &code_block_source_, &code_block_entry_name_);
+
+    // Normal kernels infer device parameters from use-def of the device body.
+    // Source kernels have no meaningful DSL body, so their device signature
+    // must be reconstructed explicitly from the host PrimFunc signature and
+    // buffer metadata.
+    auto [old_params, buffers_to_declare] =
         [&]() -> std::tuple<Array<tir::Var>, Array<tir::Buffer>> {
+      if (code_block_source_) {
+        return CollectSourceKernelSignature();
+      }
+
       tir::VarUseDefAnalyzer use_def(/*defined_vars=*/{},
                                      /*visit_thread_extent=*/true);
       use_def(body);
 
-      // Sort first by variable type, then by variable name
       std::vector<tir::Var> params{use_def.undefined_.begin(),
                                    use_def.undefined_.end()};
-      std::sort(params.begin(), params.end(),
-                [](const tir::Var &a, const tir::Var &b) {
-                  auto sort_key = [](const tir::Var &var) {
-                    return std::tuple{
-                        !var->dtype.is_handle(),
-                        var->name_hint,
-                    };
-                  };
-                  return sort_key(a) < sort_key(b);
-                });
-      return {params, use_def.undefined_buffers_};
+      SortDeviceParams(&params);
+      return {Array<tir::Var>(params.begin(), params.end()),
+              use_def.undefined_buffers_};
     }();
 
+    // Create new parameter variables for the device function to avoid sharing
+    // Var objects with the host function. This prevents ConvertSSA from
+    // incorrectly renaming variables when it processes multiple functions.
+    Array<tir::Var> params;
+    Map<tir::Var, PrimExpr> var_remap;
+    std::unordered_map<std::string, tir::Var> name_to_var;
+    for (const auto &old_var : old_params) {
+      tir::Var new_var(old_var->name_hint, old_var->type_annotation);
+      params.push_back(new_var);
+      var_remap.Set(old_var, new_var);
+      name_to_var[old_var->name_hint] = new_var;
+    }
+
+    // Substitute old variables with new ones in the body
+    body = tir::Substitute(body, var_remap);
+
+    // Also remap buffers to use new variables
+    Array<tir::Buffer> new_buffers_to_declare;
+    for (const auto &buf : buffers_to_declare) {
+      auto new_shape = buf->shape.Map(
+          [&](const PrimExpr &e) { return tir::Substitute(e, var_remap); });
+      auto new_strides = buf->strides.Map(
+          [&](const PrimExpr &e) { return tir::Substitute(e, var_remap); });
+      auto new_elem_offset = tir::Substitute(buf->elem_offset, var_remap);
+      auto new_data = var_remap.count(buf->data)
+                          ? Downcast<tir::Var>(var_remap[buf->data])
+                          : buf->data;
+      tir::Buffer new_buf(new_data, buf->dtype, new_shape, new_strides,
+                          new_elem_offset, buf->name, buf->data_alignment,
+                          buf->offset_factor, buf->buffer_type,
+                          buf->axis_separators, buf->span);
+      new_buffers_to_declare.push_back(new_buf);
+    }
+    buffers_to_declare = new_buffers_to_declare;
+
     // CodeGenCPU is used for some device-side targets, such as
     // "ext_dev", and expects to be able to return a int32_t status
     // code.
@@ -156,21 +328,46 @@ class HostDeviceSplitter : public tir::StmtMutator {
       body = tir::DeclBuffer(buf, std::move(body));
     }
 
-    // Copy assumes from host-side to device-side.
-    body = wrapBodyWithHostSideAssumes(body);
+    // Copy assumes from host-side to device-side, with variable substitution.
+    // This must be done after DeclBuffer so that assumes are at the outermost
+    // level of the function body. This ensures ConvertSSA correctly identifies
+    // that assume variables refer to function parameters.
+    body = wrapBodyWithHostSideAssumes(body, name_to_var);
+
+    // Remap non_restrict_params to use new parameter variables
+    Array<tir::Var> remapped_non_restrict_params;
+    for (const auto &old_var : non_restrict_params_) {
+      if (var_remap.count(old_var)) {
+        remapped_non_restrict_params.push_back(
+            Downcast<tir::Var>(var_remap[old_var]));
+      } else {
+        remapped_non_restrict_params.push_back(old_var);
+      }
+    }
 
     tir::PrimFunc device_func(params, body, kernel_ret_type);
-    device_func =
-        WithAttrs(std::move(device_func),
-                  {{tvm::attr::kTarget, device_target},
-                   {tir::attr::kNoAlias, true},
-                   {tir::attr::kIsGlobalFunc, true},
-                   {tl::attr::kNonRestrictParams, non_restrict_params_}});
+    Map<String, ffi::Any> device_attrs = {
+        {tvm::attr::kTarget, device_target},
+        {tir::attr::kNoAlias, true},
+        {tir::attr::kIsGlobalFunc, true},
+        {tl::attr::kNonRestrictParams, remapped_non_restrict_params}};
+    if (cluster_dims_.defined()) {
+      device_attrs.Set("cluster_dims", cluster_dims_.value());
+    }
+    if (code_block_source_) {
+      device_attrs.Set(tl::attr::kCodeBlockSource, code_block_source_.value());
+    }
+    device_func = WithAttrs(std::move(device_func), device_attrs);
 
     GlobalVar kernel_symbol_global = var_supply_();
+    if (code_block_entry_name_) {
+      kernel_symbol_global = GlobalVar(code_block_entry_name_.value());
+    }
+
     (*device_mod_)->Add(kernel_symbol_global, device_func);
+    // Use old_params as call arguments (host-side variables)
     Array<PrimExpr> args =
-        params.Map([](const tir::Var &var) -> PrimExpr { return var; });
+        old_params.Map([](const tir::Var &var) -> PrimExpr { return var; });
 
     if (can_propagate_errors) {
       tir::Var kernel_error_code("kernel_error_code", success->dtype);
@@ -199,6 +396,7 @@ class HostDeviceSplitter : public tir::StmtMutator {
 tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
                               std::function<GlobalVar()> var_supply) {
   HostDeviceSplitter splitter(device_mod, std::move(var_supply));
+  splitter.SetHostFuncSignature(func);
   // Propagate non-restrict parameter list from host func to device kernels
   if (auto opt = func->GetAttr<Array<tir::Var>>(tl::attr::kNonRestrictParams)) {
     splitter.SetNonRestrictParams(opt.value());
@@ -206,6 +404,13 @@ tir::PrimFunc SplitHostDevice(tir::PrimFunc func, IRModule *device_mod,
     // codegen.
     func = tvm::WithoutAttr(std::move(func), tl::attr::kNonRestrictParams);
   }
+  // Propagate cluster_dims from host func to device kernel.
+  // LowerOpaqueBlock sets this attr on the pre-split kernel; after splitting
+  // it must live on the device side so the codegen can emit a cluster launch.
+  if (auto opt = func->GetAttr<Array<Integer>>("cluster_dims")) {
+    splitter.SetClusterDims(opt.value());
+    func = tvm::WithoutAttr(std::move(func), "cluster_dims");
+  }
 
   if (auto body = splitter(func->body); !body.same_as(func->body)) {
     func.CopyOnWrite()->body = body;
diff --git a/src/transform/storage_rewrite.cc b/src/transform/storage_rewrite.cc
index 40973f39ab..d494fdac32 100644
--- a/src/transform/storage_rewrite.cc
+++ b/src/transform/storage_rewrite.cc
@@ -285,6 +285,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor {
       VisitNewScope(op);
     } else if (op->attr_key == tir::attr::virtual_thread) {
       VisitNewScope(op);
+    } else if (op->attr_key == tl::attr::kLexicalAllocScope) {
+      VisitNewScope(op);
     } else {
       StmtExprVisitor::VisitStmt_(op);
     }
@@ -575,6 +577,7 @@ class StoragePlanRewriter : public StmtExprMutator {
   Stmt VisitStmt_(const AttrStmtNode *op) final {
     if (op->attr_key == tir::attr::thread_extent ||
         op->attr_key == tir::attr::virtual_thread ||
+        op->attr_key == tl::attr::kLexicalAllocScope ||
         tir::attr::IsPragmaKey(op->attr_key)) {
       // remake all the allocation at the attach scope.
       if (attach_map_.count(op)) {
@@ -677,9 +680,9 @@ class StoragePlanRewriter : public StmtExprMutator {
   // memory. Special memory is all combined into a single allocation.
   bool IsSpecialTaggedMemory(const StorageScope &scope) {
     return !scope.tag.empty() && scope.tag != ".dyn" &&
-           scope.tag != ".barrier" && scope.tag != ".workspace" &&
-           scope.tag != ".vtcm" && scope.tag != ".var" &&
-           scope.tag.find(".descriptor") != 0;
+           scope.tag != ".barrier" && scope.tag != ".cluster_barrier" &&
+           scope.tag != ".workspace" && scope.tag != ".vtcm" &&
+           scope.tag != ".var" && scope.tag.find(".descriptor") != 0;
   }
 
   // Allocate entry of node.
@@ -954,6 +957,21 @@ class StoragePlanRewriter : public StmtExprMutator {
     }
   }
 
+  /*! \brief Return the effective attach scope for the given storage scope.
+   *
+   * lexical_alloc_scope is intended to bound register/local-like allocations.
+   * Shared/global allocations should continue to follow thread_scope_ so we do
+   * not accidentally re-scope shared buffers nested inside a lexical block.
+   */
+  const Object *effective_scope(const StorageScope &storage_scope) const {
+    if (lexical_scope_ != nullptr &&
+        storage_scope.rank != StorageRank::kGlobal &&
+        storage_scope.rank != StorageRank::kShared) {
+      return lexical_scope_;
+    }
+    return thread_scope_;
+  }
+
   // Memory plan algorithm
   void
   PlanMemory(const std::vector<StmtEntry> &seq,
@@ -990,7 +1008,8 @@ class StoragePlanRewriter : public StmtExprMutator {
                 InplaceOpVerifier visitor;
                 StorageEntry *src_entry = alloc_map_.at(src);
                 if (src_entry->scope == storage_scope &&
-                    src_entry->attach_scope_ == thread_scope_ &&
+                    src_entry->attach_scope_ ==
+                        effective_scope(storage_scope) &&
                     src_entry->elem_type == alloc->dtype.element_of() &&
                     visitor.Check(s.stmt, var, src)) {
                   uint64_t const_nbits =
@@ -1007,9 +1026,10 @@ class StoragePlanRewriter : public StmtExprMutator {
             }
           }
           if (dst_entry == nullptr) {
-            dst_entry = FindAlloc(alloc, thread_scope_, storage_scope,
-                                  entry.num_physical_dimensions, enable_reuse,
-                                  reuse_require_exact_matched_dtype);
+            dst_entry =
+                FindAlloc(alloc, effective_scope(storage_scope), storage_scope,
+                          entry.num_physical_dimensions, enable_reuse,
+                          reuse_require_exact_matched_dtype);
           }
           dst_entry->allocs.emplace_back(alloc);
           alloc_map_[var] = dst_entry;
@@ -1022,6 +1042,33 @@ class StoragePlanRewriter : public StmtExprMutator {
             op->attr_key == tir::attr::virtual_thread ||
             tir::attr::IsPragmaKey(op->attr_key)) {
           PlanNewScope(op);
+        } else if (op->attr_key == tl::attr::kLexicalAllocScope) {
+          if (s.scope_pair_offset > 0) {
+            // Entering: redirect allocation attachment to this scope.
+            // thread_scope_ is NOT touched so PlanNewScope keeps working.
+            lexical_scope_stack_.push_back(lexical_scope_);
+            lexical_scope_ = op;
+          } else {
+            // Exiting: clear free lists for this scope and restore.
+            for (auto it = const_free_map_.begin();
+                 it != const_free_map_.end();) {
+              if (it->second->attach_scope_ == op) {
+                it = const_free_map_.erase(it);
+              } else {
+                ++it;
+              }
+            }
+            for (auto it = sym_free_list_.begin();
+                 it != sym_free_list_.end();) {
+              if ((*it)->attach_scope_ == op) {
+                it = sym_free_list_.erase(it);
+              } else {
+                ++it;
+              }
+            }
+            lexical_scope_ = lexical_scope_stack_.back();
+            lexical_scope_stack_.pop_back();
+          }
         } else {
           ICHECK(op->attr_key == tir::attr::extern_scope);
         }
@@ -1109,6 +1156,9 @@ class StoragePlanRewriter : public StmtExprMutator {
         // when not divided, no reuse, eg, float4 vs float3
         if (e->bits_offset % op_elem_bits != 0)
           continue;
+        // must check element type to avoid type mismatch in codegen
+        if (e->elem_type != op->dtype.element_of())
+          continue;
         if (reuse_require_exact_matched_dtype && e->elem_type != op->dtype) {
           continue;
         }
@@ -1176,6 +1226,11 @@ class StoragePlanRewriter : public StmtExprMutator {
   }
   // thread scope.
   const Object *thread_scope_{nullptr};
+  // Current lexical scope (set by lexical_alloc_scope, independent of
+  // thread_scope_ so that PlanNewScope's toggle protocol is preserved).
+  const Object *lexical_scope_{nullptr};
+  // Stack for nested lexical scopes.
+  std::vector<const Object *> lexical_scope_stack_;
   // whether enable inplace detection.
   bool detect_inplace_{false};
   // Locations of free ops.
@@ -1512,6 +1567,17 @@ class VectorTypeAccessChecker : public StmtExprVisitor {
     // vectorized pointer types (e.g. float16x4*).  Once they do, this if
     // statement should instead be replaced by the below ICHECK_EQ.
     if (index_lanes * var_info.element_dtype.lanes() != value_dtype.lanes()) {
+      // If the total element sizes differ (e.g. a bfloat16 view of a
+      // bfloat16x2 buffer where each bfloat16x2 = 4 bytes but bfloat16 = 2
+      // bytes), this is a reinterpret-cast view access with a finer-grained
+      // element type.  The buffer's declared element dtype must not be
+      // downgraded in this case; just skip lane tracking for this access.
+      int declared_bytes =
+          var_info.element_dtype.bits() * var_info.element_dtype.lanes() / 8;
+      int access_bytes = value_dtype.bits() * value_dtype.lanes() / 8;
+      if (access_bytes != declared_bytes) {
+        return;
+      }
       ICHECK_EQ(index_lanes, value_dtype.lanes());
       lanes_used = 1;
       var_info.element_dtype = var_info.element_dtype.with_lanes(1);
@@ -1962,21 +2028,12 @@ Pass StorageRewrite() {
         ctx->GetConfig<Bool>(kStorageRewriteDetectInplace, Bool(false)).value();
     bool enable_reuse = true;
     bool reuse_require_exact_matched_dtype = false;
-    bool merge_static_smem =
-        ctx->GetConfig<Bool>("tir.merge_static_smem", Bool(false)).value();
     AllocateCollector collector;
     collector(f->body);
-    bool has_dynamic = collector.dyn_shmem_allocs_.size() > 1;
-    if (has_dynamic || merge_static_smem) {
-      // For IRModule utilizing dynamic shared memory, reuse is not enabled
-      // Because dynamic doesn't require maintaining the readability and
-      // it benefits from a more optimized allocation strategy through the
-      // Pass `MergeSharedMemoryAllocations`.
-      // When `merge_static_smem` is true, we will reuse and merge shared
-      // memory in a dedicated pass `MergeSharedMemoryAllocations`.
-      // And so we don't enable reuse in this pass.
-      enable_reuse = false;
-    }
+    // Always disable reuse currently, for shared memory reuse we depend on
+    // MergeSharedMemoryAllocations pass, for register reuse we depend on nvcc
+    // or other compiler its self.
+    enable_reuse = false;
 
     Optional<Target> target = f->GetAttr<Target>("target");
     if (target.defined() && (target.value()->kind->name == "vulkan" ||
diff --git a/src/transform/thread_storage_sync.cc b/src/transform/thread_storage_sync.cc
index 0627678e18..628e10b887 100644
--- a/src/transform/thread_storage_sync.cc
+++ b/src/transform/thread_storage_sync.cc
@@ -20,480 +20,191 @@
 /*!
  * \file thread_storage_sync.cc
  */
+#include "../op/builtin.h"
+#include "./common/constr_visitor.h"
+#include "./common/thread_sync_types.h"
+#include "arith/ir_mutator_with_analyzer.h"
+#include "runtime/thread_storage_scope.h"
+#include "tir/transforms/ir_utils.h"
+#include <algorithm>
+#include <string>
+#include <tvm/arith/analyzer.h>
+#include <tvm/arith/int_set.h>
 #include <tvm/ffi/function.h>
 #include <tvm/ffi/reflection/registry.h>
+#include <tvm/ir/attrs.h>
+#include <tvm/target/target_info.h>
 #include <tvm/tir/analysis.h>
 #include <tvm/tir/builtin.h>
 #include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
-
 #include <unordered_map>
 #include <unordered_set>
-#include <utility>
-
-#include "../op/builtin.h"
-#include "./common/thread_sync_types.h"
-#include "./storage_access.h"
-#include "arith/ir_mutator_with_analyzer.h"
-#include "runtime/thread_storage_scope.h"
-#include "tir/transforms/ir_utils.h"
+#include <vector>
 
 namespace tvm {
 namespace tl {
 
 using namespace tir;
+using namespace ffi;
 using arith::IRMutatorWithAnalyzer;
+using runtime::StorageRank;
+using runtime::StorageScope;
 
-class TileLangThreadSyncPlanner : public TileLangStorageAccessVisitor {
+// Similar to ThreadSyncAfterWaitQueueInserter, but for explicit cp.async
+// synchronization intrinsics (ptx_wait_group).
+//
+// In TileLang, cp.async copies may be lowered to explicit ptx_cp_async +
+// ptx_commit_group, and pipelining can move ptx_wait_group away from the copy
+// statement it originally guarded. tvm_storage_sync barriers inserted by
+// ThreadSyncPlanner are based on memory conflicts and may end up *before* the
+// wait_group, which is incorrect for cp.async because __syncthreads() does not
+// wait for outstanding asynchronous copies.
+//
+// Correct usage requires:
+//   ptx_wait_group(N);
+//   tvm_storage_sync("shared");   // __syncthreads()
+// before any cross-thread consumption of the shared memory written by cp.async.
+//
+// This rewriter conservatively inserts a shared-memory storage sync
+// immediately after every ptx_wait_group statement unless an identical sync
+// already follows.
+class ThreadSyncAfterWaitGroupInserter : public StmtExprMutator {
 public:
-  explicit TileLangThreadSyncPlanner(StorageScope sync_scope)
+  explicit ThreadSyncAfterWaitGroupInserter(StorageScope sync_scope)
       : sync_scope_(std::move(sync_scope)) {}
 
-  // The syncs inserted before each statement
-  std::unordered_set<const Object *> syncs_inserted_;
+  Stmt VisitStmt_(const SeqStmtNode *op) final {
+    Array<Stmt> visited;
+    visited.reserve(op->seq.size());
+    for (const Stmt &stmt : op->seq) {
+      visited.push_back(this->VisitStmt(stmt));
+    }
 
-protected:
-  bool Enabled(const VarNode *buf, const StorageScope &scope) const final {
-    return in_device_env() && scope == sync_scope_;
-  }
-  // Plan the sync
-  std::vector<AccessEntry> Summarize(std::vector<StmtEntry> seq,
-                                     const ForNode *loop) final {
-    // Redirect all "shared.dyn" buffer access to the same buffer var
-    // so that the accesses can be planned together.
-    Var shared_dyn_buf;
-    for (StmtEntry &entry : seq) {
-      for (AccessEntry &access : entry.access) {
-        if (access.scope.rank == StorageRank::kShared &&
-            access.scope.tag == ".dyn" && access.buffer.defined()) {
-          if (!shared_dyn_buf.defined()) {
-            shared_dyn_buf = access.buffer;
-          } else {
-            access.buffer = shared_dyn_buf;
-          }
+    Array<Stmt> rewritten;
+    rewritten.reserve(visited.size());
+    for (int i = 0, n = static_cast<int>(visited.size()); i < n; ++i) {
+      const Stmt &stmt = visited[i];
+      rewritten.push_back(stmt);
+      if (IsWaitGroupStmt(stmt)) {
+        bool next_is_sync = false;
+        if (i + 1 < n && IsStorageSyncStmt(visited[i + 1])) {
+          next_is_sync = true;
+        }
+        if (!next_is_sync) {
+          rewritten.push_back(MakeStorageSyncStmt());
         }
       }
     }
 
-    // Unsynced reads and writes
-    std::vector<AccessEntry> reads;
-    std::vector<AccessEntry> writes;
-    // if it is a loop, rotate two times to consider effect of loop.
-    // simulation based approach to find dependencies
-    for (size_t i = 0; i < seq.size(); ++i) {
-      const StmtEntry &s = seq[i];
-      // check if sync before statement is needed.
-      bool sync_before_stmt = (syncs_inserted_.count(s.stmt) != 0);
-      // Apply the syncs added already.
+    if (rewritten.empty()) {
+      return Evaluate(0);
+    }
+    if (rewritten.size() == 1) {
+      return rewritten[0];
+    }
+    return SeqStmt(rewritten);
+  }
 
-      if (sync_before_stmt) {
-        reads.clear();
-        writes.clear();
-      }
+private:
+  StorageScope sync_scope_;
 
-      for (const AccessEntry &acc : s.access) {
-        if (acc.type == kRead) {
-          if (FindConflict(writes, acc, false)) {
-            sync_before_stmt = true;
-            break;
-          }
-        } else if (acc.type == kWrite) {
-          if (FindConflict(reads, acc, false) ||
-              FindConflict(writes, acc, false)) {
-            sync_before_stmt = true;
-            break;
-          }
-        } else if (acc.type == kSync) {
-          reads.clear();
-          writes.clear();
-        }
-      }
-      // If sync is inserted. remove the irrelevant things.
-      if (sync_before_stmt) {
-        reads.clear();
-        writes.clear();
-      }
-      // Add the read/write of current statement
-      for (const AccessEntry &acc : s.access) {
-        if (acc.type == kRead) {
-          reads.push_back(acc);
-        } else if (acc.type == kWrite) {
-          writes.push_back(acc);
-        } else if (acc.type == kSync) {
-          reads.clear();
-          writes.clear();
-        }
-      }
+  Stmt MakeStorageSyncStmt() const {
+    return Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
+                         {StringImm(sync_scope_.to_string())}));
+  }
 
-      if (sync_before_stmt) {
-        insert_syncs(s.stmt);
-      }
+  bool IsWaitGroupStmt(const Stmt &stmt) const {
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      return IsWaitGroupStmt(let->body);
     }
-    if (loop != nullptr) {
-      // Check if the loop body contains any reads in the same sync scope.
-      // If there are reads, we conservatively keep the sync within the loop
-      // body to preserve per-iteration ordering when needed. If there are no
-      // reads (e.g., only writes to shared.dyn), we can safely hoist the sync
-      // to before the loop to avoid redundant barriers.
-      bool has_read_in_scope = false;
-      for (const StmtEntry &s : seq) {
-        for (const AccessEntry &acc : s.access) {
-          if (acc.type == kRead && acc.scope == sync_scope_) {
-            has_read_in_scope = true;
-            break;
-          }
-        }
-        if (has_read_in_scope)
-          break;
-      }
-      // If there is a loop-carried dependency, insert a single sync
-      // before the loop rather than hoisting a sync into the loop body.
-      // This reduces redundant per-iteration synchronizations for cases
-      // where each iteration touches disjoint regions (e.g., stmatrix
-      // writes to shared.dyn) and only a global ordering before/after the
-      // loop is required.
-      for (size_t i = 0; i < seq.size(); ++i) {
-        const StmtEntry &s = seq[i];
-        if (syncs_inserted_.count(s.stmt) != 0)
-          break;
-        if (reads.empty() && writes.empty())
-          break;
-        bool need_loop_sync = false;
-        for (const AccessEntry &acc : s.access) {
-          if (acc.type == kRead) {
-            if (FindConflict(writes, acc, true)) {
-              need_loop_sync = true;
-              break;
-            }
-          } else if (acc.type == kWrite) {
-            if (FindConflict(reads, acc, true) ||
-                FindConflict(writes, acc, true)) {
-              need_loop_sync = true;
-              break;
-            }
-          } else if (acc.type == kSync) {
-            reads.clear();
-            writes.clear();
-          }
-        }
-        if (need_loop_sync) {
-          if (!has_read_in_scope) {
-            // Mark the loop itself to receive a sync before it, instead of
-            // inserting inside the loop body. This ensures a single sync is
-            // emitted outside the loop and avoids per-iteration overhead.
-            insert_syncs(loop);
-          } else {
-            // Fall back to inserting before the first conflicting statement
-            // inside the loop to maintain correctness when reads are present.
-            insert_syncs(s.stmt);
-          }
-          break;
-        }
-      }
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      return IsWaitGroupStmt(attr->body);
     }
-    // return the exposed entries, remove unnecessary ones.
-    int sync_count = 0;
-    // head are before first sync, tail are after last sync
-    std::vector<AccessEntry> head, tail;
-    AccessEntry esync;
-    esync.threads = this->env_threads();
-    esync.thread_range = this->ComputeThreadRange(esync.threads);
-    esync.type = kSync;
-    esync.scope = sync_scope_;
-
-    for (const StmtEntry &s : seq) {
-      if (syncs_inserted_.count(s.stmt)) {
-        if (sync_count != 0) {
-          tail.clear();
-        } else {
-          head.push_back(esync);
-        }
-        ++sync_count;
-      }
-      for (const AccessEntry &acc : s.access) {
-        if (acc.type == kSync) {
-          if (sync_count != 0) {
-            tail.clear();
-          } else {
-            head.push_back(esync);
-          }
-          ++sync_count;
-        } else {
-          if (sync_count != 0) {
-            tail.push_back(acc);
-          } else {
-            head.push_back(acc);
-          }
-        }
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      if (seq->seq.size() == 1) {
+        return IsWaitGroupStmt(seq->seq[0]);
       }
+      return false;
     }
-    head.insert(head.end(), tail.begin(), tail.end());
-    if (loop != nullptr) {
-      // clear double buffer flag after a loop is finished.
-      for (AccessEntry &e : head) {
-        e.double_buffer_write = false;
+    if (const auto *block = stmt.as<BlockNode>()) {
+      return IsWaitGroupStmt(block->body);
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      if (is_one(realize->predicate)) {
+        return IsWaitGroupStmt(realize->block->body);
       }
+      return false;
     }
-    return head;
-  }
-
-private:
-  // find conflicting entry in vec.
-  bool FindConflict(const std::vector<AccessEntry> &prev,
-                    const AccessEntry &curr, bool loop_carry) {
-    for (const AccessEntry &x : prev) {
-      if (FindConflict(x, curr, loop_carry)) {
-        return true;
+    if (const auto *iff = stmt.as<IfThenElseNode>()) {
+      if (!iff->else_case.defined()) {
+        return IsWaitGroupStmt(iff->then_case);
       }
+      return false;
     }
-    return false;
-  }
 
-  bool FindConflict(const AccessEntry &prev, const AccessEntry &curr,
-                    bool loop_carry) {
-    // Special case: ignore conflicts between async-copy writes (e.g., TMA
-    // loads into shared memory). Multiple async writes do not require
-    // interspersed barriers among themselves. We still respect conflicts with
-    // reads to ensure visibility before consumption.
-    if (prev.type == kWrite && curr.type == kWrite && prev.is_async_copy &&
-        curr.is_async_copy) {
+    const auto *eval = stmt.as<EvaluateNode>();
+    if (!eval) {
       return false;
     }
-    // Access to different buffers does not conflict.
-    if (!prev.buffer.same_as(curr.buffer)) {
+    const auto *call = eval->value.as<CallNode>();
+    if (!call) {
       return false;
     }
+    return call->op.same_as(builtin::ptx_wait_group());
+  }
 
-    // Assumes no race between threads
-    // Same index value means no conflicts
-    // TODO(tqchen) more standard set based testing.
-    bool has_same_index = true;
-    bool range_is_equal = true;
-    bool range_is_overlap = true;
-
-    for (const auto &kv : prev.thread_range) {
-      if (!StructuralEqual()(kv.second, curr.thread_range[kv.first])) {
-        range_is_equal = false;
-        break;
-      }
+  bool IsStorageSyncStmt(const Stmt &stmt) const {
+    if (const auto *let = stmt.as<LetStmtNode>()) {
+      return IsStorageSyncStmt(let->body);
     }
-
-    if (prev.buffer_indices.size() != curr.buffer_indices.size()) {
-      // They are not the same indices, should be conflict.
-      return true;
+    if (const auto *attr = stmt.as<AttrStmtNode>()) {
+      return IsStorageSyncStmt(attr->body);
     }
-    if (prev.is_pointer_access || curr.is_pointer_access) {
-      // For accesses created via tvm_access_ptr we may still be able to prove
-      // disjointness using their byte ranges.  If both sides expose a touched
-      // interval and we can show they don't overlap, skip the conflict.
-      if (prev.is_pointer_access && curr.is_pointer_access &&
-          PointerAccessIsDisjoint(prev, curr)) {
-        return false;
+    if (const auto *seq = stmt.as<SeqStmtNode>()) {
+      if (seq->seq.size() == 1) {
+        return IsStorageSyncStmt(seq->seq[0]);
       }
-      // Otherwise fall back to the conservative answer: treat them as
-      // overlapping.
-      return true;
+      return false;
     }
-
-    for (size_t i = 0; i < prev.buffer_indices.size(); i++) {
-      auto prev_dtype = prev.dtype;
-      auto curr_dtype = curr.dtype;
-
-      const auto &prev_indice = prev.buffer_indices[i];
-      const auto &curr_indice = curr.buffer_indices[i];
-
-      if (!ExprDeepEqual()(prev_indice, curr_indice)) {
-        PrimExpr prev_indice_bytes =
-            analyzer_.Simplify(prev_indice * prev_dtype.bytes());
-        PrimExpr curr_indice_bytes =
-            analyzer_.Simplify(curr_indice * curr_dtype.bytes());
-
-        has_same_index = false;
-
-        // If both are const, we can check if they are disjoint
-        // by checking if the bounds are disjoint
-        // [1024, 2048], [2048, 3072] are disjoint
-        // [1024, 2048], [1024, 1024] are not disjoint
-        auto prev_bound = analyzer_.const_int_bound(prev_indice_bytes);
-        auto curr_bound = analyzer_.const_int_bound(curr_indice_bytes);
-        if (prev_bound.defined() && curr_bound.defined()) {
-          if ((prev_bound->min_value) > (curr_bound->max_value) ||
-              (curr_bound->min_value) > (prev_bound->max_value)) {
-            range_is_overlap = false;
-            break;
-          }
-        }
-
-        // if we can prove prev_indice < curr_indice or prev_indice >
-        // curr_indice, then they are not overlap
-        auto prev_indices_dtype = prev_indice.dtype();
-        auto curr_indices_dtype = curr_indice.dtype();
-        if (prev_indices_dtype.lanes() != curr_indices_dtype.lanes()) {
-          // can not support different lanes binary op like <, >, <=, >=
-          // skip otherwise it will lead to error
-          continue;
-        }
-
-        // provably disjoint means no overlap, for example:
-        // we can prove that tx - 128 < tx + 128, tx in [0, 128]
-        // However, we should apply tx split because
-        // tx < tx + 32 when tx in [0, 128] is not disjoint
-        // because [0, 128] is not disjoint with [32, 160]
-        // so we should split tx into tx0 and tx1.
-
-        struct ThreadVarInfo {
-          const char *name_prev;
-          const char *name_curr;
-          IterVar iv;
-        } thread_vars[] = {
-            {"tx1", "tx2", tx_},
-            {"ty1", "ty2", ty_},
-            {"tz1", "tz2", tz_},
-        };
-
-        for (const auto &info : thread_vars) {
-          Var prev_var(info.name_prev, info.iv->var.dtype());
-          Var curr_var(info.name_curr, info.iv->var.dtype());
-          analyzer_.Bind(prev_var, info.iv->dom);
-          analyzer_.Bind(curr_var, info.iv->dom);
-          prev_indice_bytes =
-              Substitute(prev_indice_bytes, {{info.iv->var, prev_var}});
-          curr_indice_bytes =
-              Substitute(curr_indice_bytes, {{info.iv->var, curr_var}});
-        }
-
-        bool provably_disjoint =
-            analyzer_.CanProve(prev_indice_bytes < curr_indice_bytes,
-                               arith::ProofStrength::kSymbolicBound) ||
-            analyzer_.CanProve(prev_indice_bytes > curr_indice_bytes,
-                               arith::ProofStrength::kSymbolicBound);
-
-        if (provably_disjoint) {
-          range_is_overlap = false;
-          break;
-        }
+    if (const auto *block = stmt.as<BlockNode>()) {
+      return IsStorageSyncStmt(block->body);
+    }
+    if (const auto *realize = stmt.as<BlockRealizeNode>()) {
+      if (is_one(realize->predicate)) {
+        return IsStorageSyncStmt(realize->block->body);
       }
-
-      if (!has_same_index) {
-        break;
+      return false;
+    }
+    if (const auto *iff = stmt.as<IfThenElseNode>()) {
+      if (!iff->else_case.defined()) {
+        return IsStorageSyncStmt(iff->then_case);
       }
+      return false;
     }
 
-    if (has_same_index && range_is_equal) {
+    const auto *eval = stmt.as<EvaluateNode>();
+    if (!eval) {
       return false;
     }
-
-    // If this is a read into a double buffer that was previously
-    // swapped out, then it doesn't conflict.
-    if (prev.double_buffer_write && curr.type == kRead && !loop_carry) {
+    const auto *call = eval->value.as<CallNode>();
+    if (!call) {
       return false;
     }
-
-    // If nothing else allows sharing the same buffer, then they are
-    // in conflict.
-    // if range_is_overlap is true, then they are in conflict, we should return
-    // true. if range_is_overlap is false, then they are not in conflict, we
-    // should return false.
-    return range_is_overlap;
-  }
-
-  bool PointerAccessIsDisjoint(const AccessEntry &lhs, const AccessEntry &rhs) {
-    if (lhs.touched.size() != 1 || rhs.touched.size() != 1) {
+    if (!call->op.same_as(builtin::tvm_storage_sync())) {
       return false;
     }
-    PrimExpr lhs_min = analyzer_.Simplify(lhs.touched[0].min());
-    PrimExpr lhs_max = analyzer_.Simplify(lhs.touched[0].max());
-    PrimExpr rhs_min = analyzer_.Simplify(rhs.touched[0].min());
-    PrimExpr rhs_max = analyzer_.Simplify(rhs.touched[0].max());
-
-    if (analyzer_.CanProve(lhs_max < rhs_min,
-                           arith::ProofStrength::kSymbolicBound)) {
-      return true;
+    if (call->args.size() != 1) {
+      return false;
     }
-    if (analyzer_.CanProve(rhs_max < lhs_min,
-                           arith::ProofStrength::kSymbolicBound)) {
-      return true;
+    const auto *scope = call->args[0].as<StringImmNode>();
+    if (!scope) {
+      return false;
     }
-    return false;
+    return scope->value == sync_scope_.to_string();
   }
-
-  void VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == tvm::tir::attr::thread_extent) {
-      IterVar iv = Downcast<IterVar>(op->node);
-      if (iv->thread_tag == "threadIdx.x") {
-        tx_ = iv;
-      } else if (iv->thread_tag == "threadIdx.y") {
-        ty_ = iv;
-      } else if (iv->thread_tag == "threadIdx.z") {
-        tz_ = iv;
-      }
-    }
-    TileLangStorageAccessVisitor::VisitStmt_(op);
-  }
-
-  void insert_syncs(const Object *obj) {
-    if (syncs_inserted_.count(obj))
-      return;
-    syncs_inserted_.insert(obj);
-  }
-
-private:
-  // Member variables
-  IterVar tx_ =
-      IterVar(Range::FromMinExtent(0, 1), Var("tx"), IterVarType::kDataPar);
-  IterVar ty_ =
-      IterVar(Range::FromMinExtent(0, 1), Var("ty"), IterVarType::kDataPar);
-  IterVar tz_ =
-      IterVar(Range::FromMinExtent(0, 1), Var("tz"), IterVarType::kDataPar);
-  // synchronization scope
-  StorageScope sync_scope_;
-};
-
-// There are cases where necessary syncthreads is not inserted by
-// ThreadSyncInserter. For example, syncthreads is needed after async_wait_queue
-// in the second loop below, but since ThreadSyncInserter is not aware of the
-// asynchronous semantics, it cannot tell that the syncthreads is needed there.
-//
-// // Pipeline prologue
-// for i in range(125):
-//    async_commit_queue(0):
-//       async_scope:
-//          shared[(i + 3) % 4] = ...
-// ...
-//
-// // Pipeline Epilogue
-// for i in range(3):
-//    async_wait_queue(0, 2 - i):
-//       local[...] = shared[(i + 125) % 4]
-
-// This class adds syncthreads after all async_wait_queue. That includes
-// syncthreads that can be inserted by ThreadSyncInserter as well, but
-// ThreadSyncInserter will not insert duplicate syncthreads if it finds an
-// existing one at the synchronization point.
-class ThreadSyncAfterWaitQueueInserter : public StmtExprMutator {
-public:
-  explicit ThreadSyncAfterWaitQueueInserter(StorageScope sync_scope)
-      : sync_scope_(std::move(sync_scope)) {}
-
-  Stmt VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == tvm::tir::attr::async_wait_queue_scope) {
-      auto sync = Evaluate(Call(DataType::Int(32), builtin::tvm_storage_sync(),
-                                {StringImm(sync_scope_.to_string())}));
-      auto inner = op->body.as<AttrStmtNode>();
-      ICHECK(inner &&
-             inner->attr_key == tvm::tir::attr::async_wait_inflight_count);
-      auto zero = make_zero(DataType::Int(32));
-      auto new_body = SeqStmt({sync, inner->body});
-      return AttrStmt(zero, tvm::tir::attr::async_wait_queue_scope, op->value,
-                      AttrStmt(zero, tvm::tir::attr::async_wait_inflight_count,
-                               inner->value, new_body));
-    }
-    return StmtExprMutator::VisitStmt_(op);
-  }
-
-private:
-  StorageScope sync_scope_;
 };
 
 class ThreadSyncInserter : public StmtExprMutator {
@@ -758,12 +469,35 @@ class ThreadPartialSyncRewriter : public IRMutatorWithAnalyzer {
     return {barrier_id, thread_count};
   }
 
+  /*!
+   * \brief Calculate the number of threads that satisfy current constraints.
+   *
+   * This method uses Z3's model enumeration (AllSAT) to precisely count
+   * how many thread IDs satisfy all current constraints. This is essential
+   * for cases like `if (threadIdx.x % 4 == 0)` where const_int_bound only
+   * gives us the range [0, 127] but the actual number of satisfying threads
+   * is 32 (i.e., 0, 4, 8, ..., 124).
+   *
+   * Falls back to range-based calculation if Z3 enumeration fails or returns
+   * an invalid result.
+   */
   size_t CalculateThreadExtent(const IterVar &iv,
                                const arith::ConstIntBound &bound) {
     if (!analyzer_->const_int_bound.IsBound(iv->var)) {
       return 1;
     }
-    return bound->max_value - bound->min_value + 1;
+    auto extent = *as_const_int(iv->dom->extent);
+    // Always use Z3 enumeration to count satisfying values.
+    // This handles constraints like `tx % 4 == 0` that const_int_bound cannot
+    // detect. Z3 enumeration will return the exact count of satisfying values.
+    int64_t z3_count =
+        analyzer_->z3_prover.CountSatisfyingValues(iv->var, extent);
+    if (z3_count > 0) {
+      return static_cast<size_t>(z3_count);
+    }
+
+    // Fallback to range-based calculation if Z3 enumeration failed
+    return static_cast<size_t>(bound->max_value - bound->min_value + 1);
   }
 
   Stmt VisitStmt_(const AttrStmtNode *op) final {
@@ -811,19 +545,1421 @@ class ThreadPartialSyncRewriter : public IRMutatorWithAnalyzer {
   std::unordered_map<ThreadBoundKey, size_t> thread_count_map_;
 };
 
-PrimFunc TileLangThreadSync(PrimFunc func, const std::string &storage_scope) {
-  StorageScope sync_scope = StorageScope::Create(storage_scope);
-  auto *n = func.CopyOnWrite();
-  auto stmt = n->body;
-  if (sync_scope.rank == StorageRank::kShared && sync_scope.tag.empty()) {
-    stmt = ThreadSyncAfterWaitQueueInserter(sync_scope)(stmt);
+struct ConditionThreadProperty {
+  bool depends_on_runtime{false};
+  bool is_block_uniform{true};
+  bool requires_hoist{false};
+
+  void Merge(const ConditionThreadProperty &other) {
+    depends_on_runtime = depends_on_runtime || other.depends_on_runtime;
+    is_block_uniform = is_block_uniform && other.is_block_uniform;
+    requires_hoist = requires_hoist || other.requires_hoist;
   }
-  TileLangThreadSyncPlanner planner(sync_scope);
-  for (const auto &[_, buffer] : func->buffer_map) {
-    planner.SetBufferDataToBuffer(buffer->data, buffer);
+};
+
+/*!
+ * \brief Analyze whether an if-condition is runtime-dependent and/or uniform
+ * across all threads in a block.
+ *
+ * For sync hoisting decisions we care about two independent properties:
+ *
+ * 1. Does the condition depend on runtime values such as memory loads?
+ * 2. Even if it does, is it still block-uniform, i.e. identical for every
+ *    thread in the block?
+ *
+ * Example:
+ * - `token_ids[tx] != -1` is runtime-dependent and non-uniform.
+ * - `batch_sizes[bx] > 0` is runtime-dependent but block-uniform.
+ *
+ * Only runtime-dependent, non-uniform conditions need to force sync hoisting.
+ * In addition, some non-uniform threadIdx-only conditions still need hoisting
+ * when ThreadPartialSyncRewriter cannot handle them.
+ */
+class ConditionThreadPropertyChecker : public IRMutatorWithAnalyzer {
+public:
+  explicit ConditionThreadPropertyChecker(
+      arith::Analyzer *analyzer, const Array<IterVar> &env_threads,
+      const std::unordered_map<const VarNode *, ConditionThreadProperty>
+          &let_var_properties,
+      int warp_size = 32)
+      : IRMutatorWithAnalyzer(analyzer), env_threads_(env_threads),
+        let_var_properties_(let_var_properties), warp_size_(warp_size) {}
+
+  /*!
+   * \brief Analyze condition properties relevant to thread-sync hoisting.
+   */
+  ConditionThreadProperty AnalyzeExpr(const PrimExpr &expr) {
+    current_ = ConditionThreadProperty();
+    this->VisitExpr(expr);
+    return current_;
   }
-  planner(stmt);
 
+  ConditionThreadProperty AnalyzeCondition(const PrimExpr &expr,
+                                           const IterVar &iv) {
+    current_ = ConditionThreadProperty();
+    this->VisitExpr(expr);
+    auto extent_opt = as_const_int(iv->dom->extent);
+    ICHECK(extent_opt != nullptr)
+        << "AnalyzeCondition: thread extent must be a "
+           "constant, but got: "
+        << iv->dom->extent;
+    int64_t thread_extent = *extent_opt;
+    {
+      With<arith::ConstraintContext> ctx(analyzer_, expr);
+      auto count = analyzer_->z3_prover.CountSatisfyingValues(
+          iv->var, thread_extent, /*min_consecutive=*/warp_size_);
+      if (count < 0) {
+        // ThreadPartialSyncRewriter cannot safely lower this condition.
+        current_.requires_hoist = true;
+      }
+    }
+    return current_;
+  }
+
+private:
+  StorageScope GetScope(Var buffer_var) const {
+    return StorageScope::Create(GetPtrStorageScope(std::move(buffer_var)));
+  }
+
+  bool IsThreadVar(const VarNode *op) const {
+    for (const auto &iv : env_threads_) {
+      if (iv->var.get() == op &&
+          runtime::ThreadScope::Create(iv->thread_tag).rank == 1) {
+        return true;
+      }
+    }
+    return false;
+  }
+
+  bool IsThreadLocalScope(const StorageScope &scope) const {
+    switch (scope.rank) {
+    case StorageRank::kWarp:
+    case StorageRank::kLocal:
+    case StorageRank::kWMMAMatrixA:
+    case StorageRank::kWMMAMatrixB:
+    case StorageRank::kWMMAAccumulator:
+    case StorageRank::kAMXTMM:
+    case StorageRank::kMMAMatrixA:
+    case StorageRank::kMMAMatrixB:
+    case StorageRank::kMMAMatrixC:
+    case StorageRank::kMetalSimdGroup:
+      return true;
+    case StorageRank::kGlobal:
+    case StorageRank::kShared:
+    case StorageRank::kTexture:
+      return false;
+    }
+    return false;
+  }
+
+  PrimExpr VisitExpr_(const VarNode *op) final {
+    if (IsThreadVar(op)) {
+      current_.is_block_uniform = false;
+    }
+    auto it = let_var_properties_.find(op);
+    if (it != let_var_properties_.end()) {
+      current_.Merge(it->second);
+    }
+    return GetRef<Var>(op);
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    current_.depends_on_runtime = true;
+    // Do not mark local-scope loads as non-block-uniform solely based on
+    // storage scope.  Thread-local buffers (fragments) commonly hold
+    // block-uniform data when populated from block-uniform global addresses
+    // (e.g., T.copy(BlockMask[blockIdx.y, :], fragment)).  If the load
+    // indices actually depend on threadIdx, the recursive visit of indices
+    // below (via IRMutatorWithAnalyzer::VisitExpr_) will correctly set
+    // is_block_uniform = false through VisitExpr_(VarNode*).
+    return IRMutatorWithAnalyzer::VisitExpr_(op);
+  }
+
+  PrimExpr VisitExpr_(const CallNode *op) final {
+    if (op->op.same_as(builtin::tvm_access_ptr()) ||
+        op->op.same_as(builtin::address_of())) {
+      current_.depends_on_runtime = true;
+      if (op->op.same_as(builtin::tvm_access_ptr()) && op->args.size() >= 2) {
+        if (const auto *data_var = op->args[1].as<VarNode>()) {
+          if (IsThreadLocalScope(GetScope(GetRef<Var>(data_var)))) {
+            current_.is_block_uniform = false;
+          }
+        }
+      }
+    }
+    return IRMutatorWithAnalyzer::VisitExpr_(op);
+  }
+
+private:
+  ConditionThreadProperty current_;
+  const Array<IterVar> &env_threads_;
+  const std::unordered_map<const VarNode *, ConditionThreadProperty>
+      &let_var_properties_;
+  int warp_size_;
+};
+
+struct TileLangThreadSyncPlanner : public ConstrVisitor {
+  explicit TileLangThreadSyncPlanner(StorageScope sync_scope,
+                                     int warp_size = 32)
+      : sync_scope_(std::move(sync_scope)), warp_size_(warp_size) {
+    scope_.push_back(std::vector<StmtEntry>());
+  }
+
+  /*! \brief Storage access type */
+  enum AccessType : uint8_t {
+    kRead,
+    kWrite,
+    kSync,
+    kAlloc,
+    // acquired version of read, only need to handle WAR dep.
+    kReadAcquire
+  };
+  /*! \brief An access entry */
+  struct AccessEntry {
+    /*! \brief The thread index that access this entry */
+    Array<IterVar> threads;
+    /*! \brief The buffer variable, if any */
+    Array<PrimExpr> buffer_indices;
+    ConstrSet cset;
+    /*! \brief The buffer ranges for pointer access */
+    Array<Range> buffer_ranges;
+    Var buffer = NullValue<Var>();
+    Buffer buffer_name;
+    /*! \brief The access data type */
+    DataType dtype;
+    /*! \brief The touched access range
+     *
+     * Has one IntSet for each index in the buffer being accessed.
+     */
+    Array<arith::IntSet> touched;
+    /*! \brief The type of access */
+    AccessType type;
+    /*! \brief The storage scope */
+    StorageScope scope;
+    /*! \brief Whether the access is pointer access */
+    bool is_pointer_access = false;
+    /*! \brief Whether this access originates from an async copy context
+     *         (e.g., inside a TMA load) and therefore multiple writes
+     *         among themselves should not force barriers between them. */
+    bool is_async_copy = false;
+    /*! \brief Whether this access is part of an atomic RMW (e.g., atomic_add).
+     *
+     * If both sides of a dependency are atomic, we should not insert a thread
+     * barrier between them. A barrier would artificially serialize atomics
+     * across threads (and can even change observable results for atomics with
+     * return values).
+     */
+    bool is_atomic = false;
+  };
+  /*! \brief Access pattern about a single statement */
+  struct StmtEntry {
+    /*! \brief The statement */
+    const Object *stmt{};
+    /*! \brief access patterns in the statement */
+    std::vector<AccessEntry> access;
+  };
+  // access scope
+  std::vector<std::vector<StmtEntry>> scope_;
+  StorageScope GetScope(Var buffer_var) const {
+    return StorageScope::Create(GetPtrStorageScope(std::move(buffer_var)));
+  }
+  IterVar GetThreadVar(const std::string &tag) const {
+    for (const auto &iv : env_threads_) {
+      if (iv->thread_tag == tag) {
+        return iv;
+      }
+    }
+    LOG(FATAL) << "Thread variable " << tag << " not found";
+    return IterVar();
+  }
+
+  void VisitExpr_(const BufferLoadNode *op) final {
+    Var buf = op->buffer->data;
+    buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buf.get()), op->buffer);
+    StorageScope scope = GetScope(buf);
+    if (Enabled(buf.get(), scope)) {
+      ICHECK(allow_append_)
+          << tvm::ffi::GetRef<BufferLoad>(op) << " " << scope.to_string();
+      AccessEntry e{.cset = {constr_stack_}};
+      e.threads = env_threads();
+      e.buffer = buf;
+      e.buffer_name = op->buffer;
+      e.buffer_indices = op->indices;
+      e.dtype = op->dtype.element_of();
+      for (const auto &index : op->indices) {
+        e.touched.push_back(arith::IntSet::Vector(index));
+      }
+      e.type = kRead;
+      e.scope = scope;
+      curr_stmt_.access.emplace_back(std::move(e));
+    }
+    // traverse child
+    ConstrVisitor::VisitExpr_(op);
+  }
+  void VisitStmt_(const BufferStoreNode *op) final {
+    allow_append_ = true;
+    ICHECK_EQ(curr_stmt_.access.size(), 0U);
+    curr_stmt_.stmt = op;
+
+    Var buf = op->buffer->data;
+    buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buf.get()), op->buffer);
+    StorageScope scope = GetScope(buf);
+    if (Enabled(buf.get(), scope)) {
+      AccessEntry e{.cset = {constr_stack_}};
+      e.threads = env_threads();
+      e.buffer = buf;
+      e.buffer_name = op->buffer;
+      e.buffer_indices = op->indices;
+      e.dtype = op->value.dtype().element_of();
+      for (const auto &index : op->indices) {
+        e.touched.push_back(arith::IntSet::Vector(index));
+      }
+      e.type = kWrite;
+      e.scope = scope;
+      curr_stmt_.access.emplace_back(std::move(e));
+    }
+    // traverse child
+    ConstrVisitor::VisitStmt_(op);
+    // push to the scope
+    scope_.back().push_back(curr_stmt_);
+    // clear access entry.
+    curr_stmt_.access.clear();
+    allow_append_ = false;
+  }
+  void VisitStmt_(const EvaluateNode *op) final {
+    allow_append_ = true;
+    ICHECK_EQ(curr_stmt_.access.size(), 0U);
+    curr_stmt_.stmt = op;
+    ConstrVisitor::VisitStmt_(op);
+    // push to the scope
+    if (!curr_stmt_.access.empty()) {
+      scope_.back().push_back(curr_stmt_);
+      curr_stmt_.access.clear();
+    }
+    allow_append_ = false;
+  }
+
+  void VisitStmt_(const LetStmtNode *op) final {
+    allow_append_ = true;
+    ICHECK_EQ(curr_stmt_.access.size(), 0U);
+    curr_stmt_.stmt = op;
+    this->VisitExpr(op->value);
+    // push to the scope
+    scope_.back().push_back(curr_stmt_);
+    // clear access entry.
+    curr_stmt_.access.clear();
+    allow_append_ = false;
+    // traverse body block
+    {
+      auto let_prop = AnalyzeExprProperty(op->value);
+      auto it = let_var_properties_.find(op->var.get());
+      bool had_prev = it != let_var_properties_.end();
+      ConditionThreadProperty prev_prop;
+      if (had_prev) {
+        prev_prop = it->second;
+      }
+      let_var_properties_[op->var.get()] = let_prop;
+      auto guard = MakeGuard(op->var, op->value);
+      this->VisitStmt(op->body);
+      if (had_prev) {
+        let_var_properties_[op->var.get()] = prev_prop;
+      } else {
+        let_var_properties_.erase(op->var.get());
+      }
+    }
+  }
+  void VisitStmt_(const BlockNode *op) final {
+    auto block = Downcast<Block>(op);
+    for (const auto &buffer : block->alloc_buffers) {
+      ICHECK(buffer->IsInstance<BufferNode>());
+      buffer_data_to_buffer_.Set(buffer->data, buffer);
+    }
+    ConstrVisitor::VisitStmt_(op);
+  }
+  void VisitStmt_(const AttrStmtNode *op) override {
+    if (op->attr_key == tvm::tir::attr::coproc_scope) {
+      IterVar iv = Downcast<IterVar>(op->node);
+      env_threads_.push_back(iv);
+      ConstrVisitor::VisitStmt_(op);
+      env_threads_.pop_back();
+    } else if (op->attr_key == tvm::tir::attr::thread_extent) {
+      IterVar iv = Downcast<IterVar>(op->node);
+      env_threads_.push_back(iv);
+      ICHECK_NE(iv->thread_tag.length(), 0U);
+
+      if (!in_device_env_) {
+        in_device_env_ = true;
+        scope_.push_back(std::vector<StmtEntry>());
+        ConstrVisitor::VisitStmt_(op);
+        // no need to take the result as the thread barrier automatically syncs.
+        Summarize(std::move(scope_.back()), nullptr);
+        in_device_env_ = false;
+        scope_.pop_back();
+      } else {
+        ConstrVisitor::VisitStmt_(op);
+      }
+      env_threads_.pop_back();
+    } else {
+      ConstrVisitor::VisitStmt_(op);
+    }
+  }
+
+  void VisitStmt_(const ForNode *op) final {
+    scope_.push_back(std::vector<StmtEntry>());
+    ConstrVisitor::VisitStmt_(op);
+    StmtEntry s;
+    s.stmt = op;
+    s.access = Summarize(std::move(scope_.back()), op);
+    scope_.pop_back();
+    if (!s.access.empty()) {
+      // relax the touched set to contain all ranges in the loop.
+      std::unordered_map<const VarNode *, arith::IntSet> relax_map;
+      relax_map[op->loop_var.get()] =
+          arith::IntSet::FromRange(Range::FromMinExtent(op->min, op->extent));
+      for (AccessEntry &e : s.access) {
+        if (e.buffer.defined()) {
+          ICHECK(!e.touched.empty());
+          Array<arith::IntSet> new_touched;
+          for (const auto &touched : e.touched) {
+            new_touched.push_back(arith::EvalSet(touched, relax_map));
+          }
+          e.touched = std::move(new_touched);
+        }
+      }
+    }
+    if (!s.access.empty()) {
+      scope_.back().emplace_back(std::move(s));
+    }
+  }
+
+  /**
+   * @brief Visit an IfThenElse statement and collect storage access summaries
+   * for its branches.
+   *
+   * Visits the if-then-else node's condition and both branches to summarize
+   * buffer reads, writes, and synchronization events under the condition's
+   * constraints.
+   *
+   * IMPORTANT: If syncs are inserted inside an if-statement with a non-uniform
+   * condition (i.e., the condition depends on threadIdx), we must hoist the
+   * sync to before the if-statement. Otherwise, only some threads will reach
+   * the sync point, causing a deadlock.
+   */
+  void VisitStmt_(const IfThenElseNode *op) final {
+    StmtEntry s;
+    // Track syncs inserted before visiting the if body
+    std::unordered_set<const Object *> syncs_before_then;
+    std::unordered_set<const Object *> syncs_before_else;
+    for (const auto &sync : syncs_inserted_) {
+      syncs_before_then.insert(sync);
+    }
+
+    {
+      auto guard = MakeGuard(op->condition);
+      allow_append_ = true;
+      this->VisitExpr(op->condition);
+
+      // Preserve accesses collected from the condition expression so they
+      // participate in dependency analysis. Otherwise, a write to shared memory
+      // immediately followed by an if-condition reading that memory would not
+      // trigger a sync before the if-statement.
+      std::vector<AccessEntry> cond_access = std::move(curr_stmt_.access);
+      allow_append_ = false;
+
+      scope_.push_back(std::vector<StmtEntry>());
+      {
+        this->VisitStmt(op->then_case);
+      }
+
+      s.stmt = op;
+      s.access = Summarize(std::move(scope_.back()), nullptr);
+      scope_.pop_back();
+      // Merge the condition's access summary into the if-statement's access
+      // list so the planner can insert a sync before the if when necessary.
+      if (!cond_access.empty()) {
+        s.access.insert(s.access.begin(), cond_access.begin(),
+                        cond_access.end());
+      }
+    }
+
+    // Track syncs inserted after visiting then branch
+    for (const auto &sync : syncs_inserted_) {
+      syncs_before_else.insert(sync);
+    }
+
+    if (op->else_case) {
+      auto guard = MakeGuard(tir::Not(op->condition));
+      scope_.push_back(std::vector<StmtEntry>());
+      {
+        this->VisitStmt(op->else_case.value());
+      }
+      auto v = Summarize(std::move(scope_.back()), nullptr);
+      scope_.pop_back();
+      s.access.insert(s.access.end(), v.begin(), v.end());
+    }
+
+    // Check if any syncs were inserted inside the if-then-else
+    std::vector<const Object *> syncs_in_then;
+    std::vector<const Object *> syncs_in_else;
+
+    for (const auto &sync : syncs_inserted_) {
+      if (syncs_before_then.count(sync) == 0 &&
+          syncs_before_else.count(sync) != 0) {
+        // Sync was inserted during then branch processing
+        syncs_in_then.push_back(sync);
+      } else if (syncs_before_else.count(sync) == 0) {
+        // Sync was inserted during else branch processing
+        syncs_in_else.push_back(sync);
+      }
+    }
+
+    bool has_syncs_inside = !syncs_in_then.empty() || !syncs_in_else.empty();
+
+    if (has_syncs_inside) {
+      // Runtime-dependent conditions are only problematic when they can differ
+      // across threads in the same block. Block-uniform runtime conditions
+      // such as `batch_sizes[blockIdx.z] > 0` are safe to keep in-place.
+      //
+      // Separately, some threadIdx-only non-uniform conditions still need
+      // hoisting when ThreadPartialSyncRewriter cannot lower them safely.
+      arith::Analyzer analyzer;
+      ConstrSet constr_set = GetConstrSet();
+      constr_set.Populate(analyzer);
+      ConditionThreadPropertyChecker checker(&analyzer, env_threads_,
+                                             let_var_properties_, warp_size_);
+      IterVar tx = GetThreadVar("threadIdx.x");
+      auto condition_prop = checker.AnalyzeCondition(op->condition, tx);
+
+      if ((condition_prop.depends_on_runtime &&
+           !condition_prop.is_block_uniform) ||
+          condition_prop.requires_hoist) {
+        LOG(WARNING)
+            << "[ThreadSync] Hoisting sync from inside if to before if. "
+            << "Condition is not safe for in-if sync: " << op->condition;
+        for (const auto &sync : syncs_in_then) {
+          syncs_inserted_.erase(sync);
+        }
+        for (const auto &sync : syncs_in_else) {
+          syncs_inserted_.erase(sync);
+        }
+
+        // Insert sync before the if-statement itself
+        insert_syncs(op);
+      }
+    }
+
+    scope_.back().emplace_back(std::move(s));
+  }
+
+  void VisitStmt_(const WhileNode *op) final {
+    StmtEntry s;
+    {
+      auto guard = MakeGuard(op->condition);
+      allow_append_ = true;
+      this->VisitExpr(op->condition);
+      std::vector<AccessEntry> cond_access = std::move(curr_stmt_.access);
+      allow_append_ = false;
+
+      scope_.push_back(std::vector<StmtEntry>());
+      {
+        this->VisitStmt(op->body);
+      }
+      s.stmt = op;
+      s.access = Summarize(std::move(scope_.back()), nullptr);
+      scope_.pop_back();
+      if (!cond_access.empty()) {
+        s.access.insert(s.access.begin(), cond_access.begin(),
+                        cond_access.end());
+      }
+    }
+    scope_.back().emplace_back(std::move(s));
+  }
+
+  void VisitExpr_(const CallNode *op) final {
+    // Mark async TMA load context so that tvm_access_ptr within the call
+    // can be tagged accordingly.
+    auto is_tma_load = [&]() {
+      if (auto opt = op->op.as<Op>()) {
+        const Op &call_op = opt.value();
+        return call_op.same_as(tl::tma_load()) ||
+               call_op.same_as(tl::tma_load_im2col());
+      }
+      return false;
+    }();
+    if (is_tma_load) {
+      tma_depth_++;
+      for (const auto &a : op->args) {
+        this->VisitExpr(a);
+      }
+      tma_depth_--;
+      return;
+    }
+
+    // Mark async cp.async load context so that tvm_access_ptr within the call
+    // can be tagged accordingly. This allows the sync planner to avoid
+    // inserting unnecessary barriers between back-to-back cp.async writes.
+    auto is_cp_async = [&]() {
+      if (auto opt = op->op.as<Op>()) {
+        const Op &call_op = opt.value();
+        return call_op.same_as(builtin::ptx_cp_async()) ||
+               call_op.same_as(tl::ptx_cp_async());
+      }
+      return false;
+    }();
+    if (is_cp_async) {
+      cp_async_depth_++;
+      for (const auto &a : op->args) {
+        this->VisitExpr(a);
+      }
+      cp_async_depth_--;
+      return;
+    }
+
+    // Mark the pointer argument of atomic ops as atomic so the sync planner
+    // doesn't insert barriers between atomics.
+    auto is_atomic_op = [&]() {
+      if (auto opt = op->op.as<Op>()) {
+        const Op &call_op = opt.value();
+        return call_op.same_as(tl::atomic_add_elem_op()) ||
+               call_op.same_as(tl::atomic_add_ret_elem_op()) ||
+               call_op.same_as(tl::atomic_addx2_elem_op()) ||
+               call_op.same_as(tl::atomic_addx4_elem_op()) ||
+               call_op.same_as(tl::atomic_load_elem_op()) ||
+               call_op.same_as(tl::atomic_store_elem_op()) ||
+               call_op.same_as(tl::atomic_max_elem_op()) ||
+               call_op.same_as(tl::atomic_max_ret_elem_op()) ||
+               call_op.same_as(tl::atomic_min_elem_op()) ||
+               call_op.same_as(tl::atomic_min_ret_elem_op());
+      }
+      return false;
+    }();
+    if (is_atomic_op) {
+      if (!op->args.empty()) {
+        atomic_dst_ptr_depth_++;
+        this->VisitExpr(op->args[0]);
+        atomic_dst_ptr_depth_--;
+        for (size_t i = 1; i < op->args.size(); ++i) {
+          this->VisitExpr(op->args[i]);
+        }
+      }
+      return;
+    }
+    if (op->op.same_as(builtin::address_of())) {
+      ICHECK_EQ(op->args.size(), 1U);
+      if (auto load = op->args[0].as<BufferLoadNode>()) {
+        Buffer buffer = load->buffer;
+        DataType dtype = buffer->dtype;
+        const VarNode *buffer_var = buffer->data.as<VarNode>();
+        buffer_data_to_buffer_.Set(tvm::ffi::GetRef<Var>(buffer_var), buffer);
+        StorageScope scope = GetScope(tvm::ffi::GetRef<Var>(buffer_var));
+        Array<Range> buffer_ranges;
+        // from indices to buffer indices
+        ICHECK(buffer->shape.size() == load->indices.size());
+        // Use buffer shape and indices to compute the buffer_ranges for each
+        // dimension.
+        for (size_t i = 0; i < buffer->shape.size(); ++i) {
+          PrimExpr min = load->indices[i];
+          PrimExpr extent = make_const(buffer->shape[i].dtype(), 1);
+          buffer_ranges.push_back(Range::FromMinExtent(min, extent));
+        }
+        if (Enabled(buffer_var, scope)) {
+          ICHECK(allow_append_);
+          AccessEntry e{.cset = {constr_stack_}};
+          e.threads = env_threads();
+          e.dtype = dtype;
+          e.buffer = Downcast<Var>(buffer->data);
+          e.buffer_name = buffer;
+          e.buffer_ranges = buffer_ranges;
+          for (const auto &index : load->indices) {
+            e.touched.push_back(arith::IntSet::Vector(index));
+          }
+          e.is_pointer_access = true;
+          e.is_atomic = (atomic_dst_ptr_depth_ > 0);
+          e.type = kRead;
+          e.scope = scope;
+          curr_stmt_.access.emplace_back(e);
+        }
+        ConstrVisitor::VisitExpr_(load);
+      } else {
+        ConstrVisitor::VisitExpr_(op);
+      }
+    } else if (op->op.same_as(builtin::tvm_access_ptr())) {
+      ICHECK_EQ(op->args.size(), 5U);
+      DataType dtype = op->args[0].dtype();
+      const VarNode *buffer_var = op->args[1].as<VarNode>();
+      PrimExpr offset = op->args[2];
+      PrimExpr extent = op->args[3];
+      const IntImmNode *flag = op->args[4].as<IntImmNode>();
+      StorageScope scope = GetScope(tvm::ffi::GetRef<Var>(buffer_var));
+      // The buffer scope.
+      if (Enabled(buffer_var, scope)) {
+        ICHECK(allow_append_);
+        Array<Range> buffer_ranges;
+        if (buffer_data_to_buffer_.find(tvm::ffi::GetRef<Var>(buffer_var)) ==
+            buffer_data_to_buffer_.end()) {
+          // cannot find buffer map, use the default buffer
+          buffer_ranges = {Range::FromMinExtent(offset, extent)};
+        } else {
+          Buffer buffer =
+              buffer_data_to_buffer_.at(tvm::ffi::GetRef<Var>(buffer_var));
+          auto buffer_shape = buffer->shape;
+          // convert 1d offset to multi-dimensional index
+          auto linear_to_indices = [](PrimExpr offset,
+                                      const Array<PrimExpr> &shape) {
+            Array<PrimExpr> indices;
+            DataType index_dtype = offset.dtype();
+            ICHECK(index_dtype.is_int() || index_dtype.is_uint())
+                << "Expected integer offset dtype in tvm_access_ptr, but got "
+                << index_dtype;
+            PrimExpr remaining = std::move(offset);
+            for (size_t i = 0; i < shape.size(); ++i) {
+              PrimExpr stride = make_const(index_dtype, 1);
+              for (size_t j = i + 1; j < shape.size(); ++j) {
+                PrimExpr dim = shape[j];
+                if (dim.dtype() != index_dtype) {
+                  dim = tir::Cast(index_dtype, dim);
+                }
+                stride = stride * dim;
+              }
+              PrimExpr idx = FloorDiv(remaining, stride);
+              remaining = FloorMod(remaining, stride);
+              indices.push_back(idx);
+            }
+            return indices;
+          };
+          Array<PrimExpr> start_indices =
+              linear_to_indices(offset, buffer_shape);
+          Array<PrimExpr> end_indices =
+              linear_to_indices(offset + extent, buffer_shape);
+          for (size_t i = 0; i < buffer_shape.size(); ++i) {
+            buffer_ranges.push_back(Range::FromMinExtent(
+                start_indices[i], end_indices[i] - start_indices[i]));
+          }
+        }
+        AccessEntry e{.cset = {constr_stack_}};
+        e.threads = env_threads();
+        e.dtype = dtype;
+        e.buffer = tvm::ffi::GetRef<Var>(buffer_var);
+        e.buffer_ranges = buffer_ranges;
+        e.is_pointer_access = true;
+        e.is_atomic = (atomic_dst_ptr_depth_ > 0);
+        e.touched = {
+            arith::IntSet::FromRange(Range::FromMinExtent(offset, extent))};
+        e.scope = scope;
+        if (flag->value & 1) {
+          e.type = kRead;
+          e.is_async_copy = (tma_depth_ > 0 || cp_async_depth_ > 0);
+          curr_stmt_.access.emplace_back(e);
+        }
+        if (flag->value & 2) {
+          e.type = kWrite;
+          e.is_async_copy = (tma_depth_ > 0 || cp_async_depth_ > 0);
+          curr_stmt_.access.emplace_back(e);
+        }
+      }
+      ConstrVisitor::VisitExpr_(op);
+    } else if (op->op.same_as(builtin::tvm_storage_sync())) {
+      ICHECK(allow_append_);
+      const std::string &s = op->args[0].as<StringImmNode>()->value;
+      if (s != "warp" && s != "cluster") {
+        StorageScope scope = StorageScope::Create(s);
+        AccessEntry e{.cset = {constr_stack_}};
+        e.threads = env_threads();
+        e.type = kSync;
+        e.scope = StorageScope::Create(s);
+        curr_stmt_.access.emplace_back(std::move(e));
+      }
+    } else {
+      ConstrVisitor::VisitExpr_(op);
+    }
+  }
+
+  void SetBufferDataToBuffer(const Var &buffer_var, const Buffer &buffer) {
+    buffer_data_to_buffer_.Set(buffer_var, buffer);
+  }
+
+  std::vector<AccessEntry> Summarize(std::vector<StmtEntry> seq,
+                                     const ForNode *loop) {
+    // Redirect all "shared.dyn" buffer access to the same buffer var
+    // so that the accesses can be planned together.
+    Var shared_dyn_buf;
+    for (StmtEntry &entry : seq) {
+      for (AccessEntry &access : entry.access) {
+        if (access.scope.rank == StorageRank::kShared &&
+            access.scope.tag == ".dyn" && access.buffer.defined()) {
+          if (!shared_dyn_buf.defined()) {
+            shared_dyn_buf = access.buffer;
+          } else {
+            access.buffer = shared_dyn_buf;
+          }
+        }
+      }
+    }
+
+    // Unsynced reads and writes
+    std::vector<AccessEntry> reads;
+    std::vector<AccessEntry> writes;
+    // if it is a loop, rotate two times to consider effect of loop.
+    // simulation based approach to find dependencies
+    for (size_t i = 0; i < seq.size(); ++i) {
+      const StmtEntry &s = seq[i];
+      // check if sync before statement is needed.
+      bool sync_before_stmt = (syncs_inserted_.count(s.stmt) != 0);
+      // Apply the syncs added already.
+
+      if (sync_before_stmt) {
+        reads.clear();
+        writes.clear();
+      }
+
+      for (const AccessEntry &acc : s.access) {
+        if (acc.type == kRead) {
+          // Same-iteration conflict: loop=nullptr
+          if (FindConflict(writes, acc, nullptr)) {
+            sync_before_stmt = true;
+            break;
+          }
+        } else if (acc.type == kWrite) {
+          // Same-iteration conflict: loop=nullptr
+          if (FindConflict(reads, acc, nullptr) ||
+              FindConflict(writes, acc, nullptr)) {
+            sync_before_stmt = true;
+            break;
+          }
+        } else if (acc.type == kSync) {
+          reads.clear();
+          writes.clear();
+        }
+      }
+      // If sync is inserted. remove the irrelevant things.
+      if (sync_before_stmt) {
+        reads.clear();
+        writes.clear();
+      }
+      // Add the read/write of current statement
+      for (const AccessEntry &acc : s.access) {
+        if (acc.type == kRead) {
+          reads.push_back(acc);
+        } else if (acc.type == kWrite) {
+          writes.push_back(acc);
+        } else if (acc.type == kSync) {
+          reads.clear();
+          writes.clear();
+        }
+      }
+
+      if (sync_before_stmt) {
+        insert_syncs(s.stmt);
+      }
+    }
+    if (loop != nullptr) {
+      // Check if the loop body contains any reads in the same sync scope.
+      // If there are reads, we conservatively keep the sync within the loop
+      // body to preserve per-iteration ordering when needed. If there are no
+      // reads (e.g., only writes to shared.dyn), we can safely hoist the sync
+      // to before the loop to avoid redundant barriers.
+      bool has_read_in_scope = false;
+      for (const StmtEntry &s : seq) {
+        for (const AccessEntry &acc : s.access) {
+          if (acc.type == kRead && acc.scope == sync_scope_) {
+            has_read_in_scope = true;
+            break;
+          }
+        }
+        if (has_read_in_scope)
+          break;
+      }
+      // Loop-carried dependency analysis using symbolic iteration shift.
+      // We compare accesses at iteration i (end of loop, stored in
+      // reads/writes) with accesses at iteration i+1 (beginning of next
+      // iteration). By substituting loop_var -> loop_var + step in the "next
+      // iteration" indices, we can precisely determine if there's a true
+      // dependency.
+      //
+      // Examples:
+      // - A[i] write, A[i] read: No loop-carry (same iteration access)
+      // - A[i] write, A[i+1] read: After shift, comparing A[i] vs A[i+1],
+      // disjoint
+      // - A[i] write, A[i-1] read: After shift, comparing A[i] vs A[i],
+      // conflict!
+      // - A[i%2] write, A[i%2] read: After shift, comparing A[i%2] vs
+      // A[(i+1)%2],
+      //   which are disjoint for modulo buffering
+      for (size_t i = 0; i < seq.size(); ++i) {
+        const StmtEntry &s = seq[i];
+        if (syncs_inserted_.count(s.stmt) != 0)
+          break;
+        if (reads.empty() && writes.empty())
+          break;
+        bool need_loop_sync = false;
+        for (const AccessEntry &acc : s.access) {
+          if (acc.type == kRead) {
+            // Loop-carry conflict: pass loop for iteration shift analysis
+            if (FindConflict(writes, acc, loop)) {
+              need_loop_sync = true;
+              break;
+            }
+          } else if (acc.type == kWrite) {
+            // Loop-carry conflict: pass loop for iteration shift analysis
+            if (FindConflict(reads, acc, loop) ||
+                FindConflict(writes, acc, loop)) {
+              need_loop_sync = true;
+              break;
+            }
+          } else if (acc.type == kSync) {
+            reads.clear();
+            writes.clear();
+          }
+        }
+        if (need_loop_sync) {
+          if (!has_read_in_scope) {
+            // Mark the loop itself to receive a sync before it, instead of
+            // inserting inside the loop body. This ensures a single sync is
+            // emitted outside the loop and avoids per-iteration overhead.
+            insert_syncs(loop);
+          } else {
+            // Fall back to inserting before the first conflicting statement
+            // inside the loop to maintain correctness when reads are present.
+            insert_syncs(s.stmt);
+          }
+          break;
+        }
+      }
+    }
+    // return the exposed entries, remove unnecessary ones.
+    int sync_count = 0;
+    // head are before first sync, tail are after last sync
+    std::vector<AccessEntry> head, tail;
+    AccessEntry esync{.cset = {constr_stack_}};
+    esync.threads = this->env_threads();
+    esync.type = kSync;
+    esync.scope = sync_scope_;
+
+    for (const StmtEntry &s : seq) {
+      if (syncs_inserted_.count(s.stmt)) {
+        if (sync_count != 0) {
+          tail.clear();
+        } else {
+          head.push_back(esync);
+        }
+        ++sync_count;
+      }
+      for (const AccessEntry &acc : s.access) {
+        if (acc.type == kSync) {
+          if (sync_count != 0) {
+            tail.clear();
+          } else {
+            head.push_back(esync);
+          }
+          ++sync_count;
+        } else {
+          if (sync_count != 0) {
+            tail.push_back(acc);
+          } else {
+            head.push_back(acc);
+          }
+        }
+      }
+    }
+    head.insert(head.end(), tail.begin(), tail.end());
+    return head;
+  }
+  // The syncs inserted before each statement
+  std::unordered_set<const Object *> syncs_inserted_;
+  const Array<IterVar> &env_threads() const { return env_threads_; }
+
+private:
+  ConditionThreadProperty AnalyzeExprProperty(const PrimExpr &expr) const {
+    arith::Analyzer analyzer;
+    ConstrSet constr_set = GetConstrSet();
+    constr_set.Populate(analyzer);
+    ConditionThreadPropertyChecker checker(&analyzer, env_threads_,
+                                           let_var_properties_, warp_size_);
+    return checker.AnalyzeExpr(expr);
+  }
+
+  bool Enabled(const VarNode *buf, const StorageScope &scope) {
+    return in_device_env() && scope == sync_scope_;
+  }
+  /*! \return whether we are in device environment. */
+  bool in_device_env() const { return in_device_env_; }
+
+  // whether access appending is enabled.
+  bool allow_append_{false};
+  // Whether we are in device environment
+  bool in_device_env_{false};
+  // Nesting depth of tma_load/tma_load_im2col calls
+  int tma_depth_{0};
+  // Nesting depth of cp.async calls (ptx_cp_async)
+  int cp_async_depth_{0};
+  // Whether we're visiting the pointer argument expression of an atomic call
+  // (e.g., atomic_add/atomic_max/atomic_load). When > 0, accesses produced by
+  // the pointer metadata ops are tagged as atomic.
+  int atomic_dst_ptr_depth_{0};
+  // the current free stmt entry.
+  StmtEntry curr_stmt_;
+  // The involving threads
+  Array<IterVar> env_threads_;
+  // Thread-uniform/runtime properties for let-bound vars visible in the
+  // current lexical scope.
+  std::unordered_map<const VarNode *, ConditionThreadProperty>
+      let_var_properties_;
+  // The buffer map
+  Map<Var, Buffer> buffer_data_to_buffer_;
+  // synchronization scope
+  StorageScope sync_scope_;
+  // warp size from target
+  int warp_size_;
+
+  void insert_syncs(const Object *obj) {
+    if (syncs_inserted_.count(obj))
+      return;
+    syncs_inserted_.insert(obj);
+  }
+  bool PointerAccessIsDisjoint(const AccessEntry &lhs, const AccessEntry &rhs) {
+    if (lhs.touched.size() != 1 || rhs.touched.size() != 1) {
+      return false;
+    }
+    ConstrSet prev_cset{lhs.cset};
+    ConstrSet curr_cset{rhs.cset};
+    arith::Analyzer analyzer;
+
+    struct ThreadVarInfo {
+      const char *name_prev;
+      const char *name_curr;
+    } thread_vars[] = {
+        {"tx1", "tx2"},
+        {"ty1", "ty2"},
+        {"tz1", "tz2"},
+    };
+    PrimExpr lhs_min = analyzer.Simplify(lhs.touched[0].min());
+    PrimExpr lhs_max = analyzer.Simplify(lhs.touched[0].max());
+    PrimExpr rhs_min = analyzer.Simplify(rhs.touched[0].min());
+    PrimExpr rhs_max = analyzer.Simplify(rhs.touched[0].max());
+    for (unsigned idx = 0; idx != 3; ++idx) {
+      auto &info = thread_vars[idx];
+      Var old_prev_var = lhs.threads[lhs.threads.size() + idx - 3]->var;
+      Var old_curr_var = rhs.threads[rhs.threads.size() + idx - 3]->var;
+      Var prev_var(info.name_prev, old_prev_var.dtype());
+      Var curr_var(info.name_curr, old_curr_var.dtype());
+      lhs_min = Substitute(lhs_min, {{old_prev_var, prev_var}});
+      lhs_max = Substitute(lhs_max, {{old_prev_var, prev_var}});
+      prev_cset = prev_cset.Substitute({{old_prev_var, prev_var}});
+      rhs_min = Substitute(rhs_min, {{old_curr_var, curr_var}});
+      rhs_max = Substitute(rhs_max, {{old_curr_var, curr_var}});
+      curr_cset = curr_cset.Substitute({{old_curr_var, curr_var}});
+    }
+    prev_cset.Populate(analyzer);
+    curr_cset.Populate(analyzer);
+
+    if (analyzer.CanProve(lhs_max < rhs_min,
+                          arith::ProofStrength::kSymbolicBound)) {
+      return true;
+    }
+    if (analyzer.CanProve(rhs_max < lhs_min,
+                          arith::ProofStrength::kSymbolicBound)) {
+      return true;
+    }
+    return false;
+  }
+  void print_access_tentry(const AccessEntry &access,
+                           bool print_constr = false) {
+    std::ostringstream output;
+
+    output << "Access Entry Information:\n";
+    output << "  Buffer: " << access.buffer << "\n";
+    output << "  Buffer Name: " << access.buffer_name << "\n";
+    output << "  Data Type: " << access.dtype << "\n";
+
+    std::string type_str;
+    switch (access.type) {
+    case kRead:
+      type_str = "Read";
+      break;
+    case kWrite:
+      type_str = "Write";
+      break;
+    case kSync:
+      type_str = "Sync";
+      break;
+    case kAlloc:
+      type_str = "Alloc";
+      break;
+    case kReadAcquire:
+      type_str = "ReadAcquire";
+      break;
+    default:
+      type_str = "Unknown";
+      break;
+    }
+    output << "  Access Type: " << type_str << "\n";
+
+    output << "  Storage Scope: " << access.scope.to_string() << "\n";
+
+    output << "  Threads: [";
+    for (size_t i = 0; i < access.threads.size(); ++i) {
+      if (i > 0)
+        output << ", ";
+      output << access.threads[i]->thread_tag;
+    }
+    output << "]\n";
+
+    if (print_constr) {
+      output << "  Constraint: {";
+      arith::Analyzer analyzer_;
+      access.cset.Populate(analyzer_);
+      output << analyzer_.z3_prover.GetSMTLIB2(std::nullopt);
+      output << "}\n";
+    }
+
+    output << "  Buffer Indices: [";
+    for (size_t i = 0; i < access.buffer_indices.size(); ++i) {
+      if (i > 0)
+        output << ", ";
+      output << access.buffer_indices[i];
+    }
+    output << "]\n";
+
+    if (!access.buffer_ranges.empty()) {
+      output << "  Buffer Ranges: [";
+      for (size_t i = 0; i < access.buffer_ranges.size(); ++i) {
+        if (i > 0)
+          output << ", ";
+        output << "[" << access.buffer_ranges[i]->min << ", "
+               << access.buffer_ranges[i]->extent << "]";
+      }
+      output << "]\n";
+    }
+
+    if (!access.touched.empty()) {
+      output << "  Touched Ranges: [";
+      for (size_t i = 0; i < access.touched.size(); ++i) {
+        if (i > 0)
+          output << ", ";
+        output << access.touched[i];
+      }
+      output << "]\n";
+    }
+
+    output << "  Flags: ";
+    output << "is_pointer_access="
+           << (access.is_pointer_access ? "true" : "false");
+    output << ", is_async_copy=" << (access.is_async_copy ? "true" : "false");
+
+    LOG(WARNING) << output.str();
+  }
+  /*!
+   * \brief Check if two access entries conflict, considering loop-carried
+   * dependencies.
+   *
+   * For loop-carry analysis, we use symbolic iteration shift: instead of
+   * treating loop_carry as a simple flag, we substitute loop_var with
+   * loop_var + step in the "next iteration" access indices and check if they
+   * overlap with the "current iteration" access indices.
+   *
+   * This approach can prove that accesses like A[i] and A[i+1] are disjoint
+   * (no loop-carry dependency), while correctly detecting dependencies like
+   * A[i] and A[i-1] (loop-carry dependency with distance 1).
+   *
+   * \param prev The access entry from the previous/current iteration
+   * \param curr The access entry to check against
+   * \param loop The loop node for loop-carry analysis, nullptr for
+   * same-iteration
+   * \return true if the accesses conflict and need synchronization
+   */
+  bool FindConflict(const AccessEntry &prev, const AccessEntry &curr,
+                    const ForNode *loop) {
+    // Special case: ignore conflicts between async-copy writes (e.g., TMA
+    // loads into shared memory). Multiple async writes do not require
+    // interspersed barriers among themselves. We still respect conflicts with
+    // reads to ensure visibility before consumption.
+    if (prev.type == kWrite && curr.type == kWrite && prev.is_async_copy &&
+        curr.is_async_copy) {
+      return false;
+    }
+    // Access to different buffers does not conflict.
+    if (!prev.buffer.same_as(curr.buffer)) {
+      return false;
+    }
+
+    // Atomic ops already provide correctness for concurrent access.
+    // Inserting a barrier between atomics is unnecessary and can change
+    // program behavior (e.g., atomics with return values).
+    if (prev.is_atomic && curr.is_atomic) {
+      return false;
+    }
+
+    if (prev.buffer_indices.size() != curr.buffer_indices.size()) {
+      // They are not the same indices, should be conflict.
+      return true;
+    }
+
+    if (prev.is_pointer_access || curr.is_pointer_access) {
+      // For accesses created via tvm_access_ptr we may still be able to prove
+      // disjointness using their byte ranges. If both sides expose a touched
+      // interval and we can show they don't overlap, skip the conflict.
+      if (prev.is_pointer_access && curr.is_pointer_access &&
+          PointerAccessIsDisjoint(prev, curr)) {
+        return false;
+      }
+      // Otherwise fall back to the conservative answer: treat them as
+      // overlapping.
+      return true;
+    }
+
+    // Build substitution map for loop-carry analysis
+    // For loop-carry, we compare: Iter(i) vs Iter(i+step)
+    // prev represents access at iteration i (end of loop body)
+    // curr represents access at iteration i+step (beginning of next iteration)
+    ffi::Map<Var, PrimExpr> loop_shift_sub;
+    if (loop != nullptr) {
+      // Get loop step, default to 1 if not specified
+      PrimExpr step = make_const(loop->loop_var.dtype(), 1);
+      // Substitute loop_var -> loop_var + step for the "next iteration"
+      loop_shift_sub.Set(loop->loop_var, loop->loop_var + step);
+    }
+
+    // Check if indices are the same (considering loop shift)
+    bool has_same_index = true;
+    for (size_t i = 0; i < prev.buffer_indices.size(); i++) {
+      const auto &prev_indice = prev.buffer_indices[i];
+      PrimExpr curr_indice = curr.buffer_indices[i];
+
+      // For loop-carry, shift the curr index to represent next iteration
+      if (loop != nullptr) {
+        curr_indice = Substitute(curr_indice, loop_shift_sub);
+      }
+
+      if (!ExprDeepEqual()(prev_indice, curr_indice)) {
+        has_same_index = false;
+        break;
+      }
+    }
+
+    if (has_same_index) {
+      // Use Z3 to check if prev and curr constraints are equivalent.
+      // If equivalent, the same set of threads execute both accesses, so no
+      // sync is needed.
+      PrimExpr prev_constr = prev.cset.ToConjunction();
+      PrimExpr curr_constr = curr.cset.ToConjunction();
+
+      arith::Analyzer analyzer;
+      for (const auto &iv : prev.threads) {
+        if (iv->dom.defined()) {
+          analyzer.Bind(iv->var, iv->dom);
+        }
+      }
+      // Add loop variable constraint for loop-carry analysis
+      if (loop != nullptr) {
+        // For loop-carry analysis, we compare iteration i with iteration i+1.
+        // Since i+1 must be a valid iteration, i can only range from min to
+        // min+extent-2 (i.e., extent-1 valid pairs instead of extent).
+        PrimExpr adjusted_extent =
+            loop->extent - make_const(loop->extent.dtype(), 1);
+        analyzer.Bind(loop->loop_var,
+                      Range::FromMinExtent(loop->min, adjusted_extent));
+      }
+
+      // Check P => C: ¬P ∨ C
+      bool prev_implies_curr = analyzer.z3_prover.CanProve(
+          tir::Or(tir::Not(prev_constr), curr_constr));
+      // Check C => P: ¬C ∨ P
+      bool curr_implies_prev = analyzer.z3_prover.CanProve(
+          tir::Or(tir::Not(curr_constr), prev_constr));
+
+      if (prev_implies_curr && curr_implies_prev) {
+        // If constraints are equivalent, they are not in conflict
+        return false;
+      } else {
+        // If constraints are not equivalent, they are in conflict
+        return true;
+      }
+    }
+
+    // Indices are different, need to check if they can overlap
+    bool range_is_overlap = true;
+
+    for (size_t i = 0; i < prev.buffer_indices.size(); i++) {
+      auto prev_dtype = prev.dtype;
+      auto curr_dtype = curr.dtype;
+
+      const auto &prev_indice = prev.buffer_indices[i];
+      PrimExpr curr_indice = curr.buffer_indices[i];
+
+      // For loop-carry, shift the curr index to represent next iteration
+      if (loop != nullptr) {
+        curr_indice = Substitute(curr_indice, loop_shift_sub);
+      }
+
+      PrimExpr prev_indice_bytes = prev_indice * prev_dtype.bytes();
+      PrimExpr curr_indice_bytes = curr_indice * curr_dtype.bytes();
+
+      ConstrSet prev_cset{prev.cset};
+      ConstrSet curr_cset{curr.cset};
+      arith::Analyzer analyzer;
+
+      // Add loop variable constraint for loop-carry analysis
+      if (loop != nullptr) {
+        // For loop-carry analysis, we compare iteration i with iteration i+1.
+        // Since i+1 must be a valid iteration, i can only range from min to
+        // min+extent-2 (i.e., extent-1 valid pairs instead of extent).
+        PrimExpr adjusted_extent =
+            loop->extent - make_const(loop->extent.dtype(), 1);
+        analyzer.Bind(loop->loop_var,
+                      Range::FromMinExtent(loop->min, adjusted_extent));
+      }
+
+      // For WAW (Write-after-Write) and RAR (Read-after-Read), we should use
+      // the same thread variables because:
+      // - WAW: doesn't create true data dependency, only need to check if the
+      //   same thread overwrites its own data across iterations
+      // - RAR: no dependency at all
+      // For RAW (Read-after-Write) and WAR (Write-after-Read), we need to use
+      // different thread variables to check cross-thread dependencies.
+      bool same_access_type = (prev.type == kWrite && curr.type == kWrite) ||
+                              (prev.type == kRead && curr.type == kRead);
+
+      PrimExpr thread_condition = Bool(false);
+      ffi::Map<Var, PrimExpr> prev_sub, curr_sub;
+
+      const char *thread_names[] = {"tx", "ty", "tz"};
+      for (unsigned idx = 0; idx != 3; ++idx) {
+        Var old_prev_var = prev.threads[prev.threads.size() + idx - 3]->var;
+        Var old_curr_var = curr.threads[curr.threads.size() + idx - 3]->var;
+
+        if (same_access_type) {
+          // For WAW/RAR: use a single shared Var object for both prev and curr
+          // This allows the analyzer to see they reference the same thread
+          Var shared_var(thread_names[idx], old_prev_var.dtype());
+          prev_sub.Set(old_prev_var, shared_var);
+          curr_sub.Set(old_curr_var, shared_var);
+        } else {
+          // For RAW/WAR: use different Var objects to model cross-thread access
+          Var prev_var(std::string(thread_names[idx]) + "1",
+                       old_prev_var.dtype());
+          Var curr_var(std::string(thread_names[idx]) + "2",
+                       old_curr_var.dtype());
+          thread_condition =
+              tir::Or(thread_condition, tir::NE(prev_var, curr_var));
+          prev_sub.Set(old_prev_var, prev_var);
+          curr_sub.Set(old_curr_var, curr_var);
+        }
+      }
+      if (!same_access_type) {
+        analyzer.EnterConstraint(thread_condition);
+      }
+      prev_cset.Substitute(prev_sub).Populate(analyzer);
+      curr_cset.Substitute(curr_sub).Populate(analyzer);
+      bool provably_disjoint = false;
+
+      prev_indice_bytes =
+          analyzer.Simplify(Substitute(prev_indice_bytes, prev_sub));
+      curr_indice_bytes =
+          analyzer.Simplify(Substitute(curr_indice_bytes, curr_sub));
+
+      // Handle Ramp expressions by creating a new index variable
+      if (const RampNode *prev_ramp = prev_indice_bytes.as<RampNode>()) {
+        DataType prev_index_dtype = prev_ramp->base.dtype();
+        Var prev_idx("prev_idx", prev_index_dtype);
+        analyzer.Bind(prev_idx, Range::FromMinExtent(0, prev_ramp->lanes));
+
+        prev_indice_bytes = prev_ramp->base + prev_idx * prev_ramp->stride;
+      }
+
+      if (const RampNode *curr_ramp = curr_indice_bytes.as<RampNode>()) {
+        DataType curr_index_dtype = curr_ramp->base.dtype();
+        Var curr_idx("curr_idx", curr_index_dtype);
+        analyzer.Bind(curr_idx, Range::FromMinExtent(0, curr_ramp->lanes));
+        curr_indice_bytes = curr_ramp->base + curr_idx * curr_ramp->stride;
+      }
+
+      // Now handle the simplified expressions
+      if (prev_indice_bytes.dtype().is_scalar() &&
+          curr_indice_bytes.dtype().is_scalar()) {
+        if (prev_indice_bytes.dtype() != curr_indice_bytes.dtype()) {
+          if (prev_indice_bytes.dtype().bits() <
+              curr_indice_bytes.dtype().bits()) {
+            prev_indice_bytes =
+                tir::Cast(curr_indice_bytes.dtype(), prev_indice_bytes);
+          } else {
+            curr_indice_bytes =
+                tir::Cast(prev_indice_bytes.dtype(), curr_indice_bytes);
+          }
+        }
+        ICHECK(prev_indice_bytes.dtype() == curr_indice_bytes.dtype());
+        provably_disjoint =
+            analyzer.CanProve(tir::NE(prev_indice_bytes, curr_indice_bytes));
+      } else {
+        try {
+          auto prev_min = analyzer.Simplify(
+              Substitute(prev.touched[i].min() * prev_dtype.bytes(), prev_sub));
+          auto prev_max = analyzer.Simplify(
+              Substitute(prev.touched[i].max() * prev_dtype.bytes(), prev_sub));
+          auto curr_min = analyzer.Simplify(
+              Substitute(curr.touched[i].min() * curr_dtype.bytes(), curr_sub));
+          auto curr_max = analyzer.Simplify(
+              Substitute(curr.touched[i].max() * curr_dtype.bytes(), curr_sub));
+          provably_disjoint = analyzer.CanProve(analyzer.Simplify(
+              tir::Or(prev_min > curr_max, curr_min > prev_max)));
+        } catch (const std::exception &e) {
+          auto prev_bound = analyzer.const_int_bound(prev_indice_bytes);
+          auto curr_bound = analyzer.const_int_bound(curr_indice_bytes);
+          if (prev_bound.defined() && curr_bound.defined()) {
+            if ((prev_bound->min_value) > (curr_bound->max_value) ||
+                (curr_bound->min_value) > (prev_bound->max_value)) {
+              range_is_overlap = false;
+              break;
+            }
+          }
+        }
+        // if (!provably_disjoint) {
+        //   LOG(WARNING) << analyzer.z3_prover.GetStats();
+        //   LOG(WARNING) <<
+        //   analyzer.z3_prover.GetSMTLIB2(tir::Not(tir::Or(prev_min >
+        //   curr_max, curr_min > prev_max)));
+        // }
+      }
+
+      if (provably_disjoint) {
+        range_is_overlap = false;
+        break;
+      }
+    }
+
+    return range_is_overlap;
+  }
+
+  bool FindConflict(const std::vector<AccessEntry> &prev,
+                    const AccessEntry &curr, const ForNode *loop) {
+    for (const AccessEntry &x : prev) {
+      if (FindConflict(x, curr, loop)) {
+        return true;
+      }
+    }
+    return false;
+  }
+};
+
+PrimFunc TileLangThreadSync(PrimFunc func, const std::string &storage_scope) {
+  StorageScope sync_scope = StorageScope::Create(storage_scope);
+  auto *n = func.CopyOnWrite();
+  auto stmt = n->body;
+  if (sync_scope.rank == StorageRank::kShared && sync_scope.tag.empty()) {
+    stmt = ThreadSyncAfterWaitGroupInserter(sync_scope)(stmt);
+  }
+  // Get warp size from target, defaulting to 32 if not available
+  int warp_size = 32;
+  if (auto target = func->GetAttr<Target>(tvm::attr::kTarget)) {
+    warp_size = target.value()
+                    ->GetAttr<Integer>("thread_warp_size", 32)
+                    .value()
+                    .IntValue();
+  }
+  TileLangThreadSyncPlanner planner(sync_scope, warp_size);
+  for (const auto &[_, buffer] : func->buffer_map) {
+    planner.SetBufferDataToBuffer(buffer->data, buffer);
+  }
+  planner(stmt);
   stmt =
       ThreadSyncInserter(sync_scope, planner.syncs_inserted_)(std::move(stmt));
   n->body = ThreadPartialSyncRewriter::Rewrite(std::move(stmt));
diff --git a/src/transform/unroll_loop.cc b/src/transform/unroll_loop.cc
new file mode 100644
index 0000000000..57c1d6573e
--- /dev/null
+++ b/src/transform/unroll_loop.cc
@@ -0,0 +1,324 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+/*!
+ *  Loop unrolling as in Halide pipeline.
+ * \file unroll_loop.cc
+ */
+// Unrolls the loop as in Halide pipeline.
+#include <tvm/arith/analyzer.h>
+#include <tvm/ffi/function.h>
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/expr.h>
+#include <tvm/tir/op.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+#include <unordered_set>
+
+#include "runtime/thread_storage_scope.h"
+#include "tir/transforms/ir_utils.h"
+
+namespace tvm {
+namespace tl {
+
+using namespace tir;
+
+struct UnrollLoopConfigNode
+    : public AttrsNodeReflAdapter<UnrollLoopConfigNode> {
+  int auto_max_step;
+  int auto_max_depth;
+  int auto_max_extent;
+  int explicit_unroll;
+  int unroll_local_access;
+
+  static void RegisterReflection() {
+    namespace refl = tvm::ffi::reflection;
+    refl::ObjectDef<UnrollLoopConfigNode>()
+        .def_ro("auto_max_step", &UnrollLoopConfigNode::auto_max_step,
+                "Threshold of number of steps in the loop to be automatically "
+                "unrolled",
+                refl::DefaultValue(0))
+        .def_ro("auto_max_depth", &UnrollLoopConfigNode::auto_max_depth,
+                "The maximum nested level of loops that can be automatically "
+                "unrolled.",
+                refl::DefaultValue(8))
+        .def_ro("auto_max_extent", &UnrollLoopConfigNode::auto_max_extent,
+                "The maximum extent` of loop that will be unrolled.",
+                refl::DefaultValue(0))
+        .def_ro(
+            "explicit_unroll", &UnrollLoopConfigNode::explicit_unroll,
+            "Whether to explicitly unroll the loop instead of setting a pragma",
+            refl::DefaultValue(true))
+        .def_ro(
+            "unroll_local_access", &UnrollLoopConfigNode::unroll_local_access,
+            "Whether to always unroll local access", refl::DefaultValue(false));
+  }
+  TVM_FFI_DECLARE_OBJECT_INFO_FINAL("tl.transform.UnrollLoopConfig",
+                                    UnrollLoopConfigNode, BaseAttrsNode);
+};
+
+class UnrollLoopConfig : public Attrs {
+public:
+  TVM_FFI_DEFINE_OBJECT_REF_METHODS_NOTNULLABLE(UnrollLoopConfig, Attrs,
+                                                UnrollLoopConfigNode);
+};
+
+TVM_FFI_STATIC_INIT_BLOCK() { UnrollLoopConfigNode::RegisterReflection(); }
+
+TVM_REGISTER_PASS_CONFIG_OPTION("tl.UnrollLoop", UnrollLoopConfig);
+
+class VarLocalAccessMarker : public ExprVisitor {
+public:
+  explicit VarLocalAccessMarker(std::unordered_set<Var> *var_touched_local)
+      : var_touched_local_(var_touched_local) {}
+
+  void VisitExpr_(const VarNode *op) final {
+    var_touched_local_->insert(ffi::GetRef<Var>(op));
+  }
+
+private:
+  std::unordered_set<Var> *var_touched_local_;
+};
+
+// The Visitor is used to check whether var is used as write index in a local
+// memory If a loop var is used as indices to a local memory, it must be
+// unrolled so the local memory access can be turned into register access.
+class LoopUnroller : public StmtExprMutator {
+public:
+  explicit LoopUnroller(int auto_max_step, int auto_max_depth,
+                        int auto_max_extent, bool explicit_unroll,
+                        bool unroll_local_access)
+      : auto_max_step_(auto_max_step), auto_max_depth_(auto_max_depth),
+        auto_max_extent_(auto_max_extent), explicit_unroll_(explicit_unroll),
+        unroll_local_access_(unroll_local_access) {}
+
+  Stmt VisitStmt_(const AttrStmtNode *op) final {
+    if (op->attr_key == "pragma_auto_unroll_max_step") {
+      int value = static_cast<int>(Downcast<Integer>(op->value)->value);
+      std::swap(value, auto_max_step_);
+      Stmt ret = this->VisitStmt(op->body);
+      std::swap(value, auto_max_step_);
+      return ret;
+    } else if (op->attr_key == "pragma_unroll_explicit") {
+      bool explicit_unroll = Downcast<Integer>(op->value)->value;
+      std::swap(explicit_unroll, explicit_unroll_);
+      Stmt ret = this->VisitStmt(op->body);
+      std::swap(explicit_unroll, explicit_unroll_);
+      return ret;
+    } else {
+      return StmtExprMutator::VisitStmt_(op);
+    }
+  }
+
+  Stmt VisitStmt_(const ForNode *op) {
+    // Post order so we can collect more information
+    Stmt stmt = StmtExprMutator::VisitStmt_(op);
+    op = stmt.as<ForNode>();
+    int value = GetExtent(op);
+    // condition for auto unroll
+    bool auto_unroll =
+        (op->kind == ForKind::kSerial && value >= 0 &&
+         normal_loop_depth_ == 0 && unroll_depth_ <= auto_max_depth_);
+
+    auto_unroll = auto_unroll && (value * step_count_ <= auto_max_step_ ||
+                                  value <= auto_max_extent_);
+
+    if (op->kind == ForKind::kUnrolled) {
+      if (explicit_unroll_) {
+        ICHECK_GE(value, 0)
+            << "Cannot unroll non-constant loop " << explicit_unroll_;
+      }
+      auto_unroll = true;
+    }
+
+    // If a loop var is used as indices to a local memory, it must be unrolled
+    // so the local memory access can be turned into register access.
+    if (this->var_touched_local_.count(op->loop_var) && value > 0 &&
+        unroll_local_access_) {
+      auto_unroll = true;
+    }
+
+    if (auto_unroll) {
+      step_count_ *= value;
+      unroll_depth_ += 1;
+    } else {
+      normal_loop_depth_ += 1;
+    }
+
+    if ((auto_unroll && explicit_unroll_) ||
+        // unroll loops with extent = 1, no matter how many steps in body
+        (0 <= value && value <= auto_max_extent_ && auto_max_extent_ == 1)) {
+      return Unroll(op);
+    } else {
+      if (auto_unroll) {
+        if (op->kind != ForKind::kUnrolled) {
+          auto n = CopyOnWrite(op);
+          n->kind = ForKind::kUnrolled;
+          return For(n);
+        }
+      }
+      return stmt;
+    }
+  }
+
+  PrimExpr VisitExpr_(const BufferLoadNode *op) final {
+    if (unroll_local_access_) {
+      auto storage_scope =
+          runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
+      if (storage_scope.rank == runtime::StorageRank::kLocal ||
+          storage_scope.rank == runtime::StorageRank::kWarp) {
+        VarLocalAccessMarker marker(&var_touched_local_);
+        for (PrimExpr e : op->indices) {
+          marker(e);
+        }
+      }
+    }
+    return ffi::GetRef<PrimExpr>(op);
+  }
+
+  Stmt VisitStmt_(const BufferStoreNode *op) final {
+    ++step_count_;
+    if (unroll_local_access_) {
+      auto storage_scope =
+          runtime::StorageScope::Create(GetPtrStorageScope(op->buffer->data));
+      if (storage_scope.rank == runtime::StorageRank::kLocal ||
+          storage_scope.rank == runtime::StorageRank::kWarp) {
+        VarLocalAccessMarker marker(&var_touched_local_);
+        for (PrimExpr e : op->indices) {
+          marker(e);
+        }
+      }
+    }
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const EvaluateNode *op) final {
+    ++step_count_;
+    return StmtExprMutator::VisitStmt_(op);
+  }
+
+  Stmt VisitStmt_(const SeqStmtNode *op) final {
+    auto fmutate = [this](const Stmt &s) {
+      int step_count = step_count_;
+      int unroll_depth = unroll_depth_;
+      int normal_loop_depth = normal_loop_depth_;
+      step_count_ = 0;
+      unroll_depth_ = 0;
+      normal_loop_depth_ = 0;
+      Stmt ret = this->VisitStmt(s);
+      step_count_ += step_count;
+      normal_loop_depth_ = std::max(normal_loop_depth, normal_loop_depth_);
+      unroll_depth_ = std::max(unroll_depth_, unroll_depth);
+      return ret;
+    };
+    return StmtExprMutator::VisitSeqStmt_(op, false, fmutate);
+  }
+
+  Stmt Unroll(const ForNode *op) {
+    int value = GetExtent(op);
+    // For loop must have a constant integer extent
+    ICHECK_NE(value, -1) << "loop doesn't have a constant integer extent";
+    if (value == 0)
+      return Evaluate(0);
+    Stmt body = op->body;
+    ffi::Map<Var, PrimExpr> vmap;
+    ffi::Array<Stmt> unrolled;
+    for (int i = 0; i < value; ++i) {
+      vmap.Set(op->loop_var, op->min + make_const(op->loop_var.dtype(), i));
+      Stmt step = Substitute(body, vmap);
+      unrolled.push_back(step);
+    }
+    return SeqStmt::Flatten(unrolled);
+  }
+
+private:
+  // returns the extent of the loop if it's a constant integer, otherwise return
+  // -1
+  int GetExtent(const ForNode *op) {
+    // constant folding.
+    PrimExpr extent = analyzer_.Simplify(op->extent);
+    const IntImmNode *v1 = extent.as<IntImmNode>();
+    int value = -1;
+    // integers that do not fit in int32_t are treated as symbolic,
+    // as it's impossible to unroll such large loops
+    if (v1 != nullptr && v1->value <= std::numeric_limits<int>::max()) {
+      value = static_cast<int>(v1->value);
+    }
+    return value;
+  }
+
+  // maximum number of step to perform auto unroll.
+  int auto_max_step_;
+  int auto_max_depth_;
+  // max extent of loop to auto unroll
+  // this does not count the total steps, only count the number of loops
+  int auto_max_extent_;
+  bool explicit_unroll_;
+  // Whether to unroll loops to local access.
+  bool unroll_local_access_{false};
+  // Number of normal loops in scope
+  int normal_loop_depth_{0};
+  // number of unrolled cases in current scope.
+  int unroll_depth_{0};
+  // Number of total steps unrolled
+  int step_count_{0};
+  // set of indices touched during visit local memory
+  std::unordered_set<Var> var_touched_local_;
+  // analyzer
+  arith::Analyzer analyzer_;
+};
+
+Stmt UnrollLoop(Stmt stmt, UnrollLoopConfig cfg) {
+  Stmt ret = LoopUnroller(cfg->auto_max_step, cfg->auto_max_depth,
+                          cfg->auto_max_extent, cfg->explicit_unroll,
+                          cfg->unroll_local_access)(stmt);
+  if (!ret.same_as(stmt)) {
+    return ConvertSSA(ret);
+  } else {
+    return ret;
+  }
+}
+
+namespace transform {
+
+using namespace tir::transform;
+
+Pass UnrollLoop() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    auto *n = f.CopyOnWrite();
+    auto cfg = ctx->GetConfig<UnrollLoopConfig>("tl.UnrollLoop");
+    if (!cfg.defined()) {
+      cfg = AttrsWithDefaultValues<UnrollLoopConfig>();
+    }
+    n->body = tl::UnrollLoop(f->body, cfg.value());
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.UnrollLoop", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.UnrollLoop", UnrollLoop);
+}
+
+} // namespace transform
+
+} // namespace tl
+} // namespace tvm
diff --git a/src/transform/vectorize_loop.cc b/src/transform/vectorize_loop.cc
index e6f8532901..9735a9a5b1 100644
--- a/src/transform/vectorize_loop.cc
+++ b/src/transform/vectorize_loop.cc
@@ -32,13 +32,18 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
+#include <optional>
 #include <unordered_map>
 #include <unordered_set>
 #include <utility>
 #include <vector>
 
+#include "../op/builtin.h"
+#include "../op/utils.h"
+#include "../target/utils.h"
 #include "arith/scalable_expression.h"
 #include "tir/analysis/check_contains.h"
+#include "tvm/ffi/cast.h"
 
 namespace tvm {
 namespace tl {
@@ -118,6 +123,69 @@ inline PrimExpr BroadcastTo(PrimExpr e, int lanes, bool is_scalable) {
   return Broadcast(e, CreateNewLanes(is_scalable, lanes));
 }
 
+/*!
+ * \brief Extract BufferLoad from an expression that may be wrapped in
+ * address_of.
+ */
+inline Optional<BufferLoad> ExtractBufferLoadForAtomic(const PrimExpr &expr) {
+  if (const auto *load = expr.as<BufferLoadNode>()) {
+    return tvm::ffi::GetRef<BufferLoad>(load);
+  }
+  if (const auto *call = expr.as<CallNode>()) {
+    if (call->op.same_as(builtin::address_of()) && !call->args.empty()) {
+      if (const auto *load = call->args[0].as<BufferLoadNode>()) {
+        return tvm::ffi::GetRef<BufferLoad>(load);
+      }
+    }
+    if (call->op.same_as(tl::access_ptr()) && !call->args.empty()) {
+      if (const auto *load = call->args[0].as<BufferLoadNode>()) {
+        return tvm::ffi::GetRef<BufferLoad>(load);
+      }
+    }
+    // Handle tvm_access_ptr: args are (dtype_annotation, data, offset, extent,
+    // access_mask)
+    if (call->op.same_as(builtin::tvm_access_ptr()) && call->args.size() >= 3) {
+      DataType dtype = call->args[0].dtype();
+      Var data_var = Downcast<Var>(call->args[1]);
+      PrimExpr offset = call->args[2];
+      // Create a dummy buffer with the correct dtype and a BufferLoad from data
+      // + offset
+      Buffer dummy_buf(data_var, dtype, {Integer(1)}, {}, Integer(0),
+                       data_var->name_hint, 0, 0, kDefault);
+      return BufferLoad(dummy_buf, {offset});
+    }
+  }
+  return Optional<BufferLoad>();
+}
+
+/*!
+ * \brief Get the vectorized atomic add op based on vector size.
+ */
+inline Op GetVectorizedAtomicOp(int vector_size) {
+  switch (vector_size) {
+  case 4:
+    return atomic_addx4_elem_op();
+  case 2:
+    return atomic_addx2_elem_op();
+  default:
+    return atomic_add_elem_op();
+  }
+}
+
+/*!
+ * \brief Get the max vector size supported by the given dtype for atomic ops.
+ */
+inline int GetMaxAtomicVectorSize(DataType dtype, Target target) {
+  if (dtype.is_float16() || dtype.is_bfloat16()) {
+    return 2;
+  }
+  if (dtype.is_float() && dtype.bits() == 32 &&
+      TargetHasSMVersionGE(target, 90)) {
+    return 4;
+  }
+  return 1;
+}
+
 // Rewrite vectorized allocation access
 // This is necessary for making each vector component containing its own
 // workspace. Originates from Halide's loop vectorizer
@@ -421,6 +489,7 @@ class TLVectorizer : public StmtMutator,
       return std::move(var);
     }
   }
+
   // IfThenElse expr
   PrimExpr MutateIfThenElseExpr_(const CallNode *op) {
     PrimExpr cond = this->VisitExpr(op->args[0]);
@@ -449,6 +518,85 @@ class TLVectorizer : public StmtMutator,
       }
     }
   }
+
+  // Address of: remove vectorized var from indices to get base address
+  // e.g., T.address_of(buf[base + vec]) -> T.address_of(buf[base])
+  PrimExpr MutateAddressOfCall_(const CallNode *op) {
+    ICHECK(op->op.same_as(builtin::address_of()));
+    ICHECK_EQ(op->args.size(), 1);
+
+    auto buffer_load = op->args[0].as<BufferLoadNode>();
+    if (!buffer_load) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    // Remove the vectorized var from indices by substituting var_ with 0
+    Array<PrimExpr> new_indices;
+    for (const auto &index : buffer_load->indices) {
+      PrimExpr new_index = Substitute(index, {{var_, IntImm(var_->dtype, 0)}});
+      new_indices.push_back(analyzer_.Simplify(new_index));
+    }
+
+    BufferLoad new_load = GetRef<BufferLoad>(buffer_load);
+    if (!new_indices.same_as(buffer_load->indices)) {
+      auto writer = new_load.CopyOnWrite();
+      writer->indices = new_indices;
+    }
+
+    return Call(op->dtype, op->op, {new_load});
+  }
+
+  // tvm_access_ptr: substitute loop var with 0 in offset to get base address
+  // args are (dtype_annotation, data, offset, extent, access_mask)
+  PrimExpr MutateAccessPtrCall_(const CallNode *op) {
+    ICHECK(op->op.same_as(builtin::tvm_access_ptr()));
+    ICHECK_GE(op->args.size(), 5);
+
+    // Only the offset (args[2]) may contain the loop var; substitute it with 0
+    PrimExpr offset = op->args[2];
+    PrimExpr new_offset = Substitute(offset, {{var_, IntImm(var_->dtype, 0)}});
+    new_offset = analyzer_.Simplify(new_offset);
+
+    if (new_offset.same_as(offset)) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    Array<PrimExpr> new_args = op->args;
+    new_args.Set(2, new_offset);
+    return Call(op->dtype, op->op, new_args);
+  }
+
+  // tl.access_ptr: substitute loop var with 0 in BufferLoad indices.
+  // args are (base_load, extent, access_mask)
+  PrimExpr MutateTLAccessPtrCall_(const CallNode *op) {
+    ICHECK(op->op.same_as(tl::access_ptr()));
+    ICHECK_GE(op->args.size(), 3);
+
+    auto buffer_load = op->args[0].as<BufferLoadNode>();
+    if (!buffer_load) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    Array<PrimExpr> new_indices;
+    for (const auto &index : buffer_load->indices) {
+      PrimExpr new_index = Substitute(index, {{var_, IntImm(var_->dtype, 0)}});
+      new_indices.push_back(analyzer_.Simplify(new_index));
+    }
+
+    if (new_indices.same_as(buffer_load->indices)) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    BufferLoad new_load = GetRef<BufferLoad>(buffer_load);
+    auto writer = new_load.CopyOnWrite();
+    writer->indices = new_indices;
+    LegalizeBufferLoadDType(writer);
+
+    Array<PrimExpr> new_args = op->args;
+    new_args.Set(0, new_load);
+    return Call(op->dtype, op->op, new_args);
+  }
+
   // Reinterpret expr
   PrimExpr MutateReinterpretExpr_(const CallNode *op) {
     ICHECK(op->op.same_as(builtin::reinterpret()));
@@ -465,22 +613,163 @@ class TLVectorizer : public StmtMutator,
       }
     }
   }
+  // Atomic add vectorization
+  PrimExpr MutateAtomicAddExpr_(const CallNode *op) {
+    ICHECK(op->op.same_as(atomic_add_elem_op()));
+
+    // Must have at least 2 args (dst_ptr and src)
+    if (op->args.size() < 2) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    // Get the vector size from var_lanes_
+    auto lanes_ptr = as_const_int(var_lanes_);
+    if (!lanes_ptr || *lanes_ptr <= 1) {
+      // Not in vectorized context or vector size is 1
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+    int vector_size = static_cast<int>(*lanes_ptr);
+    auto dst = VisitExpr(op->args[0]);
+    auto src = VisitExpr(op->args[1]);
+
+    // If src is not Ramp/Broadcasted, it must be a scalar or something.
+    // Broadcast to vector size if needed
+    if (src.same_as(op->args[1])) {
+      src = BroadcastTo(src, vector_size, src.dtype().is_scalable_vector());
+    }
+
+    // Check if dtype supports this vector size
+    auto dst_buffer_load = ExtractBufferLoadForAtomic(dst);
+    Target target = Target::Current(false);
+    int max_vec_size =
+        GetMaxAtomicVectorSize(dst_buffer_load.value()->buffer->dtype, target);
+    if (vector_size > max_vec_size) {
+      // Vector size not supported for this dtype, cannot vectorize
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    // Return the vectorized atomic op
+    return Call(op->dtype, GetVectorizedAtomicOp(vector_size), {dst, src});
+  }
+
+  static std::optional<int> GetAccessPtrElementBits(const PrimExpr &expr) {
+    const auto *ptr_call = expr.as<CallNode>();
+    if (ptr_call == nullptr) {
+      return std::nullopt;
+    }
+    if (ptr_call->op.same_as(builtin::tvm_access_ptr())) {
+      ICHECK(!ptr_call->args.empty());
+      DataType dtype = ptr_call->args[0].dtype();
+      return dtype.bits() * dtype.lanes();
+    }
+    if (ptr_call->op.same_as(tl::access_ptr())) {
+      ICHECK_GE(ptr_call->args.size(), 3U);
+      const auto *buffer_load = ptr_call->args[0].as<BufferLoadNode>();
+      ICHECK(buffer_load) << "tl.access_ptr arg0 must be BufferLoad";
+      DataType dtype = buffer_load->buffer->dtype;
+      return dtype.bits() * dtype.lanes();
+    }
+    return std::nullopt;
+  }
+
+  static std::optional<int> GetCPAsyncBitsPerCall(const CallNode *op,
+                                                  const PrimExpr &count) {
+    const auto *count_imm = count.as<IntImmNode>();
+    if (count_imm == nullptr) {
+      return std::nullopt;
+    }
+    int scalar_count = static_cast<int>(count_imm->value);
+    if (scalar_count <= 0) {
+      return std::nullopt;
+    }
+    if (op->op.same_as(builtin::ptx_cp_async())) {
+      return scalar_count * 8;
+    }
+    ICHECK(op->op.same_as(tl::ptx_cp_async()));
+    auto dst_elem_bits = GetAccessPtrElementBits(op->args[0]);
+    auto src_elem_bits = GetAccessPtrElementBits(op->args[1]);
+    if (!dst_elem_bits.has_value() || !src_elem_bits.has_value()) {
+      return std::nullopt;
+    }
+    int dst_total_bits = scalar_count * dst_elem_bits.value();
+    int src_total_bits = scalar_count * src_elem_bits.value();
+    ICHECK_EQ(dst_total_bits, src_total_bits)
+        << "tl.ptx_cp_async requires src/dst transfer widths to match, but got "
+        << dst_total_bits << " vs " << src_total_bits << " bits";
+    return dst_total_bits;
+  }
+
+  // Vectorized cp.async widening.
+  // builtin::ptx_cp_async keeps the transfer width in bytes, while
+  // tl::ptx_cp_async keeps it in logical element counts. The generic
+  // vectorization pass widens either form by the vector lane count and lets
+  // the final codegen validate the derived PTX byte width.
+  PrimExpr MutatePTXCPAsyncExpr_(const CallNode *op) {
+    ICHECK(op->op.same_as(builtin::ptx_cp_async()) ||
+           op->op.same_as(tl::ptx_cp_async()));
+    if (op->args.size() != 3 && op->args.size() != 4) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    PrimExpr dst = VisitExpr(op->args[0]);
+    PrimExpr src = VisitExpr(op->args[1]);
+    PrimExpr count = VisitExpr(op->args[2]);
+    Optional<PrimExpr> predicate = std::nullopt;
+    if (op->args.size() == 4) {
+      auto pred = VisitExpr(op->args[3]);
+      if (pred.dtype().is_scalable_or_fixed_length_vector()) {
+        need_scalarize_ = true;
+        return tvm::ffi::GetRef<PrimExpr>(op);
+      }
+      predicate = pred;
+    }
+
+    auto lanes_ptr = as_const_int(var_lanes_);
+    if (!lanes_ptr || *lanes_ptr <= 1) {
+      Array<PrimExpr> new_args{dst, src, count};
+      if (predicate.defined()) {
+        new_args.push_back(predicate.value());
+      }
+      if (new_args.same_as(op->args)) {
+        return tvm::ffi::GetRef<PrimExpr>(op);
+      }
+      return Call(op->dtype, op->op, new_args);
+    }
+
+    auto bits_per_call = GetCPAsyncBitsPerCall(op, count);
+    if (!bits_per_call.has_value()) {
+      need_scalarize_ = true;
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    int vector_size = static_cast<int>(*lanes_ptr);
+    int total_bits = bits_per_call.value() * vector_size;
+    if (total_bits % 8 != 0) {
+      need_scalarize_ = true;
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+    int total_bytes = total_bits / 8;
+    if (!IsValidCPAsyncTransferBytes(total_bytes)) {
+      need_scalarize_ = true;
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+
+    int total_count =
+        static_cast<int>(Downcast<IntImm>(count)->value) * vector_size;
+    Array<PrimExpr> new_args{dst, src, IntImm(count.dtype(), total_count)};
+    if (predicate.defined()) {
+      new_args.push_back(predicate.value());
+    }
+    if (new_args.same_as(op->args)) {
+      return tvm::ffi::GetRef<PrimExpr>(op);
+    }
+    return Call(op->dtype, op->op, new_args);
+  }
+
   // Call
   PrimExpr VisitExpr_(const CallNode *op) final {
     if (op->op.same_as(builtin::if_then_else())) {
       return MutateIfThenElseExpr_(op);
-    } else if (op->op.same_as(builtin::call_extern())) {
-      // Check if this is a tl::ld or tl::st call which can be vectorized
-      if (op->args.size() >= 3) {
-        auto func_name_node = op->args[0].as<StringImmNode>();
-        if (func_name_node) {
-          std::string func_name = func_name_node->value;
-          if (func_name.rfind("tl::ld<", 0) == 0 ||
-              func_name.rfind("tl::st<", 0) == 0) {
-            return MutateTlLdStExpr_(op, func_name.rfind("tl::ld<", 0) == 0);
-          }
-        }
-      }
     } else if (op->op.same_as(builtin::texture2d_load())) {
       int lane = 0;
       Array<PrimExpr> fcd = MutateArray({op->args.back()}, &lane);
@@ -498,6 +787,18 @@ class TLVectorizer : public StmtMutator,
       return Call(op->dtype.with_lanes(lane), op->op, new_args);
     } else if (op->op.same_as(builtin::reinterpret())) {
       return MutateReinterpretExpr_(op);
+    } else if (op->op.same_as(atomic_add_elem_op())) {
+      // Handle vectorization of atomic_add_elem_op
+      return MutateAtomicAddExpr_(op);
+    } else if (op->op.same_as(builtin::address_of())) {
+      return MutateAddressOfCall_(op);
+    } else if (op->op.same_as(tl::access_ptr())) {
+      return MutateTLAccessPtrCall_(op);
+    } else if (op->op.same_as(builtin::tvm_access_ptr())) {
+      return MutateAccessPtrCall_(op);
+    } else if (op->op.same_as(builtin::ptx_cp_async()) ||
+               op->op.same_as(tl::ptx_cp_async())) {
+      return MutatePTXCPAsyncExpr_(op);
     }
     auto optional_op = op->op.as<Op>();
     bool vectorizable = optional_op &&
@@ -530,6 +831,7 @@ class TLVectorizer : public StmtMutator,
       }
     }
   }
+
   // BufferLoad
   PrimExpr VisitExpr_(const BufferLoadNode *op) final {
     auto load = tvm::ffi::GetRef<BufferLoad>(op);
@@ -547,6 +849,7 @@ class TLVectorizer : public StmtMutator,
 
     return std::move(load);
   }
+
   // Let
   PrimExpr VisitExpr_(const LetNode *op) final {
     PrimExpr value = this->VisitExpr(op->value);
@@ -578,6 +881,7 @@ class TLVectorizer : public StmtMutator,
       }
     }
   }
+
   // BufferStore
   Stmt VisitStmt_(const BufferStoreNode *op) final {
     auto store = tvm::ffi::GetRef<BufferStore>(op);
@@ -634,6 +938,7 @@ class TLVectorizer : public StmtMutator,
 
     return std::move(store);
   }
+
   // For
   Stmt VisitStmt_(const ForNode *op) final {
     if (op->kind == ForKind::kVectorized) {
@@ -653,6 +958,7 @@ class TLVectorizer : public StmtMutator,
                  op->thread_binding, op->annotations);
     }
   }
+
   // IfThenElse
   Stmt VisitStmt_(const IfThenElseNode *op) final {
     ICHECK(!op->condition.dtype().is_scalable_or_fixed_length_vector());
@@ -672,10 +978,12 @@ class TLVectorizer : public StmtMutator,
       return IfThenElse(condition, then_case, else_case);
     }
   }
+
   // While
   Stmt VisitStmt_(const WhileNode *op) final {
     LOG(FATAL) << "A while loop inside a vectorized loop not supported.";
   }
+
   // LetStmt
   Stmt VisitStmt_(const LetStmtNode *op) final {
     PrimExpr value = this->VisitExpr(op->value);
@@ -688,7 +996,6 @@ class TLVectorizer : public StmtMutator,
       // Record mapping from the new var to its bound value
       let_value_binding_[op->var] = op->value;
       let_value_binding_[new_var] = value;
-
       return LetStmt(new_var, value, this->VisitStmt(op->body));
     } else {
       let_var_map_[op->var] = op->var;
@@ -715,140 +1022,6 @@ class TLVectorizer : public StmtMutator,
     return StmtMutator::VisitStmt_(op);
   }
 
-  // Vectorize tl::ld or tl::st call
-  PrimExpr MutateTlLdStExpr_(const CallNode *op, bool is_load) {
-    // Structure: call_extern("tl::ld<...>", address_of(BufferLoad), value, ...)
-    ICHECK(op->args.size() >= 3) << "tl::ld/st expects at least 3 arguments";
-
-    PrimExpr func_name = op->args[0];
-    PrimExpr addr_arg = op->args[1];
-    PrimExpr value_arg = op->args[2];
-
-    // Helper to visit indices and extract Ramp info
-    // Returns: (visited_indices, base_indices, ramp_lanes)
-    auto visit_and_extract_ramp = [this](const Array<PrimExpr> &indices)
-        -> std::tuple<Array<PrimExpr>, Array<PrimExpr>, int> {
-      Array<PrimExpr> visited_indices;
-      Array<PrimExpr> base_indices;
-      int ramp_lanes = 1;
-      for (const auto &idx : indices) {
-        PrimExpr visited = this->VisitExpr(idx);
-        visited_indices.push_back(visited);
-        auto ramp = visited.as<RampNode>();
-        if (ramp && is_one(ramp->stride)) {
-          auto lanes_imm = ramp->lanes.as<IntImmNode>();
-          if (lanes_imm) {
-            ramp_lanes = lanes_imm->value;
-          }
-          base_indices.push_back(ramp->base);
-        } else {
-          base_indices.push_back(visited);
-        }
-      }
-      return {visited_indices, base_indices, ramp_lanes};
-    };
-
-    // Process source address - directly handle address_of(BufferLoad)
-    int src_ramp_lanes = 1;
-    PrimExpr new_addr = addr_arg;
-    auto addr_call = addr_arg.as<CallNode>();
-    if (addr_call && addr_call->op.same_as(builtin::address_of())) {
-      auto buffer_load = addr_call->args[0].as<BufferLoadNode>();
-      if (buffer_load) {
-        auto [visited_indices, base_indices, lanes] =
-            visit_and_extract_ramp(buffer_load->indices);
-        src_ramp_lanes = lanes;
-        // Create new address with base indices only (for vectorized load)
-        BufferLoad new_buffer_load(buffer_load->buffer, base_indices);
-        new_addr =
-            Call(DataType::Handle(), builtin::address_of(), {new_buffer_load});
-      }
-    }
-
-    // Process destination value - directly handle BufferLoad
-    int dst_ramp_lanes = 1;
-    PrimExpr new_value = value_arg;
-    auto value_load = value_arg.as<BufferLoadNode>();
-    if (value_load) {
-      auto [visited_indices, base_indices, lanes] =
-          visit_and_extract_ramp(value_load->indices);
-      dst_ramp_lanes = lanes;
-      // Create new value with base indices only
-      new_value = BufferLoad(value_load->buffer, base_indices);
-    }
-
-    // Determine vectorization lanes
-    int vector_lanes = std::max(src_ramp_lanes, dst_ramp_lanes);
-    if (vector_lanes > 1) {
-      // Determine the vector type based on total bytes
-      // 8 x 16-bit = 128 bits = int4, 4 x 32-bit = 128 bits = int4
-      // 4 x 16-bit = 64 bits = int2, 2 x 32-bit = 64 bits = int2
-      DataType vec_dtype;
-      int elem_bits = 16; // Default assumption for bf16/f16
-
-      // Try to get element dtype from source buffer
-      auto addr_call_check = new_addr.as<CallNode>();
-      if (addr_call_check &&
-          addr_call_check->op.same_as(builtin::address_of())) {
-        auto buffer_load = addr_call_check->args[0].as<BufferLoadNode>();
-        if (buffer_load) {
-          elem_bits = buffer_load->buffer->dtype.bits();
-        }
-      }
-
-      int total_bits = vector_lanes * elem_bits;
-      if (total_bits == 128) {
-        vec_dtype = DataType::Int(32, 4); // int4 equivalent (128 bits)
-      } else if (total_bits == 64) {
-        vec_dtype = DataType::Int(32, 2); // int2 equivalent (64 bits)
-      } else if (total_bits == 32) {
-        vec_dtype = DataType::Int(32);
-      } else {
-        // Can't vectorize to a standard type, fall back to scalarize
-        need_scalarize_ = true;
-        return tvm::ffi::GetRef<PrimExpr>(op);
-      }
-
-      // Reinterpret the value to vector type (e.g., int4 for 8xbf16)
-      PrimExpr vec_value = Call(vec_dtype, builtin::reinterpret(), {new_value});
-      PrimExpr vec_value_slice = vec_value.as<CallNode>()->args[0];
-
-      // Build new args with base addresses and reinterpreted value
-      Array<PrimExpr> new_args;
-      new_args.push_back(func_name);
-      new_args.push_back(new_addr);
-      new_args.push_back(vec_value_slice);
-      // Copy remaining args (sem, scope, etc.)
-      for (size_t i = 3; i < op->args.size(); ++i) {
-        new_args.push_back(this->VisitExpr(op->args[i]));
-      }
-
-      // Return the vectorized call
-      return Call(op->dtype, op->op, new_args);
-    }
-
-    // If we couldn't vectorize but args became vectors, need to scalarize
-    if (new_addr.dtype().is_scalable_or_fixed_length_vector() ||
-        new_value.dtype().is_scalable_or_fixed_length_vector()) {
-      need_scalarize_ = true;
-      return tvm::ffi::GetRef<PrimExpr>(op);
-    }
-
-    // No vectorization needed, return with updated args if changed
-    if (new_addr.same_as(addr_arg) && new_value.same_as(value_arg)) {
-      return tvm::ffi::GetRef<PrimExpr>(op);
-    }
-
-    Array<PrimExpr> new_args;
-    new_args.push_back(func_name);
-    new_args.push_back(new_addr);
-    new_args.push_back(new_value);
-    for (size_t i = 3; i < op->args.size(); ++i) {
-      new_args.push_back(this->VisitExpr(op->args[i]));
-    }
-    return Call(op->dtype, op->op, new_args);
-  }
-
   // scalarize the statement
   Stmt Scalarize(Stmt stmt) {
     Var idx(var_->name_hint + "_s", var_->dtype);
@@ -862,10 +1035,24 @@ class TLVectorizer : public StmtMutator,
         }
       }
     });
+
+    // Check which vars already have LetStmt definitions inside stmt
+    std::unordered_set<const VarNode *> defined_in_stmt;
+    PostOrderVisit(stmt, [&defined_in_stmt](const ObjectRef &node) {
+      if (const auto *let = node.as<LetStmtNode>()) {
+        defined_in_stmt.insert(let->var.get());
+      }
+    });
+
     stmt = Substitute(stmt, {{var_, idx}});
 
     if (!used_let_bound_vars.empty()) {
       for (const auto &v : used_let_bound_vars) {
+        if (defined_in_stmt.count(v.get()) > 0) {
+          // Skip: the original stmt already contains a LetStmt definition for
+          // this var
+          continue;
+        }
         // Bind the existing var v to its value around the stmt scope
         auto new_value = Substitute(let_value_binding_.at(v), {{var_, idx}});
         stmt = LetStmt(v, new_value, stmt);
diff --git a/src/transform/verify_parallel_loop.cc b/src/transform/verify_parallel_loop.cc
new file mode 100644
index 0000000000..e0c1612461
--- /dev/null
+++ b/src/transform/verify_parallel_loop.cc
@@ -0,0 +1,115 @@
+#include "../op/utils.h"
+#include "common/constr_visitor.h"
+#include "layout_reducer.h"
+#include "tvm/arith/analyzer.h"
+#include "tvm/ffi/base_details.h"
+#include "tvm/ffi/object.h"
+#include "tvm/ir/expr.h"
+#include "tvm/tir/op.h"
+#include "tvm/tir/stmt.h"
+#include "tvm/tir/var.h"
+#include <tvm/ffi/reflection/registry.h>
+#include <tvm/tir/analysis.h>
+#include <tvm/tir/builtin.h>
+#include <tvm/tir/stmt_functor.h>
+#include <tvm/tir/transform.h>
+
+namespace tvm::tl {
+
+using namespace tir;
+
+namespace {
+using tvm::tl::ConstrSet;
+using tvm::tl::ConstrVisitor;
+
+struct ParallelLoopVerifier : public ConstrVisitor {
+  std::vector<Var> parallel_loop_vars_;
+  std::unordered_set<Var, ffi::ObjectPtrHash, ffi::ObjectPtrEqual> reducers;
+
+  void VisitStmt_(const ForNode *op) override {
+    if (op->kind == ForKind::kParallel) {
+      parallel_loop_vars_.push_back(op->loop_var);
+      ConstrVisitor::VisitStmt_(op);
+      parallel_loop_vars_.pop_back();
+    } else {
+      ConstrVisitor::VisitStmt_(op);
+    }
+  }
+  void VisitStmt_(const BufferStoreNode *op) override {
+    if (reducers.count(op->buffer->data) ||
+        IsLocalBuffer(op->buffer, /*allow_var=*/true)) {
+      StmtExprVisitor::VisitStmt_(op);
+      return;
+    }
+    ConstrSet cset{constr_stack_};
+    std::vector<Var> other_thread_vars_;
+    ffi::Map<Var, PrimExpr> subs;
+    for (const auto &var : parallel_loop_vars_) {
+      Var v_other_thread(var->name_hint + "<OTHER>", var->dtype);
+      other_thread_vars_.push_back(v_other_thread);
+      subs.Set(var, v_other_thread);
+    }
+    cset.Extend(cset.Substitute(subs));
+    for (const auto &idx : op->indices) {
+      cset.AddConstr(idx == tir::Substitute(idx, subs));
+    }
+    arith::Analyzer analyzer;
+    cset.Populate(analyzer);
+    // If we can prove the values are the same, then no data race can happen.
+    if (analyzer.CanProve(op->value == tir::Substitute(op->value, subs))) {
+      StmtExprVisitor::VisitStmt_(op);
+      return;
+    }
+    ffi::Array<Var> failed_vars;
+    PrimExpr failed_var_expr;
+    for (auto [k, v] : subs) {
+      if (!analyzer.CanProve(k == v)) {
+        failed_vars.push_back(k);
+        failed_var_expr =
+            failed_var_expr.defined() ? And(failed_var_expr, k == v) : (k == v);
+      }
+    }
+    if (!failed_vars.empty()) {
+      LOG(WARNING) << "Data race detected: `" << op->buffer << op->indices
+                   << "`"
+                   << "is written by multiple threads in loop " << failed_vars
+                   << ", Example:\n"
+                   << analyzer.z3_prover.GetModel(failed_var_expr)
+                   << "If you believe this is a false positive, pass "
+                      "`PassKey.TL_DISABLE_DATA_RACE_CHECK` to pass key to "
+                      "disable this check.";
+    }
+    StmtExprVisitor::VisitStmt_(op);
+  }
+  void VisitStmt_(const BlockNode *op) override {
+    if (op->annotations.count(attr::kReducerInfo)) {
+      auto map = op->annotations.Get(attr::kReducerInfo)
+                     ->as<Map<Var, Map<String, String>>>();
+      ICHECK(map) << "reducer_replication map is not defined";
+      for (const auto &[var, info] : map.value()) {
+        reducers.insert(var);
+      }
+    }
+    return StmtExprVisitor::VisitStmt_(op);
+  }
+};
+
+using namespace tir::transform;
+
+tvm::transform::Pass VerifyParallelLoop() {
+  auto pass_func = [=](PrimFunc f, IRModule m, PassContext ctx) {
+    ParallelLoopVerifier verifier;
+    verifier(f->body);
+    return f;
+  };
+  return CreatePrimFuncPass(pass_func, 0, "tl.VerifyParallelLoop", {});
+}
+
+TVM_FFI_STATIC_INIT_BLOCK() {
+  namespace refl = tvm::ffi::reflection;
+  refl::GlobalDef().def("tl.transform.VerifyParallelLoop", VerifyParallelLoop);
+}
+
+} // namespace
+
+} // namespace tvm::tl
diff --git a/src/transform/warp_specialized_rewriter.cc b/src/transform/warp_specialized_rewriter.cc
deleted file mode 100644
index 8e891d8551..0000000000
--- a/src/transform/warp_specialized_rewriter.cc
+++ /dev/null
@@ -1,1325 +0,0 @@
-/*!
- * \file warp_specialized_rewriter.cc
- * \brief Warp specialized Pipeline for cuda GPU (sm90+)
- */
-
-#include "warp_specialized_rewriter.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using namespace runtime;
-using arith::IRVisitorWithAnalyzer;
-
-struct LoopInfo {
-  Var loop_var;
-  PrimExpr extent;
-  PrimExpr min;
-};
-
-enum class Role : uint8_t { kConsumer, kProducer, kBoth };
-
-class ProducerBufferDetector : public StmtExprVisitor {
-public:
-  ProducerBufferDetector(
-      std::unordered_set<const BufferNode *> cur_producer_buffers)
-      : cur_producer_buffers_(std::move(cur_producer_buffers)) {}
-
-  void clear() { has_producer_buffer_ = false; }
-
-  void VisitExpr_(const CallNode *call) final {
-    if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col())) {
-      has_producer_buffer_ = true;
-    }
-    StmtExprVisitor::VisitExpr_(call);
-  }
-
-  void VisitExpr_(const BufferLoadNode *op) final {
-    if (cur_producer_buffers_.count(op->buffer.get())) {
-      has_producer_buffer_ = true;
-    }
-    StmtExprVisitor::VisitExpr_(op);
-  }
-
-  bool has_producer_buffer_ = false;
-  std::unordered_set<const BufferNode *> cur_producer_buffers_;
-};
-
-class ProducerUsedBufferFinder : public StmtExprVisitor {
-public:
-  auto FindProducerusedBuffer(const Stmt &stmt) {
-    producer_buffers_.clear();
-    let_var_to_expr_.clear();
-    std::unordered_set<const BufferNode *> last_producer_buffers_;
-    for (;;) {
-      VisitStmt(stmt);
-      if (producer_buffers_ == last_producer_buffers_) {
-        break;
-      }
-      last_producer_buffers_ = producer_buffers_;
-    }
-    return producer_buffers_;
-  }
-
-  void InsertBuffer(const PrimExpr &expr) {
-    // Find the buffer that is used in the condition
-    VarUseDefAnalyzer usage(Array<Var>{});
-    usage(expr);
-    for (const auto &buffer : usage.buffer_use_count_) {
-      producer_buffers_.insert(buffer.first);
-    }
-    // Also collect buffers through let bindings
-    CollectBuffersFromExpr(expr);
-  }
-
-  // Collect buffers from expression, following let bindings
-  void CollectBuffersFromExpr(const PrimExpr &expr) {
-    PostOrderVisit(expr, [this](const ObjectRef &node) {
-      if (auto bl = node.as<BufferLoadNode>()) {
-        producer_buffers_.insert(bl->buffer.get());
-      } else if (auto var_node = node.as<VarNode>()) {
-        auto var = tvm::ffi::GetRef<Var>(var_node);
-        auto it = let_var_to_expr_.find(var.get());
-        if (it != let_var_to_expr_.end()) {
-          CollectBuffersFromExpr(it->second);
-        }
-      }
-    });
-  }
-
-  void VisitStmt_(const LetStmtNode *op) final {
-    let_var_to_expr_[op->var.get()] = op->value;
-    StmtExprVisitor::VisitStmt_(op);
-  }
-
-  void VisitStmt_(const IfThenElseNode *op) final {
-    ProducerBufferDetector producer_buffer_detector(producer_buffers_);
-    producer_buffer_detector(op->then_case);
-    if (op->else_case.defined()) {
-      producer_buffer_detector(op->else_case.value());
-    }
-    if (producer_buffer_detector.has_producer_buffer_) {
-      InsertBuffer(op->condition);
-    }
-    StmtExprVisitor::VisitStmt_(op);
-  }
-
-  void VisitStmt_(const ForNode *op) final {
-    ProducerBufferDetector producer_buffer_detector(producer_buffers_);
-    producer_buffer_detector(op->body);
-    if (producer_buffer_detector.has_producer_buffer_) {
-      InsertBuffer(op->min);
-      InsertBuffer(op->extent);
-    }
-    StmtExprVisitor::VisitStmt_(op);
-  }
-
-  void VisitStmt_(const BufferStoreNode *op) final {
-    if (producer_buffers_.count(op->buffer.get())) {
-      InsertBuffer(op->value);
-    }
-    StmtExprVisitor::VisitStmt_(op);
-  }
-
-  void VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col())) {
-      for (auto arg : op->args) {
-        // Collect buffers from args, including through let bindings
-        CollectBuffersFromExpr(arg);
-      }
-    }
-  }
-
-private:
-  std::unordered_set<const BufferNode *> producer_buffers_;
-  std::unordered_map<const VarNode *, PrimExpr> let_var_to_expr_;
-};
-
-class WarpSpecializedRoleMarker : public StmtVisitor {
-public:
-  WarpSpecializedRoleMarker(Map<Var, Buffer> buffer_data_to_buffer)
-      : buffer_data_to_buffer_(std::move(buffer_data_to_buffer)) {}
-
-  void Prepare(const Stmt &stmt) {
-    ProducerUsedBufferFinder finder;
-    producer_buffers_ = finder.FindProducerusedBuffer(stmt);
-  }
-
-  Role GetRole(const StmtNode *stmt) const {
-    auto it = map_.find(stmt);
-    ICHECK(it != map_.end());
-    return it->second;
-  }
-
-  Role GetRole(const Stmt &stmt) const { return GetRole(stmt.get()); }
-
-  void VisitStmt_(const EvaluateNode *op) final {
-    Role role = Role::kConsumer;
-    if (auto call = op->value.as<CallNode>()) {
-      if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col())) {
-        role = Role::kProducer;
-        has_bulk_copy_ = true;
-      }
-      if (call->op.same_as(loop_break())) {
-        role = Role::kBoth;
-      }
-    }
-    SetRole(op, role);
-  }
-
-  void VisitStmt_(const BufferStoreNode *op) final {
-    auto scope = StorageScope::Create(GetPtrStorageScope(op->buffer->data));
-    bool is_shared_store = scope.rank == StorageRank::kShared;
-    if (producer_buffers_.count(op->buffer.get())) {
-      SetRole(op, Role::kBoth);
-      return;
-    }
-    if (!is_shared_store) {
-      SetRole(op, Role::kConsumer);
-      return;
-    }
-
-    // Check reads from global
-    Block block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{}, /*name_hint=*/"",
-                /*body*/ tvm::ffi::GetRef<Stmt>(op));
-    auto access = GetBlockReadWriteRegion(block, buffer_data_to_buffer_);
-    auto reads = access[0];
-    Role role = Role::kProducer;
-    if (reads.empty())
-      role = Role::kConsumer;
-    for (auto read : reads) {
-      if (read->buffer.scope() != "global") {
-        role = Role::kConsumer;
-        break;
-      }
-    }
-    if (role == Role::kProducer)
-      has_simt_copy_ = true;
-    SetRole(op, role);
-  }
-
-  void VisitStmt_(const SeqStmtNode *op) final {
-    StmtVisitor::VisitStmt_(op);
-    auto role = GetRole(op->seq[0]);
-    for (auto stmt : op->seq) {
-      if (role != GetRole(stmt)) {
-        role = Role::kBoth;
-        break;
-      }
-    }
-    SetRole(op, role);
-  }
-
-  void VisitStmt_(const IfThenElseNode *op) final {
-    StmtVisitor::VisitStmt_(op);
-    auto role = GetRole(op->then_case);
-    if (op->else_case.defined()) {
-      auto role_else = GetRole(op->else_case.value());
-      if (role != role_else)
-        role = Role::kBoth;
-    }
-    SetRole(op, role);
-  }
-
-  void VisitStmt_(const BlockRealizeNode *op) final {
-    StmtVisitor::VisitStmt_(op);
-    SetRole(op, GetRole(op->block));
-  }
-
-  void VisitStmt_(const AllocateNode *op) final {
-    StmtVisitor::VisitStmt_(op);
-    Role role = Role::kConsumer;
-    SetRole(op, role);
-  }
-
-  template <class NodeType> void HandleBodyStmt(const NodeType *op) {
-    StmtVisitor::VisitStmt_(op);
-    SetRole(op, GetRole(op->body));
-  }
-
-  void VisitStmt_(const ForNode *op) final { HandleBodyStmt(op); }
-  void VisitStmt_(const WhileNode *op) final { HandleBodyStmt(op); }
-  void VisitStmt_(const LetStmtNode *op) final { HandleBodyStmt(op); }
-  void VisitStmt_(const AttrStmtNode *op) final { HandleBodyStmt(op); }
-  void VisitStmt_(const AssertStmtNode *op) final { HandleBodyStmt(op); }
-  void VisitStmt_(const BlockNode *op) final { HandleBodyStmt(op); }
-
-  bool HasProducer() { return has_simt_copy_ || has_bulk_copy_; }
-
-  bool HasSimtCopy() { return has_simt_copy_; }
-
-private:
-  void SetRole(const StmtNode *stmt, Role role) { map_[stmt] = role; }
-  Map<Var, Buffer> buffer_data_to_buffer_;
-  std::unordered_map<const StmtNode *, Role> map_;
-  bool has_simt_copy_ = false;
-  bool has_bulk_copy_ = false;
-  std::unordered_set<const BufferNode *> producer_buffers_;
-};
-
-static PrimExpr makeGetBarrier(PrimExpr barrier_id) {
-  return Call(DataType::Handle(), get_mbarrier(), {std::move(barrier_id)});
-}
-
-static Stmt makeArriveBarrier(PrimExpr barrier_id, int cta_id = -1,
-                              const PrimExpr &pred = 1) {
-  Array<PrimExpr> args = {makeGetBarrier(std::move(barrier_id))};
-  if (cta_id != -1) {
-    args.push_back(cta_id);
-    args.push_back(pred);
-  }
-  return Evaluate(
-      Call(DataType::Handle(), builtin::ptx_arrive_barrier(), args));
-}
-
-static Stmt makeCpAsyncBarrier(PrimExpr barrier_id) {
-  auto call = Call(DataType::Handle(), builtin::ptx_cp_async_barrier(),
-                   {makeGetBarrier(std::move(barrier_id))});
-  return Evaluate(call);
-}
-
-static Stmt makeParityWait(PrimExpr barrier_id, PrimExpr parity) {
-  auto call = Call(DataType::Handle(), mbarrier_wait_parity(),
-                   {makeGetBarrier(std::move(barrier_id)), std::move(parity)});
-  return Evaluate(call);
-}
-
-class ProducerTraitsCollector : public StmtExprVisitor {
-public:
-  ProducerTraitsCollector() { Clear(); }
-
-  void Clear() { has_simt_copy = false; }
-
-  void Collect(const Stmt &stmt) { VisitStmt(stmt); }
-
-  bool HasSimtCopy() { return has_simt_copy; }
-
-private:
-  void VisitStmt_(const IfThenElseNode *op) final {
-    bool old_in_if_cond = in_if_cond_;
-    in_if_cond_ = true;
-    VisitExpr(op->condition);
-    in_if_cond_ = old_in_if_cond;
-
-    VisitStmt(op->then_case);
-    if (op->else_case.defined()) {
-      VisitStmt(op->else_case.value());
-    }
-  }
-
-  void VisitExpr_(const BufferLoadNode *op) final {
-    if (!in_if_cond_) {
-      has_simt_copy = true;
-    }
-    StmtExprVisitor::VisitExpr_(op);
-  }
-
-  bool has_simt_copy{};
-  bool in_if_cond_ = false;
-};
-
-// Rewrite the producer Stmt to use the correct barrier index
-class MbarrierRewriter : public StmtExprMutator {
-public:
-  static Stmt Rewrite(Stmt stmt, PrimExpr barrier_id) {
-    MbarrierRewriter rewriter;
-    rewriter.producer_barrier_idx_ = std::move(barrier_id);
-    return rewriter(std::move(stmt));
-  }
-
-private:
-  PrimExpr VisitExpr_(const CallNode *op) final {
-    auto call = Downcast<Call>(StmtExprMutator::VisitExpr_(op));
-    if (call->op.same_as(tma_load()) || call->op.same_as(tma_load_im2col())) {
-      auto mbar = makeGetBarrier(producer_barrier_idx_);
-      auto arg0 = call->args[0].as<Call>();
-      // Check if this is a 1D TMA load
-      auto is_1d_tma_load =
-          arg0 && !arg0.value()->op.same_as(create_tma_descriptor()) &&
-          call->op.same_as(tma_load());
-      if (is_1d_tma_load) {
-        call.CopyOnWrite()->args.Set(2, mbar);
-      } else {
-        Call access_ptr = Downcast<Call>(call->args[2]);
-        ICHECK(access_ptr->op.same_as(builtin::tvm_access_ptr()));
-        call.CopyOnWrite()->args.Set(1, mbar);
-      }
-    }
-    return call;
-  }
-  PrimExpr producer_barrier_idx_;
-};
-
-class ThreadIdxRewriter : public StmtExprMutator {
-public:
-  static Stmt Rewrite(Stmt stmt, Var thread_var, PrimExpr replaced,
-                      PrimExpr thread_extent, bool do_shuffle = false) {
-    auto rewriter =
-        ThreadIdxRewriter(std::move(thread_var), std::move(replaced),
-                          std::move(thread_extent), do_shuffle);
-    return rewriter(std::move(stmt));
-  }
-
-private:
-  ThreadIdxRewriter(Var thread_var, PrimExpr replaced, PrimExpr thread_extent,
-                    bool do_shuffle)
-      : thread_var_(std::move(thread_var)), replaced_(std::move(replaced)),
-        thread_extent_(std::move(thread_extent)), do_shuffle_(do_shuffle) {}
-
-  PrimExpr VisitExpr_(const VarNode *var) final {
-    if (var == thread_var_.get()) {
-      return replaced_;
-    } else {
-      return StmtExprMutator::VisitExpr_(var);
-    }
-  }
-
-  Stmt VisitStmt_(const IfThenElseNode *op) final {
-    auto f_uses_thread_index = [=](const tvm::tir::VarNode *parameter) {
-      return parameter == thread_var_.get();
-    };
-    maybe_thread_opt_ = false;
-    if (!op->else_case.defined() && op->condition.as<EQNode>() &&
-        UsesVar(op->condition, f_uses_thread_index) &&
-        !(UsesVar(op->then_case, f_uses_thread_index))) {
-      auto eq_op = Downcast<EQ>(op->condition);
-      if (eq_op->a.as<VarNode>() == thread_var_.get() ||
-          eq_op->b.as<VarNode>() == thread_var_.get()) {
-        maybe_thread_opt_ = true;
-      }
-      auto then_case = StmtExprMutator::VisitStmt(op->then_case);
-      maybe_thread_opt_ = do_shuffle_ && maybe_thread_opt_ && has_tma_op_;
-      has_tma_op_ = false;
-      if (maybe_thread_opt_) {
-        return IfThenElse(
-            Call(DataType::Bool(), tl_shuffle_elect(), {thread_extent_}),
-            StmtExprMutator::VisitStmt(op->then_case), std::nullopt);
-      }
-    }
-    return StmtExprMutator::VisitStmt_(op);
-  }
-
-  PrimExpr VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(tl::tma_load()) ||
-        op->op.same_as(tl::tma_load_im2col()) ||
-        op->op.same_as(tl::tma_store())) {
-      has_tma_op_ = true;
-    }
-    return StmtExprMutator::VisitExpr_(op);
-  }
-
-  Var thread_var_;
-  PrimExpr replaced_;
-  PrimExpr thread_extent_;
-  bool maybe_thread_opt_ = false;
-  bool do_shuffle_;
-  bool has_tma_op_ = false;
-};
-
-Block MakeGroupBlock(const Stmt &stmt,
-                     const Map<String, ObjectRef> &annotations) {
-  Block block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{}, /*name_hint=*/"",
-              /*body*/ stmt,
-              /*init=*/{}, /*alloc_buffers=*/{}, /*match_buffers=*/{},
-              /*annotations=*/annotations);
-  return block;
-}
-
-struct OpInfo {
-  int group_size{}, order{}, stage{};
-  std::vector<int> group;
-};
-struct PipelineInfo {
-  std::vector<OpInfo> op_infos;
-
-  PipelineInfo() = default;
-  PipelineInfo(const Array<Array<Integer>> &group_info,
-               const Array<Integer> &order_info,
-               const Array<Integer> &stage_info) {
-    int n = static_cast<int>(group_info.size());
-    ICHECK(n == static_cast<int>(order_info.size()));
-    ICHECK(n == static_cast<int>(stage_info.size()));
-    // int cur_id = 0;
-    for (int i = 0; i < n; i++) {
-      OpInfo op_info;
-      op_info.group_size = group_info[i].size();
-      for (int j = 0; j < op_info.group_size; j++) {
-        op_info.group.push_back(group_info[i][j].as<IntImmNode>()->value);
-      }
-      op_info.order = order_info[i].as<IntImmNode>()->value;
-      op_info.stage = stage_info[i].as<IntImmNode>()->value;
-      op_infos.push_back(op_info);
-    }
-  }
-
-  PipelineInfo(const PipelineInfo &other) {
-    for (const auto &op_info : other.op_infos) {
-      op_infos.push_back(op_info);
-    }
-  }
-
-  std::pair<int, int> FindStmt(int stmt_idx) {
-    for (size_t i = 0; i < op_infos.size(); i++) {
-      for (size_t j = 0; j < op_infos[i].group.size(); j++) {
-        if (op_infos[i].group[j] == stmt_idx) {
-          return std::make_pair(i, j);
-        }
-      }
-    }
-    return std::make_pair(-1, -1);
-  }
-
-  void UpdateOrder(int order) {
-    for (int i = 0; i < static_cast<int>(op_infos.size()); i++) {
-      if (op_infos[i].order >= order && op_infos[i].order > 0) {
-        op_infos[i].order++;
-      }
-    }
-  }
-
-  int SplitOp(int stmt_idx) {
-    auto pair = FindStmt(stmt_idx);
-    int op_idx = pair.first;
-    int inner_idx = pair.second;
-    ICHECK(op_idx != -1);
-    ICHECK(inner_idx != -1);
-    OpInfo half0;
-    OpInfo half1;
-    // The order to do sync
-    int sync_order = op_infos[op_idx].order + 1;
-    UpdateOrder(sync_order);
-
-    half0.group_size = inner_idx + 1;
-    half0.order = op_infos[op_idx].order;
-    half0.stage = op_infos[op_idx].stage;
-    for (int i = 0; i <= inner_idx; i++) {
-      half0.group.push_back(op_infos[op_idx].group[i]);
-    }
-    half1.group_size = op_infos[op_idx].group_size - inner_idx - 1;
-    half1.order = op_infos[op_idx].order + 2;
-    half1.stage = op_infos[op_idx].stage;
-    for (int i = inner_idx + 1; i < op_infos[op_idx].group_size; i++) {
-      half1.group.push_back(op_infos[op_idx].group[i]);
-    }
-    op_infos.erase(op_infos.begin() + op_idx);
-    if (half0.group_size > 0) {
-      op_infos.insert(op_infos.begin() + op_idx, half0);
-    }
-    if (half1.group_size > 0) {
-      UpdateOrder(half1.order);
-      op_infos.insert(op_infos.begin() + op_idx + 1, half1);
-    }
-    return sync_order;
-  }
-
-  void PrintPipelineInfo() {
-    std::cout << "Print op_infos:" << '\n';
-    for (size_t i = 0; i < op_infos.size(); i++) {
-      std::cout << i << " " << op_infos[i].group_size << " "
-                << op_infos[i].order << " " << op_infos[i].stage << '\n';
-    }
-    std::cout << "End of print" << '\n';
-  }
-};
-
-class GroupOpRewriter : public StmtExprMutator {
-public:
-  GroupOpRewriter(const PipelineInfo &pipeline_info)
-      : pipeline_info_(pipeline_info) {}
-
-private:
-  Stmt VisitStmt_(const ForNode *op) final {
-    Map<String, ObjectRef> annotations;
-    annotations.Set(String("stmt_group"), Integer(1));
-    auto original_node = (op->body).as<SeqStmtNode>();
-    if (!original_node) {
-      return tvm::ffi::GetRef<For>(op);
-    }
-    Array<Stmt> new_body;
-    int cur_id = 0;
-    for (int i = 0; i < static_cast<int>(pipeline_info_.op_infos.size()); i++) {
-      if (pipeline_info_.op_infos[i].group_size == 0)
-        continue;
-      Array<Stmt> block_stmt;
-      for (int j = 0;
-           j < static_cast<int>(pipeline_info_.op_infos[i].group_size); j++) {
-        // ICHECK(group_info_[i][j].as<IntImmNode>());
-        // int index =
-        // static_cast<int>(group_info_[i][j].as<IntImmNode>()->value);
-        ICHECK(original_node->seq[cur_id].as<BlockNode>());
-        auto block = original_node->seq[cur_id].as<BlockNode>();
-        // TODO: handle nested seqstmt
-        block_stmt.push_back(block->body);
-        cur_id++;
-      }
-      new_body.push_back(MakeGroupBlock(
-          block_stmt.size() == 1 ? block_stmt[0]
-                                 // NOLINTNEXTLINE(performance-move-const-arg)
-                                 : SeqStmt(std::move(block_stmt)),
-          annotations));
-    }
-    Array<Integer> order_anno;
-    Array<Integer> stage_anno;
-    for (const auto &op_info : pipeline_info_.op_infos) {
-      order_anno.push_back(Integer(op_info.order));
-      stage_anno.push_back(Integer(op_info.stage));
-    }
-    Map<String, Any> for_annotations = op->annotations;
-    for_annotations.erase("tl_pipeline_group");
-    for_annotations.Set("software_pipeline_order", order_anno);
-    for_annotations.Set("software_pipeline_stage", stage_anno);
-    For new_for =
-        For(op->loop_var, op->min, op->extent, op->kind,
-            new_body.size() == 1 ? new_body[0] : SeqStmt(std::move(new_body)),
-            op->thread_binding, for_annotations);
-    return new_for;
-  }
-
-  PipelineInfo pipeline_info_;
-};
-
-class WgMMACollector : public StmtExprVisitor {
-public:
-  WgMMACollector() = default;
-
-  void VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(tl_gemm()) || op->op.same_as(tl_gemm_sp())) {
-      auto op_name = std::string(op->args[0].as<StringImmNode>()->value);
-      if (has_wgmma_) {
-        has_wgmma_ =
-            op_name.find("false") == std::string::npos && !in_if_scope_;
-      }
-    }
-    StmtExprVisitor::VisitExpr_(op);
-  }
-
-  void VisitStmt_(const IfThenElseNode *op) final {
-    in_if_scope_ = true;
-    StmtExprVisitor::VisitStmt(op->then_case);
-    if (op->else_case.defined()) {
-      StmtExprVisitor::VisitStmt(op->else_case.value());
-    }
-    in_if_scope_ = false;
-  }
-
-  static bool HasWgMMA(const Stmt &stmt) {
-    auto collector = WgMMACollector();
-    collector(stmt);
-    return collector.has_wgmma_;
-  }
-
-  bool has_wgmma_{true};
-  bool in_if_scope_{false};
-};
-
-class WSCodeEmitter : public StmtMutator {
-public:
-  WSCodeEmitter(bool is_emitting_producer, const IterVar &thread_iv,
-                Map<Var, Buffer> buffer_data_to_buffer,
-                const WarpSpecializedRoleMarker &marker,
-                bool mbarrier_only = false)
-      : is_emitting_producer_(is_emitting_producer),
-        buffer_data_to_buffer_(std::move(buffer_data_to_buffer)),
-        marker_(marker), thread_var_(thread_iv->var),
-        mbarrier_only_(mbarrier_only) {}
-
-  /**
-   * @brief Whether a SIMT-style bulk copy was detected.
-   *
-   * Returns true when a simulated SIMT (thread-parallel) copy pattern was
-   * observed during analysis/emission, which can affect barrier insertion and
-   * copy emission.
-   *
-   * @return true if a SIMT copy was detected; false otherwise.
-   */
-  bool hasSimtCopy() const { return has_simt_copy_; }
-
-private:
-  template <
-      typename NodeType> /**
-                          * @brief Filter a statement by its producer/consumer
-                          * role for emission.
-                          *
-                          * Returns one of:
-                          * - the original statement (unchanged) when this
-                          * emitter should emit it,
-                          * - the result of visiting the statement (to descend
-                          * into it) when mbarrier-only mode requires full
-                          * traversal for non-producer roles,
-                          * - an empty evaluate (`Evaluate(0)`) when the
-                          * statement should be omitted.
-                          *
-                          * The decision is based on the role of `op` as
-                          * reported by `marker_`, the emitter mode
-                          * (`is_emitting_producer_`), and the `mbarrier_only_`
-                          * flag.
-                          *
-                          * @param op The statement node to filter; its role is
-                          * queried via `marker_`.
-                          * @return Stmt The statement to place into the emitted
-                          * IR (possibly transformed or an empty evaluate).
-                          */
-  Stmt FilterByRole(const NodeType *op) {
-    Role role = marker_.GetRole(op);
-    if (mbarrier_only_) {
-      if (role != Role::kProducer)
-        return StmtMutator::VisitStmt_(op);
-    }
-    if (role == Role::kBoth) {
-      return StmtMutator::VisitStmt_(op);
-    } else if ((role == Role::kProducer) == is_emitting_producer_) {
-      return tvm::ffi::GetRef<Stmt>(op);
-    } else {
-      return Evaluate(0);
-    }
-  }
-
-  Stmt VisitStmt_(const SeqStmtNode *op) final {
-
-    bool has_producer = false;
-    for (auto stmt : op->seq) {
-      if (marker_.GetRole(stmt) == Role::kProducer) {
-        has_producer = true;
-        break;
-      }
-    }
-    bool need_producer_sync =
-        has_producer && marker_.GetRole(op) == Role::kBoth;
-    if (!need_producer_sync)
-      return FilterByRole(op);
-
-    auto seq_transformed =
-        op->seq.Map([&](const Stmt &stmt) { return VisitStmt(stmt); });
-
-    auto map = ExtractSyncPattern(op->seq);
-
-    /*
-      std::cout << "Print ExtractSyncPattern" << std::endl;
-      for (int i = 0; i < static_cast<int>(op->seq.size()); i++) {
-        std::cout << i << " " << map.acquire[i] << " " << map.release[i] << " "
-        << map.release_after[i] << std::endl;
-      }
-      std::cout << "Print sync pattern" << std::endl;
-      for (auto pattern : map.patterns) {
-        std::cout << pattern.release_idx << " " << pattern.acquire_idx <<
-        std::endl;
-      }
-      std::cout << "End of ExtractSyncPattern" << std::endl;
-      pipeline_info_.PrintPipelineInfo();
-    */
-    Array<Stmt> new_body;
-    Map<String, ObjectRef> annotations;
-    annotations.Set(String("stmt_group"), Integer(1));
-
-    if (is_emitting_producer_) { // producer case
-      ProducerTraitsCollector collector;
-      for (int i = 0; i < static_cast<int>(op->seq.size()); i++) {
-        Array<Stmt> block_stmt = {};
-        if (!mbarrier_only_) {
-          if (marker_.GetRole(op->seq[i]) == Role::kConsumer)
-            continue;
-          if (marker_.GetRole(op->seq[i]) == Role::kBoth) {
-            block_stmt.push_back(seq_transformed[i]);
-            new_body.push_back(
-                MakeGroupBlock(block_stmt.size() == 1
-                                   ? block_stmt[0]
-                                   // NOLINTNEXTLINE(performance-move-const-arg)
-                                   : SeqStmt(std::move(block_stmt)),
-                               annotations));
-            continue;
-          }
-        }
-
-        for (int pattern_idx : map.acquire[i]) {
-          PrimExpr acquire_barrier_id =
-              stage_ + num_barriers_ + num_stages_ * pattern_idx;
-          PrimExpr parity = map.is_loop_dependency(pattern_idx)
-                                ? bitwise_xor(parity_, 1)
-                                : parity_;
-          block_stmt.push_back(makeParityWait(acquire_barrier_id, parity));
-        }
-        ICHECK(!map.release[i].empty());
-        for (size_t j = 0; j < map.release[i].size(); j++) {
-          int pattern_idx = map.release[i][j];
-          PrimExpr release_barrier_id =
-              stage_ + num_barriers_ + num_stages_ * pattern_idx;
-          auto stmt =
-              MbarrierRewriter::Rewrite(seq_transformed[i], release_barrier_id);
-          collector.Collect(stmt);
-          block_stmt.push_back(stmt);
-          if (collector.HasSimtCopy()) {
-            block_stmt.push_back(makeCpAsyncBarrier(release_barrier_id));
-            has_simt_copy_ = true;
-          }
-          if (map.release_after[i][j]) {
-            block_stmt.push_back(makeArriveBarrier(release_barrier_id));
-            for (int s = 0; s < num_stages_; s++) {
-              released_barrier_.insert(s + num_barriers_ +
-                                       num_stages_ * pattern_idx);
-            }
-          }
-          collector.Clear();
-          new_body.push_back(
-              MakeGroupBlock(block_stmt.size() == 1
-                                 ? block_stmt[0]
-                                 // NOLINTNEXTLINE(performance-move-const-arg)
-                                 : SeqStmt(std::move(block_stmt)),
-                             annotations));
-        }
-      }
-    } else { // consumer case
-      for (int i = 0; i < static_cast<int>(op->seq.size()); i++) {
-        Array<Stmt> block_stmt = {};
-        if (marker_.GetRole(op->seq[i]) == Role::kProducer)
-          continue;
-        for (int pattern_idx : map.acquire[i]) {
-          PrimExpr acquire_barrier_id =
-              stage_ + num_barriers_ + num_stages_ * pattern_idx;
-          PrimExpr parity = map.is_loop_dependency(pattern_idx)
-                                ? bitwise_xor(parity_, 1)
-                                : parity_;
-          block_stmt.push_back(makeParityWait(acquire_barrier_id, parity));
-        }
-        block_stmt.push_back(seq_transformed[i]);
-        for (size_t j = 0; j < map.release[i].size(); j++) {
-          if (map.release_after[i][j]) {
-            int pattern_idx = map.release[i][j];
-            PrimExpr release_barrier_id =
-                stage_ + num_barriers_ + num_stages_ * pattern_idx;
-            block_stmt.push_back(makeArriveBarrier(release_barrier_id));
-            for (int s = 0; s < num_stages_; s++) {
-              released_barrier_.insert(s + num_barriers_ +
-                                       num_stages_ * pattern_idx);
-            }
-          }
-        }
-        new_body.push_back(MakeGroupBlock(
-            block_stmt.size() == 1 ? block_stmt[0]
-                                   // NOLINTNEXTLINE(performance-move-const-arg)
-                                   : SeqStmt(std::move(block_stmt)),
-            annotations));
-      }
-      // Filter out the producer stmts
-      int cur_id = 0;
-      PipelineInfo new_pipeline_info;
-      for (int i = 0; i < static_cast<int>(pipeline_info_.op_infos.size());
-           i++) {
-        auto op_info = pipeline_info_.op_infos[i];
-        bool is_producer = false;
-        for (int j = 0; j < op_info.group_size; j++) {
-          if (marker_.GetRole(op->seq[cur_id]) == Role::kProducer) {
-            is_producer = true;
-          }
-          cur_id++;
-        }
-        if (is_producer) {
-          ICHECK(op_info.group_size == 1);
-        } else {
-          new_pipeline_info.op_infos.push_back(op_info);
-        }
-      }
-      pipeline_info_ = new_pipeline_info;
-    }
-
-    num_barriers_ += map.patterns.size() * num_stages_;
-
-    ICHECK(!new_body.empty());
-    return new_body.size() == 1 ? new_body[0] : SeqStmt(std::move(new_body));
-  }
-
-  Stmt VisitStmt_(const ForNode *op) final {
-    int num_stages = 1;
-    auto num_stages_anno = op->annotations.Get("num_stages");
-    if (num_stages_anno) {
-      ICHECK(num_stages_anno->as<IntImmNode>());
-      num_stages = static_cast<int>(num_stages_anno->as<IntImmNode>()->value);
-      ICHECK(num_stages_ == 1) << "Nested pipeline not supported.";
-    }
-    loop_stack_.emplace_back(LoopInfo{op->loop_var, op->extent, op->min});
-
-    Array<Array<Integer>> group_info_array;
-    Array<Integer> order_info_array;
-    Array<Integer> stage_info_array;
-
-    auto group_anno = op->annotations.Get("tl_pipeline_group");
-    if (group_anno) {
-      group_info_array = Downcast<Array<Array<Integer>>>(group_anno.value());
-    }
-    auto order_anno = op->annotations.Get("tl_pipeline_order");
-    if (order_anno) {
-      order_info_array = Downcast<Array<Integer>>(order_anno.value());
-    }
-    auto stage_anno = op->annotations.Get("tl_pipeline_stage");
-    if (stage_anno) {
-      stage_info_array = Downcast<Array<Integer>>(stage_anno.value());
-    }
-
-    PipelineInfo pipeline_info(group_info_array, order_info_array,
-                               stage_info_array);
-    if (!pipeline_info.op_infos.empty()) {
-      ICHECK(pipeline_info_.op_infos.empty())
-          << "Nested pipeline not supported.";
-    }
-
-    PrimExpr parity_before = std::move(parity_);
-    PrimExpr stage_before = std::move(stage_);
-    int num_stages_before = num_stages_;
-    PipelineInfo pipeline_info_before = pipeline_info_;
-
-    num_stages_ = num_stages;
-    pipeline_info_ = pipeline_info;
-    PrimExpr linear_index = loop_stack_[0].loop_var - loop_stack_[0].min;
-    for (size_t i = 1; i < loop_stack_.size(); ++i) {
-      linear_index = linear_index * loop_stack_[i].extent +
-                     (loop_stack_[i].loop_var - loop_stack_[i].min);
-    }
-    stage_ = FloorMod(linear_index, num_stages);
-    parity_ = FloorMod(
-        parity_before * op->extent + FloorDiv(linear_index, num_stages), 2);
-    auto result = FilterByRole(op);
-
-    Stmt grouped_for_node;
-    if (result.as<ForNode>() && group_anno && !group_info_array.empty() &&
-        !is_emitting_producer_) {
-      GroupOpRewriter group_op_rewriter(pipeline_info_);
-      auto for_node = Downcast<For>(result);
-      grouped_for_node = group_op_rewriter(for_node);
-    }
-
-    parity_ = std::move(parity_before);
-    stage_ = std::move(stage_before);
-    num_stages_ = num_stages_before;
-    pipeline_info_ = pipeline_info_before;
-
-    // remove pipeline annotation
-    auto for_node = result.as<For>();
-    if (result.as<ForNode>()) {
-      auto for_node = Downcast<For>(result);
-      for_node.CopyOnWrite()->annotations.erase("num_stages");
-      if (is_emitting_producer_ || group_info_array.empty()) {
-        for_node.CopyOnWrite()->annotations.erase("tl_pipeline_order");
-        for_node.CopyOnWrite()->annotations.erase("tl_pipeline_stage");
-      }
-      if (is_emitting_producer_ || !group_anno || group_info_array.empty()) {
-        loop_stack_.pop_back();
-        return for_node;
-      }
-      loop_stack_.pop_back();
-      return grouped_for_node;
-    }
-    loop_stack_.pop_back();
-    return result;
-  }
-
-  Stmt VisitStmt_(const IfThenElseNode *op) final { return FilterByRole(op); }
-  Stmt VisitStmt_(const EvaluateNode *op) final { return FilterByRole(op); }
-  Stmt VisitStmt_(const AttrStmtNode *op) final { return FilterByRole(op); }
-  Stmt VisitStmt_(const BufferStoreNode *op) final { return FilterByRole(op); }
-  Stmt VisitStmt_(const LetStmtNode *op) final { return FilterByRole(op); }
-  Stmt VisitStmt_(const AssertStmtNode *op) final { return FilterByRole(op); }
-  Stmt VisitStmt_(const BlockNode *op) final { return FilterByRole(op); }
-  Stmt VisitStmt_(const BlockRealizeNode *op) final { return FilterByRole(op); }
-
-  struct SyncPattern {
-    int release_idx, acquire_idx;
-  };
-
-  struct SyncPatternMap {
-    std::vector<std::vector<int>> acquire;
-    std::vector<std::vector<int>> release;
-    std::vector<std::vector<bool>> release_after;
-    std::vector<SyncPattern> patterns;
-
-    void resize(size_t n) {
-      acquire.resize(n);
-      release.resize(n);
-      release_after.resize(n);
-    }
-
-    bool is_loop_dependency(int pattern_idx) {
-      return patterns[pattern_idx].release_idx >
-             patterns[pattern_idx].acquire_idx;
-    }
-  };
-
-  std::vector<SyncPattern>
-  CreateBaseSyncPairs(const Array<Stmt> &seq_stmt,
-                      const std::vector<bool> &is_producer) {
-    const int n = seq_stmt.size();
-    std::vector<std::set<const BufferNode *>> reads, writes;
-    reads.reserve(n);
-    writes.reserve(n);
-    for (int i = 0; i < n; i++) {
-      Block block(/*iter_vars=*/{}, /*reads=*/{}, /*writes=*/{},
-                  /*name_hint=*/"",
-                  /*body*/ seq_stmt[i]);
-      auto access = GetBlockAccessRegion(block, buffer_data_to_buffer_);
-      std::set<const BufferNode *> read_set, write_set;
-      for (auto region : access[0]) {
-        auto var = region->buffer->data;
-        if (buffer_data_to_buffer_.count(var)) {
-          read_set.insert(buffer_data_to_buffer_[var].get());
-        } else {
-          read_set.insert(region->buffer.get());
-        }
-      }
-      for (auto region : access[1]) {
-        auto var = region->buffer->data;
-        if (buffer_data_to_buffer_.count(var)) {
-          write_set.insert(buffer_data_to_buffer_[var].get());
-        } else {
-          write_set.insert(region->buffer.get());
-        }
-      }
-      reads.push_back(std::move(read_set));
-      writes.push_back(std::move(write_set));
-    }
-
-    auto intersect_fn = [](const std::set<const BufferNode *> &lhs,
-                           const std::set<const BufferNode *> &rhs) {
-      for (auto ptr : lhs)
-        if (rhs.count(ptr))
-          return true;
-      return false;
-    };
-
-    std::vector<SyncPattern> sync_patterns;
-    // producer_release consumer_acquire,
-    // inject before the first consumer stmt for each producer
-    for (int i = 0; i < n; i++) {
-      for (int j = i + 1; j < n; j++) {
-        if (is_producer[i] != is_producer[j] &&
-            (intersect_fn(writes[i], reads[j]) ||
-             intersect_fn(reads[i], writes[j]))) {
-          sync_patterns.push_back({i, j});
-          break;
-        }
-      }
-    }
-
-    // consumer_release producer_acquire
-    // valid when is_loop is true
-    // inject before the earliest producer stmt for each consumer
-    bool in_loop = !is_zero(parity_);
-    if (in_loop) {
-      for (int i = 0; i < n; i++) {
-        for (int j = 0; j < i; j++) {
-          if (is_producer[i] != is_producer[j] &&
-              (intersect_fn(writes[i], reads[j]) ||
-               intersect_fn(reads[i], writes[j]))) {
-            sync_patterns.push_back({i, j});
-            break;
-          }
-        }
-      }
-    }
-
-    return sync_patterns;
-  }
-
-  static std::vector<SyncPattern>
-  RemoveUnusedSyncPatterns(const std::vector<SyncPattern> &sync_patterns,
-                           const std::vector<bool> &is_producer) {
-    /*
-      Simplify multiple release-acquire pairs into one
-      ------------------
-        Produce(A)
-        Produce(B)
-        Consume(A, B)
-      ------------------
-      [(0, 2), (1, 2), (2, 0)] -> [(1, 2), (2, 0)]
-
-      Or
-      ------------------
-        Produce(A, B)
-        Consume(A)
-        Consume(B)
-      ------------------
-      [(0, 1), (1, 0), (2, 0)] -> [(0, 1), (2, 0)]
-    */
-    int M = sync_patterns.size();
-    std::vector<bool> removed(M, false);
-    for (int i = 0; i < M; i++) {
-      for (int j = 0; j < M; j++) {
-        if (is_producer[sync_patterns[i].acquire_idx] ==
-                is_producer[sync_patterns[j].acquire_idx] &&
-            sync_patterns[i].acquire_idx >= sync_patterns[j].acquire_idx &&
-            sync_patterns[i].release_idx < sync_patterns[j].release_idx)
-          removed[i] = true;
-      }
-    }
-
-    std::vector<SyncPattern> sync_pattern_cleaned;
-    sync_pattern_cleaned.reserve(M);
-    for (int i = 0; i < M; i++)
-      if (!removed[i])
-        sync_pattern_cleaned.push_back(sync_patterns[i]);
-
-    return sync_pattern_cleaned;
-  }
-
-  SyncPatternMap ExtractSyncPattern(const Array<Stmt> &seq_stmt) {
-    size_t num_stmts = seq_stmt.size();
-    std::vector<bool> is_producer;
-    is_producer.reserve(num_stmts);
-    for (auto stmt : seq_stmt) {
-      is_producer.push_back(marker_.GetRole(stmt) == Role::kProducer);
-    }
-
-    auto sync_patterns_base = CreateBaseSyncPairs(seq_stmt, is_producer);
-    auto sync_patterns =
-        RemoveUnusedSyncPatterns(sync_patterns_base, is_producer);
-
-    // for (auto pattern : sync_patterns) {
-    //   std::cout << pattern.release_idx << " " << pattern.acquire_idx <<
-    //   std::endl;
-    // }
-
-    SyncPatternMap map;
-    map.resize(num_stmts);
-    map.patterns = sync_patterns;
-
-    for (size_t i = 0; i < sync_patterns.size(); i++) {
-      int acquire_idx = sync_patterns[i].acquire_idx;
-      int release_idx = sync_patterns[i].release_idx;
-
-      map.acquire[acquire_idx].push_back(i);
-      map.release[release_idx].push_back(i);
-      map.release_after[release_idx].push_back(true);
-    }
-
-    std::vector<int> cur_consumer_barrier, cur_producer_barrier;
-    for (int i = num_stmts - 1; i >= 0; i--) {
-      if (is_producer[i]) {
-        if (map.release[i].empty()) {
-          for (auto pattern_idx : cur_producer_barrier) {
-            map.release[i].push_back(pattern_idx);
-            map.release_after[i].push_back(false);
-          }
-        } else {
-          for (auto pattern_idx : map.release[i]) {
-            cur_producer_barrier.push_back(pattern_idx);
-          }
-        }
-      } else {
-        if (map.release[i].empty()) {
-          for (auto pattern_idx : cur_consumer_barrier) {
-            map.release[i].push_back(pattern_idx);
-            map.release_after[i].push_back(false);
-          }
-        } else {
-          for (auto pattern_idx : map.release[i]) {
-            cur_consumer_barrier.push_back(pattern_idx);
-          }
-        }
-      }
-    }
-    return map;
-  }
-
-  const bool is_emitting_producer_;
-  Map<Var, Buffer> buffer_data_to_buffer_;
-  std::unordered_set<int> released_barrier_;
-  const WarpSpecializedRoleMarker &marker_;
-
-  int num_barriers_ = 0;
-  PrimExpr parity_ = 0;
-  PrimExpr stage_ = 0;
-  int num_stages_ = 1;
-  std::vector<LoopInfo> loop_stack_;
-  Var thread_var_;
-  bool mbarrier_only_ = false;
-  PipelineInfo pipeline_info_;
-  friend class WarpSpecializedRewriter;
-  bool has_simt_copy_ = false;
-};
-
-class WarpSpecializedRewriter : public StmtExprMutator {
-public:
-  WarpSpecializedRewriter(bool disable_warp_specialized,
-                          bool disable_shuffle_elect)
-      : disable_warp_specialized_(disable_warp_specialized),
-        disable_shuffle_elect_(disable_shuffle_elect) {}
-  static PrimFunc Substitute(PrimFunc f, bool disable_warp_specialized,
-                             bool disable_shuffle_elect) {
-    // Check if function only uses threadIdx.x before proceeding
-    if (!ThreadTagChecker::HasOnlyThreadIdxX(f)) {
-      LOG(WARNING) << "WarpSpecialize will be disabled because the program "
-                      "uses thread tags other than threadIdx.x."
-                   << "If you want to use warp specialization, please refactor "
-                      "your program to use threadIdx.x only";
-      // Return original function unchanged if other thread tags are found
-      return f;
-    }
-
-    auto T = WarpSpecializedRewriter(disable_warp_specialized,
-                                     disable_shuffle_elect);
-    T.buffer_lca_ = DetectBufferAccessLCA(f);
-    for (auto [buffer, _] : T.buffer_lca_)
-      T.buffer_data_to_buffer_.Set(buffer->data, buffer);
-    f.CopyOnWrite()->body = T(f->body);
-    return f;
-  }
-
-private:
-  Stmt VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == tir::attr::thread_extent &&
-        Downcast<IterVar>(op->node)->thread_tag == "threadIdx.x") {
-      thread_iv_ = Downcast<IterVar>(op->node);
-      need_update_thread_extent_ = false;
-      AttrStmt attr_stmt = Downcast<AttrStmt>(StmtExprMutator::VisitStmt_(op));
-      if (need_update_thread_extent_) {
-        thread_iv_.CopyOnWrite()->dom = {0, updated_thread_extent_.value()};
-        attr_stmt.CopyOnWrite()->node = thread_iv_;
-        attr_stmt.CopyOnWrite()->value = updated_thread_extent_.value();
-      }
-      thread_iv_ = {};
-      return attr_stmt;
-    } else {
-      return StmtExprMutator::VisitStmt_(op);
-    }
-  }
-
-  // If users define a thread binding, we will replace the thread binding with
-  // threadIdx.x We require the thread binding is threadIdx.x, and the extent is
-  // the same as the thread extent
-  Stmt VisitStmt_(const ForNode *op) final {
-    ICHECK(thread_iv_.defined());
-    For for_node = Downcast<For>(StmtExprMutator::VisitStmt_(op));
-    if (for_node->kind == ForKind::kThreadBinding) {
-      ICHECK(for_node->thread_binding.defined());
-      String thread_tag = for_node->thread_binding.value()->thread_tag;
-      ICHECK(thread_tag == "threadIdx.x") << "Only support threadIdx.x";
-      Var thread_iv = Downcast<Var>(for_node->loop_var);
-      Stmt new_body =
-          ThreadIdxRewriter::Rewrite(for_node->body, thread_iv, thread_iv_, 0);
-      return new_body;
-    }
-    return for_node;
-  }
-
-  Stmt VisitStmt_(const BlockRealizeNode *op) final {
-    BlockRealize block_realize =
-        Downcast<BlockRealize>(StmtExprMutator::VisitStmt_(op));
-    if (!thread_iv_.defined()) {
-      return block_realize;
-    }
-
-    Block block = block_realize->block;
-    WarpSpecializedRoleMarker marker(buffer_data_to_buffer_);
-    marker.Prepare(block);
-    marker(block);
-    if (!marker.HasProducer()) {
-      // Cannot detect any producer here, directly return.
-      return block_realize;
-    }
-
-    if (disable_warp_specialized_) {
-      WSCodeEmitter mbarrier_emitter(true, thread_iv_, buffer_data_to_buffer_,
-                                     marker, true);
-      auto code = mbarrier_emitter(block->body);
-      int num_barriers = mbarrier_emitter.num_barriers_;
-      Array<PrimExpr> barrier_num_threads;
-      barrier_num_threads.reserve(num_barriers);
-      PrimExpr arrive_thread_count = thread_iv_->dom->extent;
-      for (int i = 0; i < num_barriers; i++) {
-        barrier_num_threads.push_back(arrive_thread_count);
-      }
-      Stmt init_barrier = Evaluate(Call(
-          DataType::Handle(), create_list_of_mbarrier(), barrier_num_threads));
-      block.CopyOnWrite()->body = SeqStmt({init_barrier, code});
-      block_realize.CopyOnWrite()->block = block;
-      return block_realize;
-    }
-    WSCodeEmitter producer(true, thread_iv_, buffer_data_to_buffer_, marker);
-    WSCodeEmitter consumer(false, thread_iv_, buffer_data_to_buffer_, marker,
-                           false);
-    Stmt producer_code = producer(block->body);
-    Stmt consumer_code = consumer(block->body);
-    PrimExpr consumer_thread_extent = thread_iv_->dom->extent;
-    PrimExpr producer_thread_extent = thread_iv_->dom->extent;
-    // Need one warp-group for bulk-copy only case
-    if (!marker.HasSimtCopy())
-      producer_thread_extent = 128;
-
-    updated_thread_extent_ = consumer_thread_extent + producer_thread_extent;
-
-    producer_code = ThreadIdxRewriter::Rewrite(
-        producer_code, thread_iv_->var,
-        thread_iv_->var - consumer_thread_extent, producer_thread_extent,
-        !disable_shuffle_elect_);
-    consumer_code = ThreadIdxRewriter::Rewrite(
-        consumer_code, thread_iv_->var, thread_iv_->var, consumer_thread_extent,
-        !disable_shuffle_elect_);
-    need_update_thread_extent_ = true;
-
-    ICHECK(producer.num_barriers_ == consumer.num_barriers_)
-        << producer.num_barriers_ << " " << consumer.num_barriers_;
-    int num_barriers = consumer.num_barriers_;
-    Array<PrimExpr> barrier_num_threads;
-    barrier_num_threads.reserve(num_barriers);
-    for (int i = 0; i < num_barriers; i++) {
-      PrimExpr arrive_thread_count =
-          producer.released_barrier_.count(i)
-              ? (producer.hasSimtCopy() ? producer_thread_extent : 1)
-              : consumer_thread_extent;
-      barrier_num_threads.push_back(arrive_thread_count);
-    }
-
-    Stmt init_barrier = Evaluate(Call(
-        DataType::Handle(), create_list_of_mbarrier(), barrier_num_threads));
-    Stmt body = IfThenElse(GE(thread_iv_->var, consumer_thread_extent),
-                           producer_code, consumer_code);
-    // Add an attr here to handle the partial thread count in ThreadSync pass.
-    Array<IntImm> ws_partition = {Downcast<IntImm>(producer_thread_extent),
-                                  Downcast<IntImm>(consumer_thread_extent)};
-    body = AttrStmt(ws_partition, attr::kWarpSpecializationScope, 0, body);
-
-    block.CopyOnWrite()->body = SeqStmt({init_barrier, body});
-    block_realize.CopyOnWrite()->block = block;
-    return block_realize;
-  }
-
-  WarpSpecializedRewriter() = default;
-
-  Map<Var, Buffer> buffer_data_to_buffer_;
-  Map<Buffer, Optional<Stmt>> buffer_lca_;
-  Map<Buffer, Buffer> buffer_remap_;
-  IterVar thread_iv_;
-  Optional<PrimExpr> updated_thread_extent_;
-  bool need_update_thread_extent_ = false;
-  bool disable_warp_specialized_ = false;
-  bool disable_shuffle_elect_ = false;
-};
-
-using namespace tir::transform;
-
-tvm::transform::Pass WarpSpecialized() {
-  auto pass_func = [=](PrimFunc f, const IRModule &m, PassContext ctx) {
-    bool disable_warp_specialized =
-        ctx->GetConfig<Bool>(kDisableWarpSpecialized, Bool(false)).value();
-    bool disable_shuffle_elect =
-        ctx->GetConfig<Bool>(kDisableShuffleElect, Bool(false)).value();
-    bool warp_specialized = WarpSpecializedDetector::Detect(f->body);
-
-    if (!warp_specialized) {
-      return WarpSpecializedRewriter::Substitute(f, disable_warp_specialized,
-                                                 disable_shuffle_elect);
-    } else {
-      auto node = ffi::String("default");
-      f.CopyOnWrite()->body =
-          AttrStmt(node, attr::kCustomWarpSpecialization, 1, f->body);
-      return f;
-    }
-  };
-  return CreatePrimFuncPass(pass_func, 0, "tl.WarpSpecialized", {});
-}
-
-TVM_FFI_STATIC_INIT_BLOCK() {
-  namespace refl = tvm::ffi::reflection;
-  refl::GlobalDef().def("tl.transform.WarpSpecialized", WarpSpecialized);
-}
-
-} // namespace tl
-} // namespace tvm
diff --git a/src/transform/warp_specialized_rewriter.h b/src/transform/warp_specialized_rewriter.h
deleted file mode 100644
index 01a2474a83..0000000000
--- a/src/transform/warp_specialized_rewriter.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*!
- * \file warp_specialized_rewriter.h
- * \brief tools for warp-specialized-related analysis and transformation
- */
-
-#pragma once
-
-#include "arith/ir_visitor_with_analyzer.h"
-#include "tir/analysis/var_use_def_analysis.h"
-#include <tvm/ffi/reflection/registry.h>
-#include <tvm/tir/analysis.h>
-#include <tvm/tir/builtin.h>
-#include <tvm/tir/op.h>
-#include <tvm/tir/stmt_functor.h>
-#include <tvm/tir/transform.h>
-
-#include <utility>
-
-#include "../op/builtin.h"
-#include "./common/collector.h"
-#include "runtime/thread_storage_scope.h"
-#include "tir/transforms/ir_utils.h"
-
-namespace tvm {
-namespace tl {
-
-using namespace tir;
-using namespace runtime;
-using arith::IRVisitorWithAnalyzer;
-
-class WarpSpecializedDetector : public IRVisitorWithAnalyzer {
-public:
-  // return true means this aws will be disabled
-  static bool Detect(const Stmt &stmt, bool skip_thread_partition = false) {
-    WarpSpecializedDetector detector;
-    detector.VisitStmt(stmt);
-    if (detector.has_warp_specialization_) {
-      LOG(WARNING) << "Auto warp specialization will be disabled because warp "
-                      "specialization is manually enabled";
-      return true;
-    }
-    if (detector.has_tma_op_ && detector.has_mbarrier_op_) {
-      LOG(WARNING) << "Auto warp specialization will be disabled because TMA "
-                      "and mbarrier are both present";
-      return true;
-    }
-    return false;
-  }
-
-  WarpSpecializedDetector() {
-    has_tma_op_ = false;
-    has_mbarrier_op_ = false;
-    has_warp_specialization_ = false;
-  }
-
-private:
-  void VisitStmt_(const EvaluateNode *op) final {
-    if (const CallNode *call = op->value.as<CallNode>()) {
-      if (call->op.same_as(create_list_of_mbarrier()) ||
-          call->op.same_as(mbarrier_wait_parity()) ||
-          call->op.same_as(builtin::ptx_arrive_barrier()) ||
-          call->op.same_as(builtin::ptx_cp_async_barrier())) {
-        has_mbarrier_op_ = true;
-      }
-    }
-    IRVisitorWithAnalyzer::VisitStmt_(op);
-  }
-
-  void VisitExpr_(const CallNode *op) final {
-    if (op->op.same_as(tma_load()) || op->op.same_as(tma_load_im2col()) ||
-        op->op.same_as(set_max_nreg())) {
-      has_tma_op_ = true;
-    }
-    IRVisitorWithAnalyzer::VisitExpr_(op);
-  }
-
-  void VisitStmt_(const AttrStmtNode *op) final {
-    if (op->attr_key == "warp_specialize" &&
-        op->value.as<IntImmNode>()->value == 1) {
-      has_warp_specialization_ = true;
-    }
-    if (op->attr_key == tir::attr::thread_extent) {
-      IterVar iv = Downcast<IterVar>(op->node);
-      if (iv->thread_tag == "threadIdx.x") {
-        ICHECK(iv->dom->extent.as<IntImmNode>());
-        thread_var_ = iv;
-      }
-    }
-    IRVisitorWithAnalyzer::VisitStmt_(op);
-  }
-
-  bool has_tma_op_{false};
-  IterVar thread_var_;
-  bool has_mbarrier_op_{false};
-  bool has_warp_specialization_{false};
-};
-
-} // namespace tl
-} // namespace tvm
diff --git a/testing/conftest.py b/testing/conftest.py
index 4010e0d83a..72094aa135 100644
--- a/testing/conftest.py
+++ b/testing/conftest.py
@@ -1,9 +1,16 @@
 import os
 import random
+import sys
 import pytest
 
 os.environ["PYTHONHASHSEED"] = "0"
 
+# Ensure we import the in-tree `tilelang/` instead of any globally installed
+# versions that may appear earlier on PYTHONPATH.
+REPO_ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), os.pardir))
+if REPO_ROOT not in sys.path:
+    sys.path.insert(0, REPO_ROOT)
+
 random.seed(0)
 
 try:
@@ -12,6 +19,9 @@
     pass
 else:
     torch.manual_seed(0)
+    # Workaround: hipBLASLt on ROCm 7.1 nightly has a bug with certain matmul shapes
+    if hasattr(torch.version, "hip") and torch.version.hip:
+        torch.backends.cuda.preferred_blas_library("hipblas")
 
 try:
     import numpy as np
@@ -21,19 +31,40 @@
     np.random.seed(0)
 
 
+def pytest_addoption(parser):
+    parser.addoption(
+        "--run-perf",
+        action="store_true",
+        default=False,
+        help="run performance and benchmark-oriented tests",
+    )
+
+
+def pytest_collection_modifyitems(config, items):
+    if config.getoption("--run-perf"):
+        config._perf_items_filtered = 0
+        return
+
+    perf_skip = pytest.mark.skip(reason="performance test skipped by default; pass --run-perf to include it")
+    perf_items_filtered = 0
+    for item in items:
+        if item.get_closest_marker("perf") is not None:
+            item.add_marker(perf_skip)
+            perf_items_filtered += 1
+    config._perf_items_filtered = perf_items_filtered
+
+
 def pytest_terminal_summary(terminalreporter, exitstatus, config):
     """Ensure that at least one test is collected. Error out if all tests are skipped."""
-    known_types = {
-        "failed",
-        "passed",
-        "skipped",
-        "deselected",
-        "xfailed",
-        "xpassed",
-        "warnings",
-        "error",
-    }
-    if sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"})) == 0:
+    known_types = {"failed", "passed", "skipped", "deselected", "xfailed", "xpassed", "warnings", "error"}
+    executed_count = sum(len(terminalreporter.stats.get(k, [])) for k in known_types.difference({"skipped", "deselected"}))
+    if executed_count == 0 and getattr(config, "_perf_items_filtered", 0) > 0:
+        terminalreporter.write_sep(
+            "-",
+            f"Skipped {config._perf_items_filtered} perf test(s). Re-run with --run-perf to include them.",
+        )
+        return
+    if executed_count == 0:
         terminalreporter.write_sep(
             "!",
             (f"Error: No tests were collected. {dict(sorted((k, len(v)) for k, v in terminalreporter.stats.items()))}"),
diff --git a/testing/python/amd/test_tilelang_ds_read_tr.py b/testing/python/amd/test_tilelang_ds_read_tr.py
new file mode 100644
index 0000000000..65d465707b
--- /dev/null
+++ b/testing/python/amd/test_tilelang_ds_read_tr.py
@@ -0,0 +1,125 @@
+"""Tests for ds_read_tr16_b64 and ds_read_tr8_b64 intrinsics on gfx950.
+
+Covers:
+  - Codegen: generated HIP source contains the correct tl:: call.
+  - Runtime: kernel compiles and executes on gfx950 without errors.
+
+ds_read_tr16_b64  – LDS transpose read, 64-bit, 16-element transpose.
+                    Used for FP16/BF16 MFMA B-loads on MI350/MI355X (gfx950).
+ds_read_tr8_b64   – LDS transpose read, 64-bit, 8-element transpose.
+                    Used for FP32 MFMA B-loads on MI350/MI355X (gfx950).
+"""
+
+import pytest
+import torch
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang.utils.target import target_is_gfx950, determine_target
+
+
+def requires_gfx950():
+    """Skip the test when the current ROCm target is not gfx950."""
+    target = determine_target("auto", return_object=True)
+    if not target_is_gfx950(target):
+        pytest.skip("gfx950 (MI350/MI355X) not detected")
+
+
+# ---------------------------------------------------------------------------
+# Kernels
+# ---------------------------------------------------------------------------
+
+
+# ds_read_tr16_b64: each thread reads 2 fp16 elements from LDS with a
+# 16-element transpose and stores the result (as float32x2) into a staging
+# shared buffer, which is then copied to global memory.
+@tilelang.jit(target="hip")
+def _kernel_tr16(X, Out):
+    NV = T.const("NV")
+    X: T.Tensor[[NV], T.float16]
+    Out: T.Tensor[[NV // 2], T.float32]
+
+    with T.Kernel(1, threads=NV // 2) as _:
+        smem = T.alloc_shared([NV], T.float16)
+        smem2 = T.alloc_shared([NV // 2], T.float32)
+        T.copy(X[:NV], smem[:NV])
+        T.sync_threads()
+        for i in T.Parallel(NV // 2):
+            val = T.reinterpret(T.ds_read_tr16_b64(smem[i * 2]), T.float32x2)
+            smem2[i * 2 : i * 2 + 2] = val
+        T.sync_threads()
+        T.copy(smem2[: NV // 2], Out[: NV // 2])
+
+
+# ds_read_tr8_b64: same pattern but reads float32 elements.
+@tilelang.jit(target="hip")
+def _kernel_tr8(X, Out):
+    NV = T.const("NV")
+    X: T.Tensor[[NV], T.float32]
+    Out: T.Tensor[[NV // 2], T.float32]
+
+    with T.Kernel(1, threads=NV // 2) as _:
+        smem = T.alloc_shared([NV], T.float32)
+        smem2 = T.alloc_shared([NV // 2], T.float32)
+        T.copy(X[:NV], smem[:NV])
+        T.sync_threads()
+        for i in T.Parallel(NV // 2):
+            val = T.reinterpret(T.ds_read_tr8_b64(smem[i * 2]), T.float32x2)
+            smem2[i * 2 : i * 2 + 2] = val
+        T.sync_threads()
+        T.copy(smem2[: NV // 2], Out[: NV // 2])
+
+
+# ---------------------------------------------------------------------------
+# Tests
+# ---------------------------------------------------------------------------
+
+N = 128  # number of fp16 elements in shared memory
+
+
+@tilelang.testing.requires_rocm
+def test_ds_read_tr16_b64_codegen():
+    """Generated HIP source must contain tl::ds_read_tr16_b64(...)."""
+    requires_gfx950()
+
+    src = _kernel_tr16.get_kernel_source(NV=N)
+    print("=== ds_read_tr16_b64 codegen ===")
+    print(src)
+    assert "ds_read_tr16_b64" in src, "Expected tl::ds_read_tr16_b64 call in generated HIP source"
+
+
+@tilelang.testing.requires_rocm
+def test_ds_read_tr8_b64_codegen():
+    """Generated HIP source must contain tl::ds_read_tr8_b64(...)."""
+    requires_gfx950()
+
+    src = _kernel_tr8.get_kernel_source(NV=N)
+    print("=== ds_read_tr8_b64 codegen ===")
+    print(src)
+    assert "ds_read_tr8_b64" in src, "Expected tl::ds_read_tr8_b64 call in generated HIP source"
+
+
+@tilelang.testing.requires_rocm
+def test_ds_read_tr16_b64_runtime():
+    """ds_read_tr16_b64 kernel must execute without error on gfx950."""
+    requires_gfx950()
+
+    X = torch.randn(N, dtype=torch.float16, device="cuda")
+    Out = torch.empty(N // 2, dtype=torch.float32, device="cuda")
+    _kernel_tr16(X, Out)
+    torch.cuda.synchronize()
+
+
+@tilelang.testing.requires_rocm
+def test_ds_read_tr8_b64_runtime():
+    """ds_read_tr8_b64 kernel must execute without error on gfx950."""
+    requires_gfx950()
+
+    X = torch.randn(N, dtype=torch.float32, device="cuda")
+    Out = torch.empty(N // 2, dtype=torch.float32, device="cuda")
+    _kernel_tr8(X, Out)
+    torch.cuda.synchronize()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
index b26354830a..3fe33aebf0 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_intrinsic.py
@@ -8,6 +8,7 @@
     MatrixCoreIntrinEmitter,
 )
 from tilelang.transform import simplify_prim_func
+from tilelang.utils import determine_fp8_type, determine_target
 
 tilelang.testing.set_random_seed(0)
 
@@ -26,7 +27,7 @@ def tl_matmul(
 ):
     micro_size_x = micro_size_y = micro_size_k = 16
 
-    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
+    if in_dtype in {T.float8_e4m3fnuz, T.float8_e4m3fn, T.int8}:
         micro_size_k = 32
 
     block_row_warps = 2
@@ -75,6 +76,7 @@ def tl_matmul(
         warp_col_tiles=warp_col_tiles,
         chunk=chunk,
         k_pack=k_pack,
+        target=determine_target("auto", return_object=True),
     )
 
     @T.prim_func
@@ -162,7 +164,6 @@ def main(
 
 def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype=T.float32, a_transposed=False, b_transposed=True, k_pack=1):
     matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack)
-    print(matmul)
     kernel = tilelang.compile(matmul)
     src_code = kernel.get_kernel_source()
     # src_code is the generated cuda source
@@ -172,7 +173,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype=T.flo
     if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
-    elif in_dtype == T.float8_e4m3fnuz:
+    elif "float8" in str(in_dtype):  # for T.float8_e4m3fnuz in gfx942 and T.float8_e4m3fn in gfx950
         A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
@@ -195,7 +196,7 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype=T.flo
         ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
         # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
         # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
@@ -219,7 +220,11 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype=T.flo
         (128, 256, 256, T.int8, T.int32, T.int32, False, True, 2),
         (128, 256, 256, T.int8, T.int32, T.int32, False, False, 1),
         (128, 256, 256, T.int8, T.int32, T.int32, False, False, 2),
-        (128, 128, 128, T.float8_e4m3fnuz, T.float16, T.float32, False, True, 1),
+        (128, 128, 128, determine_fp8_type(), T.float16, T.float32, False, True, 1),
+        (128, 256, 256, determine_fp8_type(), T.float32, T.float32, False, True, 1),
+        (128, 256, 256, determine_fp8_type(), T.float32, T.float32, False, True, 2),
+        (128, 256, 256, determine_fp8_type(), T.float32, T.float32, False, False, 1),
+        (128, 256, 256, determine_fp8_type(), T.float32, T.float32, False, False, 2),
     ],
 )
 @tilelang.testing.requires_rocm
@@ -235,10 +240,6 @@ def test_assert_tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transpose
         b_transposed=b_transposed,
         k_pack=k_pack,
     )
-    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32)
-    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, k_pack=2)
-    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False)
-    assert_tl_matmul_correctness(128, 256, 256, T.float8_e4m3fnuz, T.float32, b_transposed=False, k_pack=2)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
index dc95eb7010..abedd1f19b 100644
--- a/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
+++ b/testing/python/amd/test_tilelang_gemm_mfma_preshuffle.py
@@ -1,11 +1,13 @@
 import pytest
 import torch
+import tilelang
 import tilelang.testing
 from tilelang import tvm as tvm
 import tilelang.language as T
 from tilelang.intrinsics import make_mfma_swizzle_layout as make_swizzle_layout
 from tilelang.intrinsics.mfma_macro_generator import MatrixCorePreshuffleIntrinEmitter
 from tilelang.transform import simplify_prim_func
+from tilelang.utils import determine_fp8_type
 
 tilelang.testing.set_random_seed(0)
 
@@ -21,27 +23,70 @@ def tl_matmul(
     a_transposed=False,
     b_transposed=True,
     k_pack=1,
+    a_preshuffle=False,
     b_preshuffle=False,
     b_g2l_load=False,
+    block_row_warps=None,
+    block_col_warps=None,
+    warp_row_tiles=None,
+    warp_col_tiles=None,
+    chunk=None,
+    num_stages=0,
+    panel_size=10,
+    mfma_shape=None,
 ):
-    micro_size_x = micro_size_y = micro_size_k = 16
-
-    if in_dtype in {T.float8_e4m3fnuz, T.int8}:
-        micro_size_k = 32
-
-    block_row_warps = 2
-    block_col_warps = 2
-    warp_row_tiles = 32
-    warp_col_tiles = 32
-
-    # for preshuffle_b, warp_layout = {1, 4}
-    if b_preshuffle:
-        block_row_warps = 1
-        block_col_warps = 4
-        warp_row_tiles = 64
-        warp_col_tiles = 16
-
-    chunk = 256 * k_pack
+    """Build a TileLang MFMA kernel for ``A @ B^T`` (with optional preshuffle).
+
+    The (block_*_warps, warp_*_tiles, chunk, num_stages, panel_size) parameters
+    expose the underlying CK-style template knobs so that an external autotuner
+    can pick per-shape tile / wave / pipeline configurations.
+
+    ``mfma_shape`` selects which MFMA instruction to use, as an ``(M, N, K)``
+    tuple.  Supported int8 shapes on CDNA4 (gfx950):
+        (16, 16, 32)  — default, ``v_mfma_i32_16x16x32_i8``
+        (16, 16, 64)  — doubled-K, ``v_mfma_i32_16x16x64_i8``
+        (32, 32, 32)  — doubled-MN, ``v_mfma_i32_32x32x32_i8``
+    """
+    if mfma_shape is not None:
+        micro_size_x, micro_size_y, micro_size_k = mfma_shape
+    else:
+        micro_size_x = micro_size_y = 16
+        micro_size_k = 32 if in_dtype.bits == 8 else 16
+
+    if block_row_warps is None:
+        block_row_warps = 2
+    if block_col_warps is None:
+        block_col_warps = 2
+    if warp_row_tiles is None:
+        warp_row_tiles = max(32, micro_size_x)
+    if warp_col_tiles is None:
+        warp_col_tiles = max(32, micro_size_y)
+
+    # Legacy heuristic: if the caller did not override any tile knob and we are
+    # in B-only preshuffle mode, keep the historical 1x4 warp grid.
+    _all_tile_defaults = (
+        block_row_warps == 2
+        and block_col_warps == 2
+        and warp_row_tiles == max(32, micro_size_x)
+        and warp_col_tiles == max(32, micro_size_y)
+    )
+    if _all_tile_defaults and b_preshuffle and not a_preshuffle:
+        block_row_warps, block_col_warps = 1, 4
+        warp_row_tiles = max(64, micro_size_x)
+        warp_col_tiles = max(16, micro_size_y)
+
+    if chunk is None:
+        chunk = 256 * k_pack
+
+    # ---- structural validation (catch invalid configs early) ----
+    assert warp_row_tiles % micro_size_x == 0, f"warp_row_tiles={warp_row_tiles} must be a multiple of micro_size_x={micro_size_x}"
+    assert warp_col_tiles % micro_size_y == 0, f"warp_col_tiles={warp_col_tiles} must be a multiple of micro_size_y={micro_size_y}"
+    assert chunk % (k_pack * micro_size_k) == 0, f"chunk={chunk} must be a multiple of k_pack*micro_size_k={k_pack * micro_size_k}"
+    block_M_check = block_row_warps * warp_row_tiles
+    block_N_check = block_col_warps * warp_col_tiles
+    assert M % block_M_check == 0, f"M={M} must be a multiple of block_M={block_M_check}"
+    assert N % block_N_check == 0, f"N={N} must be a multiple of block_N={block_N_check}"
+    assert K % chunk == 0, f"K={K} must be a multiple of chunk={chunk}"
 
     pack_size_k = micro_size_k * k_pack
 
@@ -51,7 +96,14 @@ def tl_matmul(
     block_N = block_col_warps * warp_col_tiles
     block_K = chunk
 
-    A_shape = (K, M) if a_transposed else (M, K)
+    if a_preshuffle:
+        A_shape = (
+            (K // pack_size_k, M // micro_size_x, pack_size_k, micro_size_x)
+            if a_transposed
+            else (M // micro_size_x, K // pack_size_k, micro_size_x, pack_size_k)
+        )
+    else:
+        A_shape = (K, M) if a_transposed else (M, K)
     if b_preshuffle:
         B_shape = (
             (N // micro_size_y, K // pack_size_k, micro_size_y, pack_size_k)
@@ -61,7 +113,14 @@ def tl_matmul(
     else:
         B_shape = (N, K) if b_transposed else (K, N)
 
-    A_shared_shape = (block_K, block_M) if a_transposed else (block_M, block_K)
+    if a_preshuffle:
+        A_shared_shape = (
+            (block_K // pack_size_k, block_M // micro_size_x, pack_size_k, micro_size_x)
+            if a_transposed
+            else (block_M // micro_size_x, block_K // pack_size_k, micro_size_x, pack_size_k)
+        )
+    else:
+        A_shared_shape = (block_K, block_M) if a_transposed else (block_M, block_K)
     if b_preshuffle:
         B_shared_shape = (
             (block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k)
@@ -92,7 +151,9 @@ def tl_matmul(
         warp_col_tiles=warp_col_tiles,
         chunk=chunk,
         k_pack=k_pack,
+        a_preshuffle=a_preshuffle,
         b_preshuffle=b_preshuffle,
+        mfma_shape=mfma_shape,
     )
 
     @T.prim_func
@@ -108,35 +169,51 @@ def main(
             B_local = T.alloc_local((warp_cols * local_size_b), in_dtype)
             C_local = T.alloc_local((warp_rows * warp_cols * local_size_c), accum_dtype)
 
-            T.annotate_layout(
-                {
-                    A_shared: make_swizzle_layout(A_shared),
-                }
-            )
+            layout_map = {}
+            if not a_preshuffle:
+                layout_map[A_shared] = make_swizzle_layout(A_shared)
+            if not b_preshuffle:
+                layout_map[B_shared] = make_swizzle_layout(B_shared)
+            if layout_map:
+                T.annotate_layout(layout_map)
 
             num_ko = K // block_K
             num_ki = block_K // (k_pack * micro_size_k)
 
             # Improve L2 Cache
-            T.use_swizzle(panel_size=10)
+            T.use_swizzle(panel_size=panel_size)
 
             T.clear(C_local)
 
-            for ko in T.Pipelined(num_ko, num_stages=0):
+            for ko in T.Pipelined(num_ko, num_stages=num_stages):
                 # Load A into shared memory
-                if a_transposed:
-                    T.copy(A[ko * block_K, by * block_M], A_shared)
+                if a_preshuffle:
+                    if a_transposed:
+                        for k, i, kk, ii in T.Parallel(block_K // pack_size_k, block_M // micro_size_x, pack_size_k, micro_size_x):
+                            A_shared[k, i, kk, ii] = A[ko * block_K // pack_size_k + k, by * block_M // micro_size_x + i, kk, ii]
+                    else:
+                        for i, k, ii, kk in T.Parallel(block_M // micro_size_x, block_K // pack_size_k, micro_size_x, pack_size_k):
+                            A_shared[i, k, ii, kk] = A[by * block_M // micro_size_x + i, ko * block_K // pack_size_k + k, ii, kk]
                 else:
-                    T.copy(A[by * block_M, ko * block_K], A_shared)
+                    if a_transposed:
+                        T.copy(A[ko * block_K, by * block_M], A_shared)
+                    else:
+                        T.copy(A[by * block_M, ko * block_K], A_shared)
 
                 # Load B into shared memory
                 if b_g2l_load is False:
-                    if b_transposed:
-                        for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k):
-                            B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j, ko * block_K // pack_size_k + k, jj, kk]
+                    if b_preshuffle:
+                        if b_transposed:
+                            for j, k, jj, kk in T.Parallel(block_N // micro_size_y, block_K // pack_size_k, micro_size_y, pack_size_k):
+                                B_shared[j, k, jj, kk] = B[bx * block_N // micro_size_y + j, ko * block_K // pack_size_k + k, jj, kk]
+                        else:
+                            for k, j, kk, jj in T.Parallel(block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y):
+                                B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k, bx * block_N // micro_size_y + j, kk, jj]
                     else:
-                        for k, j, kk, jj in T.Parallel(block_K // pack_size_k, block_N // micro_size_y, pack_size_k, micro_size_y):
-                            B_shared[k, j, kk, jj] = B[ko * block_K // pack_size_k + k, bx * block_N // micro_size_y + j, kk, jj]
+                        if b_transposed:
+                            T.copy(B[bx * block_N, ko * block_K], B_shared)
+                        else:
+                            T.copy(B[ko * block_K, bx * block_N], B_shared)
 
                 for ki in T.serial(0, num_ki):
                     # Load A S2L
@@ -200,21 +277,35 @@ def assert_tl_matmul_correctness(
     a_transposed=False,
     b_transposed=True,
     k_pack=1,
+    a_preshuffle=False,
     b_preshuffle=False,
     b_g2l_load=False,
+    mfma_shape=None,
 ):
-    matmul = tl_matmul(M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load)
-    print(matmul)
+    matmul = tl_matmul(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        a_transposed,
+        b_transposed,
+        k_pack,
+        a_preshuffle,
+        b_preshuffle,
+        b_g2l_load,
+        mfma_shape=mfma_shape,
+    )
     kernel = tilelang.compile(matmul)
-    src_code = kernel.get_kernel_source()
-    # src_code is the generated cuda source
-    assert src_code is not None
+    assert kernel.get_kernel_source() is not None
+
     A_shape = (K, M) if a_transposed else (M, K)
     B_shape = (N, K) if b_transposed else (K, N)
     if in_dtype == T.int8:
         A = torch.randint(-128, 127, A_shape, device="cuda", dtype=torch.int8)
         B = torch.randint(-128, 127, B_shape, device="cuda", dtype=torch.int8)
-    elif in_dtype == T.float8_e4m3fnuz:
+    elif "float8" in str(in_dtype):
         A = torch.rand(A_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
         B = torch.rand(B_shape, device="cuda", dtype=torch.float16).to(getattr(torch, in_dtype))
     else:
@@ -223,52 +314,46 @@ def assert_tl_matmul_correctness(
 
     C = torch.zeros(M, N, device="cuda", dtype=getattr(torch, out_dtype))
 
-    B_preshuffle = B
-    if b_preshuffle:
-        B_preshuffle = shuffle_weight(B_preshuffle, k_pack=k_pack, is_transpose=b_transposed)
-        kernel(A, B_preshuffle, C)
-    else:
-        kernel(A, B, C)
-
-    print(kernel.get_kernel_source())
-
-    profiler = kernel.get_profiler()
-
-    latency = profiler.do_bench()
-
-    # Ensure that the latency is not None
-    assert latency is not None
+    shuf_layout = (mfma_shape[0], mfma_shape[2]) if mfma_shape else (16, 32)
+    A_in = shuffle_weight(A, layout=shuf_layout, k_pack=k_pack, is_transpose=not a_transposed) if a_preshuffle else A
+    B_in = shuffle_weight(B, layout=shuf_layout, k_pack=k_pack, is_transpose=b_transposed) if b_preshuffle else B
+    kernel(A_in, B_in, C)
 
     if a_transposed and b_transposed:
-        # Get Reference Result
         ref_c = torch.matmul(A.T.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     elif a_transposed and not b_transposed:
-        # Get Reference Result
-        ref_c = torch.matmul(A.Tto(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
+        ref_c = torch.matmul(A.T.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
     elif not a_transposed and b_transposed:
-        # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     else:
-        # Get Reference Result
         ref_c = torch.matmul(A.to(torch.float32), B.to(torch.float32)).to(getattr(torch, out_dtype))
 
-    print(C)
-    print(ref_c)
-
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
 @pytest.mark.parametrize(
-    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, b_preshuffle, b_g2l_load",
+    "M, N, K, in_dtype, out_dtype, accum_dtype, a_transposed, b_transposed, k_pack, a_preshuffle, b_preshuffle, b_g2l_load",
     [
-        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, True, False),
-        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 1, True, False),
-        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 2, True, False),
-        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 2, True, False),
-        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 1, True, False),
-        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 1, True, False),
-        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, True, 2, True, False),
-        (256, 256, 512, T.float8_e4m3fnuz, T.float32, T.float32, False, False, 2, True, False),
+        # B-only preshuffle
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, False, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 1, False, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 2, False, True, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, False, False, 2, False, True, False),
+        (256, 256, 512, determine_fp8_type(), T.float32, T.float32, False, True, 1, False, True, False),
+        (256, 256, 512, determine_fp8_type(), T.float32, T.float32, False, False, 1, False, True, False),
+        (256, 256, 512, determine_fp8_type(), T.float32, T.float32, False, True, 2, False, True, False),
+        (256, 256, 512, determine_fp8_type(), T.float32, T.float32, False, False, 2, False, True, False),
+        # No preshuffle
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, False, False, False),
+        (256, 256, 512, determine_fp8_type(), T.float32, T.float32, False, True, 1, False, False, False),
+        # A-only preshuffle
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, True, False, False),
+        (256, 256, 512, T.int8, T.int32, T.int32, True, True, 1, True, False, False),
+        (256, 256, 512, determine_fp8_type(), T.float32, T.float32, False, True, 1, True, False, False),
+        (256, 256, 512, determine_fp8_type(), T.float32, T.float32, True, True, 1, True, False, False),
+        # A+B preshuffle together (default 2x2 warp grid)
+        (256, 256, 512, T.int8, T.int32, T.int32, False, True, 1, True, True, False),
+        (256, 256, 512, determine_fp8_type(), T.float32, T.float32, False, True, 1, True, True, False),
     ],
 )
 @tilelang.testing.requires_rocm
@@ -282,6 +367,7 @@ def test_assert_tl_matmul(
     a_transposed,
     b_transposed,
     k_pack,
+    a_preshuffle,
     b_preshuffle,
     b_g2l_load,
 ):
@@ -295,10 +381,50 @@ def test_assert_tl_matmul(
         a_transposed=a_transposed,
         b_transposed=b_transposed,
         k_pack=k_pack,
+        a_preshuffle=a_preshuffle,
         b_preshuffle=b_preshuffle,
         b_g2l_load=b_g2l_load,
     )
 
 
+# ---- CDNA4 extended MFMA shapes: 16x16x64 and 32x32x32 for int8 ----
+@pytest.mark.parametrize(
+    "M, N, K, in_dtype, out_dtype, accum_dtype, b_transposed, k_pack, b_preshuffle, mfma_shape",
+    [
+        # v_mfma_i32_16x16x64_i8 — doubled K throughput (kp=1 only, micro_k=64)
+        (256, 256, 512, T.int8, T.int32, T.int32, True, 1, False, (16, 16, 64)),
+        (256, 256, 512, T.int8, T.int32, T.int32, True, 1, True, (16, 16, 64)),
+        # v_mfma_i32_32x32x32_i8 — doubled MN throughput
+        (256, 256, 512, T.int8, T.int32, T.int32, True, 1, False, (32, 32, 32)),
+        (256, 256, 512, T.int8, T.int32, T.int32, True, 1, True, (32, 32, 32)),
+    ],
+)
+@tilelang.testing.requires_gfx950
+def test_assert_tl_matmul_extended_mfma(
+    M,
+    N,
+    K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    b_transposed,
+    k_pack,
+    b_preshuffle,
+    mfma_shape,
+):
+    assert_tl_matmul_correctness(
+        M,
+        N,
+        K,
+        in_dtype,
+        out_dtype,
+        accum_dtype=accum_dtype,
+        b_transposed=b_transposed,
+        k_pack=k_pack,
+        b_preshuffle=b_preshuffle,
+        mfma_shape=mfma_shape,
+    )
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/amd/test_tilelang_gfx950_copy_async.py b/testing/python/amd/test_tilelang_gfx950_copy_async.py
new file mode 100644
index 0000000000..18935db938
--- /dev/null
+++ b/testing/python/amd/test_tilelang_gfx950_copy_async.py
@@ -0,0 +1,267 @@
+"""Tests for gfx950 (MI350) copy.async feature.
+
+Two new behaviours introduced in commit dfa63b10:
+  1. cp_async_gs<16> on gfx950 lowers to buffer_load_dwordx4 ... lds
+     (128-bit direct-to-LDS, bypassing VGPRs) instead of a plain uint4
+     scalar store.  coalesced_width=8 (8 fp16 = 16 bytes) is required to
+     trigger the 16-byte path.
+  2. CDNA arch helper reports smem_cap = 160 KB for gfx950, even when
+     an older driver reports the conservative 64 KB default.
+"""
+
+import pytest
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tilelang.testing import _check_is_gfx950 as _is_gfx950
+
+
+def _matmul_kernel(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads=128,
+    k_pack=1,
+    # coalesced_width=8 → cp_async_gs<16> (16 bytes, 8×fp16)
+    # coalesced_width=4 → cp_async_gs<8>  (8 bytes,  4×fp16)
+    coalesced_width=4,
+):
+    """Return a prim_func for pipelined GEMM using T.copy (global->shared)."""
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared, coalesced_width=coalesced_width)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared, coalesced_width=coalesced_width)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared, coalesced_width=coalesced_width)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared, coalesced_width=coalesced_width)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B, k_pack=k_pack)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+# ---------------------------------------------------------------------------
+# Test 1: codegen — cp_async_gs<16> is present in generated HIP source
+# ---------------------------------------------------------------------------
+
+
+def _matmul_kernel_async(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    coalesced_width=4,
+    threads=128,
+):
+    """Return a prim_func using T.async_copy (explicit async, no pipeline) for codegen tests."""
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+                if trans_A:
+                    T.async_copy(A[k * block_K, by * block_M], A_shared, coalesced_width=coalesced_width)
+                else:
+                    T.async_copy(A[by * block_M, k * block_K], A_shared, coalesced_width=coalesced_width)
+                if trans_B:
+                    T.async_copy(B[bx * block_N, k * block_K], B_shared, coalesced_width=coalesced_width)
+                else:
+                    T.async_copy(B[k * block_K, bx * block_N], B_shared, coalesced_width=coalesced_width)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+@tilelang.testing.requires_gfx950
+def test_gfx950_cp_async_gs_16_in_codegen():
+    """coalesced_width=8 (16 bytes) must emit cp_async_gs<16> in generated HIP source.
+
+    Uses T.async_copy (explicit async semantics) so that the 16-byte path is
+    emitted verbatim without relying on the software pipeline planner.
+    Pipelining correctness is covered separately by test_gfx950_copy_async_gemm_pipelined.
+    """
+    prog = _matmul_kernel_async(
+        256,
+        256,
+        256,
+        128,
+        128,
+        32,
+        False,
+        True,
+        T.float16,
+        T.float32,
+        T.float32,
+        coalesced_width=8,  # 8 fp16 = 16 bytes → cp_async_gs<16>
+    )
+    kernel = tl.compile(prog, out_idx=[2])
+    src = kernel.get_kernel_source()
+    assert "cp_async_gs<16>" in src, "Expected cp_async_gs<16> in generated HIP source for 128-bit async copy path"
+
+
+# ---------------------------------------------------------------------------
+# Test 2: LDS capacity reported as 160 KB on gfx950
+# ---------------------------------------------------------------------------
+
+
+@tilelang.testing.requires_rocm
+def test_gfx950_smem_cap_160kb():
+    """CDNA arch helper must report 160 KB LDS for gfx950."""
+    from tilelang import tvm
+    from tilelang.carver.arch.cdna import CDNA, _GFX950_LDS_SIZE
+
+    target = tvm.target.Target("rocm")
+    arch = CDNA(target)
+
+    if _is_gfx950():
+        assert arch.smem_cap == _GFX950_LDS_SIZE, f"Expected smem_cap={_GFX950_LDS_SIZE} for gfx950, got {arch.smem_cap}"
+    else:
+        # On non-gfx950 devices the override must NOT kick in
+        from tilelang import tvm as _tvm
+
+        dev = _tvm.device("rocm", 0)
+        assert arch.smem_cap == dev.max_shared_memory_per_block
+
+
+# ---------------------------------------------------------------------------
+# Test 3: numerical correctness — pipelined copy.async GEMM (num_stages=2)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "trans_A, trans_B, k_pack",
+    [
+        (False, False, 1),
+        (False, True, 1),
+        (True, True, 1),
+        (True, False, 1),
+    ],
+)
+@tilelang.testing.requires_gfx950
+def test_gfx950_copy_async_gemm_pipelined(trans_A, trans_B, k_pack):
+    """Pipelined GEMM (num_stages=2) with gfx950 copy.async must be numerically correct."""
+    prog = _matmul_kernel(
+        512,
+        512,
+        512,
+        128,
+        128,
+        32,
+        trans_A,
+        trans_B,
+        T.float16,
+        T.float32,
+        T.float32,
+        num_stages=2,
+        threads=128,
+        k_pack=k_pack,
+        coalesced_width=4 * k_pack,
+    )
+    kernel = tl.compile(prog, out_idx=[2])
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        a = A.T.float() if trans_A else A.float()
+        b = B.T.float() if trans_B else B.float()
+        return torch.matmul(a, b)
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+# ---------------------------------------------------------------------------
+# Test 4: non-pipelined baseline still correct (num_stages=0)
+# ---------------------------------------------------------------------------
+
+
+@pytest.mark.parametrize(
+    "trans_A, trans_B",
+    [
+        (False, False),
+        (False, True),
+        (True, True),
+        (True, False),
+    ],
+)
+@tilelang.testing.requires_rocm
+def test_gfx950_copy_async_gemm_no_pipeline(trans_A, trans_B):
+    """Non-pipelined GEMM (num_stages=0) must also produce correct results."""
+    prog = _matmul_kernel(
+        512,
+        512,
+        512,
+        128,
+        128,
+        32,
+        trans_A,
+        trans_B,
+        T.float16,
+        T.float32,
+        T.float32,
+        num_stages=0,
+        threads=128,
+        coalesced_width=4,
+    )
+    kernel = tl.compile(prog, out_idx=[2])
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        a = A.T.float() if trans_A else A.float()
+        b = B.T.float() if trans_B else B.float()
+        return torch.matmul(a, b)
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/amd/test_tilelang_hip_codegen.py b/testing/python/amd/test_tilelang_hip_codegen.py
new file mode 100644
index 0000000000..d042a3d08e
--- /dev/null
+++ b/testing/python/amd/test_tilelang_hip_codegen.py
@@ -0,0 +1,749 @@
+"""
+Regression tests for HIP/AMD codegen fixes in TileLang.
+
+Covers six bug fixes across five source files:
+
+  Fix 1 (reduce.h)            warp_reduce 5-step butterfly with width=32
+  Fix 2 (codegen_hip.cc,      ShuffleNode bfloat16x2/float16x2 packing;
+          common.h)            uint1 bf16x2 math overloads
+  Fix 3 (allocate.py,         T.alloc_var(init=<literal>) emits a correctly
+          codegen_hip.cc)      initialised scalar on HIP
+  Fix 4 (codegen_hip.cc)      T.sync_warp() lowered to no-op on HIP
+  Fix 5 (codegen_hip.cc,      T.sync_grid() lowered to cooperative groups
+          rt_mod_hip.cc,       grid barrier; runtime launch infrastructure
+          stubs/)              added
+  Fix 6 (pipeline_planning.cc) T.Pipelined(num_stages>1) falls back to a
+                               plain sequential loop on ROCM to avoid LDS
+                               overflow (hipModuleLaunchKernel EINVAL)
+"""
+
+import pytest
+import torch
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+# ---------------------------------------------------------------------------
+# Fix 1 — src/tl_templates/hip/reduce.h
+#   warp_reduce: 5-step butterfly with explicit width=32
+#
+# Symptom: On CDNA (wave64) with 32 active threads the old 6-step butterfly
+#   called __shfl_xor(value, 32) without a width argument, reading uninitialised
+#   VGPRs in lanes 32-63.  This produced NaN or garbage in every reduction that
+#   went through warp_reduce (reduce_max, reduce_sum, AllReduce).
+#
+# Fix: remove the step-32 shuffle; add width=32 to every remaining step.
+#   __shfl_xor(v, N, 32) restricts the butterfly to the lower 32-lane group,
+#   matching CUDA warp semantics on CDNA wave64 and RDNA wave32 alike.
+#   With 64 threads and width=32 the wavefront splits into two independent
+#   32-lane groups — correct for kernels that assume logical warp_size=32.
+# ---------------------------------------------------------------------------
+
+
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize("n_tokens,n_experts", [(64, 8), (128, 16), (512, 32)])
+def test_warp_reduce_no_nan(n_tokens, n_experts):
+    """
+    32-thread-per-block reduce_max / reduce_sum must not produce NaN on CDNA.
+
+    Old: __shfl_xor(v, 32) with 32 active threads reads uninit VGPRs → NaN.
+    New: 5-step with width=32 stays in [0,31] group → correct, no NaN.
+    """
+    assert n_experts <= 32
+
+    @tilelang.jit
+    def gate_reduce(n_tok: int, n_exp: int):
+        @T.prim_func
+        def kernel(
+            logits: T.Tensor((n_tok, n_exp), T.float32),
+            out_max: T.Tensor((n_tok,), T.float32),
+            out_sum: T.Tensor((n_tok,), T.float32),
+        ) -> None:
+            with T.Kernel(n_tok, threads=32) as pid:
+                lf = T.alloc_fragment(n_exp, T.float32)
+                T.copy(logits[pid, 0], lf)
+                mx = T.alloc_fragment(1, T.float32)
+                T.reduce_max(lf, mx, dim=0)
+                sm = T.alloc_fragment(1, T.float32)
+                T.reduce_sum(lf, sm, dim=0)
+                if T.get_thread_binding() == 0:
+                    out_max[pid] = mx[0]
+                    out_sum[pid] = sm[0]
+
+        return kernel
+
+    logits = torch.randn(n_tokens, n_experts, dtype=torch.float32, device="cuda")
+    out_max = torch.zeros(n_tokens, dtype=torch.float32, device="cuda")
+    out_sum = torch.zeros(n_tokens, dtype=torch.float32, device="cuda")
+
+    gate_reduce(n_tokens, n_experts)(logits, out_max, out_sum)
+    torch.cuda.synchronize()
+
+    assert not out_max.isnan().any(), "reduce_max NaN — __shfl_xor(v,32) uninit VGPR bug"
+    assert not out_sum.isnan().any(), "reduce_sum NaN — __shfl_xor(v,32) uninit VGPR bug"
+    torch.testing.assert_close(out_max, logits.max(dim=1).values, atol=1e-5, rtol=1e-5)
+    torch.testing.assert_close(out_sum, logits.sum(dim=1), atol=1e-4, rtol=1e-4)
+
+
+@tilelang.testing.requires_rocm
+def test_warp_reduce_correctness_32_threads():
+    """
+    32-thread reduce_sum over 32 elements must return the exact sum on CDNA.
+
+    Exercises the warp-level shuffle path directly.  With the old step-32
+    shuffle, uninitialised VGPR reads on CDNA produced garbage.
+    """
+    N = 32
+
+    @tilelang.jit
+    def reduce_kernel():
+        @T.prim_func
+        def kernel(
+            x: T.Tensor((N,), T.float32),
+            out: T.Tensor((1,), T.float32),
+        ) -> None:
+            with T.Kernel(1, threads=N) as _:
+                frag = T.alloc_fragment((N,), T.float32)
+                T.copy(x, frag)
+                s = T.alloc_fragment((1,), T.float32)
+                T.reduce_sum(frag, s, dim=0)
+                if T.get_thread_binding() == 0:
+                    out[0] = s[0]
+
+        return kernel
+
+    x = torch.arange(1, N + 1, dtype=torch.float32, device="cuda")
+    out = torch.zeros(1, dtype=torch.float32, device="cuda")
+    reduce_kernel()(x, out)
+    torch.cuda.synchronize()
+
+    assert not out[0].isnan(), "reduce_sum NaN — warp_reduce VGPR bug on CDNA"
+    torch.testing.assert_close(out[0], x.sum(), atol=1e-4, rtol=1e-4)
+
+
+@tilelang.testing.requires_rocm
+def test_warp_reduce_with_64_threads_two_groups():
+    """
+    With 64 threads and width=32 the wavefront splits into two independent
+    32-lane groups — each group's lane 0 holds its partial sum.
+
+    Old: __shfl_xor(v, 32) without width mixed the groups → wrong result.
+    New: width=32 confines each shuffle to its own 32-lane group → correct.
+    """
+    N, n_exp = 64, 4
+
+    @tilelang.jit
+    def two_warp_reduce():
+        @T.prim_func
+        def kernel(
+            x: T.Tensor((N, n_exp), T.float32),
+            out: T.Tensor((N,), T.float32),
+        ) -> None:
+            with T.Kernel(1, threads=N) as _:
+                tx = T.get_thread_binding()
+                frag = T.alloc_fragment(n_exp, T.float32)
+                T.copy(x[tx, 0], frag)
+                s = T.alloc_fragment(1, T.float32)
+                T.reduce_sum(frag, s, dim=0)
+                out[tx] = s[0]
+
+        return kernel
+
+    x = torch.ones(N, n_exp, dtype=torch.float32, device="cuda")
+    out = torch.zeros(N, dtype=torch.float32, device="cuda")
+    two_warp_reduce()(x, out)
+    torch.cuda.synchronize()
+
+    assert not out.isnan().any(), "NaN in two-warp reduce — width=32 fix not applied"
+
+
+# ---------------------------------------------------------------------------
+# Fix 2 — src/target/codegen_hip.cc (VisitExpr_ ShuffleNode)
+#         src/tl_templates/hip/common.h (uint1 bfloat16x2 math overloads)
+#
+# Symptom: Packing two bfloat16 scalars into a bfloat16x2 ShuffleNode caused
+#   CodeGenC to emit `uint1(a, b)` — invalid HIP constructor → compile error.
+#
+# Fix (codegen_hip.cc): Override VisitExpr_(ShuffleNode) to emit
+#   `uint1{__pack_bfloat162(a, b)}` / `uint1{__pack_half2(a, b)}`.
+# Fix (common.h): Add abs2/max2/min2/add2/mul2 overloads for uint1 as a
+#   packed bfloat16x2 carrier.
+# ---------------------------------------------------------------------------
+
+
+@tilelang.testing.requires_rocm
+def test_bfloat16_shuffle_codegen_and_correctness():
+    """
+    bfloat16 fragment warp-reduction: source must use __pack_bfloat162
+    (not invalid `uint1(a,b)`) and the result must be numerically correct.
+    """
+    n_tok, n_exp = 16, 8
+
+    @tilelang.jit
+    def bf16_reduce(n_t: int, n_e: int):
+        @T.prim_func
+        def kernel(
+            x: T.Tensor((n_t, n_e), T.bfloat16),
+            out: T.Tensor((n_t,), T.float32),
+        ) -> None:
+            with T.Kernel(n_t, threads=32) as pid:
+                frag = T.alloc_fragment(n_e, T.bfloat16)
+                T.copy(x[pid, 0], frag)
+                frag_f32 = T.alloc_fragment(n_e, T.float32)
+                for i in T.Parallel(n_e):
+                    frag_f32[i] = T.cast(frag[i], T.float32)
+                s = T.alloc_fragment(1, T.float32)
+                T.reduce_sum(frag_f32, s, dim=0)
+                if T.get_thread_binding() == 0:
+                    out[pid] = s[0]
+
+        return kernel
+
+    kernel = bf16_reduce(n_tok, n_exp)
+
+    # Source check: no invalid two-argument constructor
+    src = kernel.get_kernel_source()
+    assert "uint1(a" not in src and "uint1(b" not in src, "Old `uint1(a, b)` constructor found — ShuffleNode fix not applied"
+
+    # Runtime correctness
+    x = torch.randn(n_tok, n_exp, dtype=torch.bfloat16, device="cuda")
+    out = torch.zeros(n_tok, dtype=torch.float32, device="cuda")
+    kernel(x, out)
+    torch.cuda.synchronize()
+    assert not out.isnan().any(), "bf16 ShuffleNode reduction NaN"
+    torch.testing.assert_close(out, x.float().sum(dim=1), atol=5e-2, rtol=1e-2)
+
+
+@tilelang.testing.requires_rocm
+def test_float16_shuffle_correctness():
+    """
+    float16 fragment warp-reduction exercises the __pack_half2 path.
+    Analogous to the bfloat16 test but for float16x2 packing.
+    """
+    n_tok, n_exp = 64, 8
+
+    @tilelang.jit
+    def f16_reduce(n_t: int, n_e: int):
+        @T.prim_func
+        def kernel(
+            x: T.Tensor((n_t, n_e), T.float16),
+            out: T.Tensor((n_t,), T.float32),
+        ) -> None:
+            with T.Kernel(n_t, threads=32) as pid:
+                frag = T.alloc_fragment(n_e, T.float16)
+                T.copy(x[pid, 0], frag)
+                frag_f32 = T.alloc_fragment(n_e, T.float32)
+                for i in T.Parallel(n_e):
+                    frag_f32[i] = T.cast(frag[i], T.float32)
+                s = T.alloc_fragment(1, T.float32)
+                T.reduce_sum(frag_f32, s, dim=0)
+                if T.get_thread_binding() == 0:
+                    out[pid] = s[0]
+
+        return kernel
+
+    x = torch.randn(n_tok, n_exp, dtype=torch.float16, device="cuda")
+    out = torch.zeros(n_tok, dtype=torch.float32, device="cuda")
+    f16_reduce(n_tok, n_exp)(x, out)
+    torch.cuda.synchronize()
+    assert not out.isnan().any(), "float16 ShuffleNode reduction NaN"
+    torch.testing.assert_close(out, x.float().sum(dim=1), atol=1e-1, rtol=1e-2)
+
+
+# ---------------------------------------------------------------------------
+# Fix 3 — tilelang/language/allocate.py + src/target/codegen_hip.cc
+#   T.alloc_var(init=<literal>) initialisation on HIP;
+#   local.var scalar declaration and GetBufferRef bare-name return
+#
+# Symptom (allocate.py): int/float literals used block_attr("tl.local_var_init")
+#   which the HIP backend silently ignored → variable uninitialised at runtime.
+# Symptom (codegen_hip.cc): AllocateNode emitted `type vid[1];` for local.var;
+#   alloc_storage_scope_ was not updated → GetBufferRef fell through to an
+#   invalid pointer-cast path → compile failure.
+#
+# Fix (allocate.py): always route init through T.buffer_store → explicit
+#   BufferStore TIR node → assignment statement in every backend.
+# Fix (codegen_hip.cc): emit `type vid = init;`; register alloc_storage_scope_
+#   so GetBufferRef returns the bare name `vid`.
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    }
+)
+def _kernel_alloc_var_init():
+    """Kernel that initialises a local int32 variable to 7 and writes it out."""
+
+    @T.prim_func
+    def main(Out: T.Tensor((64,), "int32")):
+        with T.Kernel(1, threads=64):
+            tx = T.get_thread_binding()
+            counter = T.alloc_var(T.int32, init=7)
+            Out[tx] = counter
+
+    return main
+
+
+@tilelang.testing.requires_rocm
+def test_alloc_var_init_in_hip_source():
+    """Init value must appear as `= 7;` in the generated HIP source."""
+    src = _kernel_alloc_var_init().get_kernel_source()
+    assert "= 7;" in src, (
+        f"T.alloc_var(T.int32, init=7) should generate '= 7;' in HIP source, but it was not found.\nGenerated source:\n{src}"
+    )
+
+
+@tilelang.testing.requires_rocm
+def test_alloc_var_init_no_array_subscript_in_hip_source():
+    """local.var must be declared as a scalar (no `counter[` array syntax)."""
+    src = _kernel_alloc_var_init().get_kernel_source()
+    assert "counter[" not in src, (
+        f"local.var should be emitted as a scalar (e.g. 'int counter = 7'), but array-style access was found:\n{src}"
+    )
+
+
+@tilelang.testing.requires_rocm
+def test_alloc_var_init_correctness():
+    """All output elements must equal 7 — the initialised value."""
+    out = torch.zeros(64, dtype=torch.int32, device="cuda")
+    _kernel_alloc_var_init()(out)
+    assert torch.all(out == 7), f"Expected all 7, got: {out}"
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    }
+)
+def _kernel_multi_alloc_var_init():
+    """Two local variables with different init values, summed into output."""
+
+    @T.prim_func
+    def main(Out: T.Tensor((32,), "int32")):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            a = T.alloc_var(T.int32, init=3)
+            b = T.alloc_var(T.int32, init=4)
+            Out[tx] = a + b
+
+    return main
+
+
+@tilelang.testing.requires_rocm
+def test_multi_alloc_var_init_in_hip_source():
+    """Both init values must appear in the HIP source."""
+    src = _kernel_multi_alloc_var_init().get_kernel_source()
+    assert src.count("= 3;") >= 1, f"Init value 3 not found in HIP source:\n{src}"
+    assert src.count("= 4;") >= 1, f"Init value 4 not found in HIP source:\n{src}"
+
+
+@tilelang.testing.requires_rocm
+def test_multi_alloc_var_init_correctness():
+    """Sum of two initialised local variables must equal 7 (3+4)."""
+    out = torch.zeros(32, dtype=torch.int32, device="cuda")
+    _kernel_multi_alloc_var_init()(out)
+    assert torch.all(out == 7), f"Expected all 7 (3+4), got: {out}"
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    }
+)
+def _kernel_alloc_var_count():
+    """Counter initialised to 0, incremented 5 times in a loop."""
+
+    @T.prim_func
+    def main(Out: T.Tensor((32,), "int32")):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            count = T.alloc_var(T.int32, init=0)
+            for _ in T.unroll(5):
+                count += 1
+            Out[tx] = count
+
+    return main
+
+
+@tilelang.testing.requires_rocm
+def test_alloc_var_zero_init_correctness():
+    """Variable initialised to 0 and incremented 5 times must equal 5."""
+    out = torch.zeros(32, dtype=torch.int32, device="cuda")
+    _kernel_alloc_var_count()(out)
+    assert torch.all(out == 5), f"Expected all 5, got: {out}"
+
+
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize(
+    "init_val,dtype_str",
+    [
+        (0, "int32"),
+        (7, "int32"),
+        (-3, "int32"),
+        (0.0, "float32"),
+        (1.0, "float32"),
+        (-0.5, "float32"),
+    ],
+)
+def test_alloc_var_literal_init_is_reliable(init_val, dtype_str):
+    """
+    alloc_var with any literal init must produce that exact value on HIP.
+
+    Old: int/float literals → block_attr (silently ignored) → uninit.
+    New: always T.buffer_store → `vid = init_val;` in generated HIP C.
+    """
+    tl_dtype = T.int32 if dtype_str == "int32" else T.float32
+    torch_dtype = torch.int32 if dtype_str == "int32" else torch.float32
+    N = 32
+
+    @tilelang.jit
+    def var_init_kernel(iv, tld):
+        @T.prim_func
+        def kernel(out: T.Tensor((N,), tld)) -> None:
+            with T.Kernel(1, threads=N) as _:
+                v = T.alloc_var(tld, init=iv)
+                for i in T.Parallel(N):
+                    out[i] = v
+
+        return kernel
+
+    out = torch.zeros(N, dtype=torch_dtype, device="cuda")
+    var_init_kernel(init_val, tl_dtype)(out)
+    torch.cuda.synchronize()
+
+    expected = torch.full((N,), init_val, dtype=torch_dtype, device="cuda")
+    if dtype_str == "int32":
+        assert torch.equal(out, expected), f"alloc_var(init={init_val}) got {out[0].item()}, expected {init_val}"
+    else:
+        torch.testing.assert_close(out, expected, atol=0, rtol=0)
+
+
+@tilelang.testing.requires_rocm
+def test_alloc_var_inf_init():
+    """
+    alloc_var(init=-T.infinity(T.float32)) — the pattern used for top1_var /
+    top2_var in MoE topk gate kernels — must produce -inf on HIP.
+    """
+    N = 32
+
+    @tilelang.jit
+    def inf_init_kernel():
+        @T.prim_func
+        def kernel(out: T.Tensor((N,), T.float32)) -> None:
+            with T.Kernel(1, threads=N) as _:
+                v = T.alloc_var(T.float32, init=-T.infinity(T.float32))
+                for i in T.Parallel(N):
+                    out[i] = v
+
+        return kernel
+
+    out = torch.zeros(N, dtype=torch.float32, device="cuda")
+    inf_init_kernel()(out)
+    torch.cuda.synchronize()
+    assert out.isinf().all() and (out < 0).all(), f"alloc_var(init=-inf) got {out[0].item()}, expected -inf"
+
+
+@tilelang.testing.requires_rocm
+def test_alloc_var_init_zero_persists_across_serial_loop():
+    """
+    count_var = T.alloc_var(T.int32, init=0) must start at 0 and accumulate
+    correctly.  This is the exact pattern used by count_var in MoE kernels.
+    """
+    N = 8
+
+    @tilelang.jit
+    def serial_count_kernel():
+        @T.prim_func
+        def kernel(out: T.Tensor((1,), T.int32)) -> None:
+            with T.Kernel(1, threads=1) as _:
+                count_var = T.alloc_var(T.int32, init=0)
+                for _ in T.serial(N):
+                    count_var = count_var + 1
+                out[0] = count_var
+
+        return kernel
+
+    out = torch.zeros(1, dtype=torch.int32, device="cuda")
+    serial_count_kernel()(out)
+    torch.cuda.synchronize()
+    assert out[0].item() == N, f"count_var: got {out[0].item()}, expected {N} — init=0 not applied (block_attr bug)"
+
+
+@tilelang.testing.requires_rocm
+def test_local_var_scalar_codegen():
+    """
+    local.var must be emitted and accessed as a plain scalar on HIP.
+
+    Before: alloc_storage_scope_ not registered → GetBufferRef fell through to
+            an invalid pointer-cast path → compile failure.
+    After:  `type vid = init;` emitted; GetBufferRef returns bare `vid`.
+    """
+    N = 32
+
+    @tilelang.jit
+    def local_var_scalar():
+        @T.prim_func
+        def kernel(out: T.Tensor((N,), T.int32)) -> None:
+            with T.Kernel(1, threads=N) as _:
+                v = T.alloc_var(T.int32, init=5)
+                if T.get_thread_binding() == 0:
+                    v = v + 1
+                for i in T.Parallel(N):
+                    out[i] = v
+
+        return kernel
+
+    out = torch.zeros(N, dtype=torch.int32, device="cuda")
+    local_var_scalar()(out)
+    torch.cuda.synchronize()
+    assert out[0].item() == 6, f"local.var scalar: got {out[0].item()}, expected 6 (5+1)"
+
+
+@tilelang.testing.requires_rocm
+def test_local_var_float_init_readable():
+    """
+    local.var with float32 literal init must be readable on HIP.
+    Before the alloc_storage_scope_ fix, GetBufferRef emitted invalid code.
+    """
+
+    @tilelang.jit
+    def float_init_readback():
+        @T.prim_func
+        def kernel(out: T.Tensor((1,), T.float32)) -> None:
+            with T.Kernel(1, threads=32) as _:
+                v = T.alloc_var(T.float32, init=3.14)
+                if T.get_thread_binding() == 0:
+                    out[0] = v
+
+        return kernel
+
+    out = torch.zeros(1, dtype=torch.float32, device="cuda")
+    float_init_readback()(out)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(out[0].item(), 3.14, atol=1e-5, rtol=0)
+
+
+# ---------------------------------------------------------------------------
+# Fix 4 — src/target/codegen_hip.cc
+#   T.sync_warp() → no-op on HIP
+#
+# Symptom: tl::sync_warp() had no handler → codegen assertion failure or
+#   undefined symbol at link time.
+# Fix: emit an empty statement; AMD wavefronts execute in lockstep so
+#   intra-wavefront convergence is guaranteed by hardware.
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    }
+)
+def _kernel_sync_warp_codegen():
+    """Minimal kernel that exercises T.sync_warp()."""
+
+    @T.prim_func
+    def main(A: T.Tensor((32,), "float32"), B: T.Tensor((32,), "float32")):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            A_shared = T.alloc_shared((32,), "float32")
+            A_shared[tx] = A[tx]
+            T.sync_warp()
+            B[tx] = A_shared[tx] * 2.0
+
+    return main
+
+
+@tilelang.testing.requires_rocm
+def test_sync_warp_no_syncwarp_in_hip_source():
+    """__syncwarp must NOT appear in the generated HIP source."""
+    src = _kernel_sync_warp_codegen().get_kernel_source()
+    assert "__syncwarp" not in src, f"T.sync_warp() should be a no-op on HIP, but __syncwarp was found in the generated source:\n{src}"
+
+
+@tilelang.testing.requires_rocm
+def test_sync_warp_correctness():
+    """Kernel using T.sync_warp() must produce correct results on HIP."""
+    A = torch.arange(32, dtype=torch.float32, device="cuda")
+    B = torch.zeros(32, dtype=torch.float32, device="cuda")
+    _kernel_sync_warp_codegen()(A, B)
+    torch.testing.assert_close(B, A * 2.0)
+
+
+@tilelang.testing.requires_rocm
+def test_sync_warp_inside_conditional():
+    """
+    T.sync_warp() inside a conditional branch (pattern from moe/common.py
+    get_topk_group_idx).  Verifies compilation and deterministic output.
+    """
+    N, M = 32, 8
+
+    @tilelang.jit
+    def sync_warp_cond_kernel():
+        @T.prim_func
+        def kernel(
+            x: T.Tensor((N,), T.float32),
+            out: T.Tensor((M,), T.float32),
+        ) -> None:
+            with T.Kernel(1, threads=N) as _:
+                shmem = T.alloc_shared((M,), T.float32)
+                tx = T.get_thread_binding()
+                if tx < M:
+                    shmem[tx] = x[tx]
+                T.sync_warp()
+                for i in T.Parallel(M):
+                    out[i] = shmem[i]
+
+        return kernel
+
+    x = torch.randn(N, dtype=torch.float32, device="cuda")
+    out = torch.zeros(M, dtype=torch.float32, device="cuda")
+    sync_warp_cond_kernel()(x, out)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(out, x[:M])
+
+
+# ---------------------------------------------------------------------------
+# Fix 5 — src/target/codegen_hip.cc, src/target/rt_mod_hip.cc,
+#          src/target/stubs/hip.cc, src/target/stubs/hip.h
+#   T.sync_grid() → cooperative_groups::this_grid().sync()
+#
+# Symptom: tl::sync_grid() had no handler → same assertion / link failure.
+# Fix: emit cooperative_groups call; add need_cooperative_groups_ flag to
+#   conditionally include hip_cooperative_groups.h; add runtime launch
+#   infrastructure (hipModuleLaunchCooperativeKernel stubs).
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit(
+    pass_configs={
+        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        tilelang.PassConfigKey.TL_DISABLE_THREAD_STORAGE_SYNC: True,
+    }
+)
+def _kernel_sync_grid_codegen():
+    """Kernel that calls T.sync_grid() to trigger cooperative groups codegen."""
+
+    @T.prim_func
+    def main(A: T.Tensor((32,), "float32")):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            T.sync_grid()
+            A[tx] = T.float32(tx)
+
+    return main
+
+
+@tilelang.testing.requires_rocm
+def test_sync_grid_cooperative_groups_in_hip_source():
+    """
+    T.sync_grid() must emit cooperative_groups::this_grid().sync() and
+    include <hip/hip_cooperative_groups.h> in the generated HIP source.
+
+    Note: runtime execution requires hipModuleLaunchCooperativeKernel which
+    is added via the stub infrastructure; this test validates codegen only.
+    """
+    src = _kernel_sync_grid_codegen().get_kernel_source()
+    assert "this_grid().sync()" in src, f"T.sync_grid() should generate 'this_grid().sync()' but not found:\n{src}"
+    assert "cooperative_groups" in src, f"T.sync_grid() should include cooperative_groups but not found:\n{src}"
+
+
+# ---------------------------------------------------------------------------
+# Fix 6 — src/transform/pipeline_planning.cc
+#   Skip T.Pipelined(num_stages>1) pipeline planning on ROCM
+#
+# Symptom: Double-buffering doubled the LDS allocation for every shared buffer
+#   inside the loop body, exhausting the per-workgroup LDS budget and causing
+#   hipModuleLaunchKernel to return HIPERRORINVALIDVALUE.  LDS limits:
+#     gfx942 (CDNA3 / MI300X): 64 KB;  gfx950 (CDNA4 / MI350): 160 KB (#2058)
+#   Even with gfx950's larger budget, double-buffering large shared tiles can
+#   still exceed 160 KB, and the HIP async-copy infrastructure has no ROCM
+#   equivalent, so the planner cannot safely pipeline on any ROCM target.
+#
+# Fix: when TargetIsRocm() && num_stages > 1, skip pipeline planning and fall
+#   back to a plain sequential loop with synchronous T.copy — always LDS-safe.
+# ---------------------------------------------------------------------------
+
+
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize("num_stages", [1, 2, 3])
+def test_pipelined_no_lds_overflow(num_stages):
+    """
+    T.Pipelined(num_stages=N) must not raise hipModuleLaunchKernel EINVAL and
+    must produce the correct result regardless of N.
+
+    Old: num_stages=2 doubled LDS → EINVAL (64 KB on gfx942, 160 KB on gfx950).
+    New: multi-stage loops fall back to plain sequential on ROCM.
+    """
+    M, K, blk = 32, 256, 64
+
+    @tilelang.jit
+    def kernel(
+        x: T.Tensor((M, K), T.float32),
+        out: T.Tensor((M,), T.float32),
+    ) -> None:
+        with T.Kernel(M, threads=64) as pid:
+            acc = T.alloc_fragment((1,), T.float32)
+            xs = T.alloc_shared((blk,), T.float32)
+            xl = T.alloc_fragment((blk,), T.float32)
+            s = T.alloc_fragment((1,), T.float32)
+            T.clear(acc)
+            for k in T.Pipelined(K // blk, num_stages=num_stages):
+                T.copy(x[pid, k * blk], xs, disable_tma=True)
+                T.copy(xs, xl, disable_tma=True)
+                T.reduce_sum(xl, s, dim=0)
+                acc[0] = acc[0] + s[0]
+            out[pid] = acc[0]
+
+    x = torch.ones(M, K, dtype=torch.float32, device="cuda")
+    out = torch.zeros(M, dtype=torch.float32, device="cuda")
+    kernel(x, out)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(out, torch.full((M,), float(K), device="cuda"), atol=1e-4, rtol=0)
+
+
+@tilelang.testing.requires_rocm
+@pytest.mark.parametrize("num_stages", [2, 3])
+def test_pipelined_multi_stage_fp16_gemm(num_stages):
+    """
+    FP16 GEMM with T.Pipelined(num_stages>1) must launch and produce correct
+    results on ROCM — the most common pattern that triggered the LDS overflow
+    (A_s bM×bK + B_s bK×bN doubled per pipeline stage).
+    """
+    M, N, K = 128, 128, 128
+    bM, bN, bK = 64, 64, 32
+
+    @tilelang.jit
+    def kernel(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float32),
+    ) -> None:
+        with T.Kernel(T.ceildiv(N, bN), T.ceildiv(M, bM), threads=128) as (bx, by):
+            A_s = T.alloc_shared((bM, bK), T.float16)
+            B_s = T.alloc_shared((bK, bN), T.float16)
+            C_l = T.alloc_fragment((bM, bN), T.float32)
+            T.clear(C_l)
+            for k in T.Pipelined(K // bK, num_stages=num_stages):
+                T.copy(A[by * bM, k * bK], A_s)
+                T.copy(B[k * bK, bx * bN], B_s)
+                T.gemm(A_s, B_s, C_l)
+            T.copy(C_l, C[by * bM, bx * bN])
+
+    A = torch.randn(M, K, dtype=torch.float16, device="cuda")
+    B = torch.randn(K, N, dtype=torch.float16, device="cuda")
+    C = torch.zeros(M, N, dtype=torch.float32, device="cuda")
+    kernel(A, B, C)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(C, A.float() @ B.float(), atol=1.0, rtol=5e-2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/amd/test_tilelang_test_amd.py b/testing/python/amd/test_tilelang_test_amd.py
index 4035c299c3..4debef4e31 100644
--- a/testing/python/amd/test_tilelang_test_amd.py
+++ b/testing/python/amd/test_tilelang_test_amd.py
@@ -103,7 +103,10 @@ def ref_program(A, B):
         (False, True, 1),
         (True, True, 1),
         (True, False, 1),
+        (False, False, 2),
         (False, True, 2),
+        (True, True, 2),
+        (True, False, 2),
     ],
 )
 @tilelang.testing.requires_rocm
diff --git a/testing/python/analysis/test_tilelang_nested_loop_checker.py b/testing/python/analysis/test_tilelang_nested_loop_checker.py
index 664fda5b81..0038f79f92 100644
--- a/testing/python/analysis/test_tilelang_nested_loop_checker.py
+++ b/testing/python/analysis/test_tilelang_nested_loop_checker.py
@@ -100,7 +100,7 @@ def test_nested_parallels():
 
 
 def matmul_nested_pipelines(
-    M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, threads, order, stage, extra_pipeline_repeats
+    M, N, K, block_M, block_N, block_K, trans_A, trans_B, in_dtype, out_dtype, accum_dtype, threads, extra_pipeline_repeats
 ):
     A_shape = (K, M) if trans_A else (M, K)
     B_shape = (N, K) if trans_B else (K, N)
@@ -121,7 +121,7 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             for _ in T.Pipelined(extra_pipeline_repeats):
                 T.clear(C_local)
-                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
                     if trans_A:
                         T.copy(A[k * block_K, by * block_M], A_shared)
                     else:
@@ -137,8 +137,6 @@ def main(
 
 
 def run_gemm_nested_pipelines(
-    order,
-    stage,
     extra_pipeline_repeats,
 ):
     M = 1024
@@ -166,18 +164,13 @@ def run_gemm_nested_pipelines(
         out_dtype,
         dtypeAccum,
         num_threads,
-        order,
-        stage,
         extra_pipeline_repeats,
     )
 
     kernel = tilelang.compile(
         program,
         out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = kernel.get_profiler()
 
@@ -201,7 +194,7 @@ def ref_program(A, B):
 
 
 def test_nested_pipelines():
-    run_gemm_nested_pipelines(order=[0, 1, 2], stage=[0, 0, 1], extra_pipeline_repeats=3)
+    run_gemm_nested_pipelines(extra_pipeline_repeats=3)
 
 
 """
@@ -379,8 +372,6 @@ def matmul_nested_pipa(
     out_dtype,
     accum_dtype,
     threads,
-    order,
-    stage,
 ):
     A_shape = (M, K)
     B_shape = (K, N)
@@ -398,7 +389,7 @@ def main(
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
                 for i, j in T.Parallel(block_M, block_K):
                     A_shared[i, j] = A[by * block_M + i, k * block_K + j]
                 for i, j in T.Parallel(block_K, block_N):
@@ -424,8 +415,6 @@ def matmul_nested_papipa(
     out_dtype,
     accum_dtype,
     threads,
-    order,
-    stage,
 ):
     A_shape = (M, K)
     B_shape = (K, N)
@@ -444,7 +433,7 @@ def main(
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
             for _ in T.Parallel(1):
-                for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
                     for i, j in T.Parallel(block_M, block_K):
                         A_shared[i, j] = A[by * block_M + i, k * block_K + j]
                     for i, j in T.Parallel(block_K, block_N):
@@ -459,10 +448,7 @@ def main(
     return main
 
 
-def run_gemm_mixed_pp(
-    order,
-    stage,
-):
+def run_gemm_mixed_pp():
     M = 1024
     N = 1024
     K = 1024
@@ -485,17 +471,12 @@ def run_gemm_mixed_pp(
         out_dtype,
         dtypeAccum,
         num_threads,
-        order,
-        stage,
     )
 
     kernel = tilelang.compile(
         program,
         out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = kernel.get_profiler()
 
@@ -524,22 +505,17 @@ def ref_program(A, B):
         out_dtype,
         dtypeAccum,
         num_threads,
-        order,
-        stage,
     )
     with pytest.raises(ValueError):
         tilelang.compile(
             program1,
             out_idx=[2],
-            pass_configs={
-                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            },
+            pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
         )
 
 
 def test_mixed_pp():
-    run_gemm_mixed_pp(order=[0, 1, 2], stage=[0, 0, 1])
+    run_gemm_mixed_pp()
 
 
 """
@@ -558,8 +534,6 @@ def matmul_with_parallel(
     out_dtype,
     accum_dtype,
     threads,
-    order,
-    stage,
 ):
     A_shape = (M, K)
     B_shape = (K, N)
@@ -577,7 +551,7 @@ def main(
             B_shared = T.alloc_shared(B_shared_shape, in_dtype)
             C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
             T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), order=order, stage=stage):
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
                 for i, j in T.Parallel(block_M, block_K):
                     A_shared[i, j] = A[by * block_M + i, k * block_K + j]
                 for i, j in T.Parallel(block_K, block_N):
@@ -593,10 +567,7 @@ def main(
     return main
 
 
-def run_gemm_tiled_op_with_parallel(
-    order,
-    stage,
-):
+def run_gemm_tiled_op_with_parallel():
     M = 1024
     N = 1024
     K = 1024
@@ -619,17 +590,12 @@ def run_gemm_tiled_op_with_parallel(
         out_dtype,
         dtypeAccum,
         num_threads,
-        order,
-        stage,
     )
 
     kernel = tilelang.compile(
         program,
         out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = kernel.get_profiler()
 
@@ -658,17 +624,12 @@ def ref_program(A, B):
         out_dtype,
         dtypeAccum,
         num_threads,
-        order,
-        stage,
     )
     with pytest.raises(ValueError):
         tilelang.compile(
             program1,
             out_idx=[2],
-            pass_configs={
-                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-                tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            },
+            pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
         )
 
 
@@ -704,7 +665,7 @@ def main(
 
 
 def test_tiled_op_with_parallel():
-    run_gemm_tiled_op_with_parallel(order=[0, 1, 2], stage=[0, 0, 1])
+    run_gemm_tiled_op_with_parallel()
 
     kernel1 = tir_op_with_parallel(length=256, block=16)
     data = _require_cuda_tensor((256,), torch.float32)
diff --git a/testing/python/arith/test_arith_hard.py b/testing/python/arith/test_arith_hard.py
index 6fc859ba63..45bd86d0d1 100644
--- a/testing/python/arith/test_arith_hard.py
+++ b/testing/python/arith/test_arith_hard.py
@@ -3,6 +3,7 @@
 from tvm.arith import Analyzer
 from tvm.ir.expr import Range
 from tvm.tir.expr import Not, Or
+from tvm.tir import all as tir_all
 
 
 def implies(x, y):
@@ -21,30 +22,25 @@ def check_expr(expr):
         if not result:
             smtlib2 = analyzer.get_smtlib2(expr)
             raise AssertionError(f"Failed to prove: {expr}\nSMT-LIB2:\n{smtlib2}")
-        # assert result, f"Failed to prove: {expr}"
 
-    @T.macro
     def complex_expr_1():
-        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+        return implies(tir_all(a > 0, b > 0, c > 0), ((b - a) // c) * c + a <= b)
 
     check_expr(complex_expr_1())
 
-    @T.macro
     def complex_expr_2():
-        return implies(a < b and b < c and a * d < b * d, b * d < c * d)
+        return implies(tir_all(a < b, b < c, a * d < b * d), b * d < c * d)
 
     check_expr(complex_expr_2())
 
-    @T.macro
     def complex_expr_3():
-        return implies(a >= 0 and a < 128, a // 128 == (a // 64 * 32 + a % 32 // 16 * 8) // 64)
+        return implies(tir_all(a >= 0, a < 128), a // 128 == (a // 64 * 32 + a % 32 // 16 * 8) // 64)
 
     check_expr(complex_expr_3())
 
-    @T.macro
     def complex_expr_4():
         return implies(
-            a >= 0 and a < 128,
+            tir_all(a >= 0, a < 128),
             (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 + 16 - (a // 64 + a % 8 // 4) // 2 * 64) // 512
             == (a % 16 * 64 + a // 64 * 32 + a % 8 // 4 * 32 + (a % 32 // 16 + a % 2) % 2 * 8 - (a // 64 + a % 8 // 4) // 2 * 64) // 512,
         )
@@ -59,9 +55,8 @@ def test_smtlib2():
     b = T.Var("b", T.int32)
     c = T.Var("c", T.int32)
 
-    @T.macro
     def complex_expr_1():
-        return implies(a > 0 and b > 0 and c > 0, ((b - a) // c) * c + a <= b)
+        return implies(tir_all(a > 0, b > 0, c > 0), ((b - a) // c) * c + a <= b)
 
     e = complex_expr_1()
     analyzer = Analyzer()
diff --git a/testing/python/autotune/test_tilelang_autotune.py b/testing/python/autotune/test_tilelang_autotune.py
index 53707ca341..a039df5812 100644
--- a/testing/python/autotune/test_tilelang_autotune.py
+++ b/testing/python/autotune/test_tilelang_autotune.py
@@ -1,14 +1,9 @@
 import itertools
-import logging
 
 import tilelang.testing
 import tilelang.language as T
 from tilelang.autotuner import AutoTuner
 
-# Configure logger
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.DEBUG)
-
 
 def ref_program(A, B):
     """
@@ -16,108 +11,53 @@ def ref_program(A, B):
 
     Parameters
     ----------
-    A : numpy.ndarray
+    A : torch.Tensor
         The matrix with shape (M, K).
-    B : numpy.ndarray
+    B : torch.Tensor
         The matrix with shape (N, K).
 
     Returns
     -------
-    np.ndarray
+    torch.Tensor
         The result of A @ B.T, shape (M, N).
     """
     return A @ B.T
 
 
-def get_configs(M, N, K, with_roller=False):
-    """
-    Generate a list of configuration dictionaries that will be used for tuning.
-
-    Parameters
-    ----------
-    with_roller : bool
-        Whether to enable bitblas roller to deduce search spaces
-
-    Returns
-    -------
-    list of dict
-        Each configuration dict includes various block sizes, pipeline stages,
-        thread numbers, and other parameters to explore during autotuning.
-    """
-    if with_roller:
-        from tilelang.carver.template import MatmulTemplate
-        from tilelang.carver.arch import CUDA
-        from tilelang.carver.roller.rasterization import NoRasterization
-
-        arch = CUDA("cuda")
-        topk = 20
-
-        # Simple TIR Compute Expression
-        carve_template = MatmulTemplate(
-            M=M,
-            N=N,
-            K=K,
-            in_dtype=T.float16,
-            out_dtype=T.float16,
-            accum_dtype=T.float16,
-        ).with_arch(arch)
-
-        func = carve_template.equivalent_function()
-        assert func is not None, "Function is None"
-
-        roller_hints = carve_template.recommend_hints(topk=topk)
-
-        if roller_hints is None:
-            raise ValueError("No Roller Hints Found for TensorCore Scheduling")
-
-        configs = []
-        for hint in roller_hints:
-            config = {}
-            block_m, block_n = hint.block
-            warp_m, warp_n = hint.warp
-            config["block_M"] = block_m
-            config["block_N"] = block_n
-            config["block_K"] = hint.rstep[0]
-            config["num_stages"] = 0
-            config["thread_num"] = (block_m * block_n) // (warp_m * warp_n) * 32
-            config["enable_rasteration"] = hint.rasterization_plan is not NoRasterization
-            configs.append(config)
-        for config in configs:
-            print(config)
-    else:
-        block_M = [64]
-        block_N = [64]
-        block_K = [32]
-        num_stages = [0, 1]
-        thread_num = [128]
-        enable_rasterization = [False]
-
-        _configs = list(
-            itertools.product(
-                block_M,
-                block_N,
-                block_K,
-                num_stages,
-                thread_num,
-                enable_rasterization,
-            )
+def get_configs():
+    block_M = [64]
+    block_N = [64]
+    block_K = [32]
+    num_stages = [0, 1]
+    thread_num = [128]
+    enable_rasterization = [False]
+
+    _configs = list(
+        itertools.product(
+            block_M,
+            block_N,
+            block_K,
+            num_stages,
+            thread_num,
+            enable_rasterization,
         )
+    )
 
-        configs = [
-            {
-                "block_M": c[0],
-                "block_N": c[1],
-                "block_K": c[2],
-                "num_stages": c[3],
-                "thread_num": c[4],
-                "enable_rasteration": c[5],  # keep param name for backward-compat
-            }
-            for c in _configs
-        ]
+    configs = [
+        {
+            "block_M": c[0],
+            "block_N": c[1],
+            "block_K": c[2],
+            "num_stages": c[3],
+            "thread_num": c[4],
+            "enable_rasteration": c[5],  # keep param name for backward-compat
+        }
+        for c in _configs
+    ]
     return configs
 
 
-def matmul(M, N, K, with_roller):
+def matmul(M, N, K):
     """
     Create an autotuned matrix multiplication kernel for matrices of shape:
       - A: (M, K)
@@ -248,10 +188,9 @@ def main(
         return main
 
     autotuner = (
-        AutoTuner.from_kernel(kernel=kernel, configs=get_configs(M, N, K, with_roller))
+        AutoTuner.from_kernel(kernel=kernel, configs=get_configs())
         .set_compile_args(
             out_idx=[-1],
-            target="auto",
         )
         .set_profile_args(
             ref_prog=ref_program,
@@ -260,14 +199,14 @@ def main(
     return autotuner.run(warmup=3, rep=20)
 
 
+@tilelang.testing.requires_cuda
 def test_autotune_get_configs():
-    get_configs(1024, 1024, 1024, with_roller=True)
-    get_configs(1024, 1024, 1024, with_roller=False)
+    get_configs()
 
 
+@tilelang.testing.requires_cuda
 def test_autotune_matmul():
-    matmul(1024, 1024, 1024, with_roller=True)
-    matmul(1024, 1024, 1024, with_roller=False)
+    matmul(1024, 1024, 1024)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/autotune/test_tilelang_autotune_atomic_save.py b/testing/python/autotune/test_tilelang_autotune_atomic_save.py
new file mode 100644
index 0000000000..5fa0931b47
--- /dev/null
+++ b/testing/python/autotune/test_tilelang_autotune_atomic_save.py
@@ -0,0 +1,152 @@
+import errno
+
+import pytest
+
+from tilelang.autotuner import param as autotune_param
+from tilelang.autotuner.param import (
+    AutotuneResult,
+    BEST_CONFIG_PATH,
+    FUNCTION_PATH,
+    LATENCY_PATH,
+    DEVICE_KERNEL_PATH,
+    HOST_KERNEL_PATH,
+    KERNEL_CUBIN_PATH,
+    KERNEL_LIB_PATH,
+    KERNEL_PY_PATH,
+    PARAMS_PATH,
+)
+from tilelang.env import env
+
+
+class _FakeAdapter:
+    def __init__(self, libpath: str):
+        self.libpath = libpath
+
+    def get_kernel_source(self):
+        return "// host kernel"
+
+    def get_host_source(self):
+        return "// host kernel"
+
+
+class _FakeKernel:
+    def __init__(self, libpath: str, execution_backend: str = "cython"):
+        self.execution_backend = execution_backend
+        self.adapter = _FakeAdapter(libpath)
+        self.kernel_source = "// device kernel"
+        self.params = ["param"]
+
+
+def _fake_func():
+    return None
+
+
+@pytest.fixture
+def cache_dirs(tmp_path, monkeypatch):
+    cache_dir = tmp_path / "cache"
+    tmp_dir = tmp_path / "tmp"
+    cache_dir.mkdir()
+    tmp_dir.mkdir()
+    monkeypatch.setattr(env, "TILELANG_CACHE_DIR", str(cache_dir))
+    monkeypatch.setattr(env, "TILELANG_TMP_DIR", str(tmp_dir))
+    return cache_dir
+
+
+def _make_result(tmp_path, execution_backend: str = "cython"):
+    if execution_backend == "nvrtc":
+        lib_path = tmp_path / "kernel.cubin"
+        lib_path.write_bytes(b"fake-cubin")
+        lib_path.with_suffix(".py").write_text("# fake launcher")
+    else:
+        lib_path = tmp_path / "kernel_lib.so"
+        lib_path.write_bytes(b"fake-so")
+    _fake_func.attrs = None
+    return AutotuneResult(
+        latency=1.0,
+        config={"threads": 128},
+        ref_latency=2.0,
+        libcode="// libcode",
+        func=_fake_func,
+        kernel=_FakeKernel(str(lib_path), execution_backend=execution_backend),
+    )
+
+
+def test_autotune_save_rewrites_incomplete_cache_dir(cache_dirs, tmp_path):
+    result = _make_result(tmp_path)
+    path = cache_dirs / "test-namespace" / "autotuner" / "autotune-entry"
+    path.mkdir(parents=True)
+    (path / "stale.txt").write_text("partial")
+
+    result.save_to_disk(path)
+
+    for filename in (
+        BEST_CONFIG_PATH,
+        FUNCTION_PATH,
+        LATENCY_PATH,
+        DEVICE_KERNEL_PATH,
+        HOST_KERNEL_PATH,
+        KERNEL_LIB_PATH,
+        PARAMS_PATH,
+    ):
+        assert (path / filename).exists()
+    assert not (path / "stale.txt").exists()
+
+
+def test_autotune_save_logs_write_oserror_instead_of_treating_it_as_race(cache_dirs, tmp_path, monkeypatch):
+    result = _make_result(tmp_path)
+    path = cache_dirs / "test-namespace" / "autotuner" / "autotune-error"
+    logged = []
+    staging_root = path.parent.parent / ".staging"
+
+    def raise_write_error(self, *args, **kwargs):
+        raise OSError(errno.ENOSPC, "No space left on device")
+
+    def record_exception(message, *args, **kwargs):
+        logged.append(message)
+
+    monkeypatch.setattr(AutotuneResult, "_save_kernel_to_disk", raise_write_error)
+    monkeypatch.setattr(autotune_param.logger, "exception", record_exception)
+
+    result.save_to_disk(path)
+
+    assert not path.exists()
+    assert "Error during atomic autotune result save" in logged
+    assert not staging_root.exists() or not any(staging_root.iterdir())
+
+
+def test_autotune_save_does_not_publish_incomplete_dir_when_device_source_is_missing(cache_dirs, tmp_path, monkeypatch):
+    result = _make_result(tmp_path)
+    result.kernel.kernel_source = None
+    path = cache_dirs / "test-namespace" / "autotuner" / "autotune-missing-device-source"
+    logged = []
+    staging_root = path.parent.parent / ".staging"
+
+    def record_exception(message, *args, **kwargs):
+        logged.append(message)
+
+    monkeypatch.setattr(autotune_param.logger, "exception", record_exception)
+
+    result.save_to_disk(path)
+
+    assert not path.exists()
+    assert "Error during atomic autotune result save" in logged
+    assert not staging_root.exists() or not any(staging_root.iterdir())
+
+
+def test_autotune_save_rewrites_nvrtc_dir_missing_launcher(cache_dirs, tmp_path):
+    result = _make_result(tmp_path, execution_backend="nvrtc")
+    path = cache_dirs / "test-namespace" / "autotuner" / "autotune-nvrtc-entry"
+    path.mkdir(parents=True)
+    (path / BEST_CONFIG_PATH).write_text("{}")
+    (path / FUNCTION_PATH).write_bytes(b"old-func")
+    (path / LATENCY_PATH).write_text('{"latency": 1.0, "ref_latency": 2.0}')
+    (path / DEVICE_KERNEL_PATH).write_text("// device kernel")
+    (path / HOST_KERNEL_PATH).write_text("// host kernel")
+    (path / KERNEL_CUBIN_PATH).write_bytes(b"old-cubin")
+    (path / PARAMS_PATH).write_bytes(b"old-params")
+    (path / "legacy.txt").write_text("stale")
+
+    result.save_to_disk(path)
+
+    assert (path / KERNEL_PY_PATH).exists()
+    assert not (path / "legacy.txt").exists()
diff --git a/testing/python/autotune/test_tilelang_autotune_cutedsl_cache.py b/testing/python/autotune/test_tilelang_autotune_cutedsl_cache.py
new file mode 100644
index 0000000000..bfdf58fc8e
--- /dev/null
+++ b/testing/python/autotune/test_tilelang_autotune_cutedsl_cache.py
@@ -0,0 +1,120 @@
+"""Regression test for #1967: CuTeDSL autotune cache saved .py as .so → "invalid ELF header"."""
+
+import os
+import pytest
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+from tilelang.autotuner.param import AutotuneResult
+from tilelang.env import env
+
+
+def test_cutedsl_save_creates_kernel_py(tmp_path):
+    """_save_kernel_to_disk should write kernel.py (not kernel_lib.so) for CuTeDSL."""
+    original_tmp_dir = env.TILELANG_TMP_DIR
+    env.TILELANG_TMP_DIR = str(tmp_path / "tmp")
+    os.makedirs(env.TILELANG_TMP_DIR, exist_ok=True)
+
+    try:
+        src_dir = tmp_path / "src"
+        src_dir.mkdir()
+        (src_dir / "kernel.py").write_text("# cutedsl kernel\n")
+        (src_dir / "kernel.cubin").write_bytes(b"fake_cubin")
+
+        class FakeLibGen:
+            launcher_libpath = None
+
+        class FakeAdapter:
+            libpath = str(src_dir / "kernel.py")
+            lib_generator = FakeLibGen()
+
+            def get_kernel_source(self, kernel_only=True):
+                return "# src"
+
+        class FakeKernel:
+            execution_backend = "cutedsl"
+            adapter = FakeAdapter()
+            kernel_source = "# src"
+            params = []
+
+        cache = tmp_path / "cache"
+        cache.mkdir()
+        AutotuneResult()._save_kernel_to_disk(cache, FakeKernel())
+
+        assert (cache / "kernel.py").exists()
+        assert not (cache / "kernel_lib.so").exists()
+        assert (cache / "kernel.cubin").exists()
+    finally:
+        env.TILELANG_TMP_DIR = original_tmp_dir
+
+
+def _is_cutedsl_available():
+    try:
+        from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available
+
+        check_cutedsl_available()
+        return True
+    except (ImportError, AssertionError):
+        return False
+
+
+# Define autotune kernel at module level so closures don't capture module objects
+def _make_vec_add_autotuned():
+    from tilelang.autotuner import autotune
+
+    @autotune(configs=[{"threads": t} for t in (128, 256)], warmup=3, rep=5)
+    @tilelang.jit(out_idx=[-1], target="cutedsl")
+    def vec_add(n: int, dtype: str = "float32", threads: int = 128):
+        num_blocks = n // threads
+
+        @T.prim_func
+        def kernel(a: T.Tensor((n,), dtype), b: T.Tensor((n,), dtype), c: T.Tensor((n,), dtype)):
+            with T.Kernel(num_blocks, threads=threads) as bx:
+                for i in T.Parallel(threads):
+                    c[bx * threads + i] = a[bx * threads + i] + b[bx * threads + i]
+
+        return kernel
+
+    return vec_add
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.skipif(not _is_cutedsl_available(), reason="CuTeDSL not installed")
+def test_cutedsl_autotune_cache_roundtrip(tmp_path):
+    """Autotune + CuTeDSL: save → reload from disk → verify correctness."""
+    import torch
+    from tilelang.autotuner import AutoTuner
+
+    original_cache_dir, original_tmp_dir = env.TILELANG_CACHE_DIR, env.TILELANG_TMP_DIR
+    env.TILELANG_CACHE_DIR = str(tmp_path / "cache")
+    env.TILELANG_TMP_DIR = str(tmp_path / "tmp")
+    os.makedirs(env.TILELANG_CACHE_DIR, exist_ok=True)
+    os.makedirs(env.TILELANG_TMP_DIR, exist_ok=True)
+    original_cache_enabled = env.is_cache_enabled()
+    tilelang.enable_cache()
+    AutoTuner._memory_cache.clear()
+
+    try:
+        vec_add = _make_vec_add_autotuned()
+        N = 256
+        a = torch.randn(N, device="cuda", dtype=torch.float32)
+        b = torch.randn(N, device="cuda", dtype=torch.float32)
+        ref = a + b
+
+        # Pass 1: cache miss
+        torch.testing.assert_close(vec_add(N)(a, b), ref, atol=1e-5, rtol=1e-5)
+
+        # Pass 2: clear memory cache → force disk reload (was "invalid ELF header" before fix)
+        AutoTuner._memory_cache.clear()
+        vec_add._tuner_cache.clear()
+        torch.testing.assert_close(vec_add(N)(a, b), ref, atol=1e-5, rtol=1e-5)
+    finally:
+        env.TILELANG_CACHE_DIR = original_cache_dir
+        env.TILELANG_TMP_DIR = original_tmp_dir
+        if not original_cache_enabled:
+            tilelang.disable_cache()
+        AutoTuner._memory_cache.clear()
+
+
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])
diff --git a/testing/python/autotune/test_tilelang_autotune_eager_mode.py b/testing/python/autotune/test_tilelang_autotune_eager_mode.py
new file mode 100644
index 0000000000..41fc406c57
--- /dev/null
+++ b/testing/python/autotune/test_tilelang_autotune_eager_mode.py
@@ -0,0 +1,141 @@
+import itertools
+import logging
+import tilelang
+import tilelang.testing
+from tilelang.autotuner import set_autotune_inputs
+import tilelang.language as T
+
+# Configure logger
+logger = logging.getLogger(__name__)
+logger.setLevel(logging.DEBUG)
+
+
+def ref_program(A, B):
+    """
+    A reference matrix multiplication program, used to compare performance.
+
+    Parameters
+    ----------
+    A : torch.Tensor
+        The matrix with shape (M, K).
+    B : torch.Tensor
+        The matrix with shape (N, K).
+
+    Returns
+    -------
+    torch.Tensor
+        The result of A @ B.T, shape (M, N).
+    """
+    return A @ B.T
+
+
+def get_configs():
+    iter_params = dict(block_M=[64], block_N=[64], block_K=[32], num_stages=[0, 1], thread_num=[128], enable_rasterization=[False])
+    return [{k: v for k, v in zip(iter_params, values)} for values in itertools.product(*iter_params.values())]
+
+
+@tilelang.autotune(
+    configs=get_configs(),
+)
+@tilelang.jit
+def matmul(A, B, block_M=128, block_N=128, block_K=32, num_stages=0, thread_num=128, enable_rasterization=False):
+    M, N, K = T.const("M, N, K")
+
+    dtype = T.float16
+    accum_dtype = T.float32
+
+    A: T.Tensor((M, K), dtype)
+    B: T.Tensor((N, K), dtype)
+    C = T.empty((M, N), dtype)
+
+    # Bind x-dimension to block index in N,
+    #     y-dimension to block index in M.
+    with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=thread_num) as (bx, by):
+        # Allocate shared memory for A sub-block of shape (block_M, block_K)
+        A_shared = T.alloc_shared((block_M, block_K), dtype)
+        # Allocate shared memory for B sub-block of shape (block_N, block_K)
+        B_shared = T.alloc_shared((block_N, block_K), dtype)
+        # Allocate a local fragment for intermediate accumulation
+        C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+        # Enable (or disable) swizzling optimization
+        T.use_swizzle(panel_size=10, enable=enable_rasterization)
+
+        # Clear out the accumulation buffer
+        T.clear(C_local)
+
+        # Loop over sub-blocks in K dimension, pipelined by num_stages
+        for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+            # Load a sub-block of A from global memory into A_shared
+            T.copy(
+                A[by * block_M, k * block_K],
+                A_shared,
+            )
+            # Load a sub-block of B from global memory into B_shared
+            T.copy(
+                B[bx * block_N, k * block_K],
+                B_shared,
+            )
+            # Perform a partial matrix multiplication:
+            #   C_local += A_shared @ B_shared^T
+            T.gemm(
+                A_shared,
+                B_shared,
+                C_local,
+                transpose_B=True,
+            )
+        # Write back the results from C_local to the global memory C
+        T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return C
+
+
+def run_autotune(M, N, K, M_value=None, N_value=None, K_value=None, return_kernel=False):
+    import torch
+
+    def _resolve(dim, provided, name):
+        if isinstance(dim, T.Var):
+            if provided is None:
+                raise ValueError(f"Dynamic dimension {name} requires a concrete value.")
+            return provided
+        return dim
+
+    actual_M = _resolve(M, M_value, "M")
+    actual_N = _resolve(N, N_value, "N")
+    actual_K = _resolve(K, K_value, "K")
+
+    a = torch.randn(actual_M, actual_K, dtype=torch.float16).cuda()
+    b = torch.randn(actual_N, actual_K, dtype=torch.float16).cuda()
+
+    if return_kernel:
+        with set_autotune_inputs([a, b]):
+            kernel = matmul.compile(M=M, N=N, K=K)
+        c = kernel(a, b)
+    else:
+        with set_autotune_inputs([a, b]):
+            c = matmul(a, b)
+
+    ref_c = ref_program(a, b)
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+
+
+def test_autotune_matmul():
+    """
+    Run the autotuning validation for the matmul kernel on a 1024x1024x1024 problem.
+
+    This test constructs random CUDA tensors, autotunes the JIT-compiled block-level matrix-multiplication kernel,
+    executes it, and asserts the result matches a reference PyTorch implementation within tolerances.
+    """
+    run_autotune(1024, 1024, 1024)
+
+
+def test_autotune_matmul_compile():
+    run_autotune(1024, 1024, 1024, return_kernel=True)
+
+
+def test_autotune_matmul_symbolic_m():
+    run_autotune(T.symbolic("m"), 1024, 1024, M_value=1024)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/autotune/test_tilelang_autotune_with_inputs.py b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
index 4edea0b883..e41337bc8f 100644
--- a/testing/python/autotune/test_tilelang_autotune_with_inputs.py
+++ b/testing/python/autotune/test_tilelang_autotune_with_inputs.py
@@ -16,14 +16,14 @@ def ref_program(A, B):
 
     Parameters
     ----------
-    A : numpy.ndarray
+    A : torch.Tensor
         The matrix with shape (M, K).
-    B : numpy.ndarray
+    B : torch.Tensor
         The matrix with shape (N, K).
 
     Returns
     -------
-    np.ndarray
+    torch.Tensor
         The result of A @ B.T, shape (M, N).
     """
     return A @ B.T
@@ -131,7 +131,7 @@ def test_autotune_matmul():
     Run the autotuning validation for the matmul kernel on a 1024x1024x1024 problem.
 
     This test constructs random CUDA tensors, autotunes the JIT-compiled block-level matrix-multiplication kernel,
-    executes it, and asserts the result matches a reference CPU implementation within tolerances.
+    executes it, and asserts the result matches a reference PyTorch implementation within tolerances.
     """
     run_autotune(1024, 1024, 1024)
 
diff --git a/testing/python/cache/test_tilelang_kernel_cache.py b/testing/python/cache/test_tilelang_kernel_cache.py
index 9f6683a8d9..a3ca0eee33 100644
--- a/testing/python/cache/test_tilelang_kernel_cache.py
+++ b/testing/python/cache/test_tilelang_kernel_cache.py
@@ -21,6 +21,7 @@
 
 import pytest
 import tilelang
+import tilelang.testing
 import tilelang.language as T
 import tvm_ffi
 import torch
@@ -42,6 +43,17 @@ def _get_target_from_backend(backend: str):
     return "cutedsl" if backend == "cutedsl" else "auto"
 
 
+def _require_backend_available(backend: str) -> None:
+    if backend != "cutedsl":
+        return
+    try:
+        from tilelang.jit.adapter.cutedsl.checks import check_cutedsl_available
+
+        check_cutedsl_available()
+    except ImportError as e:
+        pytest.skip(f"CuTeDSL backend unavailable: {e}")
+
+
 class PostProcCounter:
     """Track postproc callback invocations with a simple counter."""
 
@@ -101,6 +113,7 @@ def clean_cache_env(tmp_path, request):
     """
     # This fixture should ONLY be used with @pytest.mark.parametrize("backend", ...)
     backend = request.node.callspec.params["backend"]  # Will raise KeyError if missing
+    _require_backend_available(backend)
 
     cache_dir = tmp_path / "tilelang_cache"
     cache_dir.mkdir()
@@ -118,6 +131,7 @@ def clean_cache_env(tmp_path, request):
     return cache_dir
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_disk_cache_with_postproc(clean_cache_env, backend):
     """Test disk cache for multiple backends using postproc callback.
@@ -136,7 +150,7 @@ def test_disk_cache_with_postproc(clean_cache_env, backend):
 
     # Use UUID in global_symbol to ensure unique cache key per test run
     unique_id = uuid.uuid4().hex[:8]
-    M, N = 1024, 1024
+    M, N = 256, 256
 
     @T.prim_func
     def vector_add(
@@ -195,6 +209,7 @@ def vector_add(
     torch.testing.assert_close(c1, c2)
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_cache_miss_detection(clean_cache_env, backend):
     """Verify cache correctly misses when function changes.
@@ -205,7 +220,7 @@ def test_cache_miss_detection(clean_cache_env, backend):
     counter = PostProcCounter()
     counter.register_callback(backend)
 
-    M, N = 512, 512
+    M, N = 128, 128
 
     # Kernel 1: A + 1.0
     @T.prim_func
@@ -245,6 +260,7 @@ def func2(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
     assert counter.count == 2, f"Different function should cause cache miss, expected 2 calls, got {counter.count}"
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize("backend", BACKENDS)
 def test_cache_isolation_between_tests(clean_cache_env, backend):
     """Verify cache isolation between tests.
diff --git a/testing/python/cache/test_tilelang_kernel_cache_atomic_save.py b/testing/python/cache/test_tilelang_kernel_cache_atomic_save.py
new file mode 100644
index 0000000000..3e87ceacaa
--- /dev/null
+++ b/testing/python/cache/test_tilelang_kernel_cache_atomic_save.py
@@ -0,0 +1,123 @@
+import errno
+from pathlib import Path
+import pytest
+
+from tilelang.cache.kernel_cache import KernelCache
+from tilelang.env import env
+from tilelang.jit.adapter.nvrtc.kernel_cache import NVRTCKernelCache
+
+
+class _FakeAdapter:
+    def __init__(self, libpath: str):
+        self.libpath = libpath
+
+    def get_kernel_source(self):
+        return "// host kernel"
+
+
+class _FakeKernel:
+    def __init__(self, libpath: str):
+        self.adapter = _FakeAdapter(libpath)
+        self.kernel_source = "// device kernel"
+        self.params = ["param"]
+
+
+@pytest.fixture
+def cache_dirs(tmp_path, monkeypatch):
+    cache_dir = tmp_path / "cache"
+    tmp_dir = tmp_path / "tmp"
+    cache_dir.mkdir()
+    tmp_dir.mkdir()
+    monkeypatch.setattr(env, "TILELANG_CACHE_DIR", str(cache_dir))
+    monkeypatch.setattr(env, "TILELANG_TMP_DIR", str(tmp_dir))
+    return cache_dir
+
+
+def _make_fake_kernel(tmp_path):
+    lib_path = tmp_path / "kernel_lib.so"
+    lib_path.write_bytes(b"fake-so")
+    return _FakeKernel(str(lib_path))
+
+
+def _make_fake_nvrtc_kernel(tmp_path):
+    lib_path = tmp_path / "kernel.cubin"
+    lib_path.write_bytes(b"fake-cubin")
+    lib_path.with_suffix(".py").write_text("# fake launcher")
+    return _FakeKernel(str(lib_path))
+
+
+def test_kernel_cache_rewrites_incomplete_cache_dir(cache_dirs, tmp_path):
+    cache = KernelCache()
+    key = "atomic-repair"
+    cache_path = Path(cache._get_cache_path(key))
+    cache_path.mkdir(parents=True)
+    (cache_path / "stale.txt").write_text("partial")
+
+    cache._save_kernel_to_disk(key, _make_fake_kernel(tmp_path))
+
+    assert (cache_path / cache.device_kernel_path).exists()
+    assert (cache_path / cache.host_kernel_path).exists()
+    assert (cache_path / cache.kernel_lib_path).exists()
+    assert (cache_path / cache.params_path).exists()
+    assert not (cache_path / "stale.txt").exists()
+
+
+def test_kernel_cache_logs_write_oserror_instead_of_treating_it_as_race(cache_dirs, tmp_path, monkeypatch):
+    cache = KernelCache()
+    key = "atomic-write-error"
+    logged = []
+    cache_path = Path(cache._get_cache_path(key))
+    staging_root = Path(cache._get_staging_root())
+
+    def raise_write_error(*args, **kwargs):
+        raise OSError(errno.ENOSPC, "No space left on device")
+
+    def record_exception(message, *args, **kwargs):
+        logged.append(message)
+
+    monkeypatch.setattr(cache, "_save_so_cubin_to_disk", raise_write_error)
+    monkeypatch.setattr(cache.logger, "exception", record_exception)
+
+    cache._save_kernel_to_disk(key, _make_fake_kernel(tmp_path))
+
+    assert not cache_path.exists()
+    assert "Error during atomic cache save" in logged
+    assert not staging_root.exists() or not any(staging_root.iterdir())
+
+
+def test_kernel_cache_does_not_publish_incomplete_dir_when_device_source_is_missing(cache_dirs, tmp_path, monkeypatch):
+    cache = KernelCache()
+    key = "atomic-missing-device-source"
+    kernel = _make_fake_kernel(tmp_path)
+    kernel.kernel_source = None
+    logged = []
+    cache_path = Path(cache._get_cache_path(key))
+    staging_root = Path(cache._get_staging_root())
+
+    def record_exception(message, *args, **kwargs):
+        logged.append(message)
+
+    monkeypatch.setattr(cache.logger, "exception", record_exception)
+
+    cache._save_kernel_to_disk(key, kernel)
+
+    assert not cache_path.exists()
+    assert "Error during atomic cache save" in logged
+    assert not staging_root.exists() or not any(staging_root.iterdir())
+
+
+def test_nvrtc_kernel_cache_rewrites_dir_missing_launcher(cache_dirs, tmp_path):
+    cache = NVRTCKernelCache()
+    key = "nvrtc-atomic-repair"
+    cache_path = Path(cache._get_cache_path(key))
+    cache_path.mkdir(parents=True)
+    (cache_path / cache.device_kernel_path).write_text("// device kernel")
+    (cache_path / cache.host_kernel_path).write_text("// host kernel")
+    (cache_path / cache.kernel_lib_path).write_bytes(b"old-cubin")
+    (cache_path / cache.params_path).write_bytes(b"old-params")
+    (cache_path / "legacy.txt").write_text("stale")
+
+    cache._save_kernel_to_disk(key, _make_fake_nvrtc_kernel(tmp_path))
+
+    assert (cache_path / cache.kernel_py_path).exists()
+    assert not (cache_path / "legacy.txt").exists()
diff --git a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
index 67d20b8979..dd58ca2593 100644
--- a/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
+++ b/testing/python/carver/test_tilelang_carver_cuda_driver_properties.py
@@ -26,42 +26,49 @@ class _cudaDeviceAttrNames:
     cudaDevAttrMaxPersistingL2CacheSize: int = 108
 
 
+@tilelang.testing.requires_cuda
 def test_driver_get_device_properties():
     prop = get_cuda_device_properties()
     assert prop is not None, "Failed to get CUDA device properties"
     assert isinstance(prop, torch.cuda._CudaDeviceProperties), "Returned object is not of type _CudaDeviceProperties"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_device_name():
     tl_device_name = get_device_name()
     th_device_name = torch.cuda.get_device_name()
     assert tl_device_name == th_device_name, "Device names do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_shared_memory_per_block():
     tl_smem = get_shared_memory_per_block()
     driver_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerBlock)
     assert tl_smem == driver_smem, "Shared memory per block values do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_persisting_l2_cache_size():
     tl_cache_size = get_persisting_l2_cache_max_size()
     driver_cache_size = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxPersistingL2CacheSize)
     assert tl_cache_size == driver_cache_size, "Persisting L2 cache size values do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_num_sms():
     tl_num_sms = get_num_sms()
     driver_num_sms = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMultiProcessorCount)
     assert tl_num_sms == driver_num_sms, "Number of SMs do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_registers_per_block():
     tl_regs_per_block = get_registers_per_block()
     driver_regs_per_block = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxRegistersPerBlock)
     assert tl_regs_per_block == driver_regs_per_block, "Registers per block values do not match"
 
 
+@tilelang.testing.requires_cuda
 def test_device_get_max_dynamic_shared_size_bytes():
     tl_dynamic_smem = get_max_dynamic_shared_size_bytes()
     driver_dynamic_smem = get_device_attribute(_cudaDeviceAttrNames.cudaDevAttrMaxSharedMemoryPerMultiprocessor)
diff --git a/testing/python/carver/test_tilelang_carver_generate_hints.py b/testing/python/carver/test_tilelang_carver_generate_hints.py
index ea674f7c74..75144b62b2 100644
--- a/testing/python/carver/test_tilelang_carver_generate_hints.py
+++ b/testing/python/carver/test_tilelang_carver_generate_hints.py
@@ -48,6 +48,8 @@ def gemm(M, N, K):
     assert len(hints) > 0, "Hints length is zero"
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_le(9, 0)
 def test_general_matmul_emit_configs():
     run_general_matmul_emit_configs(128, 128, 128)
 
@@ -92,6 +94,8 @@ def gemm(M, N, K):
     assert len(hints) > 0, "Hints length is zero"
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_le(9, 0)
 def test_general_matmul_matmul_emit_configs():
     run_general_matmul_matmul_emit_configs(128, 128, 128)
 
diff --git a/testing/python/carver/test_tilelang_carver_recommend_hints.py b/testing/python/carver/test_tilelang_carver_recommend_hints.py
index 3a060f5323..d62e397d3f 100644
--- a/testing/python/carver/test_tilelang_carver_recommend_hints.py
+++ b/testing/python/carver/test_tilelang_carver_recommend_hints.py
@@ -133,6 +133,8 @@ def run_fmha_recommend_hints(
     assert len(hints) > 0, "Hints length should be greater than 0"
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(8, 0)
 def test_fmha_recommend_hints():
     run_fmha_recommend_hints(4, 32, 512, 512, 128, T.float16, T.float16, T.float16)
     run_fmha_recommend_hints(4, 32, 512, 512, 128, T.int8, T.int32, T.int32)
diff --git a/testing/python/components/test_storage_rewrite_detect_inplace.py b/testing/python/components/test_storage_rewrite_detect_inplace.py
index 4c4f4e5f3d..4fe7fdd86b 100644
--- a/testing/python/components/test_storage_rewrite_detect_inplace.py
+++ b/testing/python/components/test_storage_rewrite_detect_inplace.py
@@ -54,8 +54,11 @@ def test_storage_rewrite_detect_inplace_toggle():
     script_off = _get_device_kernel_script(detect_inplace=False)
     script_on = _get_device_kernel_script(detect_inplace=True)
 
-    assert script_off.count("read = (read * 2);") == 0
-    assert script_on.count("read = (read * 2);") > 0
+    pattern_on = "read = (read * 2);"
+    pattern_off = "write = (read * 2);"
+    assert script_off.count(pattern_on) == 0, f"inplace pattern found when disabled:\n{script_off}"
+    assert script_on.count(pattern_on) > 0, f"inplace pattern not found when enabled:\n{script_on}"
+    assert script_off.count(pattern_off) > 0, f"separate-write pattern not found when disabled:\n{script_off}"
 
 
 if __name__ == "__main__":
diff --git a/testing/python/components/test_tilelang_pass_config_disable_tma_lower.py b/testing/python/components/test_tilelang_pass_config_disable_tma_lower.py
new file mode 100644
index 0000000000..8a53d99969
--- /dev/null
+++ b/testing/python/components/test_tilelang_pass_config_disable_tma_lower.py
@@ -0,0 +1,30 @@
+import warnings
+
+import tilelang
+import tilelang.testing
+from tilelang import tvm
+from tilelang.jit.kernel import JITKernel
+
+
+def test_disable_tma_lower_pass_context_compat():
+    with tvm.transform.PassContext(config={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True}):
+        assert bool(tvm.transform.PassContext.current().config["tl.disable_tma_lower"])
+
+    with tvm.transform.PassContext(config={"tl.disable_tma_lower": True}):
+        assert bool(tvm.transform.PassContext.current().config["tl.disable_tma_lower"])
+
+
+def test_disable_tma_lower_warns_in_jit_entry():
+    with warnings.catch_warnings(record=True) as caught:
+        warnings.simplefilter("always", DeprecationWarning)
+        JITKernel(
+            from_database=True,
+            target="c",
+            pass_configs={tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+        )
+
+    assert any("tl.disable_tma_lower" in str(item.message) and "v0.1.10" in str(item.message) for item in caught)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
index d599e581ac..44bf783650 100644
--- a/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
+++ b/testing/python/components/test_tilelang_pass_config_disable_warp_specialized.py
@@ -84,10 +84,7 @@ def run_gemm(
     kernel = tilelang.compile(
         program,
         out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: disable_warp_specialized,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: disable_warp_specialized},
     )
     profiler = kernel.get_profiler()
 
diff --git a/testing/python/cpu/test_tilelang_cpu_gemm.py b/testing/python/cpu/test_tilelang_cpu_gemm.py
index 4113c9d06c..e429b2e802 100644
--- a/testing/python/cpu/test_tilelang_cpu_gemm.py
+++ b/testing/python/cpu/test_tilelang_cpu_gemm.py
@@ -113,5 +113,43 @@ def matmul(
     tilelang.testing.torch_assert_close(C, C_torch, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+def test_matmul_with_copy_cython():
+    """CPU kernel using T.copy with cython backend.
+
+    Verifies that T.copy works end-to-end on CPU: the vectorized copy
+    uses vector types (e.g. float4) defined in common.h, and the
+    wrapper correctly skips redundant re-lowering.
+    """
+    M, N, K = 128, 128, 128
+    block_M, block_N, block_K = 32, 32, 32
+
+    @T.prim_func
+    def matmul(
+        A: T.Tensor((M, K), "float32"),
+        B: T.Tensor((K, N), "float32"),
+        C: T.Tensor((M, N), "float32"),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
+            A_local = T.alloc_local((block_M, block_K), "float32")
+            B_local = T.alloc_local((block_K, block_N), "float32")
+            C_local = T.alloc_local((block_M, block_N), "float32")
+
+            T.clear(C_local)
+            for ko in T.serial(K // block_K):
+                T.copy(A[by * block_M, ko * block_K], A_local)
+                T.copy(B[ko * block_K, bx * block_N], B_local)
+                for i, j, k in T.grid(block_M, block_N, block_K):
+                    C_local[i, j] += A_local[i, k] * B_local[k, j]
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    compiled = tilelang.compile(matmul, target="c", out_idx=-1, execution_backend="cython")
+
+    a = torch.randn(M, K, dtype=torch.float32)
+    b = torch.randn(K, N, dtype=torch.float32)
+    c = compiled(a, b)
+    ref = a @ b
+    torch.testing.assert_close(c, ref, rtol=1e-5, atol=1e-5)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/cpu/test_tilelang_cpu_tgemm.py b/testing/python/cpu/test_tilelang_cpu_tgemm.py
new file mode 100644
index 0000000000..4f16f706bb
--- /dev/null
+++ b/testing/python/cpu/test_tilelang_cpu_tgemm.py
@@ -0,0 +1,130 @@
+"""Tests for T.gemm on CPU target (GemmScalar path).
+
+Verifies that T.gemm works correctly with target="c" for various
+matrix sizes, block sizes, and transpose combinations.
+"""
+
+import pytest
+import torch
+import tilelang
+import tilelang.testing
+from tilelang import tvm as tvm
+import tilelang.language as T
+
+
+def matmul(M, N, K, block_M, block_N, block_K, trans_A=False, trans_B=False, dtype=T.float32, accum_dtype=T.float32):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_local_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_local_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, dtype),
+        B: T.Tensor(B_shape, dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), is_cpu=True) as (bx, by):
+            A_local = T.alloc_local(A_local_shape, dtype)
+            B_local = T.alloc_local(B_local_shape, dtype)
+            C_local = T.alloc_local((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for ko in T.serial(T.ceildiv(K, block_K)):
+                if trans_A:
+                    T.copy(A[ko * block_K, by * block_M], A_local)
+                else:
+                    T.copy(A[by * block_M, ko * block_K], A_local)
+                if trans_B:
+                    T.copy(B[bx * block_N, ko * block_K], B_local)
+                else:
+                    T.copy(B[ko * block_K, bx * block_N], B_local)
+                T.gemm(A_local, B_local, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def ref_matmul(A, B, trans_A, trans_B):
+    if trans_A:
+        A = A.T
+    if trans_B:
+        B = B.T
+    return torch.matmul(A.float(), B.float()).to(A.dtype)
+
+
+def run_gemm_codegen(M, N, K, block_M, block_N, block_K, trans_A=False, trans_B=False):
+    func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B)
+    with tvm.target.Target("c"):
+        artifact = tilelang.lower(func, target="c", target_host="c")
+    code = artifact.kernel_source
+    assert code is not None, "Code generation failed"
+    assert "matmul" in code or "main" in code, "Generated code missing kernel function"
+    return code
+
+
+def run_gemm_compile(M, N, K, block_M, block_N, block_K, trans_A=False, trans_B=False, dtype=T.float32):
+    func = matmul(M, N, K, block_M, block_N, block_K, trans_A, trans_B, dtype=dtype)
+    with tvm.target.Target("c"):
+        kernel = tilelang.compile(func, out_idx=[2], target="c", target_host="c", execution_backend="cython")
+
+    torch_dtype = torch.__getattribute__(dtype)
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A = torch.randn(A_shape, dtype=torch_dtype)
+    B = torch.randn(B_shape, dtype=torch_dtype)
+
+    C = kernel(A, B)
+    C_ref = ref_matmul(A, B, trans_A, trans_B)
+
+    tilelang.testing.torch_assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+# --- Codegen tests ---
+
+
+def test_codegen_basic():
+    run_gemm_codegen(128, 128, 128, 64, 64, 64)
+
+
+def test_codegen_rectangular():
+    run_gemm_codegen(256, 512, 128, 64, 64, 64)
+
+
+def test_codegen_trans_A():
+    run_gemm_codegen(128, 128, 128, 64, 64, 64, trans_A=True)
+
+
+def test_codegen_trans_B():
+    run_gemm_codegen(128, 128, 128, 64, 64, 64, trans_B=True)
+
+
+# --- Compile + correctness tests ---
+
+
+@pytest.mark.parametrize(
+    "M,N,K,block_M,block_N,block_K",
+    [
+        (128, 128, 128, 64, 64, 64),
+        (256, 256, 256, 64, 64, 64),
+        (256, 512, 128, 64, 64, 64),
+        (512, 512, 512, 128, 128, 128),
+    ],
+)
+def test_gemm_f32_nn(M, N, K, block_M, block_N, block_K):
+    run_gemm_compile(M, N, K, block_M, block_N, block_K)
+
+
+def test_gemm_f32_tn():
+    run_gemm_compile(128, 128, 128, 64, 64, 64, trans_A=True)
+
+
+def test_gemm_f32_nt():
+    run_gemm_compile(128, 128, 128, 64, 64, 64, trans_B=True)
+
+
+def test_gemm_f32_tt():
+    run_gemm_compile(128, 128, 128, 64, 64, 64, trans_A=True, trans_B=True)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/cuda/test_cuda_f32x2_intrinsics.py b/testing/python/cuda/test_cuda_f32x2_intrinsics.py
new file mode 100644
index 0000000000..aace622dbe
--- /dev/null
+++ b/testing/python/cuda/test_cuda_f32x2_intrinsics.py
@@ -0,0 +1,348 @@
+"""Tests for packed x2 intrinsics (add2, sub2, mul2, fma2, max2, min2, abs2).
+
+Each operation is tested for all three supported dtype families:
+  - float32   (float32x2)
+  - bfloat16  (bfloat16x2)
+  - float16   (float16x2)
+
+Three kinds of tests:
+  1. Codegen tests  -- verify that the CUDA source contains ``tl::<op>``
+     and that bfloat16x2/float16x2 emit proper native-type casts
+     (__nv_bfloat162 / __half2) instead of the ambiguous uint1 overload.
+  2. Correctness tests -- compile, run, and compare against PyTorch reference.
+  3. Auto-vectorization tests -- verify SM100 auto-vectorization behaviour.
+"""
+
+import tilelang
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import pytest
+import torch
+
+SM100_TARGET = "cuda -arch=sm_100"
+SM80_TARGET = "cuda -arch=sm_80"
+
+M = 128  # number of threads / element-pairs
+
+# ---------------------------------------------------------------------------
+# Dtype helpers
+# ---------------------------------------------------------------------------
+
+_DTYPE_MAP = {"float32": (T.float32, torch.float32), "bfloat16": (T.bfloat16, torch.bfloat16), "float16": (T.float16, torch.float16)}
+
+# ---------------------------------------------------------------------------
+# Generic kernel builders using T.Ramp for packed x2 access
+# ---------------------------------------------------------------------------
+
+
+def _make_binary_kernel(op_func, dtype_tl):
+    """Build a kernel: C[idx] = op(A[idx], B[idx])."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M * 2,), dtype=dtype_tl),
+        B: T.Tensor((M * 2,), dtype=dtype_tl),
+        C: T.Tensor((M * 2,), dtype=dtype_tl),
+    ):
+        with T.Kernel(1, 1, threads=M) as (bx, by):
+            tid = T.get_thread_binding()
+            idx = T.Ramp(tid * 2, 1, 2)
+            C[idx] = op_func(A[idx], B[idx])
+
+    return main
+
+
+def _make_ternary_kernel(op_func, dtype_tl):
+    """Build a kernel: D[idx] = op(A[idx], B[idx], C[idx])."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M * 2,), dtype=dtype_tl),
+        B: T.Tensor((M * 2,), dtype=dtype_tl),
+        C: T.Tensor((M * 2,), dtype=dtype_tl),
+        D: T.Tensor((M * 2,), dtype=dtype_tl),
+    ):
+        with T.Kernel(1, 1, threads=M) as (bx, by):
+            tid = T.get_thread_binding()
+            idx = T.Ramp(tid * 2, 1, 2)
+            D[idx] = op_func(A[idx], B[idx], C[idx])
+
+    return main
+
+
+def _make_unary_kernel(op_func, dtype_tl):
+    """Build a kernel: C[idx] = op(A[idx])."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M * 2,), dtype=dtype_tl),
+        C: T.Tensor((M * 2,), dtype=dtype_tl),
+    ):
+        with T.Kernel(1, 1, threads=M) as (bx, by):
+            tid = T.get_thread_binding()
+            idx = T.Ramp(tid * 2, 1, 2)
+            C[idx] = op_func(A[idx])
+
+    return main
+
+
+# ---------------------------------------------------------------------------
+# Helper: lower to CUDA source
+# ---------------------------------------------------------------------------
+
+
+def _lower_to_cuda_source(func, target: str = SM80_TARGET) -> str:
+    with tvm.transform.PassContext(), tvm.target.Target(target):
+        artifact = tilelang.lower(func, target=target)
+    assert artifact.kernel_source is not None
+    return artifact.kernel_source
+
+
+# ---------------------------------------------------------------------------
+# Auto-vectorization kernels via T.Parallel
+# ---------------------------------------------------------------------------
+
+# Map from Python operator string to (lambda, tl_func_name)
+_AUTO_VEC_OPS = {"add": (lambda a, b: a + b, "add2"), "sub": (lambda a, b: a - b, "sub2"), "mul": (lambda a, b: a * b, "mul2")}
+
+
+def _make_auto_vec_binary_kernel(py_op, dtype_tl, width: int = 4):
+    """Build a kernel that uses T.Parallel to let the vectoriser emit tl::<op>2."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, width), dtype=dtype_tl),
+        B: T.Tensor((M, width), dtype=dtype_tl),
+        C: T.Tensor((M, width), dtype=dtype_tl),
+    ):
+        with T.Kernel(1, 1, threads=M) as (bx, by):
+            for i, v in T.Parallel(M, width):
+                C[i, v] = py_op(A[i, v], B[i, v])
+
+    return main
+
+
+def _make_auto_vec_fma_kernel(dtype_tl, width: int = 4):
+    """Build a kernel that lets CUDA codegen fuse mul + add into tl::fma2."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, width), dtype=dtype_tl),
+        B: T.Tensor((M, width), dtype=dtype_tl),
+        C: T.Tensor((M, width), dtype=dtype_tl),
+        D: T.Tensor((M, width), dtype=dtype_tl),
+    ):
+        with T.Kernel(1, 1, threads=M) as (bx, by):
+            for i, v in T.Parallel(M, width):
+                D[i, v] = A[i, v] * B[i, v] + C[i, v]
+
+    return main
+
+
+# ===================================================================
+# Parametrised op / dtype lists
+# ===================================================================
+
+# Binary ops: (name, func)
+_BINARY_OPS = [
+    ("add2", T.add2),
+    ("sub2", T.sub2),
+    ("mul2", T.mul2),
+    ("max2", T.max2),
+    ("min2", T.min2),
+]
+
+# All 3 dtype families
+_DTYPES = ["float32", "bfloat16", "float16"]
+
+# Native cast types expected in codegen for 16-bit packed types
+_NATIVE_CAST_TYPE = {"bfloat16": "__nv_bfloat162", "float16": "__half2"}
+
+# Torch reference functions
+_TORCH_REFS = {
+    "add2": lambda a, b: a + b,
+    "sub2": lambda a, b: a - b,
+    "mul2": lambda a, b: a * b,
+    "max2": lambda a, b: torch.maximum(a, b),
+    "min2": lambda a, b: torch.minimum(a, b),
+    "fma2": lambda a, b, c: a * b + c,
+    "abs2": lambda a: torch.abs(a),
+}
+
+
+# ===================================================================
+# Codegen tests
+# ===================================================================
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype_name", _DTYPES)
+@pytest.mark.parametrize("op_name,op_func", _BINARY_OPS, ids=[n for n, _ in _BINARY_OPS])
+def test_codegen_binary(op_name, op_func, dtype_name):
+    """Binary ops emit tl::<op> with correct native-type casts."""
+    dtype_tl, _ = _DTYPE_MAP[dtype_name]
+    func = _make_binary_kernel(op_func, dtype_tl)
+    src = _lower_to_cuda_source(func, target=SM80_TARGET)
+    assert f"tl::{op_name}" in src, f"Expected tl::{op_name} in generated CUDA source"
+    # For 16-bit types, verify that the codegen emits casts to the correct
+    # native type instead of the ambiguous uint1 overload.
+    if dtype_name in _NATIVE_CAST_TYPE:
+        assert _NATIVE_CAST_TYPE[dtype_name] in src, f"Expected {_NATIVE_CAST_TYPE[dtype_name]} cast in CUDA source for {dtype_name}"
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype_name", _DTYPES)
+def test_codegen_fma2(dtype_name):
+    """fma2 emits tl::fma2 with correct native-type casts."""
+    dtype_tl, _ = _DTYPE_MAP[dtype_name]
+    func = _make_ternary_kernel(T.fma2, dtype_tl)
+    src = _lower_to_cuda_source(func, target=SM80_TARGET)
+    assert "tl::fma2" in src
+    if dtype_name in _NATIVE_CAST_TYPE:
+        assert _NATIVE_CAST_TYPE[dtype_name] in src, f"Expected {_NATIVE_CAST_TYPE[dtype_name]} cast in CUDA source for {dtype_name}"
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype_name", _DTYPES)
+def test_codegen_abs2(dtype_name):
+    """abs2 emits tl::abs2 with correct native-type casts."""
+    dtype_tl, _ = _DTYPE_MAP[dtype_name]
+    func = _make_unary_kernel(T.abs2, dtype_tl)
+    src = _lower_to_cuda_source(func, target=SM80_TARGET)
+    assert "tl::abs2" in src
+    if dtype_name in _NATIVE_CAST_TYPE:
+        assert _NATIVE_CAST_TYPE[dtype_name] in src, f"Expected {_NATIVE_CAST_TYPE[dtype_name]} cast in CUDA source for {dtype_name}"
+
+
+# ---------------------------------------------------------------------------
+# Auto-vectorization codegen tests (T.Parallel -> tl::*2)
+# ---------------------------------------------------------------------------
+
+_AUTO_VEC_OP_NAMES = list(_AUTO_VEC_OPS.keys())  # ["add", "sub", "mul"]
+
+
+# float32: auto-vectorization should emit tl::<op>2 on SM100+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("op_key", _AUTO_VEC_OP_NAMES)
+def test_codegen_auto_vec_f32(op_key):
+    py_op, tl_func = _AUTO_VEC_OPS[op_key]
+    func = _make_auto_vec_binary_kernel(py_op, T.float32)
+    src = _lower_to_cuda_source(func, target=SM100_TARGET)
+    assert f"tl::{tl_func}" in src, f"Expected tl::{tl_func} in SM100 auto-vectorised CUDA source for float32 {op_key}"
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+@pytest.mark.parametrize("op_key", _AUTO_VEC_OP_NAMES)
+def test_codegen_auto_vec_f32_width8(op_key):
+    py_op, tl_func = _AUTO_VEC_OPS[op_key]
+    func = _make_auto_vec_binary_kernel(py_op, T.float32, width=8)
+    src = _lower_to_cuda_source(func, target=SM100_TARGET)
+    assert "\x00" not in src, "Generated CUDA source should not contain embedded NUL bytes"
+    for field in "xyzw":
+        assert f".{field})) = tl::{tl_func}(" in src, (
+            f"Expected {field}-field packed tl::{tl_func} emission in width-8 float32 auto-vectorised source"
+        )
+
+
+# float32: auto-vectorization should NOT emit tl::<op>2 before SM100
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("op_key", _AUTO_VEC_OP_NAMES)
+def test_codegen_auto_vec_f32_no_sm80(op_key):
+    py_op, tl_func = _AUTO_VEC_OPS[op_key]
+    func = _make_auto_vec_binary_kernel(py_op, T.float32)
+    src = _lower_to_cuda_source(func, target=SM80_TARGET)
+    assert f"tl::{tl_func}" not in src, f"tl::{tl_func} should NOT appear in SM80 auto-vectorised CUDA source for float32 {op_key}"
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_auto_vec_fma_f32():
+    func = _make_auto_vec_fma_kernel(T.float32)
+    src = _lower_to_cuda_source(func, target=SM100_TARGET)
+    assert "tl::fma2" in src, "Expected tl::fma2 in SM100 auto-vectorised CUDA source for float32 mul+add"
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype_name", ["bfloat16", "float16"])
+def test_codegen_auto_vec_fma_half_types(dtype_name):
+    dtype_tl, _ = _DTYPE_MAP[dtype_name]
+    func = _make_auto_vec_fma_kernel(dtype_tl, width=8)
+    src = _lower_to_cuda_source(func, target=SM80_TARGET)
+    assert "tl::fma2" in src, f"Expected tl::fma2 in CUDA source for {dtype_name} mul+add"
+    assert _NATIVE_CAST_TYPE[dtype_name] in src, f"Expected {_NATIVE_CAST_TYPE[dtype_name]} cast in CUDA source for {dtype_name}"
+
+
+# bfloat16 / float16: auto-vectorization should emit tl::<op>2 on any target
+# (the C++ helpers have compile-time arch fallbacks).
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype_name", ["bfloat16", "float16"])
+@pytest.mark.parametrize("op_key", _AUTO_VEC_OP_NAMES)
+def test_codegen_auto_vec_half_types(op_key, dtype_name):
+    py_op, tl_func = _AUTO_VEC_OPS[op_key]
+    dtype_tl, _ = _DTYPE_MAP[dtype_name]
+    func = _make_auto_vec_binary_kernel(py_op, dtype_tl)
+    src = _lower_to_cuda_source(func, target=SM80_TARGET)
+    assert f"tl::{tl_func}" in src, f"Expected tl::{tl_func} in auto-vectorised CUDA source for {dtype_name} {op_key}"
+    # Verify correct native-type cast
+    assert _NATIVE_CAST_TYPE[dtype_name] in src, (
+        f"Expected {_NATIVE_CAST_TYPE[dtype_name]} cast in auto-vectorised CUDA source for {dtype_name} {op_key}"
+    )
+
+
+# ===================================================================
+# Numerical correctness tests
+# ===================================================================
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype_name", _DTYPES)
+@pytest.mark.parametrize("op_name,op_func", _BINARY_OPS, ids=[n for n, _ in _BINARY_OPS])
+def test_correctness_binary(op_name, op_func, dtype_name):
+    """Binary ops produce correct results for all dtypes."""
+    dtype_tl, dtype_torch = _DTYPE_MAP[dtype_name]
+    func = _make_binary_kernel(op_func, dtype_tl)
+    kernel = tilelang.compile(func, out_idx=[2], target="cuda")
+    a = torch.randn(M * 2, device="cuda", dtype=dtype_torch)
+    b = torch.randn(M * 2, device="cuda", dtype=dtype_torch)
+    c = kernel(a, b)
+    ref = _TORCH_REFS[op_name](a, b)
+    torch.testing.assert_close(c, ref)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype_name", _DTYPES)
+def test_correctness_fma2(dtype_name):
+    """fma2 produces correct results for all dtypes."""
+    dtype_tl, dtype_torch = _DTYPE_MAP[dtype_name]
+    func = _make_ternary_kernel(T.fma2, dtype_tl)
+    kernel = tilelang.compile(func, out_idx=[3], target="cuda")
+    a = torch.randn(M * 2, device="cuda", dtype=dtype_torch)
+    b = torch.randn(M * 2, device="cuda", dtype=dtype_torch)
+    c = torch.randn(M * 2, device="cuda", dtype=dtype_torch)
+    d = kernel(a, b, c)
+    ref = _TORCH_REFS["fma2"](a, b, c)
+    # Hardware FMA fuses multiply-add into a single rounding step, so it can
+    # differ from the separate mul+add reference by up to 1 ULP.  Use relaxed
+    # tolerances for 16-bit types.
+    if dtype_name == "float32":
+        torch.testing.assert_close(d, ref)
+    else:
+        torch.testing.assert_close(d, ref, atol=1e-2, rtol=1e-1)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype_name", _DTYPES)
+def test_correctness_abs2(dtype_name):
+    """abs2 produces correct results for all dtypes."""
+    dtype_tl, dtype_torch = _DTYPE_MAP[dtype_name]
+    func = _make_unary_kernel(T.abs2, dtype_tl)
+    kernel = tilelang.compile(func, out_idx=[1], target="cuda")
+    a = torch.randn(M * 2, device="cuda", dtype=dtype_torch)
+    c = kernel(a)
+    ref = _TORCH_REFS["abs2"](a)
+    torch.testing.assert_close(c, ref)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/debug/test_device_assert.py b/testing/python/debug/test_device_assert.py
index 210b8966d7..6a0c517170 100644
--- a/testing/python/debug/test_device_assert.py
+++ b/testing/python/debug/test_device_assert.py
@@ -4,20 +4,6 @@
 import tilelang.language as T
 
 
-# TODO(dyq) It intentionally triggers a device-side assert so we can't include this in CI
-# Please run manually when you want to verify that device_assert actually traps on GPU.
-def _manual_device_assert_triggered():
-    @T.prim_func
-    def program():
-        with T.Kernel(threads=128):
-            tid = T.get_thread_binding()
-            T.device_assert(tid > 0, "Assertion Trigger !")
-
-    jit_kernel = tilelang.compile(program, target="cuda")
-    profiler = jit_kernel.get_profiler()
-    profiler.run_once()
-
-
 def test_device_assert_no_trigger():
     @T.prim_func
     def program():
@@ -25,10 +11,10 @@ def program():
             tid = T.get_thread_binding()
             T.device_assert(tid == tid)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
 
 if __name__ == "__main__":
-    _manual_device_assert_triggered()
+    tilelang.testing.main()
diff --git a/testing/python/debug/test_tilelang_debug_print.py b/testing/python/debug/test_tilelang_debug_print.py
index 735eb3e800..d7b7f7a7e4 100644
--- a/testing/python/debug/test_tilelang_debug_print.py
+++ b/testing/python/debug/test_tilelang_debug_print.py
@@ -1,8 +1,9 @@
 # type: ignore
-
+import pytest
 import tilelang
 import tilelang.testing
 import tilelang.language as T
+from tilelang.utils import determine_fp8_type
 
 
 def debug_print_buffer(M=16, N=16, dtype=T.float16):
@@ -17,19 +18,11 @@ def program(Q: T.Tensor((M, N), dtype)):
     profiler.run_once()
 
 
-def test_debug_print_buffer():
-    debug_print_buffer(dtype=T.int8)
-    debug_print_buffer(dtype=T.int16)
-    debug_print_buffer(dtype=T.int32)
-    debug_print_buffer(dtype=T.int64)
-    debug_print_buffer(dtype=T.uint8)
-    debug_print_buffer(dtype=T.uint16)
-    debug_print_buffer(dtype=T.uint32)
-    debug_print_buffer(dtype=T.uint64)
-    debug_print_buffer(dtype=T.float16)
-    debug_print_buffer(dtype=T.float32)
-    debug_print_buffer(dtype=T.float64)
-    debug_print_buffer(dtype=T.bfloat16)
+@pytest.mark.parametrize(
+    "dtype", [T.int8, T.int16, T.int32, T.int64, T.uint8, T.uint16, T.uint32, T.uint64, T.float16, T.float32, T.float64, T.bfloat16]
+)
+def test_debug_print_buffer(dtype):
+    debug_print_buffer(dtype=dtype)
 
 
 @tilelang.testing.requires_cuda
@@ -40,8 +33,8 @@ def test_debug_print_buffer_cuda_fp8():
 
 @tilelang.testing.requires_rocm
 def test_debug_print_buffer_rocm_fp8():
-    debug_print_buffer(dtype=T.float8_e4m3fnuz)
-    debug_print_buffer(dtype=T.float8_e5m2fnuz)
+    debug_print_buffer(dtype=determine_fp8_type("e4m3"))
+    debug_print_buffer(dtype=determine_fp8_type("e5m2"))
 
 
 def debug_print_buffer_conditional(M=16, N=16):
@@ -55,7 +48,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             if bx == 0 and by == 0 and bz == 0:
                 T.print(shared_buf)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -74,7 +67,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             if tid == 0:
                 T.print(bx + by + bz)
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -93,7 +86,7 @@ def program(Q: T.Tensor((M, N), dtype)):
             for i, j in T.Parallel(M, N):
                 T.print(register_buf[i, j])
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
@@ -102,7 +95,7 @@ def test_debug_print_register_files():
     debug_print_register_files(16, 16)
 
 
-def debug_print_msg(M=16, N=16):
+def debug_print_msg(M=16, N=16, msg_only=False):
     dtype = T.float16
 
     @T.prim_func
@@ -110,15 +103,19 @@ def program(Q: T.Tensor((M, N), dtype)):
         with T.Kernel(4, 4, 2, threads=128 * 2) as (bx, by, bz):
             tid = T.get_thread_binding()
             if tid == 0:
-                T.print(bx + by + bz, msg="hello world")
+                if msg_only:
+                    T.print(msg="hello world")
+                else:
+                    T.print(bx + by + bz, msg="hello world")
 
-    jit_kernel = tilelang.compile(program, target="cuda")
+    jit_kernel = tilelang.compile(program)
     profiler = jit_kernel.get_profiler()
     profiler.run_once()
 
 
 def test_debug_print_msg():
-    debug_print_msg(16, 16)
+    debug_print_msg(16, 16, msg_only=True)
+    debug_print_msg(16, 16, msg_only=False)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/distributed/test_multicast_ops.py b/testing/python/distributed/test_multicast_ops.py
new file mode 100644
index 0000000000..0c174375e1
--- /dev/null
+++ b/testing/python/distributed/test_multicast_ops.py
@@ -0,0 +1,100 @@
+"""
+Multicast (NVSwitch) operations test.
+
+Usage:
+  # Single-GPU unit tests (no torchrun needed):
+  python testing/python/distributed/test_multicast_ops.py
+
+  # Multi-GPU integration test:
+  torchrun --nproc_per_node=8 testing/python/distributed/test_multicast_ops.py --distributed
+"""
+
+import argparse
+import os
+
+import torch
+import torch.distributed as dist
+
+
+def test_supports_multicast():
+    """Test multicast support detection."""
+    from tilelang.distributed.shared_memory import _supports_multicast
+
+    result = _supports_multicast()
+    print(f"\033[32m[PASS]\033[0m _supports_multicast() = {result}")
+    return result
+
+
+def test_distributed_multicast_allocator(rank, world_size):
+    """Multi-GPU integration test: MulticastAllocator with fabric VMM."""
+    from tilelang.utils.allocator import MulticastAllocator
+
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+
+    group = dist.new_group(list(range(world_size)))
+
+    allocator = MulticastAllocator(
+        size=1024 * 1024,  # 1 MB
+        device="cuda",
+        local_rank=local_rank,
+        num_local_ranks=world_size,
+        group=group,
+    )
+
+    assert allocator._initialized
+    assert allocator.ptr != 0
+    assert len(allocator.peer_ptrs) == world_size
+
+    # Get local tensor and write data
+    t = allocator.get_local_tensor((256,), torch.bfloat16)
+    t.fill_(float(rank + 1))
+    torch.cuda.synchronize()
+
+    dist.barrier()
+
+    # Verify we can read peer data
+    peer_rank = (local_rank + 1) % world_size
+    peer_t = allocator.get_peer_tensor(peer_rank, (256,), torch.bfloat16)
+    peer_val = peer_t[0].item()
+    expected_val = float(peer_rank + 1)
+    assert abs(peer_val - expected_val) < 1e-2, f"rank {local_rank}: peer[{peer_rank}] = {peer_val}, expected {expected_val}"
+
+    dist.barrier()
+
+    if rank == 0:
+        print(f"\033[32m[PASS]\033[0m test_distributed_multicast_allocator (world_size={world_size})")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--distributed", action="store_true", help="Run multi-GPU tests (requires torchrun)")
+    args = parser.parse_args()
+
+    if args.distributed:
+        import datetime
+
+        world_size = int(os.environ.get("WORLD_SIZE", 1))
+        rank = int(os.environ.get("RANK", 0))
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=world_size,
+            rank=rank,
+            timeout=datetime.timedelta(seconds=60),
+        )
+
+        test_distributed_multicast_allocator(rank, world_size)
+
+        dist.destroy_process_group()
+    else:
+        # Single-GPU unit tests
+        torch.cuda.set_device(0)
+        test_supports_multicast()
+        print("\n=== Single-GPU tests done ===")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testing/python/distributed/test_vmm_ops.py b/testing/python/distributed/test_vmm_ops.py
new file mode 100644
index 0000000000..0023ee038e
--- /dev/null
+++ b/testing/python/distributed/test_vmm_ops.py
@@ -0,0 +1,195 @@
+"""
+VMM (Virtual Memory Management) operations test.
+
+Usage:
+  # Single-GPU unit tests (no torchrun needed):
+  python testing/python/distributed/test_vmm_ops.py
+
+  # Multi-GPU integration test:
+  torchrun --nproc_per_node=8 testing/python/distributed/test_vmm_ops.py --distributed
+"""
+
+import argparse
+import os
+
+import torch
+import torch.distributed as dist
+
+
+def test_supports_fabric():
+    """Test fabric support detection."""
+    from tilelang.distributed.shared_memory import _supports_vmm_fabric
+
+    result = _supports_vmm_fabric()
+    print(f"\033[32m[PASS]\033[0m _supports_vmm_fabric() = {result}")
+    return result
+
+
+def test_vmm_malloc_free():
+    """Test VMM malloc and free roundtrip."""
+    from tilelang.distributed.shared_memory import _vmm_malloc, _vmm_free
+
+    size = 1024 * 1024  # 1 MB
+    ptr = _vmm_malloc(size)
+    assert ptr != 0, "vmm_malloc returned null"
+
+    # Verify the pointer is usable by writing via cudaMemset
+    import ctypes
+    import ctypes.util
+
+    libcudart = ctypes.CDLL(ctypes.util.find_library("cudart") or "libcudart.so")
+    rc = libcudart.cudaMemset(ctypes.c_void_p(ptr), 0, ctypes.c_size_t(size))
+    assert rc == 0, f"cudaMemset on VMM pointer failed: {rc}"
+
+    _vmm_free(ptr)
+    print("\033[32m[PASS]\033[0m test_vmm_malloc_free")
+
+
+def test_vmm_handle_export_import():
+    """Test handle export and import on a single GPU."""
+    from tilelang.distributed.shared_memory import _vmm_malloc, _vmm_free, _create_vmm_handle, _open_vmm_handle, _close_vmm_handle
+
+    size = 4096
+    ptr = _vmm_malloc(size)
+    assert ptr != 0
+
+    # Write a known pattern
+    import ctypes
+
+    pattern = (ctypes.c_uint8 * size)(*([0xAB] * size))
+    libcudart = ctypes.CDLL(ctypes.util.find_library("cudart") or "libcudart.so")
+    libcudart.cudaMemcpy.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_size_t, ctypes.c_int]
+    libcudart.cudaMemcpy.restype = ctypes.c_int
+    rc = libcudart.cudaMemcpy(ctypes.c_void_p(ptr), ctypes.byref(pattern), size, 1)  # cudaMemcpyHostToDevice=1
+    assert rc == 0, f"cudaMemcpy H2D failed: {rc}"
+
+    # Export handle
+    handle = _create_vmm_handle(ptr)
+    assert len(handle) > 0, "handle is empty"
+
+    # Import handle (same process, simulates remote open)
+    ptr2 = _open_vmm_handle(handle)
+    assert ptr2 != 0, "open_vmm_handle returned null"
+
+    # Read back through the new mapping
+    readback = (ctypes.c_uint8 * size)()
+    rc = libcudart.cudaMemcpy(ctypes.byref(readback), ctypes.c_void_p(ptr2), size, 2)  # cudaMemcpyDeviceToHost=2
+    assert rc == 0, f"cudaMemcpy D2H failed: {rc}"
+    assert all(b == 0xAB for b in readback), "Data mismatch after handle export/import"
+
+    _close_vmm_handle(ptr2)
+    _vmm_free(ptr)
+    print("\033[32m[PASS]\033[0m test_vmm_handle_export_import")
+
+
+def test_distributed_vmm(rank, world_size):
+    """Multi-GPU integration test: VMM alloc + P2P read via BaseAllocator."""
+    from tilelang.utils.allocator import BaseAllocator
+
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+
+    group = dist.new_group(list(range(world_size)))
+
+    # Use BaseAllocator with use_vmm=True — it allocates via vmm_malloc
+    # and handles the VMM handle exchange internally
+    allocator = BaseAllocator(
+        size=1024 * 1024,  # 1 MB
+        device="cuda",
+        is_distributed=True,
+        local_rank=local_rank,
+        num_local_ranks=world_size,
+        group=group,
+        use_vmm=True,
+    )
+
+    assert allocator.initialized()
+    assert allocator._buffer_ptrs is not None
+    assert allocator._buffer_ptrs.shape[0] == world_size
+    assert allocator._buffer_ptrs[local_rank].item() != 0
+
+    # Allocate a tensor from the VMM buffer and verify P2P access
+    t = allocator._allocate_tensor((256,), torch.float32)
+    t.fill_(float(rank + 1))
+    torch.cuda.synchronize()
+
+    dist.barrier()
+
+    if rank == 0:
+        print(f"\033[32m[PASS]\033[0m test_distributed_vmm (world_size={world_size})")
+
+
+def test_distributed_ipc_fallback(rank, world_size):
+    """Verify IPC path still works with TILESCALE_USE_VMM=0."""
+    from tilelang.distributed.utils import create_dist_tensor, create_tensor
+
+    local_rank = int(os.environ.get("LOCAL_RANK", 0))
+    torch.cuda.set_device(local_rank)
+
+    group = dist.new_group(list(range(world_size)))
+
+    data = create_tensor([1024], torch.float32)
+    data.fill_(float(rank + 1))
+
+    # Explicitly use IPC
+    buffer_ptrs = create_dist_tensor(local_rank, world_size, data, rank, group, use_vmm=False)
+
+    assert buffer_ptrs.shape[0] == world_size
+    # Note: IPC path doesn't set local rank's pointer (no self-open needed)
+    # Check that at least one remote rank's pointer is non-zero
+    remote_rank = (local_rank + 1) % world_size
+    assert buffer_ptrs[remote_rank].item() != 0, f"Remote rank {remote_rank} pointer is zero"
+
+    dist.barrier()
+
+    if rank == 0:
+        print(f"\033[32m[PASS]\033[0m test_distributed_ipc_fallback (world_size={world_size})")
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--distributed", action="store_true", help="Run multi-GPU tests (requires torchrun)")
+    args = parser.parse_args()
+
+    if args.distributed:
+        # Multi-GPU path (launched via torchrun)
+        import datetime
+
+        world_size = int(os.environ.get("WORLD_SIZE", 1))
+        rank = int(os.environ.get("RANK", 0))
+        local_rank = int(os.environ.get("LOCAL_RANK", 0))
+
+        torch.cuda.set_device(local_rank)
+        dist.init_process_group(
+            backend="nccl",
+            world_size=world_size,
+            rank=rank,
+            timeout=datetime.timedelta(seconds=60),
+        )
+
+        has_fabric = test_supports_fabric() if rank == 0 else False
+        # Broadcast fabric support info
+        fabric_tensor = torch.tensor([int(has_fabric)], device="cuda")
+        dist.broadcast(fabric_tensor, src=0)
+        has_fabric = bool(fabric_tensor.item())
+
+        if has_fabric:
+            test_distributed_vmm(rank, world_size)
+
+        test_distributed_ipc_fallback(rank, world_size)
+
+        dist.destroy_process_group()
+    else:
+        # Single-GPU unit tests
+        torch.cuda.set_device(0)
+        has_fabric = test_supports_fabric()
+        if has_fabric:
+            test_vmm_malloc_free()
+            test_vmm_handle_export_import()
+            print("\n=== All single-GPU VMM tests passed ===")
+        else:
+            print("\n=== Fabric not supported on this hardware, skipping VMM tests ===")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/testing/python/ir/test_ir_kernel_frame.py b/testing/python/ir/test_ir_kernel_frame.py
deleted file mode 100644
index c3a6bbc90e..0000000000
--- a/testing/python/ir/test_ir_kernel_frame.py
+++ /dev/null
@@ -1 +0,0 @@
-# TODO: implement this test for tilelang/language/kernel.py
diff --git a/testing/python/issue/test_tilelang_issue_1001.py b/testing/python/issue/test_tilelang_issue_1001.py
index f2315ef21e..4e34ffc972 100644
--- a/testing/python/issue/test_tilelang_issue_1001.py
+++ b/testing/python/issue/test_tilelang_issue_1001.py
@@ -5,10 +5,7 @@
 
 
 @tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def _cumsum_view_infer_layout(hidden):
     num_tokens = T.dynamic("num_tokens")
@@ -23,6 +20,7 @@ def buggy_kernel(x: T.Tensor[(num_tokens, hidden), T.float]):
     return buggy_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_cumsum_view_infer_layout():
     hidden = 128
     x = torch.randn(1, hidden, device="cuda", dtype=torch.float)
diff --git a/testing/python/issue/test_tilelang_issue_1008.py b/testing/python/issue/test_tilelang_issue_1008.py
index a35a18449c..a09e2b8cd6 100644
--- a/testing/python/issue/test_tilelang_issue_1008.py
+++ b/testing/python/issue/test_tilelang_issue_1008.py
@@ -5,10 +5,7 @@
 
 
 @tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def _fill_with_static_region_kernel():
     num_tokens = T.symbolic("num_tokens")
@@ -22,10 +19,7 @@ def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
 
 
 @tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def _fill_with_dynamic_region_kernel():
     num_tokens = T.symbolic("num_tokens")
@@ -39,12 +33,14 @@ def buggy_kernel(x: T.Tensor[(num_tokens,), "int64"]):  # noqa: F821
     return buggy_kernel
 
 
+@tilelang.testing.requires_cuda
 def test_fill_with_static_region_kernel():
     kernel = _fill_with_static_region_kernel()
     x = torch.zeros((256,), dtype=torch.int64, device="cuda")
     kernel(x)
 
 
+@tilelang.testing.requires_cuda
 def test_fill_with_dynamic_region_kernel():
     kernel = _fill_with_dynamic_region_kernel()
     x = torch.zeros((256,), dtype=torch.int64, device="cuda")
diff --git a/testing/python/issue/test_tilelang_issue_1026.py b/testing/python/issue/test_tilelang_issue_1026.py
new file mode 100644
index 0000000000..07bd862921
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1026.py
@@ -0,0 +1,26 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.jit
+def get_shared_kernel():
+    @T.prim_func
+    def shared_kernel():
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            shared_mem = T.alloc_shared((32), dtype="float32", scope="shared")
+            if tx % 2 == 0:
+                a = shared_mem[tx]
+                shared_mem[tx ^ 1] = a
+
+    return shared_kernel
+
+
+def test_issue_1026():
+    kernel = get_shared_kernel()
+    assert "__syncthreads" not in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1210.py b/testing/python/issue/test_tilelang_issue_1210.py
index 2e141d7829..d9f2a3a63e 100644
--- a/testing/python/issue/test_tilelang_issue_1210.py
+++ b/testing/python/issue/test_tilelang_issue_1210.py
@@ -22,14 +22,32 @@ def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), T.int32)):
     return fwd_main
 
 
+def _make_kernel_if_cond(M, N):
+    dtype = T.bfloat16
+
+    @T.prim_func
+    def fwd_main(KV: T.Tensor((M, N), dtype), ids: T.Tensor((4,), T.int32)):
+        with T.Kernel(4, threads=1):
+            A = T.alloc_shared([N], dtype)
+            B = T.alloc_shared([N], dtype)
+
+            # Regression for a bug where InjectSoftwarePipeline left the loop
+            # variable as a free var, causing MakePackedAPI to fail
+            for i in T.Pipelined(4, num_stages=1):
+                if i > 1:
+                    _id = ids[i]
+                    T.copy(KV[_id, :], A)
+                    T.clear(B)
+
+    return fwd_main
+
+
 def test_make_packed_api_no_free_loop_var():
-    func = _make_kernel(4, 4)
+    func, func_if_cond = _make_kernel(4, 4), _make_kernel_if_cond(4, 4)
     # Keep warp-specialization/TMA disabled to match the original repro
-    cfg = {
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-    }
+    cfg = {tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
     tilelang.compile(func, pass_configs=cfg)
+    tilelang.compile(func_if_cond, pass_configs=cfg)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/issue/test_tilelang_issue_1257.py b/testing/python/issue/test_tilelang_issue_1257.py
new file mode 100644
index 0000000000..4dff99cde2
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1257.py
@@ -0,0 +1,41 @@
+# ruff: noqa
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1257_missing_syncthreads_after_atomic_add_on_shared():
+    """Regression for issue #1257.
+
+    After an AtomicAdd on shared memory, a __syncthreads() barrier is required
+    before other threads read from shared memory. Without the barrier, threads
+    may read stale values.
+    """
+
+    m = 1024
+
+    @tilelang.jit
+    def get_kernel(m: int):
+        @T.prim_func
+        def test_kernel(
+            a: T.Tensor[(m,), "int32"],
+        ):
+            with T.Kernel(1, threads=1024) as (bx):
+                shared = T.alloc_shared((1024,), "int32")
+                tx = T.get_thread_binding(0)
+                shared[tx ^ 1] = 0
+                T.atomic_add(shared[tx], 1)
+                a[tx] = shared[tx ^ 32]
+
+        return test_kernel
+
+    kernel = get_kernel(m)
+    source = kernel.get_kernel_source()
+
+    sync_threads = source.count("__syncthreads()")
+    assert sync_threads == 2, "Missing __syncthreads() between AtomicAdd on shared memory and subsequent shared memory read"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1263.py b/testing/python/issue/test_tilelang_issue_1263.py
new file mode 100644
index 0000000000..b0c1e80bde
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1263.py
@@ -0,0 +1,64 @@
+import tilelang.testing
+import tilelang.language as T
+
+
+def _test_kernel(M, N):
+    dtype = "bfloat16"
+
+    @T.prim_func
+    def fwd_main(
+        KV: T.Tensor((M, N), dtype),
+        ids: T.Tensor((4,), "int32"),
+        ids2: T.Tensor((4,), "int32"),
+    ):
+        with T.Kernel(4, threads=1):
+            A = T.alloc_shared([N], dtype)
+            B = T.alloc_shared([N], dtype)
+
+            for i in T.Pipelined(4, num_stages=1):
+                id = ids[i]
+                id2 = ids2[id]
+                T.copy(KV[id2, :], A)
+                T.clear(B)
+
+    return fwd_main
+
+
+def _test_kernel_if_cond(M, N):
+    dtype = "bfloat16"
+
+    @T.prim_func
+    def fwd_main(
+        KV: T.Tensor((M, N), dtype),
+        ids: T.Tensor((4,), "int32"),
+        ids2: T.Tensor((4,), "int32"),
+    ):
+        with T.Kernel(4, threads=1):
+            A = T.alloc_shared([N], dtype)
+            B = T.alloc_shared([N], dtype)
+
+            for i in T.Pipelined(4, num_stages=1):
+                id = ids[i]
+                id2 = ids2[id]
+                if id2 > 1:
+                    T.copy(KV[id2, :], A)
+                    T.clear(B)
+
+    return fwd_main
+
+
+def test_issue_1263_pipeline_no_consumer():
+    tilelang.compile(_test_kernel(1024, 1024))
+    tilelang.compile(
+        _test_kernel(1024, 1024),
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    tilelang.compile(_test_kernel_if_cond(1024, 1024))
+    tilelang.compile(
+        _test_kernel_if_cond(1024, 1024),
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1374.py b/testing/python/issue/test_tilelang_issue_1374.py
index 5b53051888..f5c1c39059 100644
--- a/testing/python/issue/test_tilelang_issue_1374.py
+++ b/testing/python/issue/test_tilelang_issue_1374.py
@@ -18,7 +18,7 @@ def main(A: T.Tensor((16, 14), dtype=dtype), B: T.Tensor((16, 448), dtype=dtype)
                 T.copy(A, A_local)
                 T.copy(B, B_local)
                 for i, j in T.Parallel(16, 448):
-                    A_local[i, j // 32] += B[i, j]
+                    B[i, j] += A_local[i, j // 32]
 
         return main
 
diff --git a/testing/python/issue/test_tilelang_issue_1549.py b/testing/python/issue/test_tilelang_issue_1549.py
new file mode 100644
index 0000000000..69d486ea52
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1549.py
@@ -0,0 +1,44 @@
+import tilelang as tl
+import tilelang.testing
+import tilelang.language as T
+import torch
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1549_strange_var_vectorization():
+    @tl.jit
+    def get_wrong_kernel(M: int = 4096):
+        dtype = "int32"
+        num_threads = 64
+
+        @T.prim_func
+        def main(
+            Data: T.Tensor((M,), dtype),
+        ):
+            with T.Kernel(1, threads=num_threads) as _:
+                # Pre-allocated scalar variables (causes issue in 0.1.7.post1)
+                idx = T.alloc_var(T.int32)
+                for i in T.Parallel(M):
+                    idx = i
+                    Data[i] = idx
+
+        return main
+
+    kernel = get_wrong_kernel()
+    M = 2048
+    kernel = get_wrong_kernel(M)
+    data = torch.randint(0, 100, (M,), dtype=torch.int32, device="cuda")
+    kernel(data)
+    code = kernel.get_kernel_source()
+    print(code)
+    assert (
+        """for (int i = 0; i < 32; ++i) {
+    idx = ((i * 64) + ((int)threadIdx.x));
+    Data[((i * 64) + ((int)threadIdx.x))] = idx;
+  }"""
+        in code
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1601.py b/testing/python/issue/test_tilelang_issue_1601.py
new file mode 100644
index 0000000000..762a13f4a1
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1601.py
@@ -0,0 +1,25 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1601():
+    @tilelang.jit
+    def qwq():
+        @T.prim_func
+        def main(
+            A: T.Tensor((8,), T.float8_e4m3fn),
+        ):
+            with T.Kernel(1, threads=32):
+                for i in T.vectorized(8):
+                    A[i] = 0
+
+        return main
+
+    kernel = qwq()
+    assert "fp8_e4_t broadcast_var = fp8_e4_t(0x0p+0f/*0.000000e+00*/);" in kernel.get_kernel_source()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1604.py b/testing/python/issue/test_tilelang_issue_1604.py
new file mode 100644
index 0000000000..215d479c7e
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1604.py
@@ -0,0 +1,36 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import re
+
+
+@tilelang.jit
+def qwq():
+    dtype = "float32"
+
+    @T.prim_func
+    def main(out: T.Tensor[(512,), dtype]):
+        with T.Kernel(1, threads=512):
+            A = T.alloc_shared((32,), dtype)
+            B = T.alloc_shared((32,), dtype)
+
+            tid = T.get_thread_binding()
+            if tid < 32:
+                A[tid] = tid
+                B[tid] = tid
+
+            out[tid] = A[tid % 32]
+
+    return main
+
+
+def test_issue_1604():
+    kernel = qwq()
+    print(kernel.get_kernel_source())
+    target = "__syncthreads"
+    pattern = r"if [^{]*{[^}]*\b" + re.escape(target) + r"\b[^}]*}"
+    assert len(re.findall(pattern, kernel.get_kernel_source())) == 0
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1678.py b/testing/python/issue/test_tilelang_issue_1678.py
new file mode 100644
index 0000000000..d22cc414a9
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1678.py
@@ -0,0 +1,26 @@
+# ruff: noqa
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_issue_1678():
+    @tilelang.jit
+    def qwq():
+        @T.prim_func
+        def qwq_kernel():
+            with T.Kernel(4096, 1, threads=1) as (pid_y, pid_x):
+                i = T.alloc_var("int32")
+                i = 1
+                tmp_row = T.alloc_local((4,), "float32")
+                amax_local = T.alloc_var("float32")
+                j = 0
+                amax_local = T.max(amax_local, tmp_row[j])
+
+        return qwq_kernel
+
+    kernel = qwq()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1690.py b/testing/python/issue/test_tilelang_issue_1690.py
new file mode 100644
index 0000000000..8322405381
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1690.py
@@ -0,0 +1,21 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+def test_issue_1690():
+    @tilelang.jit()
+    def test(A):
+        N = T.const("N")
+        A: T.Tensor[[N], T.float32]
+        with T.Kernel():
+            tmp = T.alloc_fragment((N,), T.float32)
+            tmp_max = T.alloc_fragment(1, T.float32)
+            T.copy(A, tmp)
+            T.reduce_max(tmp, tmp_max, dim=0)
+
+    test.compile(N=16)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1697.py b/testing/python/issue/test_tilelang_issue_1697.py
new file mode 100644
index 0000000000..9350272c95
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1697.py
@@ -0,0 +1,46 @@
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+
+
+def matmu_jit_kernel(M, N, K, block_M, block_N, block_K):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), T.float16)
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
+            C_local = T.alloc_fragment((block_M, block_N), T.float32)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K)):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_jit_kernel(M, N, K, block_M, block_N, block_K):
+    program = matmu_jit_kernel(M, N, K, block_M, block_N, block_K)
+
+    matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
+
+    A = torch.randn(M, K, dtype=torch.float16).cuda()
+    B = torch.randn(K, N, dtype=torch.float16).cuda()
+
+    C = matmul_kernel(A, B)
+
+    tilelang.testing.torch_assert_close(C, torch.matmul(A, B), atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
+
+
+def test_gemm_jit_kernel_zero_dim():
+    run_gemm_jit_kernel(512, 1024, 0, 128, 256, 32)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1719.py b/testing/python/issue/test_tilelang_issue_1719.py
new file mode 100644
index 0000000000..3d4dbc98f3
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1719.py
@@ -0,0 +1,127 @@
+import tilelang
+import torch
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1719_layout_1():
+    @tilelang.jit
+    def _buggy_kernel():
+        with T.Kernel(threads=32):
+            tmp1 = T.alloc_shared([32, 32], T.float16)
+            tmp2 = T.alloc_shared([32, 32], T.float16)
+            tmp3 = T.alloc_fragment([32, 32], T.float32)
+            tmp4 = T.alloc_fragment([32], T.float32)
+            T.gemm(tmp1, tmp2, tmp3, transpose_B=True)
+            T.reduce_max(tmp3, tmp4)
+            for i in T.Parallel(32):
+                tmp4[i] = 1
+
+    kernel = _buggy_kernel.compile()
+    print(kernel.get_kernel_source())
+
+
+def test_issue_1719_layout_2():
+    @tilelang.jit
+    def _buggy_kernel(M: int, N: int):
+        with T.Kernel():
+            tmp1 = T.alloc_fragment((N, M), T.float32)
+            tmp2 = T.alloc_fragment((N, M), T.float32)
+            tmp3 = T.alloc_fragment((N, M, M), T.float32)
+            for i, j, k in T.Parallel(N, M, M):
+                tmp3[i, j, k] = 1
+            T.reduce_sum(tmp3, tmp2, dim=1)
+            for i, k in T.Parallel(N, M):
+                tmp2[i, k] /= tmp1[i, k]
+
+    kernel = _buggy_kernel.compile(M=4, N=32)
+    print(kernel.get_kernel_source())
+    assert "tmp2[(((int)threadIdx.x) & 3)]" not in kernel.get_kernel_source()
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1719_layout_3():
+    @tilelang.jit
+    def _buggy_kernel(A, dtype=T.float32):
+        M, N = T.const("M, N")
+        A: T.Tensor[(M, N), dtype]
+        B = T.empty((M,), dtype)
+        with T.Kernel(1, threads=32) as _:
+            A_local = T.alloc_fragment((M, N), dtype)
+            B_local = T.alloc_fragment((M,), dtype)
+
+            T.copy(A, A_local)
+            T.reduce_sum(A_local, B_local, dim=1)
+            T.copy(B_local, B)
+        return B
+
+    M, N = 2, 128
+    kernel = _buggy_kernel.compile(M=M, N=N)
+    a = torch.randn(M, N, device="cuda")
+    b = kernel(a)
+    print(b, a.sum(dim=1))
+    torch.testing.assert_close(b, a.sum(dim=1), atol=1e-2, rtol=1e-2)
+
+
+def test_issue_1719_layout_4():
+    @tilelang.jit
+    def _buggy_kernel():
+        with T.Kernel(threads=128):
+            Q_tail_shared = T.alloc_shared([32, 32], T.bfloat16)
+            K_tail_shared = T.alloc_shared([32, 32], T.bfloat16)
+            acc_s = T.alloc_fragment([32, 32], T.float32)
+            m_i = T.alloc_fragment([32], T.float32)
+            T.gemm(Q_tail_shared, K_tail_shared, acc_s, transpose_B=True)
+            T.reduce_max(acc_s, m_i)
+
+    _buggy_kernel.compile()
+
+
+def test_issue_1719_layout_5():
+    @tilelang.jit
+    def buggy_kernel(A, dtype=T.float32):
+        N = T.const("N")
+        A: T.Tensor[(1, N), dtype]
+        with T.Kernel(1, threads=32) as _:
+            A_local = T.alloc_fragment((1, N), dtype)
+            B_local = T.alloc_fragment((1,), dtype)
+
+            T.copy(A, A_local)
+            T.reduce_sum(A_local, B_local, dim=1)
+
+    buggy_kernel.compile(N=128)
+
+
+def test_issue_1719_layout_6():
+    @tilelang.jit
+    def buggy_kernel():
+        with T.Kernel():
+            tmp1 = T.alloc_fragment((1,), dtype=T.float32)
+            tmp2 = T.alloc_fragment((1,), dtype=T.float32)
+            tmp1[0] = 1
+            T.reduce_sum(tmp1, tmp2)
+            tmp2[0]
+
+    buggy_kernel.compile()
+
+
+def test_issue_1719_layout_7():
+    @tilelang.jit
+    def buggy_kernel():
+        with T.Kernel(threads=32):
+            tmp1 = T.alloc_fragment([1, 32], T.float16)
+            tmp2 = T.alloc_fragment([32], T.float32)
+            tmp3 = T.alloc_fragment([32], T.float32)
+            tmp4 = T.alloc_fragment([32], T.float32)
+            T.reduce_max(tmp1, tmp4, dim=0)
+            k = 0
+            T.copy(tmp1[k, :], tmp2)
+            for i in T.Parallel(32):
+                tmp3[i] += tmp2[i] - tmp4[i]
+
+    buggy_kernel.compile()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1728.py b/testing/python/issue/test_tilelang_issue_1728.py
new file mode 100644
index 0000000000..30d7ee0fa5
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1728.py
@@ -0,0 +1,25 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_issue_1728():
+    @tilelang.jit()
+    def get_qwq(hidden: int):
+        num_tokens = T.dynamic("num_tokens")
+        num_sms = num_tokens
+
+        @T.prim_func
+        def qwq(A: T.Tensor[(num_tokens,)]):
+            with T.Kernel(num_sms) as sm_id:
+                stop = sm_id + 1
+                for block_idx in T.serial(sm_id, stop):
+                    _pid_x, _pid_y = (block_idx, hidden)
+
+        return qwq
+
+    get_qwq(1)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1734.py b/testing/python/issue/test_tilelang_issue_1734.py
new file mode 100644
index 0000000000..219a14185d
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1734.py
@@ -0,0 +1,43 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+def test_issue_1734():
+    """Test that loop-invariant if statements are hoisted out of loops."""
+
+    @tilelang.jit()
+    def kernel():
+        @T.prim_func
+        def main(
+            A: T.Tensor[(2, 512), T.float32],
+            B: T.Tensor[(2, 512), T.float32],
+            C: T.Tensor[(2,), T.float32],
+        ):
+            with T.Kernel(1, threads=256):
+                A_local = T.alloc_fragment((2, 512), T.float32)
+                B_local = T.alloc_fragment((2, 512), T.float32)
+                C_local = T.alloc_fragment((2,), T.float32)
+
+                T.copy(A, A_local)
+                T.copy(C, C_local)
+
+                for i, j in T.Parallel(2, 512):
+                    if C_local[i] >= 0:
+                        B_local[i, j] = A_local[i, j]
+
+                T.copy(B_local, B)
+
+        return main
+
+    mod = kernel.compile()
+    source = mod.get_kernel_source()
+    # Verify that the if statement is hoisted outside the for loop
+    # After hoisting, we should see "if" before "for" pattern
+    if_pos = source.find("if (")
+    for_pos = source.find("for (")
+    assert if_pos < for_pos, "Loop-invariant if should be hoisted outside the loop"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1744.py b/testing/python/issue/test_tilelang_issue_1744.py
new file mode 100644
index 0000000000..2e3d7b8385
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1744.py
@@ -0,0 +1,34 @@
+import torch
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit(
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+)
+def _buggy_kernel(S: T.Tensor((8), T.bfloat16), D: T.Tensor((4, 64), T.bfloat16)):
+    with T.Kernel(1, threads=128):
+        S_shared = T.alloc_shared((8), T.bfloat16)
+        S_fragment = T.alloc_fragment((8), T.float32)
+        D_shared = T.alloc_shared((4, 64), T.bfloat16)
+
+        T.copy(S, S_shared)
+        T.copy(S_shared, S_fragment)
+        for k in T.serial(64):
+            for i in T.Parallel(4):
+                D_shared[i, k] = S_fragment[i]
+        T.copy(D_shared, D)
+
+
+@tilelang.testing.requires_cuda
+def test():
+    test_S = torch.randn((8), dtype=torch.bfloat16, device="cuda")
+    test_D = torch.zeros((4, 64), dtype=torch.bfloat16, device="cuda")
+    _buggy_kernel(test_S, test_D)
+    ref_D = test_S[:4].view(4, 1).repeat(1, 64)
+    torch.testing.assert_close(test_D, ref_D)
+
+
+if __name__ == "__main__":
+    test()
diff --git a/testing/python/issue/test_tilelang_issue_1810.py b/testing/python/issue/test_tilelang_issue_1810.py
new file mode 100644
index 0000000000..afb844b342
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1810.py
@@ -0,0 +1,32 @@
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1810_l2_persistent_float16_host_stub_no_half(monkeypatch):
+    """Regression for issue #1810.
+
+    `annotate_l2_hit_ratio` lowers to CUDA stream access policy window calls.
+    The runtime API only needs an opaque base pointer, but older lowering used a
+    typed access pointer. For float16 buffers, TVM's C host codegen prints the
+    type as `half`, which is not defined by the stable C ABI headers, causing
+    host-side compilation failures when exporting the executable.
+    """
+
+    @T.prim_func
+    def minimal_kernel(A: T.Buffer((1,), "float16")):
+        with T.Kernel():
+            T.annotate_l2_hit_ratio({A: 0.9})
+            T.evaluate(0)
+
+    kernel = tilelang.compile(minimal_kernel, execution_backend="tvm_ffi", target="cuda")
+    source = kernel.get_host_source()
+    assert "__tvm_cuda_stream_set_access_policy_window_packed" in source
+    assert "__tvm_cuda_stream_reset_access_policy_window_packed" in source
+    assert "half*" not in source
+    assert "((half*)" not in source
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1846.py b/testing/python/issue/test_tilelang_issue_1846.py
new file mode 100644
index 0000000000..3bf4dabfb7
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1846.py
@@ -0,0 +1,35 @@
+import torch
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def _issue1846_fill_scalar(fill, M):
+    X = T.empty((M), dtype="float32")
+    with T.Kernel(1, threads=M) as _:
+        for i in T.Parallel(M):
+            X[i] = fill
+    return X
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1846_eager_jit_call_executes():
+    """Regression test for issue #1846.
+
+    Calling an eager-style @tilelang.jit function (builder pattern with T.empty)
+    should compile and execute the kernel, returning the output tensor directly.
+    """
+
+    M = 32
+    fill = 1.0
+
+    out = _issue1846_fill_scalar(fill=fill, M=M)
+    assert isinstance(out, torch.Tensor)
+    assert out.shape == (M,)
+    torch.testing.assert_close(out, torch.full((M,), fill, device=out.device, dtype=out.dtype))
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_1993.py b/testing/python/issue/test_tilelang_issue_1993.py
new file mode 100644
index 0000000000..3621ae19bf
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_1993.py
@@ -0,0 +1,75 @@
+import torch
+import pytest
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def _issue1993_dynamic_grid():
+    num_tokens = T.dynamic("num_tokens")
+
+    @T.prim_func
+    def kernel(out: T.Tensor[(num_tokens,), T.float32]):
+        with T.Kernel(num_tokens, threads=1) as pid:
+            out[pid] = T.float32(1.0)
+
+    return kernel
+
+
+@tilelang.jit
+def _issue1993_static_grid():
+    @T.prim_func
+    def kernel(out: T.Tensor[(4,), T.float32]):
+        with T.Kernel(0, threads=1) as pid:
+            out[pid] = T.float32(1.0)
+
+    return kernel
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1993_dynamic_zero_grid_dim():
+    """Regression test for issue #1993.
+
+    When a dynamic grid dimension resolves to 0 at runtime, the runtime
+    should raise an error instead of silently clamping to 1 and launching
+    the kernel (which would write through a NULL pointer and crash with
+    CUDA_ERROR_ILLEGAL_ADDRESS).
+    """
+    kernel = _issue1993_dynamic_grid()
+
+    # Positive case: should work correctly
+    out = torch.zeros(4, dtype=torch.float32, device="cuda")
+    kernel(out)
+    torch.cuda.synchronize()
+    assert out.eq(1.0).all()
+
+    # Zero case: should raise an error, not crash with illegal memory access
+    out_empty = torch.zeros(0, dtype=torch.float32, device="cuda")
+    with pytest.raises(Exception):  # noqa: B017
+        kernel(out_empty)
+        torch.cuda.synchronize()
+
+
+@tilelang.testing.requires_cuda
+def test_issue_1993_static_zero_grid_dim():
+    """Regression test for issue #1993.
+
+    When T.Kernel(0) is used with a static constant, the runtime should
+    raise an error instead of silently clamping to 1 and executing a
+    spurious CTA.
+    """
+    kernel = _issue1993_static_grid()
+
+    out = torch.zeros(4, dtype=torch.float32, device="cuda")
+    with pytest.raises(Exception):  # noqa: B017
+        kernel(out)
+        torch.cuda.synchronize()
+
+    # Buffer should remain untouched
+    assert out.eq(0.0).all()
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_830.py b/testing/python/issue/test_tilelang_issue_830.py
index 1a2a909d27..7edb5dbd16 100644
--- a/testing/python/issue/test_tilelang_issue_830.py
+++ b/testing/python/issue/test_tilelang_issue_830.py
@@ -41,7 +41,6 @@ def buggy_kernel(x: T.Tensor[(num_tokens,), T.float32]):
     return buggy_kernel
 
 
-@tilelang.testing.requires_cuda
 def test_empty_with_dead_code_kernel():
     kernel = _empty_with_dead_code_kernel()
     x = torch.randn((128,), dtype=torch.float32, device="cuda")
diff --git a/testing/python/issue/test_tilelang_issue_96.py b/testing/python/issue/test_tilelang_issue_96.py
index 9bf5c69bd8..db86e825e5 100644
--- a/testing/python/issue/test_tilelang_issue_96.py
+++ b/testing/python/issue/test_tilelang_issue_96.py
@@ -37,7 +37,7 @@ def main(
 
 def run_gemm_pipeline_test(N, block_M=128, block_N=128, block_K=32):
     func = matmul(N, N, N, block_M, block_N, block_K)
-    jit_kernel = tilelang.compile(func, out_idx=[2], target="cuda")
+    jit_kernel = tilelang.compile(func, out_idx=[2])
 
     torch.manual_seed(0)
     a = torch.randn(N, N, device="cuda", dtype=torch.float16)
diff --git a/testing/python/issue/test_tilelang_issue_sm120_tma_smem_alignment.py b/testing/python/issue/test_tilelang_issue_sm120_tma_smem_alignment.py
new file mode 100644
index 0000000000..9f4813fbb3
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_sm120_tma_smem_alignment.py
@@ -0,0 +1,226 @@
+"""Regression test for the sm_120 (Blackwell consumer) TMA shared-memory
+alignment bug.
+
+Pre-fix `SharedMemoryAlignmentPlanner` (`merge_shared_memory_allocations.cc`)
+gated the 1024-byte alignment of TMA-touched smem buffers on
+`TargetIsHopper(target)` (arch in `[90, 100)`). For sm_100 / sm_120 (also
+TMA-capable: see `cp.async.bulk.tensor.{1..5}d.shared::cta.global.*` PTX
+support gated by `TargetHasBulkCopy(target)` in `src/target/utils.cc`)
+the planner fell back to the global default (16 bytes) and TMA destinations
+landed at 16-byte-aligned offsets. `cp.async.bulk.tensor.*` requires the
+destination smem pointer to be 128-byte aligned, so on sm_100/sm_120 this
+caused `CUDA_ERROR_MISALIGNED_ADDRESS` whenever the dynamic-smem arena
+started with a small (e.g. 4-byte float) buffer that pushed subsequent TMA
+buffers off the 128-byte boundary.
+
+Post-fix the predicate is `TargetHasBulkCopy(target)`, which is true for
+arch >= 90 (sm_90, sm_100, sm_103, sm_120 and any future TMA-capable
+target) — the correct set, since TMA support and TMA's alignment
+requirements are coextensive.
+
+The test has two parts:
+
+1. **Generated TIR check** (host-independent). Lowers the buggy-arena
+   kernel for explicit `cuda -arch=sm_{90,100,120}` targets and asserts
+   every `tl.tma_load` destination `tir.tvm_access_ptr` byte offset is
+   provably 128-byte aligned. Runs on any host.
+
+2. **Runtime check** on the host GPU. Compiles + executes the same kernel
+   for the host arch and asserts it doesn't raise
+   `CUDA_ERROR_MISALIGNED_ADDRESS`. Pre-fix on sm_120 this crashes; on
+   sm_90 (Hopper) the kernel happens to land buffers correctly even
+   pre-fix. Post-fix it passes on every TMA-capable arch.
+"""
+
+import torch
+
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+from tilelang import tvm
+from tilelang.engine.lower import lower as tilelang_lower
+
+
+def _make_buggy_arena_kernel(M, N, K, BM, BN, BK):
+    """2-stage pipelined bf16 GEMM with a tiny T.float32 shared scalar
+    whose lifetime *overlaps* the TMA-loaded tiles. The scalar is
+    written before the loop, accumulated inside it, and read after.
+    That overlapping lifetime forces the smem-allocation planner to
+    keep the scalar alive in the merged dynamic-smem arena alongside
+    the TMA destinations — a shorter-lived scalar (e.g. one that is
+    only used before the loop) gets reordered to a higher offset and
+    masks the bug.
+
+    Modelled on the structure of the fla
+    `chunk_bwd_dqkwg_tilelang` kernel (small dg-last accumulator that
+    is updated *inside* the V-loop alongside the TMA-loaded
+    `s_v` / `s_do` / `s_h` / `s_dh` tiles, then used in the K-loop
+    after).
+    """
+
+    @T.prim_func
+    def gemm(
+        a: T.Tensor((M, K), "bfloat16"),
+        b: T.Tensor((K, N), "bfloat16"),
+        c: T.Tensor((M, N), "bfloat16"),
+        c_scalar: T.Tensor((1,), "float32"),
+    ):
+        with T.Kernel(T.ceildiv(M, BM), T.ceildiv(N, BN), threads=128) as (bx, by):
+            # Small scalar with a lifetime that spans the whole pipelined
+            # loop — it is initialised before the loop, updated inside
+            # the loop, and used after the loop.
+            s_acc_scalar = T.alloc_shared((1,), "float32")
+            for _i in T.Parallel(1):
+                s_acc_scalar[0] = 0.0
+            T.sync_threads()
+
+            s_a = T.alloc_shared((BM, BK), "bfloat16")
+            s_b = T.alloc_shared((BK, BN), "bfloat16")
+            c_local = T.alloc_fragment((BM, BN), "float32")
+            T.clear(c_local)
+
+            for k in T.Pipelined(T.ceildiv(K, BK), num_stages=2):
+                T.copy(a[bx * BM : (bx + 1) * BM, k * BK : (k + 1) * BK], s_a)
+                T.copy(b[k * BK : (k + 1) * BK, by * BN : (by + 1) * BN], s_b)
+                T.gemm(s_a, s_b, c_local)
+                # Touch the scalar *inside* the loop so its lifetime
+                # overlaps the TMA-loaded `s_a` / `s_b` tiles. Without
+                # this overlap the planner reorders the scalar past the
+                # TMA buffers and the misalignment goes undetected.
+                # Read a single element of an already-TMA-loaded buffer
+                # so we don't conflict with `c_local`'s gemm layout.
+                for _i in T.Parallel(1):
+                    s_acc_scalar[0] = s_acc_scalar[0] + T.cast(s_a[0, 0], "float32")
+
+            T.copy(c_local, c[bx * BM : (bx + 1) * BM, by * BN : (by + 1) * BN])
+            # Use s_acc_scalar after the loop so it can't be dead-code
+            # eliminated.
+            for _i in T.Parallel(1):
+                if bx == 0 and by == 0:
+                    c_scalar[0] = s_acc_scalar[0]
+
+    return gemm
+
+
+# --------------------------------------------------------------------------
+# Generated TIR check (host-independent)
+# --------------------------------------------------------------------------
+
+
+def _lower_for(arch: str):
+    M, N, K = 128, 128, 256
+    BM, BN, BK = 64, 64, 32
+    target = tvm.target.Target(f"cuda -arch={arch}")
+    tilelang.disable_cache()
+    try:
+        with target:
+            artifact = tilelang_lower(
+                _make_buggy_arena_kernel(M, N, K, BM, BN, BK),
+                target=target,
+            )
+    finally:
+        tilelang.enable_cache()
+    return artifact.device_mod
+
+
+def _is_op_call(node, op_name: str) -> bool:
+    return isinstance(node, tvm.tir.Call) and isinstance(node.op, tvm.ir.Op) and node.op.name == op_name
+
+
+def _collect_tma_loads(mod):
+    loads = []
+
+    def _visit(node):
+        if _is_op_call(node, "tl.tma_load"):
+            loads.append(node)
+
+    for _, func in mod.functions.items():
+        if isinstance(func, tvm.tir.PrimFunc):
+            tvm.tir.stmt_functor.post_order_visit(func.body, _visit)
+    return loads
+
+
+def _byte_offset(access_ptr):
+    dtype = access_ptr.args[0].dtype
+    return access_ptr.args[2] * dtype.bytes * dtype.lanes
+
+
+def _assert_tma_destinations_128_aligned(arch: str, mod):
+    loads = _collect_tma_loads(mod)
+    assert loads, f"[{arch}] expected at least one tl.tma_load in generated TIR — the kernel layout should produce TMA loads"
+
+    analyzer = tvm.arith.Analyzer()
+    for load in loads:
+        assert len(load.args) >= 3, f"[{arch}] malformed tl.tma_load call: {load}"
+        access_ptr = load.args[2]
+        assert _is_op_call(access_ptr, "tir.tvm_access_ptr"), (
+            f"[{arch}] expected tl.tma_load destination argument to be tir.tvm_access_ptr, got: {access_ptr}"
+        )
+        byte_offset = analyzer.simplify(_byte_offset(access_ptr))
+        assert analyzer.can_prove(byte_offset % 128 == 0), (
+            f"[{arch}] TMA load destination byte offset is not provably 128-byte aligned: {byte_offset}. access_ptr={access_ptr}"
+        )
+
+
+@tilelang.testing.requires_cuda
+def test_tma_smem_alignment_codegen_sm90_hopper():
+    """sm_90 (Hopper) — covered by the original `TargetIsHopper` predicate,
+    pinned here as a regression test against re-narrowing."""
+    mod = _lower_for("sm_90")
+    _assert_tma_destinations_128_aligned("sm_90", mod)
+
+
+@tilelang.testing.requires_cuda
+def test_tma_smem_alignment_codegen_sm100_blackwell_dc():
+    """sm_100 (Blackwell DC) — TMA-capable. Pre-fix this fell back to
+    16-byte alignment because `TargetIsHopper` is true only for arch
+    `[90, 100)`."""
+    mod = _lower_for("sm_100")
+    _assert_tma_destinations_128_aligned("sm_100", mod)
+
+
+@tilelang.testing.requires_cuda
+def test_tma_smem_alignment_codegen_sm120_blackwell_consumer():
+    """sm_120 (Blackwell consumer / RTX 5090) — TMA-capable. Pre-fix this
+    fell back to 16-byte alignment and triggered
+    `CUDA_ERROR_MISALIGNED_ADDRESS` at runtime on real Blackwell consumer
+    silicon."""
+    mod = _lower_for("sm_120")
+    _assert_tma_destinations_128_aligned("sm_120", mod)
+
+
+# --------------------------------------------------------------------------
+# Runtime check on host GPU
+# --------------------------------------------------------------------------
+
+
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tma_smem_alignment_runtime_hostarch():
+    """Compile + execute the buggy-arena GEMM on the host's TMA-capable
+    GPU. Pre-fix on sm_100 / sm_120 this raises
+    `CUDA_ERROR_MISALIGNED_ADDRESS` on the first TMA load. On sm_90
+    (Hopper) the same kernel happens to land buffers at acceptable
+    offsets and runs even pre-fix. This is intentionally hardware-backed;
+    the compile-only regression coverage is the generated-TIR check above."""
+    M, N, K = 128, 128, 256
+    BM, BN, BK = 64, 64, 32
+    tilelang.disable_cache()
+    try:
+        kernel = tilelang.compile(
+            _make_buggy_arena_kernel(M, N, K, BM, BN, BK),
+            target="cuda",
+        )
+    finally:
+        tilelang.enable_cache()
+    a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    b = torch.randn(K, N, device="cuda", dtype=torch.bfloat16)
+    c = torch.empty(M, N, device="cuda", dtype=torch.bfloat16)
+    c_scalar = torch.empty(1, device="cuda", dtype=torch.float32)
+    kernel(a, b, c, c_scalar)
+    torch.cuda.synchronize()
+    ref = (a.float() @ b.float()).to(torch.bfloat16)
+    torch.testing.assert_close(c, ref, rtol=2e-2, atol=2e-2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_tma_no_ws.py b/testing/python/issue/test_tilelang_issue_tma_no_ws.py
new file mode 100644
index 0000000000..a0ab91da27
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_tma_no_ws.py
@@ -0,0 +1,479 @@
+import re
+
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+from tilelang.layout import make_cutlass_metadata_layout
+import torch
+
+
+def _compile_tvm_ffi(func, pass_configs, **kwargs):
+    tilelang.disable_cache()
+    try:
+        return tilelang.compile(
+            func,
+            target="cuda",
+            execution_backend="tvm_ffi",
+            pass_configs=pass_configs,
+            **kwargs,
+        )
+    finally:
+        tilelang.enable_cache()
+
+
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_num_stages_zero_pure_tma_does_not_auto_warp_specialize():
+    """num_stages=0 should keep ordinary T.copy on the synchronous path."""
+
+    M, K = 8, 256
+    block_m, block_k = 4, 128
+    threads = 32
+
+    @T.prim_func
+    def copy_loop_num_stages_zero(
+        x: T.Tensor((M, K), T.float16),
+        y: T.Tensor((M, K), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(M, block_m), threads=threads) as pid_m:
+            x_shared = T.alloc_shared((block_m, block_k), dtype=T.float16)
+            for ko in T.Pipelined(T.ceildiv(K, block_k), num_stages=0):
+                T.copy(
+                    x[
+                        pid_m * block_m : (pid_m + 1) * block_m,
+                        ko * block_k : (ko + 1) * block_k,
+                    ],
+                    x_shared,
+                )
+                T.copy(
+                    x_shared,
+                    y[
+                        pid_m * block_m : (pid_m + 1) * block_m,
+                        ko * block_k : (ko + 1) * block_k,
+                    ],
+                )
+
+    pass_configs = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(copy_loop_num_stages_zero, pass_configs, out_idx=[1])
+
+    src = kernel.get_kernel_source()
+    assert "tl::tma_load" not in src
+    assert "__launch_bounds__(160, 1)" not in src
+    assert "if (32 <= ((int)threadIdx.x))" not in src
+
+    x = torch.randn((M, K), device="cuda", dtype=torch.float16)
+    y = kernel(x)
+    torch.testing.assert_close(y, x)
+    torch.cuda.synchronize()
+
+
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_num_stages_one_pure_tma_keeps_auto_warp_specialize():
+    """Pure TMA loops should auto-WS when num_stages is explicitly enabled."""
+
+    M, K = 8, 256
+    block_m, block_k = 4, 128
+    threads = 32
+
+    @T.prim_func
+    def copy_loop_num_stages_one(
+        x: T.Tensor((M, K), T.float16),
+        y: T.Tensor((M, K), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(M, block_m), threads=threads) as pid_m:
+            x_shared = T.alloc_shared((block_m, block_k), dtype=T.float16)
+            for ko in T.Pipelined(T.ceildiv(K, block_k), num_stages=1):
+                T.copy(
+                    x[
+                        pid_m * block_m : (pid_m + 1) * block_m,
+                        ko * block_k : (ko + 1) * block_k,
+                    ],
+                    x_shared,
+                )
+                T.copy(
+                    x_shared,
+                    y[
+                        pid_m * block_m : (pid_m + 1) * block_m,
+                        ko * block_k : (ko + 1) * block_k,
+                    ],
+                )
+
+    pass_configs = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(copy_loop_num_stages_one, pass_configs, out_idx=[1])
+
+    src = kernel.get_kernel_source()
+    assert "tl::tma_load" in src
+    assert "__launch_bounds__(160, 1)" in src
+    assert "if (32 <= ((int)threadIdx.x))" in src
+
+    x = torch.randn((M, K), device="cuda", dtype=torch.float16)
+    y = kernel(x)
+    torch.testing.assert_close(y, x)
+    torch.cuda.synchronize()
+
+
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_num_stages_zero_cp_async_only_does_not_auto_warp_specialize():
+    """num_stages=0 should keep cp.async-only loops out of auto-WS."""
+
+    bytes_per_copy = 16
+    threads = 32
+
+    @T.prim_func
+    def cp_async_only_num_stages_zero(
+        x: T.Tensor((4 * bytes_per_copy,), T.uint8),
+        y: T.Tensor((4 * bytes_per_copy,), T.uint8),
+    ):
+        with T.Kernel(1, threads=threads):
+            x_shared = T.alloc_shared((bytes_per_copy,), dtype=T.uint8)
+            for ko in T.Pipelined(4, num_stages=0):
+                T.ptx_cp_async(
+                    T.access_ptr(x_shared[0], "w", bytes_per_copy),
+                    T.access_ptr(x[ko * bytes_per_copy], "r", bytes_per_copy),
+                    bytes_per_copy,
+                )
+                T.ptx_commit_group()
+                T.ptx_wait_group(0)
+                for i in T.serial(bytes_per_copy):
+                    y[ko * bytes_per_copy + i] = x_shared[i]
+
+    pass_configs = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(cp_async_only_num_stages_zero, pass_configs, out_idx=[1])
+
+    src = kernel.get_kernel_source()
+    assert "cp_async_gs<16>" in src
+    assert "__launch_bounds__(32, 1)" in src
+    assert "__launch_bounds__(160, 1)" not in src
+    assert "if (32 <= ((int)threadIdx.x))" not in src
+
+    x = torch.randint(0, 256, (4 * bytes_per_copy,), device="cuda", dtype=torch.uint8)
+    y = kernel(x)
+    torch.testing.assert_close(y, x)
+    torch.cuda.synchronize()
+
+
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_num_stages_one_cp_async_only_keeps_non_ws_launch_shape():
+    """Stage-1 cp.async-only loops should stay non-WS on Hopper."""
+
+    bytes_per_copy = 16
+    threads = 32
+
+    @T.prim_func
+    def cp_async_only_num_stages_one(
+        x: T.Tensor((4 * bytes_per_copy,), T.uint8),
+        y: T.Tensor((4 * bytes_per_copy,), T.uint8),
+    ):
+        with T.Kernel(1, threads=threads):
+            x_shared = T.alloc_shared((bytes_per_copy,), dtype=T.uint8)
+            for ko in T.Pipelined(4, num_stages=1):
+                T.ptx_cp_async(
+                    T.access_ptr(x_shared[0], "w", bytes_per_copy),
+                    T.access_ptr(x[ko * bytes_per_copy], "r", bytes_per_copy),
+                    bytes_per_copy,
+                )
+                T.ptx_commit_group()
+                T.ptx_wait_group(0)
+                for i in T.serial(bytes_per_copy):
+                    y[ko * bytes_per_copy + i] = x_shared[i]
+
+    pass_configs = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(cp_async_only_num_stages_one, pass_configs, out_idx=[1])
+
+    src = kernel.get_kernel_source()
+    assert "cp_async_gs<16>" in src
+    assert "__launch_bounds__(32, 1)" in src
+    assert "__launch_bounds__(160, 1)" not in src
+    assert "if (32 <= ((int)threadIdx.x))" not in src
+
+    x = torch.randint(0, 256, (4 * bytes_per_copy,), device="cuda", dtype=torch.uint8)
+    y = kernel(x)
+    torch.testing.assert_close(y, x)
+    torch.cuda.synchronize()
+
+
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_num_stages_one_mixed_tma_cp_async_keeps_auto_ws():
+    """Mixed TMA+cp.async loops should auto-WS when num_stages is enabled."""
+
+    M, K = 8, 256
+    block_m, block_k = 4, 128
+    threads = 128
+    cp_async_bytes = 16
+
+    @T.prim_func
+    def mixed_async_num_stages_one(
+        x: T.Tensor((M, K), T.float16),
+        meta: T.Tensor((2 * cp_async_bytes,), T.uint8),
+        y: T.Tensor((M, K), T.float16),
+        meta_out: T.Tensor((2 * cp_async_bytes,), T.uint8),
+    ):
+        with T.Kernel(T.ceildiv(M, block_m), threads=threads) as pid_m:
+            x_shared = T.alloc_shared((block_m, block_k), dtype=T.float16)
+            meta_shared = T.alloc_shared((cp_async_bytes,), dtype=T.uint8)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_k), num_stages=1):
+                T.copy(
+                    x[
+                        pid_m * block_m : (pid_m + 1) * block_m,
+                        ko * block_k : (ko + 1) * block_k,
+                    ],
+                    x_shared,
+                )
+                T.ptx_cp_async(
+                    T.access_ptr(meta_shared[0], "w", cp_async_bytes),
+                    T.access_ptr(meta[ko * cp_async_bytes], "r", cp_async_bytes),
+                    cp_async_bytes,
+                )
+                T.ptx_commit_group()
+                T.ptx_wait_group(0)
+                T.copy(
+                    x_shared,
+                    y[
+                        pid_m * block_m : (pid_m + 1) * block_m,
+                        ko * block_k : (ko + 1) * block_k,
+                    ],
+                )
+                for i in T.serial(cp_async_bytes):
+                    meta_out[ko * cp_async_bytes + i] = meta_shared[i]
+
+    pass_configs = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(mixed_async_num_stages_one, pass_configs, out_idx=[2, 3])
+
+    src = kernel.get_kernel_source()
+    assert "tl::tma_load" in src
+    producer_idx = src.index("if (128 <= ((int)threadIdx.x)) {")
+    consumer_idx = src.index("} else {", producer_idx)
+    cp_async_idx = src.index("cp_async_gs<16>")
+
+    assert producer_idx < cp_async_idx < consumer_idx
+    assert "cp_async_gs<16>" not in src[consumer_idx:]
+
+
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_mixed_tma_cp_async_shared_stage_barriers():
+    """Mixed TMA+cp.async groups should share one forward/bp barrier set."""
+
+    M = N = K = 256
+    block_m = block_n = 128
+    block_k = 32
+    num_stages = 3
+    threads = 128
+
+    @T.prim_func
+    def mixed_gemm_shared_barrier(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_n), T.ceildiv(M, block_m), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_m, block_k), T.float16)
+            B_shared = T.alloc_shared((block_k, block_n), T.float16)
+            C_local = T.alloc_fragment((block_m, block_n), T.float32)
+
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_k), num_stages=num_stages):
+                T.copy(A[by * block_m, ko * block_k], A_shared)
+                for k, j in T.Parallel(block_k, block_n):
+                    B_shared[k, j] = B[ko * block_k + k, bx * block_n + j]
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_m, bx * block_n])
+
+    pass_configs = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(mixed_gemm_shared_barrier, pass_configs, out_idx=[2])
+
+    src = kernel.get_kernel_source()
+    flat_src = " ".join(src.split())
+
+    assert "tl::tma_load" in src
+    assert "cp_async_gs<16>" in src
+    assert "uint64_t mbarrier_mem[6]" in src
+    assert "arrive_and_expect_tx" not in src
+    assert ".expect_transaction(8192);" in src
+    assert src.count(".init(128);") == 6
+    assert ".init(1);" not in src
+    # Mixed TMA+cp.async should reuse the same forward barrier set. Depending on
+    # when cp.async is lowered, this may appear either as an explicit
+    # noinc-arrive or as a regular arrive on the same forward barrier after the
+    # cp.async visibility sync.
+    assert "tl::mbarrier_cp_async_arrive_noinc(mbarrier[(ko % 3)])" in flat_src or "mbarrier[(ko % 3)].arrive();" in flat_src
+    assert "tl::mbarrier_cp_async_arrive_noinc(mbarrier[((ko % 3) + 4)])" not in flat_src
+    assert "mbarrier[((ko % 3) + 7)]" not in flat_src
+    assert "mbarrier[((ko % 3) + 10)]" not in flat_src
+
+    a = torch.randn((M, K), device="cuda", dtype=torch.float16)
+    b = torch.randn((K, N), device="cuda", dtype=torch.float16)
+    ref = a @ b
+    c = kernel(a, b)
+    torch.cuda.synchronize()
+    torch.testing.assert_close(c, ref, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
+def test_sparse_ws_regular_metadata_copy_stays_in_producer():
+    """Ordinary global->shared metadata copies should stay in the producer."""
+
+    M, N, K = 128, 128, 256
+    block_m = block_n = 128
+    block_k = 128
+    num_stages = 2
+    threads = 128
+
+    @T.prim_func
+    def sparse_tensorcore_metadata_copy(
+        A_sparse: T.Tensor((M, K // 2), T.float16),
+        E: T.Tensor((M, K // 8), "uint8"),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_n), T.ceildiv(M, block_m), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_m, block_k // 2), T.float16)
+            B_shared = T.alloc_shared((block_k, block_n), T.float16)
+            E_shared = T.alloc_shared((block_m, block_k // 8), "uint8")
+            C_local = T.alloc_fragment((block_m, block_n), T.float32)
+
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="9.0", block_k=block_k),
+                    E_shared: make_cutlass_metadata_layout(E_shared, mma_dtype=T.float16, arch="9.0", block_k=block_k),
+                }
+            )
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_k), num_stages=num_stages):
+                T.copy(E[by * block_m, k * block_k // 8], E_shared)
+                T.copy(A_sparse[by * block_m, k * block_k // 2], A_shared)
+                T.copy(B[k * block_k, bx * block_n], B_shared)
+                T.gemm_sp(A_shared, E_shared, B_shared, C_local, False, False)
+
+            T.copy(C_local, C[by * block_m, bx * block_n])
+
+    pass_configs = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(sparse_tensorcore_metadata_copy, pass_configs, out_idx=[3])
+
+    src = kernel.get_kernel_source()
+    producer_idx = src.index("if (128 <= ((int)threadIdx.x)) {")
+    consumer_idx = src.index("} else {", producer_idx)
+    metadata_copy_idx = src.index("*(uchar2*)(E +")
+    gemm_idx = src.index("tl::gemm_sp_ss<")
+
+    assert producer_idx < metadata_copy_idx < consumer_idx
+    assert consumer_idx < gemm_idx
+    assert "*(uchar2*)(E +" not in src[consumer_idx:]
+
+
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_pure_tma_consumer_local_init_does_not_leak_into_producer():
+    """Consumer-only pre-loop local init should not be duplicated into producer."""
+
+    batch = heads = 1
+    seq_len = dim = 64
+    block_m = block_n = 64
+    downsample_len = 1
+    num_stages = 1
+    threads = 128
+    scale = (1.0 / dim) ** 0.5 * 1.44269504
+
+    shape = [batch, heads, seq_len, dim]
+    block_mask_shape = [batch, heads, downsample_len, downsample_len]
+    dtype = T.float16
+    accum_dtype = T.float32
+
+    @T.prim_func
+    def sparse_flash_attn(
+        Q: T.Tensor(shape, dtype),
+        K: T.Tensor(shape, dtype),
+        V: T.Tensor(shape, dtype),
+        BlockSparseMask: T.Tensor(block_mask_shape, "bool"),
+        Output: T.Tensor(shape, dtype),
+    ):
+        with T.Kernel(T.ceildiv(seq_len, block_m), heads, batch, threads=threads) as (
+            bx,
+            by,
+            bz,
+        ):
+            Q_shared = T.alloc_shared([block_m, dim], dtype)
+            K_shared = T.alloc_shared([block_n, dim], dtype)
+            V_shared = T.alloc_shared([block_n, dim], dtype)
+            O_shared = T.alloc_shared([block_m, dim], dtype)
+            acc_s = T.alloc_fragment([block_m, block_n], accum_dtype)
+            acc_s_cast = T.alloc_fragment([block_m, block_n], dtype)
+            acc_o = T.alloc_fragment([block_m, dim], accum_dtype)
+            scores_max = T.alloc_fragment([block_m], accum_dtype)
+            scores_max_prev = T.alloc_fragment([block_m], accum_dtype)
+            scores_scale = T.alloc_fragment([block_m], accum_dtype)
+            scores_sum = T.alloc_fragment([block_m], accum_dtype)
+            logsum = T.alloc_fragment([block_m], accum_dtype)
+            block_mask = T.alloc_local([downsample_len], "bool")
+
+            T.copy(Q[bz, by, bx * block_m : (bx + 1) * block_m, :], Q_shared)
+            T.fill(acc_o, 0)
+            T.fill(logsum, 0)
+            T.fill(scores_max, -T.infinity(accum_dtype))
+
+            for vj in T.serial(downsample_len):
+                block_mask[vj] = BlockSparseMask[bz, by, bx, vj]
+
+            for k in T.Pipelined(downsample_len, num_stages=num_stages):
+                if block_mask[k] != 0:
+                    T.copy(K[bz, by, k * block_n : (k + 1) * block_n, :], K_shared)
+                    T.clear(acc_s)
+                    T.gemm(Q_shared, K_shared, acc_s, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+
+                    T.copy(scores_max, scores_max_prev)
+                    T.fill(scores_max, -T.infinity(accum_dtype))
+                    T.reduce_max(acc_s, scores_max, dim=1, clear=False)
+                    for i in T.Parallel(block_m):
+                        scores_scale[i] = T.exp2(scores_max_prev[i] * scale - scores_max[i] * scale)
+                    for i, j in T.Parallel(block_m, block_n):
+                        acc_s[i, j] = T.exp2(acc_s[i, j] * scale - scores_max[i] * scale)
+                    T.reduce_sum(acc_s, scores_sum, dim=1)
+                    for i in T.Parallel(block_m):
+                        logsum[i] = logsum[i] * scores_scale[i] + scores_sum[i]
+                    T.copy(acc_s, acc_s_cast)
+
+                    for i, j in T.Parallel(block_m, dim):
+                        acc_o[i, j] *= scores_scale[i]
+
+                    T.copy(V[bz, by, k * block_n : (k + 1) * block_n, :], V_shared)
+                    T.gemm(acc_s_cast, V_shared, acc_o, policy=T.GemmWarpPolicy.FullRow)
+
+            for i, j in T.Parallel(block_m, dim):
+                acc_o[i, j] /= logsum[i]
+            T.copy(acc_o, O_shared)
+            T.copy(O_shared, Output[bz, by, bx * block_m : (bx + 1) * block_m, :])
+
+    pass_configs = {tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(sparse_flash_attn, pass_configs, out_idx=[4])
+
+    src = kernel.get_kernel_source()
+    producer_idx = src.index("if (128 <= ((int)threadIdx.x)) {")
+    consumer_idx = src.index("} else {", producer_idx)
+    prelude_src = src[:producer_idx]
+    producer_src = src[producer_idx:consumer_idx]
+    consumer_src = src[consumer_idx:]
+    flat_src = " ".join(src.split())
+
+    assert src.count(".init(1);") == 3
+    assert src.count(".init(128);") == 2
+    assert "tl::tma_load(K_desc, mbarrier[0]" in src
+    assert "tl::tma_load(V_desc, mbarrier[1]" in src
+    assert re.search(r"mbarrier\[2\]\.wait\([^;]+\);", flat_src)
+    assert re.search(r"mbarrier\[3\]\.wait\([^;]+\);", flat_src)
+    assert re.search(r"mbarrier\[0\]\.wait\([^;]+\);", flat_src)
+    assert re.search(r"mbarrier\[1\]\.wait\([^;]+\);", flat_src)
+    assert "mbarrier[2].arrive();" in consumer_src
+    assert "mbarrier[3].arrive();" in consumer_src
+    assert "block_mask" in producer_src
+    assert "block_mask" in consumer_src
+    assert "acc_o" not in producer_src
+    assert "logsum" not in producer_src
+    assert "scores_max" not in producer_src
+    assert "*(float4*)(acc_o + (i * 4)) = make_float4" not in prelude_src
+    assert "*(float4*)(acc_o + (i * 4)) = make_float4" in consumer_src
+    assert "*(float2*)(logsum + 0) = make_float2" not in prelude_src
+    assert "*(float2*)(logsum + 0) = make_float2" in consumer_src
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/issue/test_tilelang_issue_ws_simt_copy_full_producer_extent.py b/testing/python/issue/test_tilelang_issue_ws_simt_copy_full_producer_extent.py
new file mode 100644
index 0000000000..34b1b1dba1
--- /dev/null
+++ b/testing/python/issue/test_tilelang_issue_ws_simt_copy_full_producer_extent.py
@@ -0,0 +1,74 @@
+import re
+
+import tilelang
+import tilelang.testing
+from tilelang import language as T
+from tilelang.layout import make_cutlass_metadata_layout
+
+
+def _compile_tvm_ffi(func, pass_configs=None):
+    tilelang.disable_cache()
+    try:
+        return tilelang.compile(
+            func,
+            target="cuda",
+            execution_backend="tvm_ffi",
+            pass_configs=pass_configs or {},
+        )
+    finally:
+        tilelang.enable_cache()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
+def test_ws_keeps_full_producer_extent_for_lowered_simt_copy():
+    M, N, K = 128, 64, 64
+    block_M, block_N, block_K = 64, 64, 32
+    num_stages = 2
+    threads = 256
+    e_factor = 8
+
+    @T.prim_func
+    def main(
+        A_sparse: T.Tensor((M, K // 2), T.float16),
+        E: T.Tensor((M, K // e_factor), "uint8"),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float32),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K // 2), T.float16)
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
+            E_shared = T.alloc_shared((block_M, block_K // e_factor), "uint8")
+            C_frag = T.alloc_fragment((block_M, block_N), T.float32)
+            T.annotate_layout(
+                {
+                    E: make_cutlass_metadata_layout(E, mma_dtype=T.float16, arch="9.0", block_k=block_K),
+                    E_shared: make_cutlass_metadata_layout(
+                        E_shared,
+                        mma_dtype=T.float16,
+                        arch="9.0",
+                        block_k=block_K,
+                    ),
+                }
+            )
+            T.disable_warp_group_reg_alloc()
+            T.clear(C_frag)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(E[by * block_M, k * block_K // e_factor], E_shared)
+                T.copy(A_sparse[by * block_M, k * block_K // 2], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm_sp(A_shared, E_shared, B_shared, C_frag, False, False)
+            T.copy(C_frag, C[by * block_M, bx * block_N])
+
+    kernel = _compile_tvm_ffi(main)
+    src = kernel.get_kernel_source()
+    flat_src = " ".join(src.split())
+
+    assert "__launch_bounds__(512, 1)" in src
+    assert "if (256 <= ((int)threadIdx.x)) {" in flat_src
+    assert "tl::tl_shuffle_elect<256>()" in src or "if (((int)threadIdx.x) == 256) {" in src
+    assert re.search(r"tl::__sync_thread_partial<\d+, 256>\(\);", src), src
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_callback.py b/testing/python/jit/test_tilelang_jit_callback.py
index 9ad8da47fb..752657be08 100644
--- a/testing/python/jit/test_tilelang_jit_callback.py
+++ b/testing/python/jit/test_tilelang_jit_callback.py
@@ -1,7 +1,7 @@
 from tilelang import language as T
 import tilelang.testing
 import tilelang
-from tilelang.engine.callback import register_cuda_postproc_callback
+from tilelang.engine.callback import register_cuda_postproc_callback, register_hip_postproc_callback
 import torch
 import pytest
 
@@ -90,6 +90,11 @@ def tilelang_callback_cuda_postproc(code, _):
         code = f"// {stramp}\n" + code
         return code
 
+    @register_hip_postproc_callback
+    def tilelang_callback_hip_postproc(code, _):
+        code = f"// {stramp}\n" + code
+        return code
+
     tilelang.disable_cache()
     matmul_kernel = tilelang.compile(program, out_idx=-1)
     tilelang.enable_cache()
@@ -109,7 +114,7 @@ def test_cuda_postproc_callback():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         256,
         32,
@@ -224,7 +229,7 @@ def test_gemm_jit_kernel():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         256,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_cutedsl.py b/testing/python/jit/test_tilelang_jit_cutedsl.py
index 202bbf1171..75c9e9012e 100644
--- a/testing/python/jit/test_tilelang_jit_cutedsl.py
+++ b/testing/python/jit/test_tilelang_jit_cutedsl.py
@@ -3,7 +3,7 @@
 import tilelang.testing
 import tilelang
 import torch
-from tilelang.utils.tensor import map_torch_type
+import pytest
 
 
 def matmul(
@@ -98,6 +98,54 @@ def main(
     return main
 
 
+def matmul_kernel_with_barrier(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    mbars,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            barriers = T.alloc_barrier(mbars)  # noqa: F841
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                if trans_A:
+                    T.copy(A[k * block_K, by * block_M], A_shared)
+                else:
+                    T.copy(A[by * block_M, k * block_K], A_shared)
+                if trans_B:
+                    T.copy(B[bx * block_N, k * block_K], B_shared)
+                else:
+                    T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local, trans_A, trans_B)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
 def run_gemm_jit_kernel(
     M,
     N,
@@ -131,8 +179,8 @@ def run_gemm_jit_kernel(
 
     matmul_kernel = tilelang.compile(program, out_idx=-1, target="cutedsl")
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     A = torch.randn(M, K, dtype=in_dtype).cuda()
     B = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -155,6 +203,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_jit_kernel():
     run_gemm_jit_kernel(
         512,
@@ -206,6 +255,8 @@ def run_cutedsl_kernel_do_bench(
     assert tvm_latency is not None
 
 
+@tilelang.testing.requires_cuda
+@pytest.mark.perf
 def test_cutedsl_kernel_do_bench():
     run_cutedsl_kernel_do_bench(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
@@ -230,8 +281,8 @@ def run_cutedsl_kernel_multi_stream(
     )
 
     matmul_kernel = tilelang.compile(program, target="cutedsl")
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
 
@@ -248,6 +299,7 @@ def run_cutedsl_kernel_multi_stream(
             matmul_kernel(tensor_a, tensor_b, tensor_c)
 
 
+@tilelang.testing.requires_cuda
 def test_cutedsl_kernel_multi_stream():
     run_cutedsl_kernel_multi_stream(512, 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
@@ -279,8 +331,8 @@ def run_cutedsl_dynamic_shape(
     if isinstance(K, T.Var):
         K = 768
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -297,6 +349,7 @@ def run_cutedsl_dynamic_shape(
     tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_cutedsl_dynamic_shape():
     run_cutedsl_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, "float16", "float16", "float16", 128, 256, 32, 2)
 
diff --git a/testing/python/jit/test_tilelang_jit_cython.py b/testing/python/jit/test_tilelang_jit_cython.py
new file mode 100644
index 0000000000..07b369bea4
--- /dev/null
+++ b/testing/python/jit/test_tilelang_jit_cython.py
@@ -0,0 +1,61 @@
+from tilelang import tvm as tvm
+import tilelang.language as T
+import tilelang.testing
+import tilelang
+import torch
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_cython_pdl():
+    """Test pdl."""
+
+    N = 64
+
+    @tilelang.jit(execution_backend="cython")
+    def multi_kernels_with_pdl(N, block_size=256, dtype=T.float32):
+        @T.prim_func
+        def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N,), dtype),
+            C: T.Tensor((N,), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+                for i in T.Parallel(block_size):
+                    idx = bx * block_size + i
+                    if idx < N:
+                        B[idx] = A[idx] + 1.0
+                T.pdl_trigger()
+
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+                T.pdl_sync()
+                for i in T.Parallel(block_size):
+                    idx = bx2 * block_size + i
+                    if idx < N:
+                        C[idx] = B[idx] * 2.0
+
+        return main
+
+    # Compile the kernel
+    kernel = multi_kernels_with_pdl(N)
+
+    # Create test tensors
+    a = torch.randn(N, dtype=torch.float32).cuda()
+    b = torch.randn(N, dtype=torch.float32).cuda()
+    c = torch.randn(N, dtype=torch.float32).cuda()
+
+    ref_b = a + 1.0
+    ref_c = ref_b * 2.0
+
+    kernel(a, b, c)
+
+    # Verify correctness
+
+    tilelang.testing.torch_assert_close(b, ref_b, atol=1e-5, rtol=1e-5)
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("pdl test passed!")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_gemm.py b/testing/python/jit/test_tilelang_jit_gemm.py
index 97391f26f3..9d65714a9f 100644
--- a/testing/python/jit/test_tilelang_jit_gemm.py
+++ b/testing/python/jit/test_tilelang_jit_gemm.py
@@ -103,7 +103,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
-def test_gemm_f16f16f16_nn_kernel_jit():
+def test_gemm_f16f16f32_nn_kernel_jit():
     run_gemm_kernel_jit(
         512,
         1024,
@@ -112,7 +112,7 @@ def test_gemm_f16f16f16_nn_kernel_jit():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         128,
         32,
diff --git a/testing/python/jit/test_tilelang_jit_gemm_cython.py b/testing/python/jit/test_tilelang_jit_gemm_cython.py
index 04c71db9d4..eb9d9c66cb 100644
--- a/testing/python/jit/test_tilelang_jit_gemm_cython.py
+++ b/testing/python/jit/test_tilelang_jit_gemm_cython.py
@@ -3,7 +3,7 @@
 import tilelang.testing
 import tilelang
 import torch
-from tilelang.utils.tensor import map_torch_type
+import pytest
 
 
 def matmul(
@@ -133,8 +133,8 @@ def run_gemm_jit_kernel(
 
     matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="cython")
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     A = torch.randn(M, K, dtype=in_dtype).cuda()
     B = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -159,16 +159,16 @@ def ref_program(A, B):
 
 def test_gemm_jit_kernel():
     run_gemm_jit_kernel(
-        512,
-        1024,
-        768,
+        256,
+        256,
+        192,
         False,
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
+        128,
         128,
-        256,
         32,
         2,
     )
@@ -207,8 +207,9 @@ def run_cython_kernel_do_bench(
     assert cython_latency is not None
 
 
+@pytest.mark.perf
 def test_cython_kernel_do_bench():
-    run_cython_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_cython_kernel_multi_stream(
@@ -232,8 +233,8 @@ def run_cython_kernel_multi_stream(
 
     matmul_kernel = tilelang.compile(program, execution_backend="cython")
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -252,7 +253,7 @@ def run_cython_kernel_multi_stream(
 
 
 def test_cython_kernel_multi_stream():
-    run_cython_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_cython_dynamic_shape(
@@ -276,14 +277,14 @@ def run_cython_dynamic_shape(
 
     matmul_kernel = tilelang.compile(program, execution_backend="cython")
     if isinstance(M, T.Var):
-        M = 1024
+        M = 256
     if isinstance(N, T.Var):
-        N = 1024
+        N = 256
     if isinstance(K, T.Var):
-        K = 768
+        K = 192
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -301,11 +302,8 @@ def run_cython_dynamic_shape(
 
 
 def test_cython_dynamic_shape():
-    run_cython_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
-
-    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
-
-    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_dynamic_shape(T.dynamic("m"), 256, 192, False, False, T.float16, T.float16, T.float32, 128, 128, 32, 2)
+    run_cython_dynamic_shape(T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float32, 128, 128, 32, 2)
 
 
 def run_cython_dynamic_shape_with_out_idx(
@@ -329,14 +327,14 @@ def run_cython_dynamic_shape_with_out_idx(
 
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=-1)
     if isinstance(M, T.Var):
-        M = 1024
+        M = 256
     if isinstance(N, T.Var):
-        N = 1024
+        N = 256
     if isinstance(K, T.Var):
-        K = 768
+        K = 192
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -354,7 +352,7 @@ def run_cython_dynamic_shape_with_out_idx(
 
 
 def test_cython_dynamic_shape_with_out_idx():
-    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_cython_dynamic_shape_with_out_idx(T.dynamic("m"), 256, 192, False, False, T.float16, T.float16, T.float32, 128, 128, 32, 2)
 
 
 def matmul_int_variable(
@@ -412,8 +410,8 @@ def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B
     )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -425,7 +423,7 @@ def run_matmul_int_variable(M, N, K, block_M, block_N, block_K, trans_A, trans_B
 
 
 def test_matmul_int_variable():
-    run_matmul_int_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
+    run_matmul_int_variable(256, 256, 256, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 def matmul_float_variable(
@@ -483,8 +481,8 @@ def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans
     )
     matmul_kernel = tilelang.compile(program, execution_backend="cython", out_idx=2)
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -496,7 +494,7 @@ def run_matmul_float_variable(M, N, K, block_M, block_N, block_K, trans_A, trans
 
 
 def test_matmul_float_variable():
-    run_matmul_float_variable(1024, 1024, 1024, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
+    run_matmul_float_variable(256, 256, 256, 128, 128, 32, False, False, T.float16, T.float16, T.float32, 0, 128)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/jit/test_tilelang_jit_nullptr.py b/testing/python/jit/test_tilelang_jit_nullptr.py
index a9edb5e930..33820608da 100644
--- a/testing/python/jit/test_tilelang_jit_nullptr.py
+++ b/testing/python/jit/test_tilelang_jit_nullptr.py
@@ -3,7 +3,6 @@
 import tilelang.testing
 import tilelang as tl
 import tilelang.language as T
-from tilelang.utils import map_torch_type
 
 
 @tl.jit
@@ -39,9 +38,9 @@ def main(
 
 
 def run_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
-    a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
-    b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
-    c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
+    a = torch.randn(M, K, device="cuda", dtype=dtype.as_torch())
+    b = torch.randn(N, K, device="cuda", dtype=dtype.as_torch())
+    c = torch.zeros(M, N, device="cuda", dtype=accum_dtype.as_torch())
     kernel = tensor_null_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype, with_bias=False)
     kernel(a, b, c, None)
 
diff --git a/testing/python/jit/test_tilelang_jit_nvrtc.py b/testing/python/jit/test_tilelang_jit_nvrtc.py
index a519ff59e1..4f4d2bbc4a 100644
--- a/testing/python/jit/test_tilelang_jit_nvrtc.py
+++ b/testing/python/jit/test_tilelang_jit_nvrtc.py
@@ -3,7 +3,7 @@
 import tilelang.testing
 import tilelang
 import torch
-from tilelang.utils.tensor import map_torch_type
+import pytest
 
 
 def matmul(
@@ -131,8 +131,8 @@ def run_gemm_jit_kernel(
 
     matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="nvrtc")
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     A = torch.randn(M, K, dtype=in_dtype).cuda()
     B = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -155,6 +155,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(C, ref_C, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_jit_kernel():
     run_gemm_jit_kernel(
         512,
@@ -206,6 +207,8 @@ def run_nvrtc_kernel_do_bench(
     assert tvm_latency is not None
 
 
+@tilelang.testing.requires_cuda
+@pytest.mark.perf
 def test_nvrtc_kernel_do_bench():
     run_nvrtc_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
@@ -230,8 +233,8 @@ def run_nvrtc_kernel_multi_stream(
     )
 
     matmul_kernel = tilelang.compile(program, execution_backend="nvrtc")
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
 
@@ -248,6 +251,7 @@ def run_nvrtc_kernel_multi_stream(
             matmul_kernel(tensor_a, tensor_b, tensor_c)
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_kernel_multi_stream():
     run_nvrtc_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
@@ -279,8 +283,8 @@ def run_nvrtc_dynamic_shape(
     if isinstance(K, T.Var):
         K = 768
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -297,6 +301,7 @@ def run_nvrtc_dynamic_shape(
     tilelang.testing.torch_assert_close(tensor_c, tensor_ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_dynamic_shape():
     run_nvrtc_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
 
@@ -368,6 +373,7 @@ def ref_program(A, B):
     tilelang.testing.torch_assert_close(out_c, ref_c, atol=1e-2, rtol=1e-2, max_mismatched_ratio=0.05)
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_im2col_tma_desc():
     """Test im2col TMA descriptor with NVRTC backend."""
     if not check_hopper():
@@ -381,6 +387,7 @@ def test_nvrtc_im2col_tma_desc():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_nvrtc_l2_persistent_map():
     """Test L2 persistent cache annotation with elementwise add."""
     from tilelang.language import annotate_l2_hit_ratio
@@ -432,5 +439,57 @@ def kernel(
     print("L2 persistent map test passed!")
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_nvrtc_pdl():
+    """Test pdl."""
+
+    N = 64
+
+    @tilelang.jit(execution_backend="nvrtc")
+    def multi_kernels_with_pdl(N, block_size=256, dtype=T.float32):
+        @T.prim_func
+        def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N,), dtype),
+            C: T.Tensor((N,), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+                for i in T.Parallel(block_size):
+                    idx = bx * block_size + i
+                    if idx < N:
+                        B[idx] = A[idx] + 1.0
+                T.pdl_trigger()
+
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+                T.pdl_sync()
+                for i in T.Parallel(block_size):
+                    idx = bx2 * block_size + i
+                    if idx < N:
+                        C[idx] = B[idx] * 2.0
+
+        return main
+
+    # Compile the kernel
+    kernel = multi_kernels_with_pdl(N)
+
+    # Create test tensors
+    a = torch.randn(N, dtype=torch.float32).cuda()
+    b = torch.randn(N, dtype=torch.float32).cuda()
+    c = torch.randn(N, dtype=torch.float32).cuda()
+
+    ref_b = a + 1.0
+    ref_c = ref_b * 2.0
+
+    kernel(a, b, c)
+
+    # Verify correctness
+
+    tilelang.testing.torch_assert_close(b, ref_b, atol=1e-5, rtol=1e-5)
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("pdl test passed!")
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/jit/test_tilelang_jit_parcompile.py b/testing/python/jit/test_tilelang_jit_parcompile.py
index 56201e1cc5..bcc76f3e57 100644
--- a/testing/python/jit/test_tilelang_jit_parcompile.py
+++ b/testing/python/jit/test_tilelang_jit_parcompile.py
@@ -58,8 +58,8 @@ def main(
 
 def test_par_compile():
     configs = [
-        (1024, 1024, 1024, 128, 128, 32),
-        (2048, 2048, 2048, 256, 256, 64),
+        (1024, 1024, 1024, 128, 128, 64),
+        (2048, 2048, 2048, 256, 256, 32),
         (4096, 4096, 4096, 64, 64, 128),
     ]
     kernels = matmul_kernel_jit.par_compile(configs)
diff --git a/testing/python/jit/test_tilelang_jit_tvm_ffi.py b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
index d9ab313c48..5d9a8256d4 100644
--- a/testing/python/jit/test_tilelang_jit_tvm_ffi.py
+++ b/testing/python/jit/test_tilelang_jit_tvm_ffi.py
@@ -3,7 +3,7 @@
 import tilelang.testing
 import tilelang
 import torch
-from tilelang.utils.tensor import map_torch_type
+import pytest
 
 
 def matmul(
@@ -131,8 +131,8 @@ def run_gemm_jit_kernel(
 
     matmul_kernel = tilelang.compile(program, out_idx=-1, execution_backend="tvm_ffi")
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     A = torch.randn(M, K, dtype=in_dtype).cuda()
     B = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -164,7 +164,7 @@ def test_gemm_jit_kernel():
         False,
         T.float16,
         T.float16,
-        T.float16,
+        T.float32,
         128,
         256,
         32,
@@ -206,8 +206,9 @@ def run_tvm_ffi_kernel_do_bench(
     assert tvm_latency is not None
 
 
+@pytest.mark.perf
 def test_tvm_ffi_kernel_do_bench():
-    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_tvm_ffi_kernel_do_bench(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_tvm_ffi_kernel_multi_stream(
@@ -230,8 +231,8 @@ def run_tvm_ffi_kernel_multi_stream(
     )
 
     matmul_kernel = tilelang.compile(program, execution_backend="tvm_ffi")
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
 
@@ -249,7 +250,7 @@ def run_tvm_ffi_kernel_multi_stream(
 
 
 def test_tvm_ffi_kernel_multi_stream():
-    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_tvm_ffi_kernel_multi_stream(512, 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
 
 def run_tvm_ffi_dynamic_shape(
@@ -279,8 +280,8 @@ def run_tvm_ffi_dynamic_shape(
     if isinstance(K, T.Var):
         K = 768
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
 
     tensor_a = torch.randn(M, K, dtype=in_dtype).cuda()
     tensor_b = torch.randn(K, N, dtype=in_dtype).cuda()
@@ -298,12 +299,12 @@ def run_tvm_ffi_dynamic_shape(
 
 
 def test_tvm_ffi_dynamic_shape():
-    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), 1024, 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
-    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2)
+    run_tvm_ffi_dynamic_shape(T.dynamic("m"), T.dynamic("n"), 768, False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2)
 
     run_tvm_ffi_dynamic_shape(
-        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float16, 128, 256, 32, 2
+        T.dynamic("m"), T.dynamic("n"), T.dynamic("k"), False, False, T.float16, T.float16, T.float32, 128, 256, 32, 2
     )
 
 
@@ -383,6 +384,7 @@ def test_tvm_ffi_im2col_tma_desc():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_tvm_ffi_l2_persistent_map():
     """Test L2 persistent cache annotation with elementwise add."""
     from tilelang.language import annotate_l2_hit_ratio
@@ -442,5 +444,57 @@ def kernel(
     print("L2 persistent map test passed!")
 
 
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tvm_ffi_pdl():
+    """Test pdl."""
+
+    N = 64
+
+    @tilelang.jit(execution_backend="tvm_ffi")
+    def multi_kernels_with_pdl(N, block_size=256, dtype=T.float32):
+        @T.prim_func
+        def main(
+            A: T.Tensor((N,), dtype),
+            B: T.Tensor((N,), dtype),
+            C: T.Tensor((N,), dtype),
+        ):
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+                for i in T.Parallel(block_size):
+                    idx = bx * block_size + i
+                    if idx < N:
+                        B[idx] = A[idx] + 1.0
+                T.pdl_trigger()
+
+            with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+                T.pdl_sync()
+                for i in T.Parallel(block_size):
+                    idx = bx2 * block_size + i
+                    if idx < N:
+                        C[idx] = B[idx] * 2.0
+
+        return main
+
+    # Compile the kernel
+    kernel = multi_kernels_with_pdl(N)
+
+    # Create test tensors
+    a = torch.randn(N, dtype=torch.float32).cuda()
+    b = torch.randn(N, dtype=torch.float32).cuda()
+    c = torch.randn(N, dtype=torch.float32).cuda()
+
+    ref_b = a + 1.0
+    ref_c = ref_b * 2.0
+
+    kernel(a, b, c)
+
+    # Verify correctness
+
+    tilelang.testing.torch_assert_close(b, ref_b, atol=1e-5, rtol=1e-5)
+    tilelang.testing.torch_assert_close(c, ref_c, atol=1e-5, rtol=1e-5)
+
+    print("pdl test passed!")
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
index 97d050b730..33eef09a56 100644
--- a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_mma.py
@@ -9,7 +9,6 @@
     TensorCoreIntrinEmitter,
 )
 from tilelang.transform import simplify_prim_func
-from tilelang.utils.tensor import map_torch_type
 
 tilelang.testing.set_random_seed(0)
 
@@ -190,9 +189,9 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-    accum_dtype = map_torch_type(accum_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
+    accum_dtype = T.dtype(accum_dtype).as_torch()
 
     if in_dtype in {torch.int8, torch.int32}:
         A = torch.randint(-128, 128, (M, K), dtype=torch.int8).to(in_dtype).cuda()
diff --git a/testing/python/kernel/test_tilelang_kernel_bf16_gemm_tcgen5_ts.py b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_tcgen5_ts.py
new file mode 100644
index 0000000000..0231d1d67b
--- /dev/null
+++ b/testing/python/kernel/test_tilelang_kernel_bf16_gemm_tcgen5_ts.py
@@ -0,0 +1,203 @@
+"""Tests for tcgen05.st (Register → TMEM Store) and MMA TS on Blackwell (SM100).
+
+Validates the chained GEMM pattern: SS GEMM → tcgen05.ld → cast → tcgen05.st → MMA TS,
+which is the core building block for Flash Attention on Blackwell.
+"""
+
+import torch
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+tilelang.testing.set_random_seed(0)
+
+PASS_CFG = {tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
+
+
+def matmul_ss(M, N, K, bM, bN, bK, in_dtype, out_dtype, accum_dtype, threads):
+    """SS GEMM baseline: verifies tcgen05.ld + cast pipeline works."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((N, K), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, bN), T.ceildiv(M, bM), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((bM, bK), in_dtype)
+            B_shared = T.alloc_shared((bN, bK), in_dtype)
+            C_tmem = T.alloc_tmem((bM, bN), accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((bM, bN), accum_dtype)
+            C_local_cast = T.alloc_fragment((bM, bN), in_dtype)
+            C_shared = T.alloc_shared((bM, bN), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, bK), num_stages=1):
+                T.copy(A[by * bM, k * bK], A_shared)
+                T.copy(B[bx * bN, k * bK], B_shared)
+                T.tcgen05_gemm(
+                    A_shared,
+                    B_shared,
+                    C_tmem,
+                    transpose_B=True,
+                    mbar=mbar,
+                    clear_accum=k == 0,
+                )
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_local_cast)
+            T.copy(C_local_cast, C_shared)
+            T.copy(C_shared, C[by * bM, bx * bN])
+
+    return main
+
+
+def chained_gemm(
+    M,
+    N1,
+    N2,
+    K,
+    bM,
+    bN1,
+    bN2,
+    bK,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+):
+    """Chained GEMM: SS → tcgen05.ld → cast → tcgen05.st → MMA TS → output."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), in_dtype),
+        B1: T.Tensor((N1, K), in_dtype),
+        B2: T.Tensor((N2, N1), in_dtype),
+        D: T.Tensor((M, N2), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N2, bN2), T.ceildiv(M, bM), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((bM, bK), in_dtype)
+            B1_shared = T.alloc_shared((bN1, bK), in_dtype)
+            S_tmem = T.alloc_tmem([bM, bN1], accum_dtype)
+            mbar1 = T.alloc_barrier(1)
+
+            S_local = T.alloc_fragment((bM, bN1), accum_dtype)
+            P_local = T.alloc_fragment((bM, bN1), in_dtype)
+            P_tmem = T.alloc_tmem([bM, bN1], in_dtype)
+
+            B2_shared = T.alloc_shared((bN2, bN1), in_dtype)
+            D_tmem = T.alloc_tmem([bM, bN2], accum_dtype)
+            mbar2 = T.alloc_barrier(1)
+
+            D_local = T.alloc_fragment((bM, bN2), accum_dtype)
+            D_shared = T.alloc_shared((bM, bN2), out_dtype)
+
+            # Stage 1: SS GEMM -- S_tmem = A * B1^T (fp32 accumulator)
+            for k in T.Pipelined(T.ceildiv(K, bK), num_stages=1):
+                T.copy(A[by * bM, k * bK], A_shared)
+                T.copy(B1[0, k * bK], B1_shared)
+                T.tcgen05_gemm(
+                    A_shared,
+                    B1_shared,
+                    S_tmem,
+                    transpose_B=True,
+                    mbar=mbar1,
+                    clear_accum=k == 0,
+                )
+                T.mbarrier_wait_parity(mbar1, k % 2)
+
+            # tcgen05.ld (fp32) → cast to bf16 → tcgen05.st (bf16, packed)
+            T.copy(S_tmem, S_local)
+            T.copy(S_local, P_local)
+            T.copy(P_local, P_tmem)
+
+            # Stage 2: MMA TS -- D_tmem = P_tmem * B2^T
+            T.copy(B2[bx * bN2, 0], B2_shared)
+            T.tcgen05_gemm(
+                P_tmem,
+                B2_shared,
+                D_tmem,
+                transpose_B=True,
+                mbar=mbar2,
+                clear_accum=True,
+            )
+            T.mbarrier_wait_parity(mbar2, 0)
+
+            T.copy(D_tmem, D_local)
+            T.copy(D_local, D_shared)
+            T.copy(D_shared, D[by * bM, bx * bN2])
+
+    return main
+
+
+def _cpu_ref_chained(a, b1, b2):
+    """Compute chained GEMM reference on CPU to avoid cuBLAS issues on Blackwell."""
+    s = a.cpu().float() @ b1.cpu().float().T
+    p = s.to(torch.bfloat16).float()
+    d = (p @ b2.cpu().float().T).to(torch.bfloat16)
+    return d
+
+
+def assert_ss_gemm(M, N, K, bM, bN, bK, threads=128):
+    """Compile and run an SS GEMM, asserting correctness against a CPU reference."""
+    func = matmul_ss(M, N, K, bM, bN, bK, T.bfloat16, T.bfloat16, T.float32, threads)
+    kernel = tilelang.compile(func, out_idx=-1, target="cuda", pass_configs=PASS_CFG)
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    b = torch.randn(N, K, device="cuda", dtype=torch.bfloat16)
+    ref = (a.cpu().float() @ b.cpu().float().T).to(torch.bfloat16)
+
+    out = kernel(a, b).cpu()
+    tilelang.testing.torch_assert_close(out, ref, rtol=1e-2, atol=1e-2)
+
+
+def assert_chained_gemm(M, N1, N2, K, bM, bN1, bN2, bK, threads=128):
+    """Compile and run a chained GEMM (SS + TS), verifying tcgen05.st and mma_ts presence."""
+    assert bN1 == N1, f"bN1 must equal N1 (full row tile) for chained GEMM, got bN1={bN1}, N1={N1}"
+    func = chained_gemm(M, N1, N2, K, bM, bN1, bN2, bK, T.bfloat16, T.bfloat16, T.float32, threads)
+    kernel = tilelang.compile(func, out_idx=-1, target="cuda", pass_configs=PASS_CFG)
+
+    src = kernel.get_kernel_source()
+    assert src is not None
+    assert "tcgen05mma_ts" in src, "Expected tcgen05mma_ts in generated code"
+    assert "tcgen05_st" in src, "Expected tcgen05_st in generated code"
+
+    a = torch.randn(M, K, device="cuda", dtype=torch.bfloat16)
+    b1 = torch.randn(N1, K, device="cuda", dtype=torch.bfloat16)
+    b2 = torch.randn(N2, N1, device="cuda", dtype=torch.bfloat16)
+    ref = _cpu_ref_chained(a, b1, b2)
+
+    out = kernel(a, b1, b2).cpu()
+    tilelang.testing.torch_assert_close(out, ref, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+def test_ss_gemm_bf16_baseline():
+    assert_ss_gemm(128, 128, 128, 128, 128, 128)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+def test_chained_gemm_128():
+    assert_chained_gemm(128, 128, 128, 128, 128, 128, 128, 128)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+def test_chained_gemm_256():
+    assert_chained_gemm(256, 128, 128, 256, 128, 128, 128, 128)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+def test_chained_gemm_non_power_of_2_k():
+    assert_chained_gemm(128, 128, 128, 192, 128, 128, 128, 64)
+
+
+if __name__ == "__main__":
+    test_ss_gemm_bf16_baseline()
+    test_chained_gemm_128()
+    test_chained_gemm_256()
+    test_chained_gemm_non_power_of_2_k()
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
index 276083b262..70d7a3f286 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm.py
@@ -1,7 +1,6 @@
 import torch
 import tilelang.testing
 import tilelang.language as T
-from tilelang.utils.tensor import map_torch_type
 
 
 def calc_diff(x, y):
@@ -38,12 +37,12 @@ def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_
     func = matmul_nt(M, N, K, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype)
     kernel = tilelang.compile(func, out_idx=-1)
 
-    A = torch.randn(M, K).to(map_torch_type(in_dtype)).cuda()
-    B = torch.randn(N, K).to(map_torch_type(in_dtype)).cuda()
+    A = torch.randn(M, K).to(T.dtype(in_dtype).as_torch()).cuda()
+    B = torch.randn(N, K).to(T.dtype(in_dtype).as_torch()).cuda()
 
     C = kernel(A, B)
 
-    ref_c = torch.matmul(A.to(map_torch_type(accum_dtype)), B.T.to(map_torch_type(accum_dtype))).to(map_torch_type(out_dtype))
+    ref_c = torch.matmul(A.to(T.dtype(accum_dtype).as_torch()), B.T.to(T.dtype(accum_dtype).as_torch())).to(T.dtype(out_dtype).as_torch())
     print(C)
     print(ref_c)
     diff = calc_diff(C, ref_c)
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
index 9ba369b6b9..f8793ba2e9 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemm_mma.py
@@ -9,7 +9,6 @@
     TensorCoreIntrinEmitter,
 )
 from tilelang.transform import simplify_prim_func
-from tilelang.utils.tensor import map_torch_type
 
 tilelang.testing.set_random_seed(0)
 
@@ -190,9 +189,9 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-    accum_dtype = map_torch_type(accum_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
+    accum_dtype = T.dtype(accum_dtype).as_torch()
 
     if in_dtype in {torch.int8, torch.int32}:
         A = torch.randint(-128, 128, (M, K), dtype=torch.int8).to(in_dtype).cuda()
diff --git a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
index 7b757992a7..1f819231f9 100644
--- a/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_fp8_gemv_simt.py
@@ -6,7 +6,6 @@
 import tilelang.language as T
 from tilelang import JITKernel
 from tilelang.transform.simplify import apply_simplify
-from tilelang.utils.tensor import map_torch_type
 from typing import Optional
 
 tilelang.testing.set_random_seed(0)
@@ -90,7 +89,7 @@ def main(
                         accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                 "reduce_scope",
                 T.reinterpret(T.uint64(0), dtype="handle"),
             ):
@@ -128,9 +127,9 @@ def evaluate_gemv_simt(
 
     kernel = JITKernel(program, target="cuda")
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-    accum_dtype = map_torch_type(accum_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
+    accum_dtype = T.dtype(accum_dtype).as_torch()
 
     if in_dtype in {torch.int8, torch.int32}:
         A = torch.randint(-128, 128, (M, K), dtype=torch.int8).to(in_dtype).cuda()
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm.py b/testing/python/kernel/test_tilelang_kernel_gemm.py
index 6dc95e98ad..f6a412f147 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm.py
@@ -103,6 +103,7 @@ def ref_program(A, B):
     profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_nn():
     run_gemm(
         512,
@@ -168,6 +169,7 @@ def test_gemm_f32f32f32_nn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_tn():
     run_gemm(
         512,
@@ -185,6 +187,7 @@ def test_gemm_f16f16f16_tn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f16f16f16_nt():
     run_gemm(
         512,
@@ -210,6 +213,7 @@ def test_gemm_i8i8i32_tn():
     run_gemm(512, 1024, 768, True, False, T.int8, T.int8, T.int32, 128, 128, 64)
 
 
+@tilelang.testing.requires_cuda
 def test_gemm_f64f64f64_nt():
     run_gemm(512, 512, 512, False, True, T.float64, T.float64, T.float64, 64, 32, 16)
 
@@ -230,6 +234,8 @@ def test_gemm_f32f32f32_nt():
     )
 
 
+# TODO(Gong): Meets precision issue on ROCm, disable for now
+@tilelang.testing.requires_cuda
 def test_gemm_f32f32f32_tn():
     run_gemm(
         512,
@@ -246,6 +252,7 @@ def test_gemm_f32f32f32_tn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_pad_aligned_f16f16f16_nn():
     run_gemm(
         512 - 8,
@@ -263,6 +270,7 @@ def test_pad_aligned_f16f16f16_nn():
     )
 
 
+@tilelang.testing.requires_cuda
 def test_pad_f16f16f16_nn():
     run_gemm(
         512 - 9,
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_batched.py b/testing/python/kernel/test_tilelang_kernel_gemm_batched.py
new file mode 100644
index 0000000000..c73923e0e9
--- /dev/null
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_batched.py
@@ -0,0 +1,134 @@
+from tilelang import tvm as tvm
+import tilelang.testing
+import tilelang.language as T
+
+
+def matmul_batched(
+    batch,
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (batch, K, M) if trans_A else (batch, M, K)
+    B_shape = (batch, N, K) if trans_B else (batch, K, N)
+    A_shared_shape = (batch, block_K, block_M) if trans_A else (batch, block_M, block_K)
+    B_shared_shape = (batch, block_N, block_K) if trans_B else (batch, block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((batch, M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.disable_warp_group_reg_alloc()
+            for b in T.serial(batch):
+                T.clear(C_local)
+                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                    if trans_A:
+                        T.copy(A[b, k * block_K, by * block_M], A_shared[b, :, :])
+                    else:
+                        T.copy(A[b, by * block_M, k * block_K], A_shared[b, :, :])
+                    if trans_B:
+                        T.copy(B[b, bx * block_N, k * block_K], B_shared[b, :, :])
+                    else:
+                        T.copy(B[b, k * block_K, bx * block_N], B_shared[b, :, :])
+                    T.gemm(A_shared[b, :, :], B_shared[b, :, :], C_local, trans_A, trans_B)
+                T.copy(C_local, C[b, by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_batched(
+    batch,
+    M,
+    N,
+    K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    dtypeAccum,
+    block_M,
+    block_N,
+    block_K,
+    num_stages=0,
+    num_threads=128,
+):
+    program = matmul_batched(
+        batch,
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        dtypeAccum,
+        num_stages,
+        num_threads,
+    )
+
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        if trans_A:
+            A = A.transpose(-1, -2)
+        if trans_B:
+            B = B.transpose(-1, -2)
+        if in_dtype == T.float32:
+            # Convert float32 to tfloat32 because tfloat32 mma cannot truncate
+            # float32 automatically, -0x1000 meas
+            A = (A.contiguous().view(torch.int32) - 0x1000).view(torch.float32)
+            B = (B.contiguous().view(torch.int32) - 0x1000).view(torch.float32)
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        C = C.to(torch.__getattribute__(out_dtype))
+        return C
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+def test_gemm_f16f16f16_nn():
+    run_gemm_batched(
+        2,
+        64,
+        64,
+        32,
+        False,
+        False,
+        T.float16,
+        T.float16,
+        T.float16,
+        64,
+        64,
+        32,
+        0,
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
index dd1b75ebc5..7f7f36c51d 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_mma_intrinsic.py
@@ -9,7 +9,6 @@
     TensorCoreIntrinEmitter,
 )
 from tilelang.transform import simplify_prim_func
-from tilelang.utils.tensor import map_torch_type
 
 tilelang.testing.set_random_seed(0)
 
@@ -190,9 +189,9 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     # src_code is the generated cuda source
     assert src_code is not None
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-    accum_dtype = map_torch_type(accum_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
+    accum_dtype = T.dtype(accum_dtype).as_torch()
 
     if in_dtype in {torch.int8, torch.int32}:
         A = torch.randint(-128, 128, (M, K), dtype=torch.int8).to(in_dtype).cuda()
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
index 584aa854a7..5c52f432d0 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_simt.py
@@ -154,15 +154,19 @@ def assert_tl_matmul_correctness(M, N, K, in_dtype, out_dtype, accum_dtype):
     assert latency is not None
 
     # Get Reference Result
-    ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, accum_dtype))
+    ref_c = torch.matmul(A.to(torch.float32), B.T.to(torch.float32)).to(getattr(torch, out_dtype))
     print(C)
     print(ref_c)
     torch.testing.assert_close(C, ref_c, rtol=1e-2, atol=1e-2)
 
 
 def test_assert_tl_matmul():
-    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float16)
+    assert_tl_matmul_correctness(128, 128, 128, T.float16, T.float16, T.float32)
     assert_tl_matmul_correctness(128, 256, 256, T.float16, T.float32, T.float32)
+
+
+@tilelang.testing.requires_cuda
+def test_assert_tl_matmul_int8():
     assert_tl_matmul_correctness(128, 256, 256, T.int8, T.int32, T.int32)
 
 
diff --git a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
index 1f76600325..8f5509f280 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemm_with_stride.py
@@ -55,10 +55,7 @@ def run_gemm_with_stride_ss(M: int, N: int, K: int, block_M: int, block_N: int,
         func,
         out_idx=[2],
         target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     # Create random input tensors on the GPU
     a = torch.randn(M, K, device="cuda", dtype=torch.float16)
diff --git a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
index b4a5c82492..4669cee51f 100644
--- a/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
+++ b/testing/python/kernel/test_tilelang_kernel_gemv_simt.py
@@ -6,7 +6,6 @@
 import tilelang.language as T
 from tilelang import JITKernel
 from tilelang.transform.simplify import apply_simplify
-from tilelang.utils.tensor import map_torch_type
 from typing import Optional
 
 tilelang.testing.set_random_seed(0)
@@ -90,7 +89,7 @@ def main(
                         accum_res[0] += A_local[ki].astype(accum_dtype) * B_local[ki].astype(accum_dtype)
 
             with T.attr(
-                T.comm_reducer(lambda x, y: x + y, [T.Cast(accum_dtype, 0)]),
+                T.comm_reducer(lambda x, y: x + y, [T.cast(0, accum_dtype)]),
                 "reduce_scope",
                 T.reinterpret(T.uint64(0), dtype="handle"),
             ):
@@ -128,9 +127,9 @@ def evaluate_gemv_simt(
 
     kernel = JITKernel(program, target="cuda")
 
-    in_dtype = map_torch_type(in_dtype)
-    out_dtype = map_torch_type(out_dtype)
-    accum_dtype = map_torch_type(accum_dtype)
+    in_dtype = T.dtype(in_dtype).as_torch()
+    out_dtype = T.dtype(out_dtype).as_torch()
+    accum_dtype = T.dtype(accum_dtype).as_torch()
 
     if in_dtype in {torch.int8, torch.int32}:
         A = torch.randint(-128, 128, (M, K), dtype=torch.int8).to(in_dtype).cuda()
diff --git a/testing/python/kernel/test_tilelang_kernel_int8_gemm_tcgen5.py b/testing/python/kernel/test_tilelang_kernel_int8_gemm_tcgen5.py
new file mode 100644
index 0000000000..a8dbd5258d
--- /dev/null
+++ b/testing/python/kernel/test_tilelang_kernel_int8_gemm_tcgen5.py
@@ -0,0 +1,80 @@
+import torch
+import tilelang.testing
+import tilelang.language as T
+
+
+def calc_diff(x, y):
+    x, y = x.double(), y.double()
+    denominator = (x * x + y * y).sum()
+    sim = 2 * (x * y).sum() / denominator
+    return 1 - sim
+
+
+def matmul_nt(M, N, K, bM, bN, bK, in_dtype, out_dtype, accum_dtype):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), in_dtype),
+        B: T.Tensor((N, K), in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, bN), T.ceildiv(M, bM), threads=256) as (bx, by):
+            A_shared = T.alloc_shared((bM, bK), in_dtype)
+            B_shared = T.alloc_shared((bN, bK), in_dtype)
+            C_tmem = T.alloc_tmem((bM, bN), accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((bM, bN), accum_dtype)
+            C_shared = T.alloc_shared((bM, bN), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, bK), num_stages=2):
+                T.copy(A[by * bM, k * bK], A_shared)
+                T.copy(B[bx * bN, k * bK], B_shared)
+                T.tcgen05_gemm(A_shared, B_shared, C_tmem, transpose_B=True, mbar=mbar, clear_accum=k == 0)
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[by * bM, bx * bN])
+
+    return main
+
+
+def assert_matmul_correctness(M, N, K, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype):
+    func = matmul_nt(M, N, K, block_M, block_N, block_K, in_dtype, out_dtype, accum_dtype)
+    kernel = tilelang.compile(
+        func,
+        out_idx=-1,
+        target="cuda",
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+
+    assert out_dtype in [T.int32], "Currently only int32 is supported"
+    assert accum_dtype in [T.int32], "Currently only int32 is supported"
+
+    if in_dtype is T.int8:
+        A = torch.randint(-128, 128, (M, K), device="cuda", dtype=torch.int8)
+        B = torch.randint(-128, 128, (N, K), device="cuda", dtype=torch.int8)
+    elif in_dtype is T.uint8:
+        A = torch.randint(0, 256, (M, K), device="cuda", dtype=torch.uint8)
+        B = torch.randint(0, 256, (N, K), device="cuda", dtype=torch.uint8)
+    else:
+        raise ValueError(f"Unsupported input dtype: {in_dtype}")
+
+    C = kernel(A, B)
+
+    ref_c = (A.float() @ B.T.float()).to(torch.int32)
+    print(C)
+    print(ref_c)
+    diff = calc_diff(C, ref_c)
+    print(f"diff: {diff}")
+    assert diff < 1e-3
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+def test_assert_matmul():
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 128, T.int8, T.int32, T.int32)
+    assert_matmul_correctness(1024, 1024, 1024, 128, 128, 128, T.uint8, T.int32, T.int32)
+
+
+if __name__ == "__main__":
+    test_assert_matmul()
diff --git a/testing/python/language/test_tilelang_language_access_ptr.py b/testing/python/language/test_tilelang_language_access_ptr.py
new file mode 100644
index 0000000000..bbeadd525d
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_access_ptr.py
@@ -0,0 +1,88 @@
+import tilelang
+import tvm
+from tvm import arith
+from tvm import tir
+from tvm.tir import op
+import tilelang.language as T
+import tilelang.transform
+
+
+def test_access_ptr_builds_tl_access_ptr_from_bufferload_1d():
+    buf = tir.decl_buffer((64,), "uint8", name="A")
+    load = tir.BufferLoad(buf, [tir.IntImm("int32", 16)])
+
+    ptr = T.access_ptr(load, "r", 16)
+
+    assert isinstance(ptr, tir.Call)
+    assert ptr.op.same_as(op.Op.get("tl.access_ptr"))
+    assert len(ptr.args) == 3
+    # args: (base_load, extent, rw_mask)
+    assert isinstance(ptr.args[0], tir.BufferLoad)
+    assert isinstance(ptr.args[1], tir.IntImm)
+    assert int(ptr.args[1].value) == 16
+    assert isinstance(ptr.args[2], tir.IntImm)
+    assert int(ptr.args[2].value) == 1
+
+
+def test_access_ptr_defaults_to_element_extent_for_bufferload():
+    buf = tir.decl_buffer((64,), "float16", name="A")
+    load = tir.BufferLoad(buf, [tir.IntImm("int32", 7)])
+
+    ptr = T.access_ptr(load, "rw")
+
+    assert isinstance(ptr, tir.Call)
+    assert ptr.op.same_as(op.Op.get("tl.access_ptr"))
+    assert isinstance(ptr.args[0], tir.BufferLoad)
+    assert isinstance(ptr.args[1], tir.IntImm)
+    assert int(ptr.args[1].value) == 1
+    assert isinstance(ptr.args[2], tir.IntImm)
+    assert int(ptr.args[2].value) == 3
+
+
+def test_access_ptr_multiplies_extents_for_2d_load():
+    buf = tir.decl_buffer((8, 8), "float16", name="A")
+    load = tir.BufferLoad(buf, [tir.IntImm("int32", 2), tir.IntImm("int32", 3)])
+
+    ptr = T.access_ptr(load, "w", 4, 5)
+
+    assert isinstance(ptr, tir.Call)
+    assert ptr.op.same_as(op.Op.get("tl.access_ptr"))
+    assert isinstance(ptr.args[0], tir.BufferLoad)
+    # extent = 4*5 = 20
+    assert isinstance(ptr.args[1], tir.IntImm)
+    assert int(ptr.args[1].value) == 20
+    assert isinstance(ptr.args[2], tir.IntImm)
+    assert int(ptr.args[2].value) == 2
+
+
+def test_lower_access_ptr_rewrites_to_tvm_access_ptr():
+    buf = tir.decl_buffer((8, 8), "float16", name="A")
+    load = tir.BufferLoad(buf, [tir.IntImm("int32", 2), tir.IntImm("int32", 3)])
+    ptr = T.access_ptr(load, "w", 4, 5)
+
+    func = tir.PrimFunc([buf.data], tir.Evaluate(ptr), buffer_map={buf.data: buf})
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    lowered = tilelang.transform.LowerAccessPtr()(mod)
+
+    calls: list[tir.Call] = []
+
+    def _collect(e):
+        if isinstance(e, tir.Call):
+            calls.append(e)
+
+    tir.stmt_functor.post_order_visit(lowered["main"].body, _collect)
+    assert any(c.op.same_as(op.Op.get("tir.tvm_access_ptr")) for c in calls)
+    assert not any(c.op.same_as(op.Op.get("tl.access_ptr")) for c in calls)
+
+    # Check the lowered tvm_access_ptr carries the expected linear offset/extents.
+    acc = [c for c in calls if c.op.same_as(op.Op.get("tir.tvm_access_ptr"))][0]
+    assert len(acc.args) == 5
+    analyzer = arith.Analyzer()
+    offset = analyzer.simplify(acc.args[2])
+    extent = analyzer.simplify(acc.args[3])
+    assert isinstance(offset, tir.IntImm)
+    assert int(offset.value) == 19
+    assert isinstance(extent, tir.IntImm)
+    assert int(extent.value) == 20
+    assert isinstance(acc.args[4], tir.IntImm)
+    assert int(acc.args[4].value) == 2
diff --git a/testing/python/language/test_tilelang_language_access_ptr_codegen.py b/testing/python/language/test_tilelang_language_access_ptr_codegen.py
new file mode 100644
index 0000000000..1d66b8b32b
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_access_ptr_codegen.py
@@ -0,0 +1,197 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import pytest
+from tilelang import tvm
+
+
+@tilelang.testing.requires_cuda
+def test_access_ptr_cp_async_codegen():
+    """Smoke-test codegen for T.access_ptr -> tl.access_ptr -> tvm_access_ptr -> cp.async."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((64,), T.uint8),
+        B: T.Tensor((64,), T.uint8),
+    ):
+        with T.Kernel(1, threads=32):
+            S = T.alloc_shared((64,), T.uint8)
+            T.ptx_cp_async(
+                T.access_ptr(S[8], "w", 16),
+                T.access_ptr(A[16], "r", 16),
+                16,
+            )
+            # Keep the shared buffer live so the pointers remain in generated code.
+            B[0] = S[8]
+
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print("=== access_ptr cp.async codegen ===")
+    print(src)
+    assert "cp_async_gs<16>" in src, "Expected cp_async_gs<16> in generated CUDA source"
+
+
+@tilelang.testing.requires_cuda
+def test_vectorized_cp_async_num_elems_codegen():
+    """Check vectorized tl.ptx_cp_async widens logical element counts."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((64,), T.float16),
+        B: T.Tensor((64,), T.float16),
+    ):
+        with T.Kernel(1, threads=32):
+            S = T.alloc_shared((64,), T.float16)
+            for i in T.vectorized(4):
+                T.ptx_cp_async(
+                    T.access_ptr(S[i], "w", 1),
+                    T.access_ptr(A[i], "r", 1),
+                    1,
+                )
+            T.ptx_commit_group()
+            T.ptx_wait_group(0)
+            B[0] = S[0]
+
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print("=== vectorized cp.async codegen ===")
+    print(src)
+    assert "cp_async_gs<8>" in src, "Expected vectorized cp.async to fold 4 x fp16 elems into cp_async_gs<8>"
+    assert "cp_async_gs<2>" not in src, "Did not expect scalar cp.async width in generated CUDA source"
+
+
+@tilelang.testing.requires_cuda
+def test_vectorized_int4_cp_async_num_elems_codegen():
+    """Check subbyte tl.ptx_cp_async derives PTX bytes from logical element counts."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((128,), T.int4),
+        B: T.Tensor((128,), T.int4),
+    ):
+        with T.Kernel(1, threads=32):
+            S = T.alloc_shared((128,), T.int4)
+            for i in T.vectorized(32):
+                T.ptx_cp_async(
+                    T.access_ptr(S[i], "w", 1),
+                    T.access_ptr(A[i], "r", 1),
+                    1,
+                )
+            T.ptx_commit_group()
+            T.ptx_wait_group(0)
+            B[0] = S[0]
+
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print("=== vectorized int4 cp.async codegen ===")
+    print(src)
+    assert "cp_async_gs<16>" in src, "Expected 32 x int4 elems to fold into cp_async_gs<16>"
+
+
+@tilelang.testing.requires_cuda
+def test_async_copy_tileop_lowers_to_cp_async():
+    """Check T.async_copy always uses CPAsync path and does not auto-wait."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((4,), T.float16),
+        B: T.Tensor((4,), T.float16),
+    ):
+        with T.Kernel(1, threads=1):
+            S = T.alloc_shared((4,), T.float16)
+            T.async_copy(A[0:4], S)
+            T.copy(S, B[0:4])
+
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print("=== async_copy -> cp.async codegen ===")
+    print(src)
+    assert "cp_async_gs<8>" in src, "Expected T.async_copy to lower to cp_async_gs<8>"
+    assert "tl::cp_async_commit" in src, "Expected async_copy lowering to emit commit"
+    assert "tl::cp_async_wait<0>" not in src, "Did not expect async_copy lowering to auto-emit wait"
+
+
+@tilelang.testing.requires_cuda
+def test_async_copy_tileop_rejects_invalid_cp_async_scope():
+    """Check T.async_copy rejects non global->shared patterns."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((4,), T.float16),
+        B: T.Tensor((4,), T.float16),
+    ):
+        with T.Kernel(1, threads=1):
+            S0 = T.alloc_shared((4,), T.float16)
+            S1 = T.alloc_shared((4,), T.float16)
+            T.copy(A[0:4], S0)
+            # shared->shared cannot use cp.async and should fail for async_copy.
+            T.async_copy(S0, S1)
+            T.copy(S1, B[0:4])
+
+    with pytest.raises(
+        tvm.error.InternalError,
+        match="T\\.async_copy only supports global->shared/shared\\.dyn copies",
+    ):
+        tilelang.compile(main, out_idx=[1], target="cuda")
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8, 0)
+def test_parallel_simt_copy_respects_enable_async_copy_config():
+    """Check `tl.enable_async_copy=False` disables auto cp.async rewriting."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        with T.Kernel(1, threads=128):
+            S = T.alloc_shared((128,), T.float32)
+            for i in T.Parallel(128):
+                S[i] = A[i]
+            B[0] = S[0]
+
+    kernel = tilelang.compile(
+        main,
+        out_idx=[1],
+        target="cuda",
+        pass_configs={tilelang.PassConfigKey.TL_ENABLE_ASYNC_COPY: False},
+    )
+    src = kernel.get_kernel_source()
+    print("=== Parallel SIMT copy (async disabled) codegen ===")
+    print(src)
+    assert "cp_async_gs<" not in src, "Did not expect cp_async_gs when async copy is disabled"
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8, 0)
+def test_async_copy_oob_lowers_to_predicated_cp_async_without_wait():
+    """Check T.async_copy supports OOB via predicated cp.async and does not auto-wait."""
+
+    M = 130
+    K = 32
+    block_m = 128
+    block_k = 32
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((M, K), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(M, block_m)) as pid_m:
+            S = T.alloc_shared((block_m, block_k), T.float16)
+            T.async_copy(A[pid_m * block_m : (pid_m + 1) * block_m, 0:block_k], S)
+            # Don't read S here (no wait). Keep B live so kernel has an output.
+            B[0, 0] = A[0, 0]
+
+    kernel = tilelang.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print("=== OOB async_copy -> predicated cp.async codegen ===")
+    print(src)
+    assert "cp_async_gs_conditional<" in src, "Expected predicated cp.async (zero-fill) in generated CUDA source"
+    assert "tl::cp_async_commit" in src, "Expected async_copy lowering to emit commit"
+    assert "tl::cp_async_wait<0>" not in src, "Did not expect async_copy lowering to auto-emit wait"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_alias.py b/testing/python/language/test_tilelang_language_alias.py
index 48fe1ac4d8..77e1a60d28 100644
--- a/testing/python/language/test_tilelang_language_alias.py
+++ b/testing/python/language/test_tilelang_language_alias.py
@@ -45,7 +45,7 @@ def main(
 
 def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(program, out_idx=[2], target="cuda")
+    kernel = tilelang.compile(program, out_idx=[2])
     kernel.run_once()
 
 
diff --git a/testing/python/language/test_tilelang_language_all_of.py b/testing/python/language/test_tilelang_language_all_of.py
index db694d3376..83586d506a 100644
--- a/testing/python/language/test_tilelang_language_all_of.py
+++ b/testing/python/language/test_tilelang_language_all_of.py
@@ -230,10 +230,7 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
     kernel = tilelang.compile(
         func,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
@@ -278,10 +275,7 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
     kernel = tilelang.compile(
         func,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
diff --git a/testing/python/language/test_tilelang_language_alloc.py b/testing/python/language/test_tilelang_language_alloc.py
index 709796932d..ee92b38361 100644
--- a/testing/python/language/test_tilelang_language_alloc.py
+++ b/testing/python/language/test_tilelang_language_alloc.py
@@ -1,5 +1,6 @@
 import tilelang.testing
 from tilelang import language as T
+import torch
 
 
 def alloc_var(
@@ -162,5 +163,78 @@ def test_alloc_multi_vars_with_initializer():
     run_alloc_multi_vars_with_initializer(256, 64, T.int32)
 
 
+def alloc_global(
+    N,
+    block_N,
+    dtype,
+):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        C = T.alloc_global((N,), dtype)
+        with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
+            T.copy(A[bx * block_N : (bx + 1) * block_N], C[bx * block_N : (bx + 1) * block_N])
+            T.copy(C[bx * block_N : (bx + 1) * block_N], B[bx * block_N : (bx + 1) * block_N])
+
+    return main
+
+
+def run_alloc_global(
+    N,
+    block_N,
+    dtype,
+):
+    program = alloc_global(N, block_N, dtype)
+
+    kernel = tilelang.compile(program, out_idx=[1])
+    # print(kernel.get_host_source())
+    # code = kernel.get_kernel_source()
+    # print(code)
+    A = torch.randn(N, device="cuda", dtype=getattr(torch, dtype))
+    B = kernel(A)
+    torch.testing.assert_close(B, A, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.jit
+def alloc_global_eagerjit(A, block_N, dtype):
+    N = T.const("N")
+    A: T.Tensor[[N], dtype]
+    B = T.empty(
+        [
+            N,
+        ],
+        dtype=dtype,
+    )
+    C = T.alloc_global(
+        [
+            N,
+        ],
+        dtype,
+    )
+
+    with T.Kernel(T.ceildiv(N, block_N), threads=block_N) as bx:
+        T.copy(A[bx * block_N : (bx + 1) * block_N], C[bx * block_N : (bx + 1) * block_N])
+        T.copy(C[bx * block_N : (bx + 1) * block_N], B[bx * block_N : (bx + 1) * block_N])
+
+    return B
+
+
+def run_alloc_global_eagerjit(
+    N,
+    block_N,
+    dtype,
+):
+    A = torch.randn(N, device="cuda", dtype=getattr(torch, dtype))
+    B = alloc_global_eagerjit(A, block_N, dtype)
+    torch.testing.assert_close(B, A, rtol=1e-2, atol=1e-2)
+
+
+def test_alloc_global():
+    run_alloc_global(1024, 128, T.float16)
+    run_alloc_global_eagerjit(1024, 128, T.float16)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_alloc_var_cmp.py b/testing/python/language/test_tilelang_language_alloc_var_cmp.py
new file mode 100644
index 0000000000..a2481cd390
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_alloc_var_cmp.py
@@ -0,0 +1,64 @@
+import warnings
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+@tilelang.jit
+def kernel_alloc_var_eq(data):
+    N = T.dynamic("N")
+    data: T.Tensor[[N], T.int32]
+    out = T.empty([N], T.int32)
+
+    with T.Kernel(1) as _:
+        for i in T.serial(N):
+            cond = T.alloc_var(T.bool)
+            cond = data[i] == 2
+            if cond:
+                out[i] = 1
+            else:
+                out[i] = 0
+    return out
+
+
+@tilelang.jit
+def kernel_alloc_var_ne(data):
+    N = T.dynamic("N")
+    data: T.Tensor[[N], T.int32]
+    out = T.empty([N], T.int32)
+
+    with T.Kernel(1) as _:
+        for i in T.serial(N):
+            cond = T.alloc_var(T.bool)
+            cond = data[i] != 0
+            if cond:
+                out[i] = 1
+            else:
+                out[i] = 0
+    return out
+
+
+@tilelang.testing.requires_cuda
+def test_alloc_var_eq():
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", message="Immutable value.*is re-bound")
+        data = torch.tensor([0, 1, 2, 3, 2], dtype=torch.int32, device="cuda")
+        result = kernel_alloc_var_eq(data)
+        expected = torch.tensor([0, 0, 1, 0, 1], dtype=torch.int32, device="cuda")
+        torch.testing.assert_close(result, expected)
+
+
+@tilelang.testing.requires_cuda
+def test_alloc_var_ne():
+    with warnings.catch_warnings():
+        warnings.filterwarnings("error", message="Immutable value.*is re-bound")
+        data = torch.tensor([0, 1, 2, 3, 0], dtype=torch.int32, device="cuda")
+        result = kernel_alloc_var_ne(data)
+        expected = torch.tensor([0, 1, 1, 1, 0], dtype=torch.int32, device="cuda")
+        torch.testing.assert_close(result, expected)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_annot.py b/testing/python/language/test_tilelang_language_annot.py
index 5c9aeeac6b..1647a296cb 100644
--- a/testing/python/language/test_tilelang_language_annot.py
+++ b/testing/python/language/test_tilelang_language_annot.py
@@ -5,7 +5,9 @@
 
 
 def test_tensor_annot_mul():
-    @tilelang.jit
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
     def example_tensor_annot():
         n = T.symbolic("n")
 
@@ -27,7 +29,9 @@ def kernel(
 
 
 def test_tensor_annot_add():
-    @tilelang.jit
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
     def example_tensor_annot():
         n = T.symbolic("n")
 
@@ -49,7 +53,9 @@ def kernel(
 
 
 def test_tensor_annot_mul_add():
-    @tilelang.jit
+    # There is a known issue where the cython execution backend fails to build with T.symbolic.
+    # Forcing the TVM FFI execution backend to avoid the issue on HIP.
+    @tilelang.jit(execution_backend="tvm_ffi")
     def example_tensor_annot():
         n = T.symbolic("n")
 
diff --git a/testing/python/language/test_tilelang_language_annotate_safe_value.py b/testing/python/language/test_tilelang_language_annotate_safe_value.py
index 3c8239a157..1207277e72 100644
--- a/testing/python/language/test_tilelang_language_annotate_safe_value.py
+++ b/testing/python/language/test_tilelang_language_annotate_safe_value.py
@@ -29,7 +29,9 @@ def main(
 def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16, pad_value=0):
     program = tilelang_copy(M, N, block_M, block_N, dtype, pad_value=pad_value)
     kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
+        program,
+        out_idx=[1],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
@@ -42,6 +44,7 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16,
     torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_tilelang_copy():
     run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, pad_value=10)
 
diff --git a/testing/python/language/test_tilelang_language_any_of.py b/testing/python/language/test_tilelang_language_any_of.py
index 74db94f7c2..46a834c698 100644
--- a/testing/python/language/test_tilelang_language_any_of.py
+++ b/testing/python/language/test_tilelang_language_any_of.py
@@ -230,10 +230,7 @@ def run_block_sparse_matmul_shared(M=1024, N=1024, K=1024, sparsity=0.5, conditi
     kernel = tilelang.compile(
         func,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
@@ -278,10 +275,7 @@ def run_block_sparse_matmul_local(M=1024, N=1024, K=1024, sparsity=0.5, conditio
     kernel = tilelang.compile(
         func,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     # Create block mask with desired sparsity
     mask_shape = (M // block_M, N // block_N, K // block_K)
diff --git a/testing/python/language/test_tilelang_language_async_copy_gemm_sm80.py b/testing/python/language/test_tilelang_language_async_copy_gemm_sm80.py
new file mode 100644
index 0000000000..833f3325fd
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_async_copy_gemm_sm80.py
@@ -0,0 +1,96 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+@tilelang.testing.requires_cuda_compute_version_eq(8, 0)
+def test_copy_and_async_copy_gemm_codegen_equivalent_sm80():
+    """For SM80, T.copy(global->shared) may lower to cp.async.
+
+    This test checks that the explicit form:
+      T.async_copy(...) + T.ptx_wait_group(0)
+    produces identical CUDA source as:
+      T.copy(...)
+
+    This is intentionally a codegen equivalence test (not a perf test).
+    """
+
+    M = 256
+    N = 256
+    K = 128
+    block_M = 128
+    block_N = 128
+    block_K = 32
+
+    @T.prim_func
+    def matmul_relu_kernel(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (
+            bx,
+            by,
+        ):
+            A_shared = T.alloc_shared((block_M, block_K), T.float16)
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
+            C_local = T.alloc_fragment((block_M, block_N), T.float32)
+
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
+                T.async_copy(A[by * block_M, ko * block_K], A_shared)
+                T.ptx_wait_group(0)
+
+                T.async_copy(B[ko * block_K, bx * block_N], B_shared)
+                T.ptx_wait_group(0)
+
+                T.gemm(A_shared, B_shared, C_local)
+
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    async_matmul_relu = matmul_relu_kernel
+
+    @T.prim_func
+    def matmul_relu_kernel(  # noqa: F811
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M)) as (
+            bx,
+            by,
+        ):
+            A_shared = T.alloc_shared((block_M, block_K), T.float16)
+            B_shared = T.alloc_shared((block_K, block_N), T.float16)
+            C_local = T.alloc_fragment((block_M, block_N), T.float32)
+
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            for i, j in T.Parallel(block_M, block_N):
+                C_local[i, j] = T.max(C_local[i, j], 0)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    sync_matmul_relu = matmul_relu_kernel
+
+    # Compile both and compare the generated CUDA source.
+    async_kernel = tilelang.compile(async_matmul_relu, target="cuda")
+    sync_kernel = tilelang.compile(sync_matmul_relu, target="cuda")
+
+    async_src = async_kernel.get_kernel_source()
+    sync_src = sync_kernel.get_kernel_source()
+
+    assert async_src == sync_src
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_atomic.py b/testing/python/language/test_tilelang_language_atomic.py
new file mode 100644
index 0000000000..5d8704ef49
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_atomic.py
@@ -0,0 +1,646 @@
+import tilelang.testing
+import tilelang.layout
+import tilelang.language as T
+import torch
+
+
+# ======================= Thread-level atomic add =======================
+
+
+@tilelang.jit
+def atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
+
+    return atomic_add
+
+
+def run_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] += A[k, i, j]
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_memory_order_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_with_memory_order(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], A_shared[i, j], memory_order="relaxed")
+
+    return atomic_with_memory_order
+
+
+def run_atomic_memory_order(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_memory_order_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] += A[k, i, j]
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_addx2_program(M, N, block_M, block_N, dtype=T.float16):
+    @T.prim_func
+    def atomic_addx2(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N // 2):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j * 2
+                T.atomic_addx2(B[idx_i, idx_j], A[idx_i, idx_j])
+
+    return atomic_addx2
+
+
+def run_atomic_addx2(M, N, block_M, block_N, dtype=T.float16):
+    kernel = atomic_addx2_program(M, N, block_M, block_N, dtype=dtype)
+
+    import torch
+
+    A = torch.randn(M, N, dtype=torch.float32).cuda().to(getattr(torch, dtype))
+    B = torch.zeros(M, N, dtype=torch.float32).cuda().to(getattr(torch, dtype))
+    ref_B = B.clone()
+
+    for i in range(M):
+        for j in range(0, N - 1, 2):
+            ref_B[i, j] += A[i, j]
+            ref_B[i, j + 1] += A[i, j + 1]
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_different_orders(
+        A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), C: T.Tensor((M, N), dtype), D: T.Tensor((M, N), dtype)
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j
+                if idx_i < M and idx_j < N:
+                    val = A[idx_i, idx_j]
+                    T.atomic_add(B[idx_i, idx_j], val, memory_order="release")
+                    T.atomic_max(C[idx_i, idx_j], val, memory_order="relaxed")
+                    T.atomic_min(D[idx_i, idx_j], val, memory_order="relaxed")
+
+    return atomic_different_orders
+
+
+def run_atomic_different_memory_orders(M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_different_memory_orders_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    C = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    D = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
+
+    kernel(A, B, C, D)
+
+    torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(C, torch.maximum(torch.zeros_like(A), A))
+    torch.testing.assert_close(D, torch.minimum(torch.full_like(A, float("inf")), A))
+
+
+@tilelang.jit
+def atomic_addx4_program(M, N, block_M, block_N):
+    @T.prim_func
+    def atomic_addx4(A: T.Tensor((M, N), T.float32), B: T.Tensor((M, N), T.float32)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N // 4):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j * 4
+                T.atomic_addx4(B[idx_i, idx_j], A[idx_i, idx_j])
+
+    return atomic_addx4
+
+
+def run_atomic_addx4(M, N, block_M, block_N):
+    kernel = atomic_addx4_program(M, N, block_M, block_N)
+    import torch
+
+    A = torch.randn(M, N, dtype=torch.float32).cuda()
+    B = torch.zeros(M, N, dtype=torch.float32).cuda()
+    ref_B = B.clone()
+
+    for i in range(M):
+        for j in range(0, N - 3, 4):
+            ref_B[i, j] += A[i, j]
+            ref_B[i, j + 1] += A[i, j + 1]
+            ref_B[i, j + 2] += A[i, j + 2]
+            ref_B[i, j + 3] += A[i, j + 3]
+
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_return_prev_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_with_return_prev(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype), old_vals: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j
+                if idx_i < M and idx_j < N:
+                    old_vals[idx_i, idx_j] = T.atomic_add(B[idx_i, idx_j], A[idx_i, idx_j], return_prev=True)
+
+    return atomic_with_return_prev
+
+
+def run_atomic_return_prev(M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_return_prev_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    A = torch.ones(M, N, dtype=getattr(torch, dtype)).cuda() * 5.0
+    B = torch.ones(M, N, dtype=getattr(torch, dtype)).cuda() * 2.0
+    old_vals = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+
+    initial_B = B.clone()
+    kernel(A, B, old_vals)
+
+    torch.testing.assert_close(old_vals, initial_B, atol=1e-3, rtol=1e-3)
+    torch.testing.assert_close(B, initial_B + A, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tma_atomic_add_program(out, explicit_swizzle=False):
+    out: T.Tensor[(16, 16), T.float32]
+
+    with T.Kernel(
+        1,
+    ):
+        out_shared = T.alloc_shared((16, 16), dtype=T.float32)
+        if explicit_swizzle:
+            T.annotate_layout({out_shared: tilelang.layout.make_swizzled_layout(out_shared)})
+        T.fill(out_shared, 1)
+        for _ in range(16):
+            T.atomic_add(out, out_shared, use_tma=True)
+
+
+@tilelang.testing.requires_cuda
+def test_tma_atomic_add():
+    out = torch.zeros((16, 16), dtype=torch.float32, device="cuda")
+    tma_atomic_add_program(out)
+    torch.testing.assert_close(out, torch.ones((16, 16), dtype=torch.float32, device="cuda") * 16)
+
+    kernel = tma_atomic_add_program.compile(out=T.Tensor[(16, 16), T.float32])
+    assert "tma_store_add" in kernel.get_kernel_source()
+    assert "desc" in kernel.get_kernel_source()  # Ensure using cp.reduce.async.bulk.tensor
+
+    kernel_with_explicit_swizzle = tma_atomic_add_program.compile(out=T.Tensor[(16, 16), T.float32], explicit_swizzle=True)
+    # Ensure auto swizzled layout is applied
+    assert kernel.get_kernel_source() == kernel_with_explicit_swizzle.get_kernel_source()
+
+
+def run_atomic_add_auto_vectorized(K, M, N, block_M, block_N, dtype=T.float32):
+    tilelang.disable_cache()
+    kernel = atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
+    assert "AtomicAddx4" in kernel.get_kernel_source()
+
+
+@tilelang.jit
+def atomic_add_auto_vectorized_unit_test(vec_size: int, dtype=T.float32):
+    @T.prim_func
+    def atomic_addx2(A: T.Tensor((vec_size,), dtype)):
+        with T.Kernel(threads=1):
+            A_local = T.alloc_fragment((vec_size,), dtype)
+            for i in T.Parallel(vec_size):
+                T.atomic_add(A[i], A_local[i])
+
+    return atomic_addx2
+
+
+def run_atomic_add_auto_vectorized_unit_test(vec_size: int, dtype=T.float32):
+    kernel = atomic_add_auto_vectorized_unit_test(vec_size, dtype)
+    assert f"AtomicAddx{vec_size}" in kernel.get_kernel_source()
+
+
+@tilelang.jit
+def atomic_add_complicated_parallel_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                value = A_shared[i, j]
+                T.atomic_add(B[bx * block_M + i, by * block_N + j], value)
+
+    return atomic_add
+
+
+def run_atomic_add_complicated_parallel(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_add_complicated_parallel_program(K, M, N, block_M, block_N, dtype=dtype)
+    assert "float4 value" in kernel.get_kernel_source()
+    assert "AtomicAddx4" in kernel.get_kernel_source()
+
+
+@tilelang.testing.requires_cuda
+def test_atomic_memory_order():
+    run_atomic_memory_order(4, 64, 64, 16, 16)
+
+
+@tilelang.testing.requires_cuda
+def test_atomic_addx2_half():
+    run_atomic_addx2(32, 64, 8, 16, dtype=T.float16)
+
+
+def test_atomic_addx2_float():
+    run_atomic_addx2(32, 64, 8, 16, dtype=T.float32)
+
+
+@tilelang.testing.requires_cuda
+def test_atomic_different_memory_orders():
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float32)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.float16)
+    run_atomic_different_memory_orders(32, 32, 8, 8, dtype=T.bfloat16)
+
+
+# TODO: atomic_addx4 currently not support half
+def test_atomic_addx4():
+    run_atomic_addx4(16, 64, 4, 4)
+
+
+def test_atomic_return_prev():
+    run_atomic_return_prev(32, 32, 8, 8)
+
+
+def test_atomic_add():
+    run_atomic_add(8, 128, 128, 32, 32)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_atomic_add_auto_vectorized():
+    run_atomic_add_auto_vectorized(8, 128, 128, 32, 32, dtype=T.float32)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_atomic_add_auto_vectorized_unit_test():
+    run_atomic_add_auto_vectorized_unit_test(2, dtype=T.float32)
+    run_atomic_add_auto_vectorized_unit_test(4, dtype=T.float32)
+    run_atomic_add_auto_vectorized_unit_test(2, dtype=T.float16)
+    run_atomic_add_auto_vectorized_unit_test(2, dtype=T.bfloat16)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_atomic_add_complicated_parallel():
+    run_atomic_add_complicated_parallel(8, 128, 128, 32, 32, dtype=T.float32)
+
+
+# ======================= Tile-level atomic add =======================
+
+
+@tilelang.jit
+def tile_atomic_add_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            T.atomic_add(B[bx * block_M, by * block_N], A_shared)
+
+    return atomic_add
+
+
+def run_tile_atomic_add(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_add_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] += A[k, i, j]
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tile_atomic_add_expr_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            T.atomic_add(A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], 1.0)
+
+    return atomic_add
+
+
+def run_tile_atomic_add_expr(M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_add_expr_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A):
+        for i in range(M):
+            for j in range(N):
+                A[i, j] += 1
+
+    A = torch.zeros(M, N, dtype=torch.float32).cuda()
+    ref_A = A.clone()
+    ref_program(ref_A)
+    kernel(A)
+    torch.testing.assert_close(A, ref_A, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tile_atomic_add_scalar_program(dtype=T.float32):
+    @T.prim_func
+    def atomic_add(A: T.Tensor((1), dtype), B: T.Tensor((1), dtype)):
+        with T.Kernel(
+            1,
+        ) as _:
+            A_local = T.alloc_local([1], dtype)
+            T.copy(A, A_local)
+            T.clear(B)
+            T.atomic_add(B, A_local)
+            T.atomic_add(B, 1)
+
+    return atomic_add
+
+
+def run_tile_atomic_add_scalar(dtype=T.float32):
+    kernel = tile_atomic_add_scalar_program(dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        B[0] = A[0] + 1
+
+    A = torch.randn(1, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(1, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+def test_tile_atomic_add():
+    run_tile_atomic_add(8, 128, 128, 32, 32)
+
+
+def test_tile_atomic_add_expr():
+    run_tile_atomic_add_expr(128, 128, 32, 32)
+
+
+def test_tile_atomic_add_scalar():
+    run_tile_atomic_add_scalar()
+
+
+# ======================= Thread-level atomic max/min/load store =======================
+
+
+@tilelang.jit
+def atomic_max_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                T.atomic_max(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
+
+    return atomic_max
+
+
+def run_atomic_max(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_max_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] = max(B[i, j], A[k, i, j])
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_min_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            for i, j in T.Parallel(block_M, block_N):
+                T.atomic_min(B[bx * block_M + i, by * block_N + j], A_shared[i, j])
+
+    return atomic_min
+
+
+def run_atomic_min(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_min_program(K, M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] = min(B[i, j], A[k, i, j])
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def atomic_load_store_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_load_store(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            for i, j in T.Parallel(block_M, block_N):
+                idx_i = bx * block_M + i
+                idx_j = by * block_N + j
+                if idx_i < M and idx_j < N:
+                    val = T.atomic_load(A[idx_i, idx_j])
+                    T.atomic_store(B[idx_i, idx_j], val)
+
+    return atomic_load_store
+
+
+def run_atomic_load_store(M, N, block_M, block_N, dtype=T.float32):
+    kernel = atomic_load_store_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    A = torch.randn(M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.zeros(M, N, dtype=getattr(torch, dtype)).cuda()
+    kernel(A, B)
+    torch.testing.assert_close(B, A, atol=1e-3, rtol=1e-3)
+
+
+def test_atomic_max():
+    run_atomic_max(4, 64, 64, 16, 16)
+
+
+def test_atomic_min():
+    run_atomic_min(4, 64, 64, 16, 16)
+
+
+@tilelang.testing.requires_cuda
+def test_atomic_load_store():
+    run_atomic_load_store(64, 64, 16, 16)
+
+
+# ======================= Tile-level atomic max/min =======================
+
+
+@tilelang.jit
+def tile_atomic_max_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def tile_atomic_max(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            T.atomic_max(B[bx * block_M, by * block_N], A_shared)
+
+    return tile_atomic_max
+
+
+def run_tile_atomic_max(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_max_program(K, M, N, block_M, block_N, dtype=dtype)
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] = max(B[i, j], A[k, i, j])
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("-inf"), dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tile_atomic_min_program(K, M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def tile_atomic_min(A: T.Tensor((K, M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), K, threads=32) as (bx, by, bz):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+
+            T.copy(A[bz, bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], A_shared)
+
+            T.atomic_min(B[bx * block_M, by * block_N], A_shared)
+
+    return tile_atomic_min
+
+
+def run_tile_atomic_min(K, M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_min_program(K, M, N, block_M, block_N, dtype=dtype)
+
+    def ref_program(A, B):
+        for k in range(K):
+            for i in range(M):
+                for j in range(N):
+                    B[i, j] = min(B[i, j], A[k, i, j])
+
+    A = torch.randn(K, M, N, dtype=getattr(torch, dtype)).cuda()
+    B = torch.full((M, N), float("inf"), dtype=getattr(torch, dtype)).cuda()
+    ref_B = B.clone()
+    ref_program(A, ref_B)
+    kernel(A, B)
+    torch.testing.assert_close(B, ref_B, atol=1e-3, rtol=1e-3)
+
+
+@tilelang.jit
+def tile_atomic_max_expr_program(M, N, block_M, block_N, dtype=T.float32):
+    @T.prim_func
+    def atomic_max(A: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=32) as (bx, by):
+            T.atomic_max(A[bx * block_M : (bx + 1) * block_M, by * block_N : (by + 1) * block_N], 0.5)
+
+    return atomic_max
+
+
+def run_tile_atomic_max_expr(M, N, block_M, block_N, dtype=T.float32):
+    kernel = tile_atomic_max_expr_program(M, N, block_M, block_N, dtype=dtype)
+    import torch
+
+    def ref_program(A):
+        for i in range(M):
+            for j in range(N):
+                A[i, j] = max(A[i, j], 0.5)
+
+    A = torch.randn(M, N, dtype=torch.float32).cuda()
+    ref_A = A.clone()
+    ref_program(ref_A)
+    kernel(A)
+    torch.testing.assert_close(A, ref_A, atol=1e-3, rtol=1e-3)
+
+
+def test_tile_atomic_max():
+    run_tile_atomic_max(8, 128, 128, 32, 32)
+
+
+def test_tile_atomic_min():
+    run_tile_atomic_min(8, 128, 128, 32, 32)
+
+
+def test_tile_atomic_max_expr():
+    run_tile_atomic_max_expr(128, 128, 32, 32)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_augassign.py b/testing/python/language/test_tilelang_language_augassign.py
new file mode 100644
index 0000000000..6164c5e2f3
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_augassign.py
@@ -0,0 +1,34 @@
+import tilelang.language as T
+from tilelang.language.eager.ast import mutate
+
+
+def test_augassign_ast_no_placeholder_collision_for_value():
+    def demo(
+        A: T.Tensor[(1,), T.int32],
+        B: T.Tensor[(1,), T.int32],
+    ):
+        value = A[0]
+        value -= 1
+        B[0] = value
+
+    ir_gen = mutate(demo)
+    # Regression test: the AugAssign lowering previously used placeholder names
+    # that collided with common user variable names like `value`, producing:
+    #   value = __tb.aug_assign('Sub', 1, 1)
+    assert "__tb.aug_assign('Sub', value, 1, name='value')" in ir_gen.source
+    assert "__tb.aug_assign('Sub', 1, 1)" not in ir_gen.source
+
+
+def test_augassign_immutable_var_is_lowered_as_rebind():
+    @T.prim_func
+    def main(
+        A: T.Tensor[(1,), T.int32],
+        B: T.Tensor[(1,), T.int32],
+    ):
+        x = A[0]
+        x -= 1
+        B[0] = x
+
+    # Just building the PrimFunc is sufficient; pre-fix this raised because
+    # augmented assignment on immutable `Var` was rejected.
+    assert main is not None
diff --git a/testing/python/language/test_tilelang_language_chain_equal.py b/testing/python/language/test_tilelang_language_chain_equal.py
index 083eefdcb4..65393cf8a1 100644
--- a/testing/python/language/test_tilelang_language_chain_equal.py
+++ b/testing/python/language/test_tilelang_language_chain_equal.py
@@ -5,10 +5,7 @@
 
 
 @tilelang.jit(
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def chain_equal(N, block_size, dtype=T.float32):
     @T.prim_func
diff --git a/testing/python/language/test_tilelang_language_clear.py b/testing/python/language/test_tilelang_language_clear.py
index af9d89631f..b9e618bf30 100644
--- a/testing/python/language/test_tilelang_language_clear.py
+++ b/testing/python/language/test_tilelang_language_clear.py
@@ -41,12 +41,11 @@ def main(
 
 def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    kernel = tilelang.compile(program, out_idx=[2], target="cuda", pass_configs={"tl.disable_tma_lower": True})
+    kernel = tilelang.compile(program, out_idx=[2])
     import torch
-    from tilelang.utils import map_torch_type
 
-    a = torch.randn((M, K), dtype=map_torch_type(dtype)).cuda()
-    b = torch.randn((N, K), dtype=map_torch_type(dtype)).cuda()
+    a = torch.randn((M, K), dtype=dtype.as_torch()).cuda()
+    b = torch.randn((N, K), dtype=dtype.as_torch()).cuda()
     c = kernel(a, b)
     assert torch.allclose(c, torch.zeros_like(c))
 
diff --git a/testing/python/language/test_tilelang_language_cluster.py b/testing/python/language/test_tilelang_language_cluster.py
new file mode 100644
index 0000000000..0e3ee9d38c
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_cluster.py
@@ -0,0 +1,132 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128, cluster_dims=(2, 1, 1)) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm
+
+
+@tilelang.jit(out_idx=-1)
+def get_cta_rank_in_cluster(cluster_size=4):
+    assert 128 % cluster_size == 0
+
+    @T.prim_func
+    def main(A: T.Tensor((128), T.int32)):
+        with T.Kernel(128, cluster_dims=(cluster_size, 1, 1)) as bx:
+            T.cluster_arrive()
+            T.cluster_wait()
+            cta_rank_in_cluster = T.block_rank_in_cluster()
+            if T.get_thread_binding() == 0:
+                A[bx] = cta_rank_in_cluster
+            T.cluster_sync()
+
+    return main
+
+
+@tilelang.jit(out_idx=-1)
+def barrier_kernel():
+    @T.prim_func
+    def main(A: T.Tensor((128), T.int32)):
+        with T.Kernel(128, threads=128, cluster_dims=(4, 1, 1)):
+            mbar = T.alloc_cluster_barrier([256])
+            T.cluster_sync()
+            T.mbarrier_arrive(mbar, 0)
+            if T.block_rank_in_cluster() == 0:
+                T.mbarrier_wait_parity(mbar, 0)
+            T.cluster_sync()
+
+    return main
+
+
+def _get_clc_query_codegen_source() -> str:
+    @T.prim_func
+    def main(A: T.Tensor((2,), T.int32)):
+        with T.Kernel(1, threads=1):
+            result = T.alloc_shared((4,), T.uint32)
+            A[0] = T.clc_is_canceled(result)
+            A[1] = T.Cast("int32", T.clc_get_first_ctaid_x(result))
+
+    artifact = tilelang.lower(main, target="cuda")
+    return artifact.kernel_source
+
+
+def run_cython_cluster_launch():
+    kernel = matmul(1024, 1024, 1024, 128, 128, 32)
+    mod = tilelang.compile(kernel, execution_backend="cython")
+    assert "clusterDim = {2, 1, 1}" in mod.get_host_source()
+
+
+def run_tvm_ffi_cluster_launch():
+    kernel = matmul(1024, 1024, 1024, 128, 128, 32)
+    mod = tilelang.compile(kernel, execution_backend="tvm_ffi")
+    check_str = r"""
+  (((TVMFFIAny*)stack_ffi_any)[3].type_index) = 1;
+  (((TVMFFIAny*)stack_ffi_any)[3].zero_padding) = 0;
+  (((TVMFFIAny*)stack_ffi_any)[3].v_int64) = ((int64_t)2);
+  (((TVMFFIAny*)stack_ffi_any)[4].type_index) = 1;
+  (((TVMFFIAny*)stack_ffi_any)[4].zero_padding) = 0;
+  (((TVMFFIAny*)stack_ffi_any)[4].v_int64) = ((int64_t)1);
+  (((TVMFFIAny*)stack_ffi_any)[5].type_index) = 1;
+  (((TVMFFIAny*)stack_ffi_any)[5].zero_padding) = 0;
+  (((TVMFFIAny*)stack_ffi_any)[5].v_int64) = ((int64_t)1);
+"""
+    assert check_str in mod.get_host_source()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_cluster_launch():
+    run_cython_cluster_launch()
+    run_tvm_ffi_cluster_launch()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_cluster_launch_intrinsics(cluster_size=4):
+    kernel = get_cta_rank_in_cluster(cluster_size)
+    result = kernel()
+    ref = torch.arange(128, dtype=torch.int32, device="cuda") % cluster_size
+    assert torch.all(result == ref)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_cluster_barrier():
+    kernel = barrier_kernel()
+    kernel()
+
+
+@tilelang.testing.requires_cuda
+def test_clc_query_codegen_includes_cluster_header():
+    src = _get_clc_query_codegen_source()
+    print("=== clc query codegen ===")
+    print(src)
+
+    assert "#include <tl_templates/cuda/cluster.h>" in src
+    assert "tl::clc_is_canceled(" in src
+    assert "tl::clc_get_first_ctaid_x(" in src
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_composable_index.py b/testing/python/language/test_tilelang_language_composable_index.py
index 7893c1f243..51fc21b873 100644
--- a/testing/python/language/test_tilelang_language_composable_index.py
+++ b/testing/python/language/test_tilelang_language_composable_index.py
@@ -30,11 +30,7 @@ def run_tilelang_composable_copy(M=1024, N=1024, block_M=128, block_N=128, dtype
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        target="cuda",
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
diff --git a/testing/python/language/test_tilelang_language_cooperative.py b/testing/python/language/test_tilelang_language_cooperative.py
new file mode 100644
index 0000000000..0a4d7a6dff
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_cooperative.py
@@ -0,0 +1,40 @@
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+
+
+@tilelang.jit
+def grid_sync(N=1024):
+    block = 64
+
+    @T.prim_func
+    def kernel(A: T.Tensor((N), T.float32)):
+        with T.Kernel(T.ceildiv(N, block), threads=128) as bx:
+            A_local = T.alloc_fragment((block), dtype=T.float32)
+            n_idx = bx * block
+            for i in T.Parallel(block):
+                A[n_idx + i] = n_idx + i
+            T.sync_grid()
+            for i in T.Parallel(block):
+                A_local[i] = A[N - n_idx - i - 1]
+                T.sync_grid()
+                A[n_idx + i] = A[n_idx + i] + A_local[i]
+
+    return kernel
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(6, 0)
+def test_grid_sync():
+    N = 1024
+    kernel = grid_sync(N)
+    assert "cooperative_groups::this_grid().sync()" in kernel.get_kernel_source()
+    tensor = torch.rand((N), dtype=torch.float32, device="cuda")
+    kernel(tensor)
+    target = torch.full_like(tensor, tensor[0])
+    torch.testing.assert_close(tensor, target)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_copy.py b/testing/python/language/test_tilelang_language_copy.py
index d9d6659d1b..2efa2784af 100644
--- a/testing/python/language/test_tilelang_language_copy.py
+++ b/testing/python/language/test_tilelang_language_copy.py
@@ -29,7 +29,7 @@ def run_tilelang_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16)
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     source = kernel.get_kernel_source()
     print(source)
@@ -65,10 +65,7 @@ def run_tilelang_copy_with_stride(M=1024, N=1024, NN=2048, block_M=128, block_N=
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     if isinstance(NN, T.Var):
         NN = N * 2
@@ -102,7 +99,7 @@ def run_tilelang_copy_bufferload(num_tokens=128, dtype=T.float16):
     tilelang.compile(
         program,
         out_idx=[1],
-        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
 
 
@@ -129,7 +126,7 @@ def run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, blo
     kernel = tilelang.compile(
         program,
         out_idx=[1],
-        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True, tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True},
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
@@ -140,6 +137,35 @@ def test_tilelang_copy_buffer_load_with_parallel():
     run_tilelang_copy_buffer_load_with_parallel(M=1024, N=1024, block_M=128, block_N=128)
 
 
+def tilelang_copy_shape_mismatched(M, N, src_dtype=T.float16, dst_dtype=T.float16):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), src_dtype),
+        B: T.Tensor((M, N), dst_dtype),
+    ):
+        # Initialize Kernel Context
+        with T.Kernel(1, threads=128):
+            T.copy(A[:, :2], B[:, :3])
+
+    return main
+
+
+def run_tilelang_copy_shape_mismatched(M=1024, N=1024, dtype=T.float16):
+    program = tilelang_copy_shape_mismatched(M, N, src_dtype=dtype, dst_dtype=dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
+    b = kernel(a)
+    torch.testing.assert_close(b[:, :1], a[:, :1], rtol=1e-2, atol=1e-2)
+
+
+def test_tilelang_copy_shape_mismatched():
+    run_tilelang_copy_shape_mismatched(M=128, N=128)
+
+
 def run_tilelang_copy_fp8_e8m0(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.float8_e8m0fnu, dst_dtype=T.float8_e8m0fnu):
     program = tilelang_copy(M, N, block_M, block_N, src_dtype=src_dtype, dst_dtype=dst_dtype)
     kernel = tilelang.compile(
@@ -168,8 +194,10 @@ def run_tilelang_copy_fp4(M=1024, N=1024, block_M=128, block_N=128, src_dtype=T.
     source = kernel.get_kernel_source()
     assert "fp4_e2_t" in source
     # For FP4, use same shape as kernel expects, since int8 is used as storage type
-    dummy_input = torch.randint(0, 100, (M, N), device="cuda", dtype=torch.int8)
+    dummy_input = torch.randint(0, 100, (M, N // 2), device="cuda", dtype=torch.int8)
     output = kernel(dummy_input)
+    if src_dtype == dst_dtype:
+        assert torch.allclose(output.view(torch.int8), dummy_input)
     assert output is not None
 
 
diff --git a/testing/python/language/test_tilelang_language_cumsum.py b/testing/python/language/test_tilelang_language_cumsum.py
index fecc0d2a88..a022f9716d 100644
--- a/testing/python/language/test_tilelang_language_cumsum.py
+++ b/testing/python/language/test_tilelang_language_cumsum.py
@@ -144,33 +144,31 @@ def ref_program(A):
 
 def test_cumsum_smem():
     # Test different sizes
-    run_cumsum(1024, 1024, 128, 128)
-    run_cumsum(1024, 1024, 128, 128, dim=1)
-    run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True)
+    run_cumsum(256, 256, 64, 64)
+    run_cumsum(256, 256, 64, 64, dim=1)
+    run_cumsum(256, 256, 64, 64, dim=1, reverse=True)
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype=T.float32)
-    run_cumsum(256, 256, 128, 128, dtype=T.float32)
+    run_cumsum(128, 128, 64, 64, dtype=T.float32)
 
 
 def test_cumsum_fragment():
-    run_cumsum(1024, 1024, 128, 128, scope="fragment")
-    run_cumsum(1024, 1024, 128, 128, dim=1, scope="fragment")
-    run_cumsum(1024, 1024, 128, 128, dim=1, reverse=True, scope="fragment")
+    run_cumsum(256, 256, 64, 64, scope="fragment")
+    run_cumsum(256, 256, 64, 64, dim=1, scope="fragment")
+    run_cumsum(256, 256, 64, 64, dim=1, reverse=True, scope="fragment")
 
     # Test different dtypes
-    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
-    run_cumsum(256, 256, 128, 128, dtype=T.float32, scope="fragment")
+    run_cumsum(128, 128, 64, 64, dtype=T.float32, scope="fragment")
 
 
 def test_cumsum_smem_1d():
-    run_cumsum_1d(1024, 128)
-    run_cumsum_1d(1024, 128, reverse=True)
+    run_cumsum_1d(512, 64)
+    run_cumsum_1d(512, 64, reverse=True)
 
 
 def test_cumsum_fragment_1d():
-    run_cumsum_1d(1024, 128, scope="fragment")
-    run_cumsum_1d(1024, 128, reverse=True, scope="fragment")
+    run_cumsum_1d(512, 64, scope="fragment")
+    run_cumsum_1d(512, 64, reverse=True, scope="fragment")
 
 
 def cumsum_region_test_1d(N, chunk_size, reverse=False, dtype=T.float32):
@@ -285,26 +283,25 @@ def ref_program(A):
 def test_cumsum_region_1d():
     """Test cumsum with 1D region input."""
     # Test normal cumsum with region input
-    run_cumsum_region_1d(1024, 128)
+    run_cumsum_region_1d(512, 64)
     # Test reverse cumsum with region input
-    run_cumsum_region_1d(1024, 128, reverse=True)
+    run_cumsum_region_1d(512, 64, reverse=True)
     # Test with different chunk sizes
-    run_cumsum_region_1d(512, 64)
-    run_cumsum_region_1d(2048, 256)
+    run_cumsum_region_1d(384, 128)
     # Tail coverage (non-divisible size)
-    run_cumsum_region_1d(1000, 128)
+    run_cumsum_region_1d(250, 64)
 
 
 def test_cumsum_region_2d():
     """Test cumsum with 2D region input."""
     # Test 2D cumsum along dim 0
-    run_cumsum_region_2d(1024, 1024, 128, 128, dim=0)
+    run_cumsum_region_2d(256, 256, 64, 64, dim=0)
     # Test 2D cumsum along dim 1
-    run_cumsum_region_2d(1024, 1024, 128, 128, dim=1)
+    run_cumsum_region_2d(256, 256, 64, 64, dim=1)
     # Test reverse cumsum
-    run_cumsum_region_2d(512, 512, 64, 64, dim=1, reverse=True)
+    run_cumsum_region_2d(192, 192, 64, 64, dim=1, reverse=True)
     # Tail coverage (non-divisible size)
-    run_cumsum_region_2d(1000, 1000, 128, 128, dim=1)
+    run_cumsum_region_2d(250, 250, 64, 64, dim=1)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_eager_jit.py b/testing/python/language/test_tilelang_language_eager_jit.py
new file mode 100644
index 0000000000..6bf960b17a
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_eager_jit.py
@@ -0,0 +1,241 @@
+import tilelang.testing
+import tilelang
+import tilelang.language as T
+from itertools import product
+import torch
+
+
+def test_jit2_gemm():
+    @tilelang.jit(verbose=True)
+    def gemm(
+        A,
+        B,
+        C,
+        dtype: T.dtype = T.float16,
+        accum_dtype: T.dtype = T.float32,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 64,
+    ):
+        M, N, K = T.const("M N K")
+
+        A: T.Tensor[[M, K], dtype]
+        B: T.Tensor[[K, N], dtype]
+        C: T.Tensor[[M, N], dtype]
+
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N)) as (by, bx):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    A = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    B = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    C = torch.randn(1024, 1024, dtype=torch.float16, device="cuda")
+    gemm(A, B, C)
+    C_ref = A @ B
+    torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+def test_jit2_gemm_ptr():
+    @tilelang.jit
+    def gemm_ptr(
+        A: T.ptr,
+        B: T.ptr,
+        C: T.ptr,
+        M: int,
+        N: int,
+        K: int,
+        dtype: T.dtype,
+        out_dtype: T.dtype,
+        block_M: int = 64,
+        block_N: int = 64,
+        block_K: int = 32,
+    ):
+        A = T.make_tensor(A, (M, K), dtype)
+        B = T.make_tensor(B, (K, N), dtype)
+        C = T.make_tensor(C, (M, N), out_dtype)
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), out_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[bx * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, by * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[bx * block_M, by * block_N])
+
+    prod = product([T.float16, T.float32], [T.float32])
+    gemm_ptr.par_compile(
+        [
+            {"A": T.ptr(), "B": T.ptr(), "C": T.ptr(), "M": 1024, "N": 1024, "K": 1024, "dtype": in_dtype, "out_dtype": out_dtype}
+            for in_dtype, out_dtype in prod
+        ]
+    )
+    for in_dtype, out_dtype in prod:
+        in_dtype = in_dtype.as_torch()
+        out_dtype = out_dtype.as_torch()
+        A = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        B = torch.randn(1024, 1024, dtype=in_dtype, device="cuda")
+        C_ref = out_dtype(A @ B)
+        C = torch.empty(1024, 1024, dtype=out_dtype, device="cuda")
+        gemm_ptr(A, B, C, 1024, 1024, 1024, in_dtype, out_dtype)
+        torch.testing.assert_close(C, C_ref, atol=1e-2, rtol=1e-2)
+
+
+def test_jit2_many_annot():
+    @T.macro
+    def copy_impl(A, B):
+        M, N = A.shape
+        M_, N_ = B.shape
+        assert M == M_, f"M mismatch {M} {M_}"
+        assert N == N_, f"N mismatch {N} {N_}"
+        # assert tuple(A.shape) == tuple(B.shape), f"Invalid tensor shape: {A.shape}, {B.shape}"
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
+
+    @tilelang.jit
+    def copy1(A, B):
+        N, M = T.const("N, M")
+        A: T.Tensor[[N, M], T.float32]
+        B: T.Tensor[[N, M], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy2(
+        A: T.Tensor[[128, 128], T.float32],
+        B: T.Tensor[[128, 128], T.float32],
+    ):
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy3(A, B):
+        N = T.const("N")
+        A: T.Tensor[[N, 128], T.float32]
+        B: T.Tensor[[N, 128], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy4(A, B):
+        N = T.dynamic("N")
+        M = T.const("M")
+        A: T.Tensor[[N, M], T.float32]
+        B: T.Tensor[[N, M], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy5(A, B):
+        N, M, N_, M_ = T.const("N, M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        B: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        copy_impl(A, B)
+
+    @tilelang.jit
+    def copy6(A, B):
+        N = T.dynamic("N")
+        M, N_, M_ = T.const("M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        B: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        copy_impl(A, B)
+
+    tilelang.par_compile([copy.get_tir(T.Tensor((128, 128)), T.Tensor((128, 128))) for copy in [copy1, copy2, copy3, copy4]])
+
+    for copy in [copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device="cuda")
+        B = torch.empty(128, 128, device="cuda")
+        copy(A, B)
+        assert torch.equal(B, A)
+
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = torch.randn(128, 2, 128, 2, device="cuda")
+        copy(A[:, 0, :, 0], B[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B[:, 0, :, 0])
+
+
+def test_jit2_return():
+    @T.macro
+    def copy_impl(A):
+        M, N = A.shape
+        B = T.empty(M, N, dtype=A.dtype)
+        with T.Kernel(T.ceildiv(M, 128), T.ceildiv(N, 128), threads=128) as (bx, by):
+            T.copy(A[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128], B[bx * 128 : bx * 128 + 128, by * 128 : by * 128 + 128])
+        return B
+
+    @tilelang.jit
+    def copy1(A):
+        M, N = T.const("M, N")
+        A: T.Tensor[[M, N], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy2(A):
+        A: T.Tensor[[128, 128], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy3(A):
+        N = T.const("N")
+        A: T.Tensor[[N, 128], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy4(A):
+        N = T.dynamic("N")
+        M = T.const("M")
+        A: T.Tensor[[N, M], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy5(A):
+        N, M, N_, M_ = T.const("N, M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        return copy_impl(A)
+
+    @tilelang.jit
+    def copy6(A):
+        N = T.dynamic("N")
+        M, N_, M_ = T.const("M, N_, M_")
+        A: T.StridedTensor[[N, M], [N_, M_], T.float32]
+        return copy_impl(A)
+
+    for copy in [copy1, copy2, copy3, copy4]:
+        A = torch.randn(128, 128, device="cuda")
+        B = copy(A)
+        assert torch.equal(B, A)
+
+    for copy in [copy5, copy6]:
+        A = torch.randn(128, 2, 128, 2, device="cuda")
+        B = copy(A[:, 0, :, 0])
+        assert torch.equal(A[:, 0, :, 0], B)
+
+
+def test_jit2_compile_with_consts():
+    @tilelang.jit
+    def transpose(X, Y, block_M, block_N):
+        M, N = T.const("M N")
+        X: T.Tensor[[M, N], T.float32]
+        Y: T.Tensor[[N, M], T.float32]
+
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(N, block_N), threads=128) as (bx, by):
+            X_tile = T.alloc_shared((block_M, block_N), T.float32)
+            Y_tile = T.alloc_shared((block_N, block_M), T.float32)
+
+            T.copy(X[bx * block_M, by * block_N], X_tile)
+            for i, j in T.Parallel(block_M, block_N):
+                Y_tile[j, i] = X_tile[i, j]
+            T.copy(Y_tile, Y[by * block_N, bx * block_M])
+
+    transpose.compile(M=1024, N=1024, block_M=64, block_N=64)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_frontend_v2.py b/testing/python/language/test_tilelang_language_frontend_v2.py
index aacbdacee0..38dc482324 100644
--- a/testing/python/language/test_tilelang_language_frontend_v2.py
+++ b/testing/python/language/test_tilelang_language_frontend_v2.py
@@ -3,8 +3,13 @@
 import torch
 import tilelang.testing
 import tvm
+import pytest
 from tvm.script.ir_builder.base import IRBuilderFrame
-from tvm.tir.expr import IntImm, Var
+from tvm.tir.expr import IntImm, Var, Not, Or
+from tvm.tir import all as tir_all
+from tilelang.language.dtypes import _all_dtypes
+
+ALL_DTYPE_NAMES = tuple(_all_dtypes)
 
 
 def test_argument():
@@ -38,21 +43,15 @@ def test_argument(
         pass
 
 
-def test_expr():
-    from tilelang.language.v2.dtypes import _all_dtypes
-
-    errors = []
-    for name in _all_dtypes:
-        dtype = getattr(T, name)
-        assert isinstance(dtype, tvm.DataType), f"{dtype} is not tvm.DataType"
-        try:
-            dtype(1.0)
-            dtype()
-        except TypeError:
-            pass
-        except Exception:
-            errors.append(name)
-    assert not errors
+@pytest.mark.parametrize("name", ALL_DTYPE_NAMES, ids=ALL_DTYPE_NAMES)
+def test_expr(name):
+    dtype = getattr(T, name)
+    assert isinstance(dtype, tvm.DataType), f"{dtype} is not tvm.DataType"
+    try:
+        dtype(1.0)
+        dtype()
+    except TypeError:
+        pass
 
 
 # def test_var_decl_sugar():
@@ -204,9 +203,9 @@ def test_str_repr():
 
 
 def test_var_assign():
-    @tilelang.jit(out_idx=-1)
-    @T.prim_func
-    def test_var_assign(A: T.Tensor((2,), T.int32)):
+    @tilelang.jit
+    def test_var_assign():
+        A = T.empty((2,), T.int32)
         with T.Kernel(1) as _:
             a: T.int32 = 1
             b: T.int32 = a
@@ -214,8 +213,9 @@ def test_var_assign(A: T.Tensor((2,), T.int32)):
             d: T.int32 = a
             A[0] = b
             A[1] = d
+        return A
 
-    res = test_var_assign()()
+    res = test_var_assign()
     assert res[0] == 1
     assert res[1] == 2
 
@@ -255,9 +255,9 @@ def test_macro_return():
 
 
 def test_serial_for_with_step():
-    @tilelang.jit(out_idx=-1)
-    @T.prim_func
-    def test_stepped_serial(A: T.Tensor((10,), T.int32)):
+    @tilelang.jit
+    def stepped_serial():
+        A = T.empty((10,), T.int32)
         with T.Kernel(1) as _:
             for i in range(0, 10, 2):
                 T.device_assert(0 <= i < 10 and i % 2 == 0, "i out of range")
@@ -265,22 +265,22 @@ def test_stepped_serial(A: T.Tensor((10,), T.int32)):
             for i in range(1, 10, 2):
                 T.device_assert(1 <= i < 10 and i % 2 == 1, "i out of range")
                 A[i] = 2.0
+        return A
 
-    ker = test_stepped_serial()
-    res = ker()
+    res = stepped_serial()
     ref = torch.tensor([1, 2, 1, 2, 1, 2, 1, 2, 1, 2], dtype=torch.int32, device="cuda")
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
-    @tilelang.jit(out_idx=-1)
-    @T.prim_func
-    def test_serial_step_neg(A: T.Tensor((10,), T.int32)):
+    @tilelang.jit
+    def stepped_serial_neg():
+        A = T.empty((10,), T.int32)
         with T.Kernel(1) as _:
             for i in range(10, 0, -1):
                 T.device_assert(0 < i <= 10, "i out of range")
                 A[10 - i] = i
+        return A
 
-    ker = test_serial_step_neg()
-    res = ker()
+    res = stepped_serial_neg()
     ref = torch.tensor([10, 9, 8, 7, 6, 5, 4, 3, 2, 1], dtype=torch.int32, device="cuda")
     assert torch.all(res == ref), f"Expected {ref}, but got {res}"
 
@@ -292,8 +292,8 @@ def test_serial_step_neg(A: T.Tensor((10,), T.int32)):
 
 def test_swap_logic():
     @tilelang.jit
-    @T.prim_func
-    def swap_var(A: T.Tensor[(2,), T.float32]):
+    def swap_var(A):
+        A: T.Tensor[(2,), T.float32]
         with T.Kernel(1, threads=1) as _:
             a = T.alloc_var(T.float32, A[0])
             b = T.alloc_var(T.float32, A[1])
@@ -301,28 +301,29 @@ def swap_var(A: T.Tensor[(2,), T.float32]):
             A[0], A[1] = a, b
 
     @tilelang.jit
-    @T.prim_func
-    def swap_idx(A: T.Tensor[(2,), T.float32]):
+    def swap_idx(A):
+        A: T.Tensor[(2,), T.float32]
         with T.Kernel(1, threads=1) as _:
             A[0], A[1] = A[1], A[0]
 
-    k_swap_var = swap_var()
     data = torch.tensor([1.0, 2.0], dtype=torch.float32).cuda()
-    k_swap_var(data)
+    swap_var(data)
     ref = torch.tensor([2.0, 1.0], dtype=torch.float32).cuda()
+
     torch.testing.assert_close(data, ref)
 
-    k_swap_idx = swap_idx()
     data = torch.tensor([1.0, 2.0], dtype=torch.float32).cuda()
-    k_swap_idx(data)
+    swap_idx(data)
     ref = torch.tensor([2.0, 1.0], dtype=torch.float32).cuda()
     torch.testing.assert_close(data, ref)
 
 
+# TODO(Gong): ROCm is not supported alloc_var with initializer
+@tilelang.testing.requires_cuda
 def test_while_loop():
-    @tilelang.jit(out_idx=-1)
-    @T.prim_func
-    def test_while_loop(A: T.Tensor((1,), T.int32)):
+    @tilelang.jit
+    def while_loop():
+        A = T.empty((1,), T.int32)
         with T.Kernel(1) as _:
             i = T.alloc_var(T.int32, 0)
             sum = T.alloc_var(T.int32)
@@ -330,10 +331,10 @@ def test_while_loop(A: T.Tensor((1,), T.int32)):
                 sum += i
                 i += 1
             A[0] = sum
+        return A
 
-    ker = test_while_loop()
-    A = ker()
-    assert A[0].item() == sum(range(10)), f"Expected {sum(range(10))}, but got {A[0].item()}"
+    res = while_loop()
+    assert res[0].item() == sum(range(10)), f"Expected {sum(range(10))}, but got {res[0].item()}"
 
 
 def test_var_macro():
@@ -445,36 +446,32 @@ def test_boolop():
     c = Var("c", T.int32)
     d = Var("d", T.int32)
 
-    @T.macro
     def cond():
-        return not (a < b and b < c and a * d < b * d) or b * d < c * d
+        return Or(Not(tir_all(a < b, b < c, a * d < b * d)), b * d < c * d)
 
     cond()
 
 
 def test_constexpr_if():
     @tilelang.jit
-    def probe(tmp: bool):
-        @T.prim_func
-        def foo(A: T.Tensor[[2], T.int32]):
-            with T.Kernel(1):
-                if tmp:
-                    v = A[0]
-                else:
-                    v = A[1]
-                if tmp:
-                    A[1] = v + 1
-                else:
-                    A[0] = v + 1
-
-        return foo
+    def probe(A, tmp: bool):
+        A: T.Tensor[(2,), T.int32]
+        with T.Kernel(1):
+            if tmp:
+                v = A[0]
+            else:
+                v = A[1]
+            if tmp:
+                A[1] = v + 1
+            else:
+                A[0] = v + 1
 
     A = torch.tensor([10, 20], dtype=torch.int32).cuda()
     expect_1 = torch.tensor([10, 11], dtype=torch.int32).cuda()
     expect_2 = torch.tensor([12, 11], dtype=torch.int32).cuda()
-    probe(True)(A)
+    probe(A, True)
     assert torch.equal(A, expect_1)
-    probe(False)(A)
+    probe(A, False)
     assert torch.equal(A, expect_2)
 
 
diff --git a/testing/python/language/test_tilelang_language_func_attrs.py b/testing/python/language/test_tilelang_language_func_attrs.py
new file mode 100644
index 0000000000..a64ab73928
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_func_attrs.py
@@ -0,0 +1,214 @@
+"""Test T.annotate_compile_flags, T.annotate_pass_configs, and out_idx via PrimFunc attrs."""
+
+import pytest
+import torch
+import tilelang
+from tilelang import language as T
+from tilelang.transform import PassConfigKey
+
+
+def test_out_idx_via_attr_lazy():
+    """out_idx should be stored as PrimFunc attr when using T.empty + return."""
+
+    @T.prim_func
+    def kernel(A):
+        A: T.Tensor[[128, 128], T.float32]
+        B = T.empty([128, 128], T.float32)
+        with T.Kernel(1):
+            for i in T.serial(128):
+                for j in T.serial(128):
+                    B[i, j] = A[i, j] + 1.0
+        return B
+
+    assert "tilelang_out_idx" in kernel.attrs
+    assert list(kernel.attrs["tilelang_out_idx"]) == [-1]
+
+    compiled = tilelang.compile(kernel)
+    a = torch.randn(128, 128, device="cuda")
+    b = compiled(a)
+    torch.testing.assert_close(b, a + 1.0)
+
+
+def test_all_attrs_together_lazy():
+    """annotate_pass_configs, annotate_compile_flags, and out_idx should all work together."""
+
+    @T.prim_func
+    def kernel(A):
+        A: T.Tensor[[64, 64], T.float32]
+        T.annotate_pass_configs({PassConfigKey.TL_ENABLE_FAST_MATH: True})
+        T.annotate_compile_flags(["--use_fast_math"])
+        B = T.empty([64, 64], T.float32)
+        with T.Kernel(1):
+            for i in T.serial(64):
+                for j in T.serial(64):
+                    B[i, j] = A[i, j] * 2.0
+        return B
+
+    attrs = kernel.attrs
+    assert "tilelang_out_idx" in attrs
+    assert "tilelang_pass_configs" in attrs
+    assert "tilelang_compile_flags" in attrs
+
+    compiled = tilelang.compile(kernel)
+    a = torch.randn(64, 64, device="cuda")
+    b = compiled(a)
+    torch.testing.assert_close(b, a * 2.0)
+
+
+def test_eager_mode_attrs():
+    """Eager mode should support annotate_pass_configs and out_idx via T.empty."""
+
+    @tilelang.jit
+    def kernel(A):
+        M, N = T.const("M N")
+        A: T.Tensor[[M, N], T.float32]
+        B = T.empty([M, N], T.float32)
+        T.annotate_pass_configs({PassConfigKey.TL_ENABLE_FAST_MATH: True})
+        with T.Kernel(1):
+            for i in T.serial(M):
+                for j in T.serial(N):
+                    B[i, j] = A[i, j] + 1.0
+        return B
+
+    a = torch.randn(32, 32, device="cuda")
+    result = kernel(a)
+    torch.testing.assert_close(result, a + 1.0)
+
+
+def test_out_idx_conflict_detection():
+    """Specifying both T.empty return and external out_idx should raise ValueError."""
+
+    @T.prim_func
+    def kernel(A):
+        A: T.Tensor[[32, 32], T.float32]
+        B = T.empty([32, 32], T.float32)
+        with T.Kernel(1):
+            for i in T.serial(32):
+                for j in T.serial(32):
+                    B[i, j] = A[i, j]
+        return B
+
+    with pytest.raises(ValueError, match="Out index conflict"):
+        tilelang.compile(kernel, out_idx=[-1])
+
+
+def test_no_out_idx_when_not_using_empty():
+    """When T.empty is not used, tilelang_out_idx attr should not be present."""
+
+    @T.prim_func
+    def kernel(A, B):
+        A: T.Tensor[[32, 32], T.float32]
+        B: T.Tensor[[32, 32], T.float32]
+        with T.Kernel(1):
+            for i in T.serial(32):
+                for j in T.serial(32):
+                    B[i, j] = A[i, j]
+
+    assert kernel.attrs is None or "tilelang_out_idx" not in kernel.attrs
+
+    compiled = tilelang.compile(kernel, out_idx=[-1])
+    a = torch.randn(32, 32, device="cuda")
+    b = compiled(a)
+    torch.testing.assert_close(b, a)
+
+
+def test_pass_configs_only_lazy():
+    """annotate_pass_configs should work without T.empty or annotate_compile_flags."""
+
+    @T.prim_func
+    def kernel(A, B):
+        A: T.Tensor[[32, 32], T.float32]
+        B: T.Tensor[[32, 32], T.float32]
+        T.annotate_pass_configs({PassConfigKey.TL_ENABLE_FAST_MATH: True})
+        with T.Kernel(1):
+            for i in T.serial(32):
+                for j in T.serial(32):
+                    B[i, j] = A[i, j] + 1.0
+
+    assert "tilelang_pass_configs" in kernel.attrs
+    assert kernel.attrs is None or "tilelang_out_idx" not in kernel.attrs
+
+    compiled = tilelang.compile(kernel, out_idx=[-1])
+    a = torch.randn(32, 32, device="cuda")
+    b = compiled(a)
+    torch.testing.assert_close(b, a + 1.0)
+
+
+def test_compile_flags_only_lazy():
+    """annotate_compile_flags should work standalone."""
+
+    @T.prim_func
+    def kernel(A, B):
+        A: T.Tensor[[32, 32], T.float32]
+        B: T.Tensor[[32, 32], T.float32]
+        T.annotate_compile_flags(["--use_fast_math"])
+        with T.Kernel(1):
+            for i in T.serial(32):
+                for j in T.serial(32):
+                    B[i, j] = A[i, j] + 1.0
+
+    assert "tilelang_compile_flags" in kernel.attrs
+
+    compiled = tilelang.compile(kernel, out_idx=[-1])
+    a = torch.randn(32, 32, device="cuda")
+    b = compiled(a)
+    torch.testing.assert_close(b, a + 1.0)
+
+
+def test_annotations_before_tensor_type():
+    """Annotations placed before tensor type annotations should work."""
+
+    @T.prim_func
+    def kernel(A, B):
+        T.annotate_pass_configs({PassConfigKey.TL_ENABLE_FAST_MATH: True})
+        T.annotate_compile_flags(["--use_fast_math"])
+        A: T.Tensor[[32, 32], T.float32]
+        B: T.Tensor[[32, 32], T.float32]
+        with T.Kernel(1):
+            for i in T.serial(32):
+                for j in T.serial(32):
+                    B[i, j] = A[i, j] + 1.0
+
+    assert "tilelang_pass_configs" in kernel.attrs
+    assert "tilelang_compile_flags" in kernel.attrs
+
+    compiled = tilelang.compile(kernel, out_idx=[-1])
+    a = torch.randn(32, 32, device="cuda")
+    b = compiled(a)
+    torch.testing.assert_close(b, a + 1.0)
+
+
+def test_annotations_after_tensor_type():
+    """Annotations placed after tensor type annotations should work."""
+
+    @T.prim_func
+    def kernel(A, B):
+        A: T.Tensor[[32, 32], T.float32]
+        B: T.Tensor[[32, 32], T.float32]
+        T.annotate_pass_configs({PassConfigKey.TL_ENABLE_FAST_MATH: True})
+        T.annotate_compile_flags(["--use_fast_math"])
+        with T.Kernel(1):
+            for i in T.serial(32):
+                for j in T.serial(32):
+                    B[i, j] = A[i, j] + 1.0
+
+    assert "tilelang_pass_configs" in kernel.attrs
+    assert "tilelang_compile_flags" in kernel.attrs
+
+    compiled = tilelang.compile(kernel, out_idx=[-1])
+    a = torch.randn(32, 32, device="cuda")
+    b = compiled(a)
+    torch.testing.assert_close(b, a + 1.0)
+
+
+if __name__ == "__main__":
+    test_out_idx_via_attr_lazy()
+    test_all_attrs_together_lazy()
+    test_eager_mode_attrs()
+    test_out_idx_conflict_detection()
+    test_no_out_idx_when_not_using_empty()
+    test_pass_configs_only_lazy()
+    test_compile_flags_only_lazy()
+    test_annotations_before_tensor_type()
+    test_annotations_after_tensor_type()
+    print("All tests passed!")
diff --git a/testing/python/language/test_tilelang_language_get_warp_info.py b/testing/python/language/test_tilelang_language_get_warp_info.py
index f2d7b12693..e14cece983 100644
--- a/testing/python/language/test_tilelang_language_get_warp_info.py
+++ b/testing/python/language/test_tilelang_language_get_warp_info.py
@@ -192,12 +192,12 @@ def test_get_warp_group_idx_custom():
     run_get_warp_group_idx(num_threads=512, warp_size=32, warps_per_group=5)
 
 
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@tilelang.testing.requires_cuda
 def test_shuffle_elect_default():
     run_shuffle_elect(num_threads=256, thread_extent=64)
 
 
-@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+@tilelang.testing.requires_cuda
 def test_shuffle_elect_block_leader():
     run_shuffle_elect(num_threads=128, thread_extent=0)
 
diff --git a/testing/python/language/test_tilelang_language_if_range.py b/testing/python/language/test_tilelang_language_if_range.py
index c81a241ba1..8da3f8b498 100644
--- a/testing/python/language/test_tilelang_language_if_range.py
+++ b/testing/python/language/test_tilelang_language_if_range.py
@@ -45,6 +45,7 @@ def run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32, dtype=T.float16)
     torch.testing.assert_close(b, ref_b, rtol=1e-2, atol=1e-2)
 
 
+@tilelang.testing.requires_cuda
 def test_tilelang_if_range():
     run_tilelang_if_range(M=128, N=128, block_M=32, block_N=32)
 
diff --git a/testing/python/language/test_tilelang_language_infinity.py b/testing/python/language/test_tilelang_language_infinity.py
index 746afc4e04..a33a616b38 100644
--- a/testing/python/language/test_tilelang_language_infinity.py
+++ b/testing/python/language/test_tilelang_language_infinity.py
@@ -1,5 +1,6 @@
 import torch
 import tilelang
+import tilelang.testing
 import tilelang.language as T
 
 
diff --git a/testing/python/language/test_tilelang_language_isfinite_codegen.py b/testing/python/language/test_tilelang_language_isfinite_codegen.py
new file mode 100644
index 0000000000..ec3359a703
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_isfinite_codegen.py
@@ -0,0 +1,53 @@
+import re
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def _get_isfinite_source() -> str:
+    n = 1
+
+    @T.prim_func
+    def main(
+        x: T.Tensor((n,), T.float32),
+        y: T.Tensor((n,), T.int32),
+    ):
+        with T.Kernel(1, threads=1):
+            pred = T.alloc_var(T.bool)
+            pred = T.isfinite(x[0])
+            y[0] = T.if_then_else(pred, 1, 0)
+
+    artifact = tilelang.lower(main, target="cuda")
+    return artifact.kernel_source
+
+
+def _get_isfinite_expr(code: str) -> str:
+    pattern = r"\b\w+\s*=\s*(.*\bisfinite\s*\(.*\));"
+    for line in code.splitlines():
+        match = re.search(pattern, line)
+        if match:
+            return match.group(1)
+    raise AssertionError("Failed to find CUDA isfinite call in generated source")
+
+
+@tilelang.testing.requires_cuda
+def test_isfinite_codegen_uses_cuda_intrinsic():
+    """Check T.isfinite lowers to CUDA's isfinite for float32."""
+    src = _get_isfinite_source()
+    expr = _get_isfinite_expr(src)
+
+    print("=== isfinite codegen ===")
+    print(src)
+    print("=== extracted expression ===")
+    print(expr)
+
+    assert "isfinite(" in expr
+    assert "fabsf(" not in expr
+    assert "CUDART_INF_F" not in expr
+    assert "!= x[0]" not in expr
+    assert "x[0] != x[0]" not in expr
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_ldg.py b/testing/python/language/test_tilelang_language_ldg.py
new file mode 100644
index 0000000000..47d82c52d2
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_ldg.py
@@ -0,0 +1,269 @@
+"""Tests for load_global_32, load_global_64, load_global_128, load_global_256 intrinsics codegen using eager jit style."""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+@tilelang.testing.requires_cuda
+def test_ldg32_codegen():
+    """Test that ldg32 generates tl::load_global_32 in CUDA source."""
+
+    @tilelang.jit
+    def ldg32_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N, threads=32) as pid:
+            Y[pid] = T.reinterpret(T.ldg32(X[pid]), T.float32)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    ldg32_kernel(X, Y)
+    src = ldg32_kernel.get_kernel_source(N=128)
+    print("=== ldg32 codegen ===")
+    print(src)
+    # Verify codegen
+    assert "load_global_32" in src, "Expected load_global_32 call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = X
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg64_codegen():
+    """Test that ldg64 generates tl::load_global_64 in CUDA source."""
+
+    @tilelang.jit
+    def ldg64_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 2, threads=32) as pid:
+            Y[pid * 2 : pid * 2 + 2] = T.reinterpret(T.ldg64(X[pid * 2 : pid * 2 + 2]), T.float32x2)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    ldg64_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg64_kernel.get_kernel_source(N=128)
+    print("=== ldg64 codegen ===")
+    print(src)
+    assert "load_global_64" in src, "Expected load_global_64 call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = X
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg128_codegen():
+    """Test that ldg128 generates tl::load_global_128 in CUDA source."""
+
+    @tilelang.jit
+    def ldg128_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            Y[pid * 4 : pid * 4 + 4] = T.reinterpret(T.ldg128(X[pid * 4 : pid * 4 + 4]), T.float32x4)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    ldg128_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg128_kernel.get_kernel_source(N=128)
+    print("=== ldg128 codegen ===")
+    print(src)
+    assert "load_global_128" in src, "Expected load_global_128 call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = X
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_ldg256_codegen():
+    """Test that ldg256 generates tl::load_global_256 in CUDA source."""
+
+    @tilelang.jit
+    def ldg256_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 8, threads=32) as pid:
+            Y[pid * 8 : pid * 8 + 8] = T.reinterpret(T.ldg256(X[pid * 8 : pid * 8 + 8]), T.float32x8)
+
+    X = torch.randn(256, dtype=torch.float32, device="cuda")
+    Y = torch.empty(256, dtype=torch.float32, device="cuda")
+
+    ldg256_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg256_kernel.get_kernel_source(N=256)
+    print("=== ldg256 codegen ===")
+    print(src)
+    assert "load_global_256" in src, "Expected load_global_256 call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = X
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg32_predicated_codegen():
+    """Test that ldg32 with predicate generates tl::load_global_32_conditional(ptr, pred) in CUDA source."""
+
+    @tilelang.jit
+    def ldg32_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N, threads=32) as pid:
+            # Only load for the first half of elements
+            Y[pid] = T.reinterpret(T.ldg32(X[pid], pred=pid < N // 2), T.float32)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    ldg32_pred_kernel(X, Y)
+    src = ldg32_pred_kernel.get_kernel_source(N=128)
+    print("=== ldg32 predicated codegen ===")
+    print(src)
+    # Verify codegen - should have load_global_32_conditional with two arguments and non-trivial predicate
+    assert "load_global_32_conditional" in src, "Expected load_global_32_conditional call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = torch.zeros(128, dtype=torch.float32, device="cuda")
+    for i in range(128):
+        if i < 64:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg64_predicated_codegen():
+    """Test that ldg64 with predicate generates tl::load_global_64_conditional(ptr, pred) in CUDA source."""
+
+    @tilelang.jit
+    def ldg64_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 2, threads=32) as pid:
+            # Only load for the first half of elements
+            Y[pid * 2 : pid * 2 + 2] = T.reinterpret(T.ldg64(X[pid * 2 : pid * 2 + 2], pred=pid < N // 4), T.float32x2)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    ldg64_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg64_pred_kernel.get_kernel_source(N=128)
+    print("=== ldg64 predicated codegen ===")
+    print(src)
+    assert "load_global_64_conditional" in src, "Expected load_global_64_conditional call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = torch.zeros(128, dtype=torch.float32, device="cuda")
+    for i in range(128):
+        if i < 64:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_ldg128_predicated_codegen():
+    """Test that ldg128 with predicate generates tl::load_global_128_conditional(ptr, pred) in CUDA source."""
+
+    @tilelang.jit
+    def ldg128_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            # Only load for the first half of elements
+            Y[pid * 4 : pid * 4 + 4] = T.reinterpret(T.ldg128(X[pid * 4 : pid * 4 + 4], pred=pid < N // 8), T.float32x4)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    ldg128_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg128_pred_kernel.get_kernel_source(N=128)
+    print("=== ldg128 predicated codegen ===")
+    print(src)
+    assert "load_global_128_conditional" in src, "Expected load_global_128_conditional call in generated CUDA source"
+
+    # Verify correctness
+    Y_ref = torch.zeros(128, dtype=torch.float32, device="cuda")
+    for i in range(128):
+        if i < 64:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_ldg256_predicated_codegen():
+    """Test that ldg256 with predicate generates tl::load_global_256_conditional(ptr, pred) in CUDA source."""
+
+    @tilelang.jit
+    def ldg256_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 8, threads=32) as pid:
+            # Only load for the first half of elements
+            Y[pid * 8 : pid * 8 + 8] = T.reinterpret(T.ldg256(X[pid * 8 : pid * 8 + 8], pred=pid < N // 16), T.float32x8)
+
+    X = torch.randn(256, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(256, dtype=torch.float32, device="cuda")
+
+    ldg256_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = ldg256_pred_kernel.get_kernel_source(N=256)
+    print("=== ldg256 predicated codegen ===")
+    print(src)
+    assert "load_global_256_conditional" in src, "Expected load_global_256_conditional call in generated CUDA source"
+    # Verify correctness
+    Y_ref = torch.zeros(256, dtype=torch.float32, device="cuda")
+    for i in range(256):
+        if i < 128:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_let.py b/testing/python/language/test_tilelang_language_let.py
index 6f94ad6649..74ec0901c6 100644
--- a/testing/python/language/test_tilelang_language_let.py
+++ b/testing/python/language/test_tilelang_language_let.py
@@ -3,19 +3,20 @@
 from tilelang import language as T
 
 
-def test_let_vectorize_load():
-    @T.prim_func
-    def main(A_ptr: T.handle):
-        A = T.match_buffer(A_ptr, (16, 16), dtype=T.float32, align=16)
+@tilelang.jit
+def test_kernel(
+    A: T.Tensor((16, 16), dtype=T.float32),
+):
+    for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+        for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
+            b = A[0, 0:4]
+            A[0, 4:8] = b
 
-        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
-            for _threadIdx in T.thread_binding(128, thread="threadIdx.x"):
-                b = A[0, 0:4]
-                A[0, 4:8] = b
 
-    mod = tvm.IRModule({"main": main})
-    mod = tvm.compile(mod, target="cuda")
-    assert "float4 b" in mod.mod.imports[0].inspect_source()
+@tilelang.testing.requires_cuda
+def test_let_vectorize_load():
+    kernel_source = test_kernel.get_kernel_source()
+    assert "float4 b" in kernel_source
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_let_layout.py b/testing/python/language/test_tilelang_language_let_layout.py
index fec30b914b..d413ea519f 100644
--- a/testing/python/language/test_tilelang_language_let_layout.py
+++ b/testing/python/language/test_tilelang_language_let_layout.py
@@ -112,10 +112,7 @@ def test_blocksparse_copy_cp_async():
         N=1024,
         block_M=128,
         block_N=128,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
 
 
diff --git a/testing/python/language/test_tilelang_language_mask_op.py b/testing/python/language/test_tilelang_language_mask_op.py
index 8f89972913..abc54cef14 100644
--- a/testing/python/language/test_tilelang_language_mask_op.py
+++ b/testing/python/language/test_tilelang_language_mask_op.py
@@ -28,9 +28,7 @@ def main(
 
 def run_tilelang_copy_mask_parallel(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -64,9 +62,7 @@ def main(
 
 def run_tilelang_copy_mask_copy(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -101,9 +97,7 @@ def main(
 
 def run_tilelang_copy_mask_parallel_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_parallel_range(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
@@ -137,9 +131,7 @@ def main(
 
 def run_tilelang_copy_mask_copy_range(M=1024, N=1024, block_M=128, block_N=128, dtype=T.float16):
     program = tilelang_copy_mask_copy_range(M, N, block_M, block_N, dtype)
-    kernel = tilelang.compile(
-        program, out_idx=[1], target="cuda", pass_configs={"tl.disable_warp_specialized": True, "tl.disable_tma_lower": True}
-    )
+    kernel = tilelang.compile(program, out_idx=[1], pass_configs={"tl.disable_warp_specialized": True})
     a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
     b = kernel(a)
     torch.testing.assert_close(b, a, rtol=1e-2, atol=1e-2)
diff --git a/testing/python/language/test_tilelang_language_min_blocks_per_sm.py b/testing/python/language/test_tilelang_language_min_blocks_per_sm.py
new file mode 100644
index 0000000000..4b98a44c13
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_min_blocks_per_sm.py
@@ -0,0 +1,32 @@
+"""Tests for T.annotate_min_blocks_per_sm → __launch_bounds__(maxThreads, minBlocks)."""
+
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+
+
+@tl.jit(out_idx=[2], target="cuda")
+def _kernel_min_blocks_per_sm():
+    @T.prim_func
+    def main(
+        A: T.Tensor((128, 128), "float32"),
+        B: T.Tensor((128, 128), "float32"),
+        C: T.Tensor((128, 128), "float32"),
+    ):
+        with T.Kernel(128, threads=128) as bx:
+            T.annotate_min_blocks_per_sm(2)
+            for i in T.serial(128):
+                C[bx, i] = A[bx, i] + B[bx, i]
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_annotate_min_blocks_per_sm_launch_bounds():
+    """Codegen should emit the second __launch_bounds__ argument from the annotation."""
+    src = _kernel_min_blocks_per_sm.get_kernel_source()
+    assert "__launch_bounds__(128, 2)" in src
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_parallel.py b/testing/python/language/test_tilelang_language_parallel.py
index a392e70b68..93a30c7ef1 100644
--- a/testing/python/language/test_tilelang_language_parallel.py
+++ b/testing/python/language/test_tilelang_language_parallel.py
@@ -48,6 +48,9 @@ def _require_cuda_tensor(shape, dtype=torch.float32):
         pytest.skip(f"CUDA runtime unavailable: {err}")
 
 
+PARALLEL_DYNAMIC_VALID_LENGTHS = [0, 13, 200, 600]
+
+
 def test_parallel_static_extent():
     kernel = parallel_elementwise_static(length=256)
     data = _require_cuda_tensor((256,), torch.float32)
@@ -55,15 +58,35 @@ def test_parallel_static_extent():
     torch.testing.assert_close(result, data + 1.0, atol=1e-5, rtol=1e-5)
 
 
-def test_parallel_dynamic_extent():
+@pytest.mark.parametrize(
+    "valid_len",
+    PARALLEL_DYNAMIC_VALID_LENGTHS,
+    ids=[f"valid_len={value}" for value in PARALLEL_DYNAMIC_VALID_LENGTHS],
+)
+def test_parallel_dynamic_extent(valid_len):
     kernel = parallel_elementwise_dynamic(max_len=512, threads=256)
     data = _require_cuda_tensor((512,), torch.float32)
-    for valid_len in [0, 13, 200, 600]:
-        out = kernel(data, valid_len)
-        reference = torch.zeros_like(data)
-        clip = min(valid_len, data.shape[0])
-        reference[:clip] = data[:clip] - 1.0
-        torch.testing.assert_close(out, reference, atol=1e-5, rtol=1e-5)
+    out = kernel(data, valid_len)
+    reference = torch.zeros_like(data)
+    clip = min(valid_len, data.shape[0])
+    reference[:clip] = data[:clip] - 1.0
+    torch.testing.assert_close(out, reference, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.jit
+def _parallel_vectorize_local_and_var():
+    with T.Kernel(1) as _:
+        x = T.alloc_fragment([256], T.float32)
+        y = T.alloc_fragment([256], T.float32)
+        z = T.alloc_var(T.float32)
+        for i in T.parallel(256):
+            y[i] = x[i] * z
+
+
+def test_parallel_vectorize_var():
+    source = _parallel_vectorize_local_and_var.get_kernel_source()
+    # do not vectorize if the loop only contains local/fragment and var buffer access
+    assert "float2" not in source
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_pdl.py b/testing/python/language/test_tilelang_language_pdl.py
new file mode 100644
index 0000000000..3f9a3d782f
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_pdl.py
@@ -0,0 +1,58 @@
+import tilelang.testing
+import tilelang.language as T
+
+
+def kernels_with_pdl_trigger(N, block_size=256, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx,):
+            for i in T.Parallel(block_size):
+                idx = bx * block_size + i
+                if idx < N:
+                    B[idx] = A[idx] + 1.0
+            T.pdl_trigger()
+
+    return main
+
+
+def kernels_with_pdl_sync(N, block_size=256, dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype),
+        B: T.Tensor((N,), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_size), threads=block_size) as (bx2,):
+            T.pdl_sync()
+            for i in T.Parallel(block_size):
+                idx = bx2 * block_size + i
+                if idx < N:
+                    B[idx] = A[idx] * 2.0
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_pdl_trigger():
+    N = 64
+    program = kernels_with_pdl_trigger(N)
+
+    pdl_kernel = tilelang.compile(program, target="cuda -arch=sm_90")
+    code = pdl_kernel.get_kernel_source()
+    assert "cudaTriggerProgrammaticLaunchCompletion" in code
+
+
+@tilelang.testing.requires_cuda
+def test_pdl_sync():
+    N = 64
+    program = kernels_with_pdl_sync(N)
+    pdl_kernel = tilelang.compile(program, target="cuda -arch=sm_90")
+    code = pdl_kernel.get_kernel_source()
+    assert "cudaGridDependencySynchronize" in code
+    assert "__restrict__" not in code
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_pipeline.py b/testing/python/language/test_tilelang_language_pipeline.py
index 8136e246f0..e0cb4612d4 100644
--- a/testing/python/language/test_tilelang_language_pipeline.py
+++ b/testing/python/language/test_tilelang_language_pipeline.py
@@ -86,10 +86,7 @@ def run_gemm(
     kernel = tilelang.compile(
         program,
         out_idx=[2],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = kernel.get_profiler()
 
@@ -121,10 +118,7 @@ def test_pipeline_order_stage():
 
 @tilelang.jit(
     out_idx=[-1],
-    pass_configs={
-        tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-        tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-    },
+    pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
 )
 def blocksparse_matmul(M, N, K, block_M, block_N, block_K, num_stages, dtype=T.float16, accum_dtype=T.float32):
     block_mask_shape = (M // block_M, N // block_N, K // block_K)
diff --git a/testing/python/language/test_tilelang_language_ptr.py b/testing/python/language/test_tilelang_language_ptr.py
index 85458139a5..9ddba5c21d 100644
--- a/testing/python/language/test_tilelang_language_ptr.py
+++ b/testing/python/language/test_tilelang_language_ptr.py
@@ -1,9 +1,9 @@
+import math
 import torch
 from tilelang import tvm as tvm
 import tilelang.testing
 import tilelang as tl
 import tilelang.language as T
-from tilelang.utils import map_torch_type
 
 
 def matmul_test(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
@@ -39,27 +39,197 @@ def main(
     return main
 
 
+def pointer_table_copy_test(N, dtype=T.float16):
+    @T.prim_func
+    def main(
+        src_ptrs: T.Tensor([1], T.ptr),
+        out: T.Tensor([N], dtype),
+    ):
+        with T.Kernel(1, threads=1) as _:
+            Src = T.make_tensor(src_ptrs[0], (N,), dtype)
+            for i in T.serial(N):
+                out[i] = Src[i]
+
+    return main
+
+
+def pointer_table_multi_copy_test(G, N, dtype=T.float16):
+    @T.prim_func
+    def main(
+        src_ptrs: T.Tensor([G], T.ptr),
+        out: T.Tensor([G, N], dtype),
+    ):
+        with T.Kernel(G, threads=1) as bx:
+            Src = T.make_tensor(src_ptrs[bx], (N,), dtype)
+            for i in T.serial(N):
+                out[bx, i] = Src[i]
+
+    return main
+
+
+def pointer_table_grouped_matmul_test(batch_sizes_list, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    batch_count = len(batch_sizes_list)
+    max_M = max(batch_sizes_list)
+    total_m_blocks = sum(math.ceil(size / block_M) for size in batch_sizes_list)
+
+    @T.prim_func
+    def main(
+        a_ptrs: T.Tensor([batch_count], T.ptr),
+        b_ptrs: T.Tensor([batch_count], T.ptr),
+        c_ptrs: T.Tensor([batch_count], T.ptr),
+        batch_tile_offsets: T.Tensor([batch_count], T.int32),
+    ):
+        with T.Kernel(total_m_blocks, T.ceildiv(N, block_N), threads=32) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            cur_batch_idx = T.alloc_var(dtype=T.int32)
+            cur_tile_offset = T.alloc_var(dtype=T.int32)
+
+            cur_batch_idx = 0
+            cur_tile_offset = 0
+            for i in range(batch_count):
+                in_cur_batch_idx = bx >= batch_tile_offsets[i]
+                cur_batch_idx = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx)
+                cur_tile_offset = T.if_then_else(in_cur_batch_idx, batch_tile_offsets[i], cur_tile_offset)
+
+            m_start = (bx - cur_tile_offset) * block_M
+            A = T.make_tensor(a_ptrs[cur_batch_idx], (max_M, K), dtype)
+            B = T.make_tensor(b_ptrs[cur_batch_idx], (K, N), dtype)
+            C = T.make_tensor(c_ptrs[cur_batch_idx], (max_M, N), dtype)
+
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
+                T.copy(A[m_start, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, by * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[m_start, by * block_N])
+
+    return main
+
+
 def run_matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
     program = matmul_test(M, N, K, block_M, block_N, block_K, dtype, accum_dtype)
-    jit_kernel = tl.compile(program, target="cuda", execution_backend="cython")
+    cython_jit_kernel = tl.compile(program, execution_backend="cython")
+    ffi_jit_kernel = tl.compile(program, execution_backend="tvm_ffi")
 
     def ref_program(a, b):
         return (a @ b.T).to(torch.float32)
 
-    a = torch.randn(M, K, device="cuda", dtype=map_torch_type(dtype))
-    b = torch.randn(N, K, device="cuda", dtype=map_torch_type(dtype))
+    a = torch.randn(M, K, device="cuda", dtype=dtype.as_torch())
+    b = torch.randn(N, K, device="cuda", dtype=dtype.as_torch())
+    ffi_c = torch.zeros(M, N, device="cuda", dtype=accum_dtype.as_torch())
+    cython_c = torch.zeros(M, N, device="cuda", dtype=accum_dtype.as_torch())
+
+    ffi_jit_kernel(a, b, ffi_c, M, N, K)
+    cython_jit_kernel(a.data_ptr(), b.data_ptr(), cython_c.data_ptr(), M, N, K)
+    torch.testing.assert_close(ffi_c, ref_program(a, b), atol=1e-2, rtol=1e-2)
+    torch.testing.assert_close(cython_c, ffi_c, atol=1e-2, rtol=1e-2)
+
+
+def run_pointer_table_copy(N, dtype=T.float16):
+    program = pointer_table_copy_test(N, dtype)
+    cython_jit_kernel = tl.compile(program, execution_backend="cython")
+    ffi_jit_kernel = tl.compile(program, execution_backend="tvm_ffi")
+    src = torch.randn(N, device="cuda", dtype=dtype.as_torch())
+    src_ptrs = torch.tensor([src.data_ptr()], device="cuda", dtype=torch.int64)
+    ffi_out = torch.empty(N, device="cuda", dtype=dtype.as_torch())
+    cython_out = torch.empty(N, device="cuda", dtype=dtype.as_torch())
+
+    ffi_jit_kernel(src_ptrs, ffi_out)
+    cython_jit_kernel(src_ptrs, cython_out)
+
+    torch.testing.assert_close(ffi_out, src)
+    torch.testing.assert_close(cython_out, src)
+    torch.testing.assert_close(cython_out, ffi_out)
+
+
+def run_pointer_table_multi_copy(G, N, dtype=T.float16):
+    program = pointer_table_multi_copy_test(G, N, dtype)
+    cython_jit_kernel = tl.compile(program, execution_backend="cython")
+    ffi_jit_kernel = tl.compile(program, execution_backend="tvm_ffi")
+    srcs = [torch.randn(N, device="cuda", dtype=dtype.as_torch()) for _ in range(G)]
+    src_ptrs = torch.tensor([src.data_ptr() for src in srcs], device="cuda", dtype=torch.int64)
+    ref = torch.stack(srcs, dim=0)
+    ffi_out = torch.empty((G, N), device="cuda", dtype=dtype.as_torch())
+    cython_out = torch.empty((G, N), device="cuda", dtype=dtype.as_torch())
+
+    ffi_jit_kernel(src_ptrs, ffi_out)
+    cython_jit_kernel(src_ptrs, cython_out)
+
+    torch.testing.assert_close(ffi_out, ref)
+    torch.testing.assert_close(cython_out, ref)
+    torch.testing.assert_close(cython_out, ffi_out)
+
+
+def run_pointer_table_grouped_matmul(batch_sizes_list, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    program = pointer_table_grouped_matmul_test(batch_sizes_list, N, K, block_M, block_N, block_K, dtype, accum_dtype)
+    compile_kwargs = {"pass_configs": {"tl.disable_warp_specialized": True}}
+    cython_jit_kernel = tl.compile(program, execution_backend="cython", **compile_kwargs)
+    ffi_jit_kernel = tl.compile(program, execution_backend="tvm_ffi", **compile_kwargs)
+
+    device = "cuda"
+    torch_dtype = dtype.as_torch()
+    torch_accum_dtype = accum_dtype.as_torch()
+    max_M = max(batch_sizes_list)
+    batch_tile_offsets = [0]
+    for size in batch_sizes_list[:-1]:
+        batch_tile_offsets.append(batch_tile_offsets[-1] + math.ceil(size / block_M))
+    batch_tile_offsets_t = torch.tensor(batch_tile_offsets, device=device, dtype=torch.int32)
+
+    a_exact = [torch.randn(size, K, device=device, dtype=torch_dtype) for size in batch_sizes_list]
+    b_list = [torch.randn(K, N, device=device, dtype=torch_dtype) for _ in batch_sizes_list]
+    ref = [(a @ b).to(torch_accum_dtype) for a, b in zip(a_exact, b_list)]
+
+    def build_backend_buffers():
+        a_list = [torch.zeros(max_M, K, device=device, dtype=torch_dtype) for _ in batch_sizes_list]
+        c_list = [torch.empty(max_M, N, device=device, dtype=torch_dtype) for _ in batch_sizes_list]
+        for buf, src in zip(a_list, a_exact):
+            buf[: src.shape[0]].copy_(src)
+        return (
+            a_list,
+            c_list,
+            torch.tensor([buf.data_ptr() for buf in a_list], device=device, dtype=torch.int64),
+            torch.tensor([buf.data_ptr() for buf in b_list], device=device, dtype=torch.int64),
+            torch.tensor([buf.data_ptr() for buf in c_list], device=device, dtype=torch.int64),
+        )
+
+    ffi_a_list, ffi_c_list, ffi_a_ptrs, ffi_b_ptrs, ffi_c_ptrs = build_backend_buffers()
+    cython_a_list, cython_c_list, cython_a_ptrs, cython_b_ptrs, cython_c_ptrs = build_backend_buffers()
+
+    ffi_jit_kernel(ffi_a_ptrs, ffi_b_ptrs, ffi_c_ptrs, batch_tile_offsets_t)
+    cython_jit_kernel(cython_a_ptrs, cython_b_ptrs, cython_c_ptrs, batch_tile_offsets_t)
+
+    for out, expected, size in zip(ffi_c_list, ref, batch_sizes_list):
+        torch.testing.assert_close(out[:size].to(torch_accum_dtype), expected, atol=1e-2, rtol=1e-2)
+    for out, expected, size in zip(cython_c_list, ref, batch_sizes_list):
+        torch.testing.assert_close(out[:size].to(torch_accum_dtype), expected, atol=1e-2, rtol=1e-2)
 
-    c = torch.zeros(M, N, device="cuda", dtype=map_torch_type(accum_dtype))
 
-    jit_kernel(a.data_ptr(), b.data_ptr(), c.data_ptr(), M, N, K)
+def test_matmul():
+    run_matmul(256, 256, 256, 64, 64, 32)
 
-    ref_c = (a @ b.T).to(map_torch_type(accum_dtype))
 
-    torch.testing.assert_close(c, ref_c, atol=1e-2, rtol=1e-2)
+def test_pointer_table_annotation_lowers_to_int64_buffer():
+    program = pointer_table_multi_copy_test(4, 8)
+    src_ptrs = program.buffer_map[program.params[0]]
 
+    assert src_ptrs.dtype == "int64"
+    assert [int(dim) for dim in src_ptrs.shape] == [4]
 
-def test_matmul():
-    run_matmul(1024, 1024, 1024, 128, 128, 32)
+
+def test_pointer_table_copy():
+    run_pointer_table_copy(64)
+
+
+def test_pointer_table_multi_copy():
+    run_pointer_table_multi_copy(2, 64)
+
+
+@tilelang.testing.requires_cuda
+def test_pointer_table_grouped_matmul():
+    run_pointer_table_grouped_matmul([8, 12, 17], 32, 32, 16, 16, 16)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_rand.py b/testing/python/language/test_tilelang_language_rand.py
index daf51dbb7f..bc2a07ebab 100644
--- a/testing/python/language/test_tilelang_language_rand.py
+++ b/testing/python/language/test_tilelang_language_rand.py
@@ -6,31 +6,63 @@
 
 
 @tilelang.jit
-def tilelang_rand_1d(M=1024, seed=42):
+def tilelang_rand_1d(M=1024, seed=42, generator="curandStatePhilox4_32_10_t"):
     num_per_thread = 128
     threads = 1
     blk_M = num_per_thread * threads
 
     @T.prim_func
-    def rand_kernel(A: T.Tensor((M,), "uint32")):
+    def rand_kernel(
+        A: T.Tensor((M,), "uint32"),
+        B: T.Tensor((M,), "float32"),
+        C: T.Tensor((M,), "float64"),
+        D: T.Tensor((M,), "float32"),
+        E: T.Tensor((M,), "float64"),
+    ):
         with T.Kernel(T.ceildiv(M, threads * num_per_thread), threads=threads) as bx:
             tx = T.get_thread_binding()
-            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread)
+            T.rng_init(seed, 0, bx * blk_M + tx * num_per_thread, generator=generator)
             for i, j in T.Parallel(threads, num_per_thread):
                 offsets = (bx * threads + i) * num_per_thread
                 idx = offsets + j
                 if idx < M:
                     A[idx] = T.rng_rand()
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    B[idx] = T.rng_rand_float()
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    C[idx] = T.rng_rand_float(bit=64)
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    D[idx] = T.rng_rand_float(dist="normal")
+            for i, j in T.Parallel(threads, num_per_thread):
+                offsets = (bx * threads + i) * num_per_thread
+                idx = offsets + j
+                if idx < M:
+                    E[idx] = T.rng_rand_float(bit=64, dist="normal")
 
     return rand_kernel
 
 
 @tilelang.testing.requires_cuda
-@pytest.mark.parametrize("M, seed", [(1024, 42), (512, 123), (128, 0)])
-def test_rand_1d(M, seed):
-    kernel = tilelang_rand_1d(M, seed)
-    tilelang_result = torch.empty(M, dtype=torch.uint32, device="cuda")
-    kernel(tilelang_result)
+@pytest.mark.parametrize(
+    "M, seed, generator", [(1024, 42, "curandStateMRG32k3a_t"), (512, 123, "curandStatePhilox4_32_10_t"), (128, 0, "curandStateXORWOW_t")]
+)
+def test_rand_1d(M, seed, generator):
+    kernel = tilelang_rand_1d(M, seed, generator)
+    A = torch.empty(M, dtype=torch.uint32, device="cuda")
+    B = torch.empty(M, dtype=torch.float32, device="cuda")
+    C = torch.empty(M, dtype=torch.float64, device="cuda")
+    D = torch.empty(M, dtype=torch.float32, device="cuda")
+    E = torch.empty(M, dtype=torch.float64, device="cuda")
+    kernel(A, B, C, D, E)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_reduce.py b/testing/python/language/test_tilelang_language_reduce.py
index 1d9bf61303..3b49fa6027 100644
--- a/testing/python/language/test_tilelang_language_reduce.py
+++ b/testing/python/language/test_tilelang_language_reduce.py
@@ -1,222 +1,310 @@
-from tilelang import tvm as tvm
-import tilelang.testing
+import tilelang
 import tilelang as tl
 import tilelang.language as T
+import tilelang.testing
+import pytest
+import torch
 
 tilelang.testing.set_random_seed()
 
 
-def _make_shared_reduce(M, N, dtype, reduce_cb):
-    @T.prim_func
-    def main(
-        A: T.Tensor((M, N), dtype),
-        B: T.Tensor((M,), dtype),
-    ):
-        with T.Kernel(1) as _:
-            A_shared = T.alloc_shared((M, N), dtype)
-            B_shared = T.alloc_shared((M,), dtype)
-
-            T.copy(A, A_shared)
-            reduce_cb(T, A_shared, B_shared)
-            T.copy(B_shared, B)
-
-    return main
-
-
-def _run_program(program, ref_program, atol=1e-2, rtol=1e-2):
-    jit_kernel = tl.compile(program, out_idx=-1)
-    profiler = jit_kernel.get_profiler()
-    profiler.assert_allclose(ref_program, atol=atol, rtol=rtol)
-
-
-def reduce_max_test(M, N, dtype=T.float16):
-    import tilelang.language as T
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
 
-    @T.prim_func
-    def main(
-        A: T.Tensor((M, N), dtype),
-        B: T.Tensor((M,), dtype),
-    ):
-        with T.Kernel(1) as _:
-            A_local = T.alloc_fragment((M, N), dtype)
-            B_local = T.alloc_fragment((M,), dtype)
-
-            T.copy(A, A_local)
-            T.reduce_max(A_local, B_local, dim=1)
-            T.copy(B_local, B)
-
-    return main
 
+def _make_input(M, N, dtype):
+    torch_dtype = getattr(torch, dtype)
+    if torch_dtype in (torch.int32, torch.int64):
+        return torch.randint(-100, 100, (M, N), dtype=torch_dtype).cuda()
+    return torch.randn(M, N, dtype=torch_dtype).cuda()
 
-def reduce_sum_test(M, N, dtype=T.float32):
-    import tilelang.language as T
 
+def _ref(A, op):
+    if op == "sum":
+        return A.sum(dim=1).to(A.dtype)
+    if op == "max":
+        return A.max(dim=1).values
+    if op == "min":
+        return A.min(dim=1).values
+    if op == "abssum":
+        return A.abs().sum(dim=1).to(A.dtype)
+    if op == "absmax":
+        return A.abs().max(dim=1).values
+    raise ValueError(op)
+
+
+def _reduce_op(T, op, src, dst, dim, batch=1):
+    kwargs = {} if batch == 1 else {"batch": batch}
+    if op == "sum":
+        T.reduce_sum(src, dst, dim=dim, **kwargs)
+    elif op == "max":
+        T.reduce_max(src, dst, dim=dim, **kwargs)
+    elif op == "min":
+        T.reduce_min(src, dst, dim=dim, **kwargs)
+    elif op == "abssum":
+        T.reduce_abssum(src, dst, dim=dim, **kwargs)
+    elif op == "absmax":
+        T.reduce_absmax(src, dst, dim=dim, **kwargs)
+
+
+# ---------------------------------------------------------------------------
+# test_reduce  (op × dtype × src_scope × dst_scope × threads × batch)
+# ---------------------------------------------------------------------------
+
+# int types only support sum (no abssum/absmax for int in tilelang)
+REDUCE_CASES = [
+    # (op,      dtype,       M,   N,   src_scope,    dst_scope,  threads, batch)
+    ("sum", T.float32, 128, 128, "fragment", "fragment", 32, 1),
+    ("sum", T.int32, 128, 128, "fragment", "fragment", 32, 1),
+    ("sum", T.int64, 192, 64, "fragment", "fragment", 32, 1),
+    ("sum", T.float32, 192, 64, "fragment", "fragment", 32, 1),
+    ("sum", T.float32, 32, 32, "fragment", "fragment", 16, 1),
+    ("sum", T.float32, 16, 16, "fragment", "fragment", 8, 1),
+    ("sum", T.float32, 32, 32, "shared", "shared", 32, 1),
+    ("sum", T.float32, 32, 32, "fragment", "shared", 32, 1),
+    ("max", T.float32, 128, 128, "fragment", "fragment", 32, 1),
+    ("max", T.int64, 128, 128, "fragment", "fragment", 32, 1),
+    ("max", T.float32, 32, 32, "shared", "shared", 32, 1),
+    ("min", T.float32, 128, 128, "fragment", "fragment", 32, 1),
+    ("min", T.int64, 128, 128, "fragment", "fragment", 32, 1),
+    ("abssum", T.float32, 128, 128, "fragment", "fragment", 32, 1),
+    ("abssum", T.int64, 128, 128, "fragment", "fragment", 32, 1),
+    ("absmax", T.float32, 128, 128, "fragment", "fragment", 32, 1),
+    ("absmax", T.int64, 128, 128, "fragment", "fragment", 32, 1),
+    # batch > 1: verify run_batch codegen and correctness together
+    ("sum", T.float32, 128, 64, "shared", "fragment", 256, 2),
+    ("sum", T.float32, 128, 64, "shared", "fragment", 256, 4),
+    ("sum", T.float16, 64, 128, "fragment", "fragment", 256, 4),
+    ("max", T.bfloat16, 128, 64, "shared", "fragment", 256, 2),
+    ("max", T.float32, 128, 128, "fragment", "fragment", 256, 4),
+    ("min", T.float32, 64, 128, "shared", "fragment", 128, 2),
+    ("min", T.float16, 128, 128, "fragment", "fragment", 256, 8),
+    ("abssum", T.float32, 128, 128, "fragment", "fragment", 256, 4),
+    ("absmax", T.float32, 128, 128, "fragment", "fragment", 256, 4),
+]
+
+
+@pytest.mark.parametrize(
+    ("op", "dtype", "M", "N", "src_scope", "dst_scope", "threads", "batch"),
+    REDUCE_CASES,
+    ids=[
+        f"{op}-{dtype}-{M}x{N}-{src_scope[0]}2{dst_scope[0]}-t{threads}-b{batch}"
+        for op, dtype, M, N, src_scope, dst_scope, threads, batch in REDUCE_CASES
+    ],
+)
+def test_reduce(op, dtype, M, N, src_scope, dst_scope, threads, batch):
+    import re
+
+    @tilelang.jit(out_idx=-1)
+    def kernel(M, N, dtype, op, src_scope, dst_scope, threads, batch):
+        @T.prim_func
+        def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M,), dtype)):
+            with T.Kernel(1, threads=threads):
+                if src_scope == "fragment":
+                    src = T.alloc_fragment((M, N), dtype)
+                else:
+                    src = T.alloc_shared((M, N), dtype)
+                if dst_scope == "fragment":
+                    dst = T.alloc_fragment((M,), dtype)
+                else:
+                    dst = T.alloc_shared((M,), dtype)
+                T.copy(A, src, disable_tma=src_scope == "shared")
+                _reduce_op(T, op, src, dst, dim=1, batch=batch)
+                T.copy(dst, B)
+
+        return main
+
+    jit_kernel = kernel(M, N, dtype, op, src_scope, dst_scope, threads, batch)
+
+    if batch > 1:
+        src = jit_kernel.get_kernel_source()
+        m = re.search(r",\s*(\d+)\s*,\s*\d+\s*>::run_batch\(", src)
+        assert m is not None, f"Expected run_batch in generated source.\n{src}"
+        assert int(m.group(1)) > 1, f"Expected batch_size > 1, got {m.group(1)}.\n{src}"
+
+    A = _make_input(M, N, dtype)
+    B = jit_kernel(A)
+    # float16/bfloat16 accumulate more rounding error over large reductions
+    tol = 1e-1 if dtype in (T.float16, T.bfloat16) else 1e-2
+    torch.testing.assert_close(B, _ref(A, op), atol=tol, rtol=tol)
+
+
+# ---------------------------------------------------------------------------
+# test_reduce_clear  (op × src_scope × dst_scope, clear=False)
+# ---------------------------------------------------------------------------
+
+REDUCE_CLEAR_CASES = [
+    # (op,   dtype,       M,   N,  src_scope,   dst_scope,  init,   ref_fn_extra)
+    # sum: init=1, ref = A.sum(dim=1) + 1
+    ("sum", T.float32, 128, 128, "fragment", "fragment"),
+    ("sum", T.float32, 128, 128, "fragment", "shared"),
+    ("sum", T.float32, 32, 32, "shared", "shared"),
+    # max: init=-inf, ref = A.max(dim=1).values  (max(-inf, x) = x)
+    ("max", T.float16, 128, 128, "fragment", "fragment"),
+]
+
+
+@pytest.mark.parametrize(
+    ("op", "dtype", "M", "N", "src_scope", "dst_scope"),
+    REDUCE_CLEAR_CASES,
+    ids=[f"{op}-{dtype}-{M}x{N}-{src_scope[0]}2{dst_scope[0]}" for op, dtype, M, N, src_scope, dst_scope in REDUCE_CLEAR_CASES],
+)
+def test_reduce_clear(op, dtype, M, N, src_scope, dst_scope):
+    @tilelang.jit(out_idx=-1)
+    def kernel(M, N, dtype, op, src_scope, dst_scope):
+        @T.prim_func
+        def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M,), dtype)):
+            with T.Kernel(1, threads=32):
+                if src_scope == "fragment":
+                    src = T.alloc_fragment((M, N), dtype)
+                else:
+                    src = T.alloc_shared((M, N), dtype)
+                if dst_scope == "fragment":
+                    dst = T.alloc_fragment((M,), dtype)
+                else:
+                    dst = T.alloc_shared((M,), dtype)
+                T.copy(A, src, disable_tma=src_scope == "shared")
+                if op == "sum":
+                    T.fill(dst, 1)
+                    T.reduce_sum(src, dst, dim=1, clear=False)
+                elif op == "max":
+                    T.fill(dst, -T.infinity(dtype))
+                    T.reduce_max(src, dst, dim=1, clear=False)
+                T.copy(dst, B)
+
+        return main
+
+    torch_dtype = getattr(torch, dtype)
+    A = torch.randn(M, N, dtype=torch_dtype).cuda()
+    B = kernel(M, N, dtype, op, src_scope, dst_scope)(A)
+    if op == "sum":
+        ref = A.sum(dim=1) + 1
+    elif op == "max":
+        ref = A.max(dim=1).values
+    torch.testing.assert_close(B, ref, atol=1e-2, rtol=1e-2)
+
+
+# ---------------------------------------------------------------------------
+# T.finalize_reducer tests
+# ---------------------------------------------------------------------------
+
+_COMPILE_FLAGS = {
+    tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+    tl.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+}
+
+FINALIZE_REDUCER_CASES = [
+    # (op,   dtype,      block_M, block_N, batch)
+    ("sum", T.float32, 128, 64, 1),
+    ("sum", T.float32, 128, 64, 4),
+    ("max", T.float16, 64, 128, 1),
+    ("max", T.float16, 64, 128, 8),
+    ("min", T.float32, 128, 128, 1),
+    ("min", T.float32, 128, 128, 16),
+]
+
+
+def _make_finalize_reducer_kernel(block_M, block_N, dtype, op, batch):
     @T.prim_func
-    def main(
-        A: T.Tensor((M, N), dtype),
-        B: T.Tensor((M,), dtype),
-    ):
-        with T.Kernel(1) as _:
-            A_local = T.alloc_fragment((M, N), dtype)
-            B_local = T.alloc_fragment((M,), dtype)
-
-            T.copy(A, A_local)
-            T.reduce_sum(A_local, B_local, dim=1)
-            T.copy(B_local, B)
-
-    return main
-
-
-def reduce_sum_ss(M, N, dtype=T.float32):
-    return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_sum(src, dst, dim=1))
-
-
-def reduce_max_ss(M, N, dtype=T.float32):
-    return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_max(src, dst, dim=1))
-
-
-def reduce_min_ss(M, N, dtype=T.float32):
-    return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_min(src, dst, dim=1))
-
-
-def reduce_abssum_ss(M, N, dtype=T.float32):
-    return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_abssum(src, dst, dim=1))
-
-
-def reduce_absmax_ss(M, N, dtype=T.float32):
-    return _make_shared_reduce(M, N, dtype, lambda T, src, dst: T.reduce_absmax(src, dst, dim=1))
-
-
-def run_reduce_sum(M, N, dtype=T.float32, mode="rr"):
-    if mode == "rr":
-        program = reduce_sum_test(M, N, dtype)
-    elif mode == "ss":
-        program = reduce_sum_ss(M, N, dtype)
+    def kernel(A: T.Tensor((block_M, block_N), dtype), B: T.Tensor((block_M,), dtype)):
+        with T.Kernel(1, threads=256):
+            o_reducer = T.alloc_reducer(block_M, dtype, op=op, replication="all")
+            T.clear(o_reducer)
+            A_smem = T.alloc_shared((block_M, block_N), dtype)
+            T.copy(A, A_smem)
+            A_frag = T.alloc_fragment((block_M, block_N), dtype)
+            T.copy(A_smem, A_frag)
+            for i, j in T.Parallel(block_M, block_N):
+                if op == "sum":
+                    o_reducer[i] += A_frag[i, j]
+                elif op == "max":
+                    o_reducer[i] = T.max(o_reducer[i], A_frag[i, j])
+                else:
+                    o_reducer[i] = T.min(o_reducer[i], A_frag[i, j])
+            T.finalize_reducer(o_reducer, batch=batch)
+            T.copy(o_reducer, B)
+
+    return kernel
+
+
+@pytest.mark.parametrize(
+    ("op", "dtype", "block_M", "block_N", "batch"),
+    FINALIZE_REDUCER_CASES,
+    ids=[f"{op}-{dtype}-{bM}x{bN}-b{batch}" for op, dtype, bM, bN, batch in FINALIZE_REDUCER_CASES],
+)
+def test_finalize_reducer_codegen(op, dtype, block_M, block_N, batch):
+    """batch=1 → scalar run; batch>1 → run_batch with correct template arg."""
+    import re
+
+    src = tl.compile(
+        _make_finalize_reducer_kernel(block_M, block_N, dtype, op, batch),
+        out_idx=-1,
+        pass_configs=_COMPILE_FLAGS,
+    ).get_kernel_source()
+
+    if batch == 1:
+        assert "run_batch" not in src, f"batch=1 must not emit run_batch.\n{src}"
     else:
-        raise NotImplementedError("run_reduce_sum only supports rr and ss")
-    _run_program(program, lambda A: A.sum(dim=1))
-
-
-def run_shared_reduce(program_builder, ref_program, M, N, dtype=T.float32):
-    program = program_builder(M, N, dtype)
-    _run_program(program, ref_program)
-
-
-def run_reduce_max(M, N, dtype=T.float16):
-    program = reduce_max_test(M, N, dtype)
-    _run_program(program, lambda A: A.max(dim=1).values, atol=1e-2, rtol=1e-2)
-
-
-def test_reduce_sum():
-    run_reduce_sum(256, 256)
-    run_reduce_sum(512, 128)
-    run_reduce_sum(128, 512)
-
-
-def test_reduce_sum_shared():
-    run_reduce_sum(64, 64, mode="ss")
-
-
-def test_reduce_max():
-    run_reduce_max(256, 256, T.float16)
-    run_reduce_max(512, 128, T.float16)
-    run_reduce_max(256, 256, T.float32)
-
-
-def test_reduce_max_shared():
-    run_shared_reduce(reduce_max_ss, lambda A: A.max(dim=1).values, 64, 64, T.float32)
-
-
-def test_reduce_min_shared():
-    run_shared_reduce(reduce_min_ss, lambda A: A.min(dim=1).values, 64, 64, T.float32)
-
-
-def test_reduce_abssum_shared():
-    run_shared_reduce(reduce_abssum_ss, lambda A: A.abs().sum(dim=1), 64, 64, T.float32)
-
-
-def test_reduce_absmax_shared():
-    run_shared_reduce(reduce_absmax_ss, lambda A: A.abs().max(dim=1).values, 64, 64, T.float32)
-
-
-def reduce_sum_test_clear(M, N, dtype=T.float32):
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-        A: T.Tensor((M, N), dtype),
-        B: T.Tensor((M,), dtype),
-    ):
-        with T.Kernel(1, threads=32) as _:
-            A_local = T.alloc_fragment((M, N), dtype)
-            B_local = T.alloc_fragment((M,), dtype)
-
-            T.copy(A, A_local)
-            T.fill(B_local, 1)
-            T.reduce_sum(A_local, B_local, dim=1, clear=False)
-            T.copy(B_local, B)
-
-    return main
-
-
-def run_reduce_sum_clear(M, N, dtype=T.float32):
-    program = reduce_sum_test_clear(M, N, dtype)
-    jit_kernel = tl.compile(program, out_idx=-1)
-
-    def ref_program(A):
-        return A.sum(dim=1) + 1
-
-    import torch
-
-    dummy_A = torch.randn((M, N), dtype=getattr(torch, dtype)).cuda()
-    ref_out = ref_program(dummy_A)
-    tl_out = jit_kernel(dummy_A)
-    torch.testing.assert_close(tl_out, ref_out, atol=1e-2, rtol=1e-2)
-
-
-def test_reduce_sum_clear():
-    run_reduce_sum_clear(256, 256, T.float32)
-    run_reduce_sum_clear(512, 128, T.float32)
-    run_reduce_sum_clear(128, 512, T.float32)
-
-
-def reduce_max_test_clear(M, N, dtype=T.float16):
-    import tilelang.language as T
-
-    @T.prim_func
-    def main(
-        A: T.Tensor((M, N), dtype),
-        B: T.Tensor((M,), dtype),
-    ):
-        with T.Kernel(1, threads=32) as _:
-            A_local = T.alloc_fragment((M, N), dtype)
-            B_local = T.alloc_fragment((M,), dtype)
-
-            T.copy(A, A_local)
-            T.fill(B_local, -T.infinity(dtype))
-            T.reduce_max(A_local, B_local, dim=1, clear=False)
-            T.copy(B_local, B)
-
-    return main
-
-
-def run_reduce_max_clear(M, N, dtype=T.float16):
-    program = reduce_max_test_clear(M, N, dtype)
-    jit_kernel = tl.compile(program, out_idx=-1)
-
-    def ref_program(A):
-        return A.max(dim=1).values
-
-    import torch
-
-    dummy_A = torch.randn((M, N), dtype=getattr(torch, dtype)).cuda()
-    ref_out = ref_program(dummy_A)
-    tl_out = jit_kernel(dummy_A)
-    torch.testing.assert_close(tl_out, ref_out, atol=1e-2, rtol=1e-2)
-
-
-def test_reduce_max_clear():
-    run_reduce_max_clear(256, 256, T.float16)
+        m = re.search(r",\s*(\d+)\s*,\s*\d+\s*>::run_batch\(", src)
+        assert m is not None, f"Expected run_batch in generated source.\n{src}"
+        assert int(m.group(1)) == batch, f"Expected batch={batch}, got {m.group(1)}.\n{src}"
+
+
+@pytest.mark.parametrize(
+    ("op", "dtype", "block_M", "block_N", "batch"),
+    [c for c in FINALIZE_REDUCER_CASES if c[4] == 1],
+    ids=[f"{op}-{dtype}-{bM}x{bN}" for op, dtype, bM, bN, batch in FINALIZE_REDUCER_CASES if batch == 1],
+)
+def test_finalize_reducer_correctness(op, dtype, block_M, block_N, batch):
+    """Numerical correctness (batch=1 scalar path; batch>1 blocked by fragment layout bug)."""
+    A = torch.randn(block_M, block_N, dtype=getattr(torch, dtype)).cuda()
+    B = tl.compile(
+        _make_finalize_reducer_kernel(block_M, block_N, dtype, op, batch),
+        out_idx=-1,
+        pass_configs=_COMPILE_FLAGS,
+    )(A)
+    torch.testing.assert_close(B, _ref(A, op), atol=1e-2, rtol=1e-2)
+
+
+# (batch, exc_type, match)
+FINALIZE_REDUCER_INVALID_CASES = [
+    (0, ValueError, "batch must be >= 1"),
+    (-1, ValueError, "batch must be >= 1"),
+    (128, Exception, "exceeds total output elements"),  # block_M=64, batch=128
+    (3, Exception, "must evenly divide"),  # block_M=64, batch=3
+]
+
+
+@pytest.mark.parametrize(
+    ("batch", "exc_type", "match"),
+    FINALIZE_REDUCER_INVALID_CASES,
+    ids=["zero", "negative", "exceeds", "not-divisible"],
+)
+def test_finalize_reducer_invalid_batch(batch, exc_type, match):
+    block_M = 64
+
+    def make_kernel():
+        @T.prim_func
+        def kernel(A: T.Tensor((block_M, 64), T.float32), B: T.Tensor((block_M,), T.float32)):
+            with T.Kernel(1, threads=256):
+                o_reducer = T.alloc_reducer(block_M, T.float32, op="sum", replication="all")
+                T.clear(o_reducer)
+                A_smem = T.alloc_shared((block_M, 64), T.float32)
+                T.copy(A, A_smem)
+                A_frag = T.alloc_fragment((block_M, 64), T.float32)
+                T.copy(A_smem, A_frag)
+                for i, j in T.Parallel(block_M, 64):
+                    o_reducer[i] += A_frag[i, j]
+                T.finalize_reducer(o_reducer, batch=batch)
+                T.copy(o_reducer, B)
+
+        return kernel
+
+    with pytest.raises(exc_type, match=match):
+        # batch<1 raises at prim_func definition time; others at compile time
+        k = make_kernel()
+        tl.compile(k, out_idx=-1, pass_configs=_COMPILE_FLAGS)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/language/test_tilelang_language_reduce_maxmin_nan.py b/testing/python/language/test_tilelang_language_reduce_maxmin_nan.py
new file mode 100644
index 0000000000..a4633f2d8d
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_reduce_maxmin_nan.py
@@ -0,0 +1,106 @@
+"""Tests for the per-call ``nan_propagate`` kwarg on T.reduce_max / reduce_min /
+reduce_absmax for float16 and bfloat16 buffers (CUDA only)."""
+
+import math
+
+import torch
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+_DTYPES = [("float16", T.float16, torch.float16), ("bfloat16", T.bfloat16, torch.bfloat16)]
+
+
+def _compile(prim_func):
+    return tilelang.compile(prim_func, out_idx=-1, target="cuda")
+
+
+def _make_reduce_kernel(reduce_fn, length, dtype, *, nan_propagate):
+    @T.prim_func
+    def kernel(a: T.Tensor((length,), dtype), out: T.Tensor((1,), dtype)):
+        with T.Kernel(1, threads=32):
+            frag = T.alloc_fragment((length,), dtype)
+            out_frag = T.alloc_fragment((1,), dtype)
+            T.copy(a, frag)
+            reduce_fn(frag, out_frag, nan_propagate=nan_propagate)
+            T.copy(out_frag, out)
+
+    return kernel
+
+
+# ---------------------------------------------------------------------------
+# Source-level checks: confirm the right reducer / intrinsic is emitted.
+# ---------------------------------------------------------------------------
+
+
+@tilelang.testing.requires_cuda
+def test_reduce_max_default_uses_plain_op():
+    k = _compile(_make_reduce_kernel(T.reduce_max, 64, T.float16, nan_propagate=False))
+    src = k.get_kernel_source()
+    assert "tl::MaxOp" in src and "MaxOpNan" not in src
+    assert "__hmax(" in src and "__hmax_nan" not in src
+
+
+@tilelang.testing.requires_cuda
+def test_reduce_max_nan_propagate_uses_nan_op():
+    k = _compile(_make_reduce_kernel(T.reduce_max, 64, T.float16, nan_propagate=True))
+    src = k.get_kernel_source()
+    assert "tl::MaxOpNan" in src
+    assert "__hmax_nan" in src
+
+
+@tilelang.testing.requires_cuda
+def test_reduce_min_nan_propagate_uses_nan_op():
+    k = _compile(_make_reduce_kernel(T.reduce_min, 64, T.bfloat16, nan_propagate=True))
+    src = k.get_kernel_source()
+    assert "tl::MinOpNan" in src
+    assert "__hmin_nan" in src
+
+
+@tilelang.testing.requires_cuda
+def test_reduce_absmax_nan_propagate_uses_nan_op():
+    k = _compile(_make_reduce_kernel(T.reduce_absmax, 64, T.float16, nan_propagate=True))
+    src = k.get_kernel_source()
+    assert "tl::MaxOpNan" in src
+    assert "__hmax_nan" in src
+
+
+# ---------------------------------------------------------------------------
+# Runtime behavioral checks: NaN actually propagates only when requested.
+# ---------------------------------------------------------------------------
+
+
+@tilelang.testing.requires_cuda
+def test_reduce_max_runtime_nan_behavior():
+    for _, tl_dtype, torch_dtype in _DTYPES:
+        length = 64
+        a = torch.arange(length, dtype=torch.float32).to(torch_dtype).cuda()
+        a[7] = float("nan")
+
+        k_default = _compile(_make_reduce_kernel(T.reduce_max, length, tl_dtype, nan_propagate=False))
+        k_nan = _compile(_make_reduce_kernel(T.reduce_max, length, tl_dtype, nan_propagate=True))
+
+        out_default = k_default(a)
+        out_nan = k_nan(a)
+
+        assert not math.isnan(out_default.float().item()), f"{tl_dtype}: default reduce_max should ignore NaN, got {out_default}"
+        assert math.isnan(out_nan.float().item()), f"{tl_dtype}: nan_propagate reduce_max should return NaN, got {out_nan}"
+
+
+@tilelang.testing.requires_cuda
+def test_reduce_min_runtime_nan_behavior():
+    for _, tl_dtype, torch_dtype in _DTYPES:
+        length = 64
+        a = torch.arange(length, dtype=torch.float32).to(torch_dtype).cuda()
+        a[13] = float("nan")
+
+        k_default = _compile(_make_reduce_kernel(T.reduce_min, length, tl_dtype, nan_propagate=False))
+        k_nan = _compile(_make_reduce_kernel(T.reduce_min, length, tl_dtype, nan_propagate=True))
+
+        assert not math.isnan(k_default(a).float().item())
+        assert math.isnan(k_nan(a).float().item())
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_reshape.py b/testing/python/language/test_tilelang_language_reshape.py
index 10c3d0ce87..27388911b7 100644
--- a/testing/python/language/test_tilelang_language_reshape.py
+++ b/testing/python/language/test_tilelang_language_reshape.py
@@ -25,10 +25,7 @@ def run_reshape(N, M, dtype):
     jit_kernel = tl.compile(
         program,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = jit_kernel.get_profiler()
 
@@ -68,10 +65,7 @@ def run_reshape_smem_1d_2_2d(N, M, dtype):
     jit_kernel = tl.compile(
         program,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = jit_kernel.get_profiler()
 
@@ -110,10 +104,7 @@ def run_reshape_smem_2d_2_1d(N, M, dtype):
     jit_kernel = tl.compile(
         program,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = jit_kernel.get_profiler()
 
@@ -153,10 +144,7 @@ def run_reshape_fragment(N, M, dtype):
     jit_kernel = tl.compile(
         program,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = jit_kernel.get_profiler()
 
@@ -199,10 +187,7 @@ def run_reshape_layout_transform_shared(N, M, dtype):
     jit_kernel = tl.compile(
         program,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = jit_kernel.get_profiler()
 
@@ -242,10 +227,7 @@ def run_reduce_after_reshape(N, M, dtype):
     jit_kernel = tl.compile(
         program,
         out_idx=-1,
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     profiler = jit_kernel.get_profiler()
 
@@ -278,5 +260,31 @@ def test_reshape_shape_mismatch():
         reshape_shape_mismatch_test(1024, 32, T.float32)
 
 
+def test_reduce_absmax_after_reshape_3d():
+    M, N, num_groups, num_per_channels = 2, 384, 3, 128
+    threads = 128
+    dtype = "int64"
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((M, num_groups), dtype),
+    ):
+        with T.Kernel(1, threads=threads) as _:
+            A_local = T.alloc_fragment((M, N), dtype)
+            A_reshaped = T.reshape(A_local, [M, num_groups, num_per_channels])
+            B_local = T.alloc_fragment((M, num_groups), dtype)
+            T.copy(A, A_local)
+            T.reduce_absmax(A_reshaped, B_local, dim=2)
+            T.copy(B_local, B)
+
+    jit_kernel = tl.compile(main, out_idx=-1)
+    A_torch = torch.randint(-100, 100, (M, N), dtype=getattr(torch, dtype)).cuda()
+    B_torch = jit_kernel(A_torch)
+
+    ref = A_torch.abs().reshape(M, num_groups, num_per_channels).max(dim=2).values
+    torch.testing.assert_close(B_torch, ref)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_select.py b/testing/python/language/test_tilelang_language_select.py
new file mode 100644
index 0000000000..bbb645dca7
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_select.py
@@ -0,0 +1,75 @@
+import torch
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def get_select_kernel_1():
+    @T.prim_func
+    def main(
+        A: T.Tensor[(128, 8), T.float32],
+        B: T.Tensor[(128, 8), T.float32],
+    ):
+        with T.Kernel(1, threads=128):
+            tx = T.get_thread_binding(0)
+            tmp = T.alloc_var(T.bfloat16)
+            tmp = A[tx, 3]
+
+            B[tx, 0] = T.Select(True, A[tx, 0], 0.0)
+            B[tx, 1] = T.Select(False, 1.0, A[tx, 1])
+            B[tx, 2] = T.Select(T.cast(A[tx, 3], T.bfloat16) == tmp, A[tx, 2], T.cast(tmp, T.float32))
+            B[tx, 3] = T.Select(B[tx, 0] != 0.0, T.if_then_else(B[tx, 1] != 0.0, A[tx, 3], 0.0), 0.0)
+
+            for i in T.serial(4):
+                B[tx, i + 4] = T.Select(
+                    A[tx, 0] == 0, T.if_then_else(T.Select(True, False, True), 1.0, 2.0), T.Select(True, A[tx, i + 4], 3.0)
+                )
+
+    return main
+
+
+def test_select_correctness():
+    A = torch.randn((128, 8), dtype=torch.float32, device="cuda")
+    B = torch.empty((128, 8), dtype=torch.float32, device="cuda")
+    kernel = get_select_kernel_1()
+
+    A = torch.clamp(A, min=1e-4)
+    kernel(A, B)
+    assert torch.allclose(A, B)
+
+
+@tilelang.jit
+def get_select_kernel_2():
+    @T.prim_func
+    def main(
+        A: T.Tensor[(128, 8), T.float32],
+        B: T.Tensor[(128, 8), T.float32],
+    ):
+        with T.Kernel(1, threads=128):
+            tx = T.get_thread_binding(0)
+            tmp = T.alloc_var(T.bfloat16)
+            tmp = A[tx, 3]
+
+            B[tx, 0] = T.Select(True, A[tx, 0], 0.0)
+            B[tx, 1] = T.Select(False, 1.0, A[tx, 1])
+            B[tx, 2] = T.Select(T.cast(A[tx, 3], T.bfloat16) == tmp, A[tx, 2], T.cast(tmp, T.float32))
+            B[tx, 3] = T.Select(B[tx, 0] != 0.0, T.Select(B[tx, 1] != 0.0, A[tx, 3], 0.0), 0.0)
+
+            for i in T.serial(4):
+                B[tx, i + 4] = T.Select(
+                    A[tx, 0] == 0,
+                    T.Select(T.Select(True, False, True), 1.0, 2.0),
+                    T.Select(T.sin(A[tx, 2]) == 1.0, T.sin(A[tx, 0]), T.cos(A[tx, 1])),
+                )
+
+    return main
+
+
+def test_select_codegen_no_if():
+    kernel = get_select_kernel_2()
+    source = kernel.get_kernel_source()
+    assert "if (" not in source
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_source_kernel.py b/testing/python/language/test_tilelang_language_source_kernel.py
new file mode 100644
index 0000000000..782ff5d205
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_source_kernel.py
@@ -0,0 +1,87 @@
+import os
+import re
+import tempfile
+from pathlib import Path
+
+import pytest
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+import torch
+
+
+CUDA_SOURCE = """
+extern "C" __global__ void external_copy(float* A, float* B, int n) {
+    int i = (int)(blockIdx.x * blockDim.x + threadIdx.x);
+    if (i < n) {
+        B[i] = A[i];
+    }
+}
+"""
+
+
+def make_source_kernel(source_code_or_path: str | os.PathLike[str], entry_name: str):
+    N = T.dynamic("N")
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), T.float32),
+        B: T.Tensor((N,), T.float32),
+    ):
+        T.CUDASourceCodeKernel(T.ceildiv(N, 128), threads=128, source_code_or_path=source_code_or_path, entry_name=entry_name)
+
+    return main
+
+
+def get_single_device_function_name(device_mod) -> str:
+    function_names = [g_var.name_hint for g_var in device_mod.functions]
+    assert len(function_names) == 1
+    return function_names[0]
+
+
+@tilelang.testing.requires_cuda
+def test_source_kernel_inline_codegen():
+    artifact = tilelang.lower(make_source_kernel(CUDA_SOURCE, entry_name="external_copy"), target="cuda")
+    function_name = get_single_device_function_name(artifact.device_mod)
+
+    assert re.search(
+        rf"__global__\s+void\s+(?:__launch_bounds__\([^\)]*\)\s+)?{re.escape(function_name)}\s*\(",
+        artifact.kernel_source,
+    )
+    assert "B[i] = A[i];" in artifact.kernel_source
+
+
+@tilelang.testing.requires_cuda
+def test_source_kernel_run():
+    kernel = tilelang.compile(make_source_kernel(CUDA_SOURCE, entry_name="external_copy"), target="cuda")
+    print(kernel.get_kernel_source())
+    print(kernel.get_host_source())
+    a = torch.randn(128, dtype=torch.float32, device="cuda")
+    b = torch.empty_like(a)
+    kernel(a, b)
+    torch.testing.assert_close(b, a)
+
+
+@tilelang.testing.requires_cuda
+def test_source_kernel_loads_from_file():
+    with tempfile.NamedTemporaryFile("w", suffix=".cu", delete=False, encoding="utf-8") as f:
+        f.write(CUDA_SOURCE)
+        source_path = f.name
+
+    try:
+        artifact = tilelang.lower(make_source_kernel(Path(source_path), entry_name="external_copy"), target="cuda")
+    finally:
+        os.unlink(source_path)
+
+    assert "B[i] = A[i];" in artifact.kernel_source
+
+
+@tilelang.testing.requires_cuda
+def test_source_kernel_invalid_entry_name_fails_in_lower():
+    with pytest.raises(Exception, match=r"Available entries: external_copy"):
+        tilelang.lower(make_source_kernel(CUDA_SOURCE, entry_name="main_kernel"), target="cuda")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_stg.py b/testing/python/language/test_tilelang_language_stg.py
new file mode 100644
index 0000000000..5ef1ae32db
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_stg.py
@@ -0,0 +1,234 @@
+"""Tests for store_global_32, store_global_64, store_global_128, store_global_256 intrinsics codegen using eager jit style."""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+import torch
+
+
+@tilelang.testing.requires_cuda
+def test_stg32_codegen():
+    """Test that stg32 generates tl::store_global_32 in CUDA source."""
+
+    @tilelang.jit
+    def stg32_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N, threads=32) as pid:
+            val = T.reinterpret(X[pid], T.uint32)
+            T.stg32(Y[pid], val)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    stg32_kernel(X, Y)
+    src = stg32_kernel.get_kernel_source(N=128)
+    print("=== stg32 codegen ===")
+    print(src)
+    # Verify codegen
+    assert "store_global_32" in src, "Expected store_global_32 call in generated CUDA source"
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_stg64_codegen():
+    """Test that stg64 generates tl::store_global_64 in CUDA source."""
+
+    @tilelang.jit
+    def stg64_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 2, threads=32) as pid:
+            val = T.ldg64(X[pid * 2 : pid * 2 + 2])
+            T.stg64(Y[pid * 2 : pid * 2 + 2], val)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    stg64_kernel(X, Y)
+
+    # Verify codegen
+    src = stg64_kernel.get_kernel_source(N=128)
+    print("=== stg64 codegen ===")
+    print(src)
+    assert "store_global_64" in src, "Expected store_global_64 call in generated CUDA source"
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_stg128_codegen():
+    """Test that stg128 generates tl::store_global_128 in CUDA source."""
+
+    @tilelang.jit
+    def stg128_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            val = T.ldg128(X[pid * 4 : pid * 4 + 4])
+            T.stg128(Y[pid * 4 : pid * 4 + 4], val)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    stg128_kernel(X, Y)
+
+    # Verify codegen
+    src = stg128_kernel.get_kernel_source(N=128)
+    print("=== stg128 codegen ===")
+    print(src)
+    assert "store_global_128" in src, "Expected store_global_128 call in generated CUDA source"
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_stg256_codegen():
+    """Test that stg256 generates tl::store_global_256 in CUDA source."""
+
+    @tilelang.jit
+    def stg256_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 8, threads=32) as pid:
+            val = T.ldg256(X[pid * 8 : pid * 8 + 8])
+            T.stg256(Y[pid * 8 : pid * 8 + 8], val)
+
+    X = torch.randn(256, dtype=torch.float32, device="cuda")
+    Y = torch.empty(256, dtype=torch.float32, device="cuda")
+
+    stg256_kernel(X, Y)
+
+    # Verify codegen
+    src = stg256_kernel.get_kernel_source(N=256)
+    print("=== stg256 codegen ===")
+    print(src)
+    assert "store_global_256" in src, "Expected store_global_256 call in generated CUDA source"
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+
+@tilelang.testing.requires_cuda
+def test_stg32_predicated_codegen():
+    """Test that stg32 with predicate generates tl::store_global_32_conditional(ptr, val, pred) in CUDA source."""
+
+    @tilelang.jit
+    def stg32_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N, threads=32) as pid:
+            val = T.reinterpret(X[pid], T.uint32)
+            # Only store for the first half of elements
+            T.stg32(Y[pid], val, pred=pid < N // 2)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    stg32_pred_kernel(X, Y)
+    src = stg32_pred_kernel.get_kernel_source(N=128)
+    print("=== stg32 predicated codegen ===")
+    print(src)
+    # Verify codegen - should have store_global_32 with predicate
+    assert "store_global_32" in src, "Expected store_global_32 call in generated CUDA source"
+
+
+@tilelang.testing.requires_cuda
+def test_stg64_predicated_codegen():
+    """Test that stg64 with predicate generates tl::store_global_64_conditional(ptr, val, pred) in CUDA source."""
+
+    @tilelang.jit
+    def stg64_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 2, threads=32) as pid:
+            val = T.ldg64(X[pid * 2 : pid * 2 + 2])
+            # Only store for the first half of elements
+            T.stg64(Y[pid * 2 : pid * 2 + 2], val, pred=pid < N // 4)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    stg64_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = stg64_pred_kernel.get_kernel_source(N=128)
+    print("=== stg64 predicated codegen ===")
+    print(src)
+    assert "store_global_64" in src, "Expected store_global_64 call in generated CUDA source"
+
+
+@tilelang.testing.requires_cuda
+def test_stg128_predicated_codegen():
+    """Test that stg128 with predicate generates tl::store_global_128_conditional(ptr, val, pred) in CUDA source."""
+
+    @tilelang.jit
+    def stg128_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            val = T.ldg128(X[pid * 4 : pid * 4 + 4])
+            # Only store for the first half of elements
+            T.stg128(Y[pid * 4 : pid * 4 + 4], val, pred=pid < N // 8)
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(128, dtype=torch.float32, device="cuda")
+
+    stg128_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = stg128_pred_kernel.get_kernel_source(N=128)
+    print("=== stg128 predicated codegen ===")
+    print(src)
+    assert "store_global_128" in src, "Expected store_global_128 call in generated CUDA source"
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_stg256_predicated_codegen():
+    """Test that stg256 with predicate generates tl::store_global_256_conditional(ptr, val, pred) in CUDA source."""
+
+    @tilelang.jit
+    def stg256_pred_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 8, threads=32) as pid:
+            val = T.ldg256(X[pid * 8 : pid * 8 + 8])
+            # Only store for the first half of elements
+            T.stg256(Y[pid * 8 : pid * 8 + 8], val, pred=pid < N // 16)
+
+    X = torch.randn(256, dtype=torch.float32, device="cuda")
+    Y = torch.zeros(256, dtype=torch.float32, device="cuda")
+
+    stg256_pred_kernel(X, Y)
+
+    # Verify codegen
+    src = stg256_pred_kernel.get_kernel_source(N=256)
+    print("=== stg256 predicated codegen ===")
+    print(src)
+    assert "store_global_256" in src, "Expected store_global_256 call in generated CUDA source"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_subtype.py b/testing/python/language/test_tilelang_language_subtype.py
new file mode 100644
index 0000000000..5aba3e96b3
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_subtype.py
@@ -0,0 +1,400 @@
+"""Tests for subtype (subbyte dtype like 4bit int or fp) shape and stride bindings ."""
+
+import torch
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import pytest
+
+SUBTYPE_M_VALUES = [1, 2, 4, 8, 16, 32]
+SUBTYPE_STRIDE_MULTIPLIERS = [1, 2, 4]
+SUBTYPE_LAST_DIM_RUNTIME_SIZES = [4, 8, 16, 32]
+SUBTYPE_SHARED_SYMBOLIC_M_VALUES = [1, 2, 4, 8]
+SUBTYPE_SHARED_SYMBOLIC_STRIDED_M_VALUES = [2, 4, 8]
+SUBTYPE_COMPLEX_EXPRESSION_CASES = [(2, 8), (4, 16), (8, 32)]
+
+
+@tilelang.jit
+def basic_shape_kernel(x):
+    m = T.dynamic("m")
+    x: T.Tensor[(m, 16), T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def strided_kernel(x):
+    m = T.dynamic("m")
+    s = T.dynamic("s")
+    x: T.StridedTensor[[m, 16], [s, 1], T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_basic_shape_binding():
+    """Test that symbolic shape variables are correctly bound for subtype buffers.
+
+    For fp4 (4 bits), pack_factor = 8 / 4 = 2.
+    Logical shape [m, 16] corresponds to runtime shape [m, 8].
+    The symbolic variable 'm' should be bound from runtime_shape[0].
+    """
+    # Runtime shape [4, 8] -> Logical shape [4, 16] for fp4
+    t = torch.randint(0, 256, (4, 8), dtype=torch.uint8, device="cuda")
+    basic_shape_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_stride_binding():
+    """Test that symbolic stride variables are correctly bound for subtype buffers.
+
+    For fp4, the stride relationship is:
+    - Last dimension: logical_stride = runtime_stride
+    - Other dimensions: logical_stride = runtime_stride * pack_factor
+
+    With pack_factor = 2:
+    - Runtime stride [8, 1] -> Logical stride [16, 1]
+    """
+    # Contiguous tensor: runtime stride [8, 1] -> logical stride [16, 1]
+    t = torch.randint(0, 256, (4, 8), dtype=torch.uint8, device="cuda")
+    strided_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_noncontiguous_tensor():
+    """Test subtype with non-contiguous (strided) tensor.
+
+    Create a tensor with stride [16, 1] (by slicing every other row).
+    This corresponds to logical stride [32, 1] for fp4.
+    """
+    # Create a larger tensor and slice to get non-contiguous strides
+    t_large = torch.randint(0, 256, (8, 8), dtype=torch.uint8, device="cuda")
+    # Slice every other row: shape [4, 8] but stride [16, 1]
+    t_noncontig = t_large[::2, :]
+    assert t_noncontig.shape == (4, 8)
+    assert t_noncontig.stride() == (16, 1)
+
+    strided_kernel(t_noncontig)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("m", SUBTYPE_M_VALUES, ids=[f"m={m}" for m in SUBTYPE_M_VALUES])
+def test_subtype_different_m_values(m):
+    """Test subtype binding with different values of symbolic variable m."""
+    # Runtime shape [m, 8] -> Logical shape [m, 16] for fp4
+    t = torch.randint(0, 256, (m, 8), dtype=torch.uint8, device="cuda")
+    basic_shape_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "stride_multiplier",
+    SUBTYPE_STRIDE_MULTIPLIERS,
+    ids=[f"stride_x{stride}" for stride in SUBTYPE_STRIDE_MULTIPLIERS],
+)
+def test_subtype_different_strides(stride_multiplier):
+    """Test subtype stride binding with different stride values."""
+    # Test with different non-contiguous strides
+    # Create tensor with specific stride pattern
+    t_large = torch.randint(0, 256, (4 * stride_multiplier, 8), dtype=torch.uint8, device="cuda")
+    # Slice to get stride [8 * stride_multiplier, 1]
+    t_strided = t_large[::stride_multiplier, :]
+    assert t_strided.shape == (4, 8)
+    assert t_strided.stride() == (8 * stride_multiplier, 1)
+
+    strided_kernel(t_strided)
+
+
+@tilelang.jit
+def symbolic_last_dim_kernel(x):
+    """Kernel with symbolic variable in the last dimension."""
+    n = T.dynamic("n")
+    x: T.Tensor[(4, n), T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def symbolic_last_dim_strided_kernel(x):
+    """Kernel with symbolic variable in both shape and stride of last dimension."""
+    n = T.dynamic("n")
+    s = T.dynamic("s")
+    x: T.StridedTensor[[4, n], [s, 1], T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def shared_symbolic_kernel(x, y):
+    """Kernel with shared symbolic variable across multiple buffers."""
+    m = T.dynamic("m")
+    x: T.Tensor[(m, 16), T.float4_e2m1fn]
+    y: T.Tensor[(m * 4, 16), T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def shared_symbolic_strided_kernel(x, y):
+    """Kernel with shared symbolic variable in strides."""
+    m = T.dynamic("m")
+    s = T.dynamic("s")
+    x: T.StridedTensor[[m, 16], [s, 1], T.float4_e2m1fn]
+    y: T.StridedTensor[[m * 2, 16], [s, 1], T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.jit
+def complex_expr_kernel(x, y):
+    """Kernel with complex expressions involving symbolic variables."""
+    m = T.dynamic("m")
+    n = T.dynamic("n")
+    x: T.Tensor[(m, n * 2), T.float4_e2m1fn]
+    y: T.Tensor[(m * 2, n), T.float4_e2m1fn]
+
+    with T.Kernel(1, threads=32):
+        pass
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_symbolic_last_dim():
+    """Test symbolic variable in the last dimension.
+
+    For fp4, the last dimension has pack_factor applied:
+    Logical shape [4, n] with n=32 corresponds to runtime shape [4, 16].
+    So n = runtime_shape[1] * pack_factor = 16 * 2 = 32.
+    """
+    # Runtime shape [4, 16] -> Logical shape [4, 32] for fp4
+    t = torch.randint(0, 256, (4, 16), dtype=torch.uint8, device="cuda")
+    symbolic_last_dim_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "n_runtime",
+    SUBTYPE_LAST_DIM_RUNTIME_SIZES,
+    ids=[f"runtime_n={n}" for n in SUBTYPE_LAST_DIM_RUNTIME_SIZES],
+)
+def test_subtype_symbolic_last_dim_various_sizes(n_runtime):
+    """Test symbolic last dimension with various sizes."""
+    # Logical n = runtime_n * 2 (pack_factor for fp4)
+    t = torch.randint(0, 256, (4, n_runtime), dtype=torch.uint8, device="cuda")
+    symbolic_last_dim_kernel(t)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_symbolic_last_dim_strided():
+    """Test symbolic variable in last dimension with strides.
+
+    Note: For subtype (packed storage), the last dimension stride must be 1
+    since elements are packed together. Column slicing doesn't make sense
+    for packed types.
+    """
+    # Contiguous tensor
+    t = torch.randint(0, 256, (4, 16), dtype=torch.uint8, device="cuda")
+    symbolic_last_dim_strided_kernel(t)
+
+    # Non-contiguous tensor (row slicing only, last dim stride stays 1)
+    t_large = torch.randint(0, 256, (8, 16), dtype=torch.uint8, device="cuda")
+    t_strided = t_large[::2, :]  # shape [4, 16], stride [32, 1]
+    assert t_strided.shape == (4, 16)
+    assert t_strided.stride() == (32, 1)
+    symbolic_last_dim_strided_kernel(t_strided)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "m",
+    SUBTYPE_SHARED_SYMBOLIC_M_VALUES,
+    ids=[f"m={m}" for m in SUBTYPE_SHARED_SYMBOLIC_M_VALUES],
+)
+def test_subtype_shared_symbolic(m):
+    """Test shared symbolic variable across multiple buffers.
+
+    x has shape (m, 16), y has shape (m*4, 16).
+    For fp4 with pack_factor=2:
+    - x runtime shape (m, 8)
+    - y runtime shape (m*4, 8)
+
+    If m=2:
+    - x runtime: (2, 8), logical: (2, 16)
+    - y runtime: (8, 8), logical: (8, 16)
+    """
+    x = torch.randint(0, 256, (m, 8), dtype=torch.uint8, device="cuda")
+    y = torch.randint(0, 256, (m * 4, 8), dtype=torch.uint8, device="cuda")
+    shared_symbolic_kernel(x, y)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    "m",
+    SUBTYPE_SHARED_SYMBOLIC_STRIDED_M_VALUES,
+    ids=[f"m={m}" for m in SUBTYPE_SHARED_SYMBOLIC_STRIDED_M_VALUES],
+)
+def test_subtype_shared_symbolic_strided(m):
+    """Test shared symbolic variable in strides across multiple buffers.
+
+    x has shape (m, 16) with stride (s, 1)
+    y has shape (m*2, 16) with stride (s, 1)
+    """
+    # Create contiguous tensors
+    x = torch.randint(0, 256, (m, 8), dtype=torch.uint8, device="cuda")
+    y = torch.randint(0, 256, (m * 2, 8), dtype=torch.uint8, device="cuda")
+    shared_symbolic_strided_kernel(x, y)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_shared_symbolic_strided_noncontig():
+    """Test shared symbolic stride with non-contiguous tensors."""
+    # Create non-contiguous tensors with same stride pattern
+    x_large = torch.randint(0, 256, (8, 8), dtype=torch.uint8, device="cuda")
+    y_large = torch.randint(0, 256, (16, 8), dtype=torch.uint8, device="cuda")
+
+    # Slice to get stride [16, 1] for both
+    x = x_large[::2, :]  # shape (4, 8), stride (16, 1)
+    y = y_large[::2, :]  # shape (8, 8), stride (16, 1)
+
+    assert x.shape == (4, 8)
+    assert y.shape == (8, 8)
+    assert x.stride() == (16, 1)
+    assert y.stride() == (16, 1)
+
+    shared_symbolic_strided_kernel(x, y)
+
+
+@tilelang.testing.requires_cuda
+def test_subtype_complex_expressions():
+    """Test complex expressions with symbolic variables.
+
+    x has shape (m, n*2), y has shape (m*2, n).
+    For fp4 with pack_factor=2:
+    - x logical (m, n*2) -> runtime (m, n)
+    - y logical (m*2, n) -> runtime (m*2, n/2)
+    """
+    # m=4, n=16: x logical (4, 32), y logical (8, 16)
+    # x runtime (4, 16), y runtime (8, 8)
+    m, n = 4, 16
+    x = torch.randint(0, 256, (m, n), dtype=torch.uint8, device="cuda")
+    y = torch.randint(0, 256, (m * 2, n // 2), dtype=torch.uint8, device="cuda")
+    complex_expr_kernel(x, y)
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize(
+    ("m", "n"),
+    SUBTYPE_COMPLEX_EXPRESSION_CASES,
+    ids=[f"m={m}-n={n}" for m, n in SUBTYPE_COMPLEX_EXPRESSION_CASES],
+)
+def test_subtype_complex_expressions_various(m, n):
+    """Test complex expressions with various m, n values."""
+    # x logical (m, n*2) -> runtime (m, n)
+    # y logical (m*2, n) -> runtime (m*2, n/2)
+    x = torch.randint(0, 256, (m, n), dtype=torch.uint8, device="cuda")
+    y = torch.randint(0, 256, (m * 2, n // 2), dtype=torch.uint8, device="cuda")
+    complex_expr_kernel(x, y)
+
+
+# ---------------------------------------------------------------------------
+# Scalar fp4 store to StridedTensor with dynamic strides.
+# Before the fix the codegen wrote full bytes instead of nibbles, so
+# consecutive fp4 elements sharing a byte would overwrite each other.
+# ---------------------------------------------------------------------------
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("block_size", [8, 16])
+def test_subtype_fp4_dynamic_stride_store(block_size):
+    """fp4 store via StridedTensor: dynamic strides must match static strides."""
+    num_blocks, n, padding = 10, 64, 4
+    fp4_bytes = 64  # 128 fp4 elems packed into 64 bytes
+    jit_kw = dict(out_idx=None, target="cuda", pass_configs={"tl.disable_data_race_check": True})
+
+    def make_buf():
+        row = fp4_bytes + padding
+        back = torch.zeros(num_blocks, block_size * row, dtype=torch.uint8, device="cuda")
+        fp4 = back[:, : block_size * fp4_bytes].view(num_blocks, block_size, fp4_bytes).view(torch.int8)
+        return back, fp4
+
+    torch.manual_seed(0)
+    src = torch.randint(0, 256, (n, 64), dtype=torch.uint8, device="cuda").view(torch.int8)
+    slots = torch.randperm(num_blocks * block_size, dtype=torch.int32, device="cuda")[:n]
+
+    # static (reference) — strides known at compile time
+    back_s, fp4_s = make_buf()
+    s0 = fp4_s.stride(0) * 2  # byte stride → fp4-element stride
+    s1 = fp4_s.stride(1) * 2
+
+    @tilelang.jit(**jit_kw)
+    def static_kern(src, dst, slots):
+        nv = T.dynamic("n")
+        nb = T.dynamic("num_blocks")
+        src: T.Tensor[(nv, 128), T.float4_e2m1fn]
+        dst: T.StridedTensor[(nb, block_size, 128), (s0, s1, 1), T.float4_e2m1fn]
+        slots: T.Tensor[(nv,), T.int32]
+        with T.Kernel(nv, threads=32) as i:
+            for k in T.serial(128):
+                dst[slots[i] // block_size, slots[i] % block_size, k] = src[i, k]
+
+    static_kern(src, fp4_s, slots)
+
+    # dynamic — strides resolved at runtime
+    @tilelang.jit(**jit_kw)
+    def dynamic_kern(src, dst, slots):
+        nv = T.dynamic("n")
+        nb = T.dynamic("num_blocks")
+        ds0 = T.dynamic("ds0")
+        ds1 = T.dynamic("ds1")
+        src: T.Tensor[(nv, 128), T.float4_e2m1fn]
+        dst: T.StridedTensor[(nb, block_size, 128), (ds0, ds1, 1), T.float4_e2m1fn]
+        slots: T.Tensor[(nv,), T.int32]
+        with T.Kernel(nv, threads=32) as i:
+            for k in T.serial(128):
+                dst[slots[i] // block_size, slots[i] % block_size, k] = src[i, k]
+
+    back_d, fp4_d = make_buf()
+    dynamic_kern(src, fp4_d, slots)
+
+    assert torch.equal(back_s, back_d), (
+        f"static vs dynamic stride mismatch: {(back_s != back_d).sum().item()}/{back_s.numel()} bytes differ"
+    )
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("n", [64, 128])
+def test_subtype_fp4_scalar_store_codegen(n):
+    """Scatter fp4 elements via indirection — forces scalar (non-vectorized) stores."""
+
+    @tilelang.jit(out_idx=None, pass_configs={"tl.disable_data_race_check": True})
+    def scatter_kern(src, dst, perm):
+        nv = T.dynamic("n")
+        src: T.Tensor[(nv, 128), T.float4_e2m1fn]
+        dst: T.Tensor[(nv, 128), T.float4_e2m1fn]
+        perm: T.Tensor[(nv,), T.int32]
+        with T.Kernel(nv, threads=32) as i:
+            for k in T.serial(128):
+                dst[perm[i], k] = src[i, k]
+
+    torch.manual_seed(42)
+    src = torch.randint(0, 256, (n, 64), dtype=torch.uint8, device="cuda").view(torch.int8)
+    dst = torch.zeros(n, 64, dtype=torch.uint8, device="cuda").view(torch.int8)
+    perm = torch.randperm(n, dtype=torch.int32, device="cuda")
+
+    scatter_kern(src, dst, perm)
+
+    # Invert the permutation and check src[inv[j]] == dst[j] at byte level
+    inv = torch.empty_like(perm)
+    inv[perm] = torch.arange(n, dtype=torch.int32, device="cuda")
+    expected = src.view(torch.uint8)[inv.long()]
+    actual = dst.view(torch.uint8)
+    assert torch.equal(expected, actual), f"scatter fp4 mismatch: {(expected != actual).sum().item()}/{actual.numel()} bytes differ"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_tcgen05_gemm.py b/testing/python/language/test_tilelang_language_tcgen05_gemm.py
new file mode 100644
index 0000000000..f69533728b
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_tcgen05_gemm.py
@@ -0,0 +1,117 @@
+import pytest
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang import tvm
+
+
+def _make_sync_tcgen05_kernel(gemm_op):
+    @T.prim_func
+    def main(
+        A: T.Tensor((128, 128), T.bfloat16),
+        B: T.Tensor((128, 128), T.bfloat16),
+        D: T.Tensor((128, 128), T.bfloat16),
+    ):
+        with T.Kernel(1, threads=128):
+            A_shared = T.alloc_shared((128, 128), T.bfloat16)
+            B_shared = T.alloc_shared((128, 128), T.bfloat16)
+            C_tmem = T.alloc_tmem((128, 128), T.float32)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((128, 128), T.float32)
+            C_shared = T.alloc_shared((128, 128), T.bfloat16)
+
+            T.copy(A[0:128, 0:128], A_shared)
+            T.copy(B[0:128, 0:128], B_shared)
+            gemm_op(A_shared, B_shared, C_tmem, mbar)
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, D[0:128, 0:128])
+
+    return main
+
+
+def _make_async_tcgen05_kernel(gemm_op):
+    @T.prim_func
+    def main(
+        A: T.Tensor((128, 128), T.bfloat16),
+        B: T.Tensor((128, 128), T.bfloat16),
+        D: T.Tensor((128, 128), T.bfloat16),
+    ):
+        with T.Kernel(1, threads=128):
+            A_shared = T.alloc_shared((128, 128), T.bfloat16)
+            B_shared = T.alloc_shared((128, 128), T.bfloat16)
+            C_tmem = T.alloc_tmem((128, 128), T.float32)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((128, 128), T.float32)
+            C_shared = T.alloc_shared((128, 128), T.bfloat16)
+
+            T.copy(A[0:128, 0:128], A_shared)
+            T.copy(B[0:128, 0:128], B_shared)
+            gemm_op(A_shared, B_shared, C_tmem, mbar)
+            T.mbarrier_wait_parity(mbar, 0)
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, D[0:128, 0:128])
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+@pytest.mark.parametrize(
+    ("sync_api", "async_api"),
+    [
+        (
+            lambda A, B, C, mbar: T.gemm(A, B, C, transpose_B=True, mbar=mbar, clear_accum=True),
+            lambda A, B, C, mbar: T.tcgen05_gemm(A, B, C, transpose_B=True, mbar=mbar, clear_accum=True),
+        ),
+    ],
+)
+def test_tcgen05_gemm_matches_sync_gemm_codegen(sync_api, async_api):
+    sync_kernel = tilelang.compile(_make_sync_tcgen05_kernel(sync_api), target="cuda")
+    async_kernel = tilelang.compile(_make_async_tcgen05_kernel(async_api), target="cuda")
+
+    assert sync_kernel.get_kernel_source() == async_kernel.get_kernel_source()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+def test_tcgen05_gemm_dispatch_matches_sync_gemm_codegen():
+    sync_kernel = tilelang.compile(
+        _make_sync_tcgen05_kernel(lambda A, B, C, mbar: T.gemm(A, B, C, transpose_B=True, mbar=mbar, clear_accum=True)),
+        target="cuda",
+    )
+    async_kernel = tilelang.compile(
+        _make_async_tcgen05_kernel(lambda A, B, C, mbar: T.tcgen05_gemm(A, B, C, transpose_B=True, mbar=mbar, clear_accum=True)),
+        target="cuda",
+    )
+
+    assert sync_kernel.get_kernel_source() == async_kernel.get_kernel_source()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(10)
+def test_tcgen05_gemm_rejects_non_tcgen05_lowering():
+    @T.prim_func
+    def main(
+        A: T.Tensor((128, 128), T.bfloat16),
+        B: T.Tensor((128, 128), T.bfloat16),
+        D: T.Tensor((128, 128), T.bfloat16),
+    ):
+        with T.Kernel(1, threads=128):
+            A_shared = T.alloc_shared((128, 128), T.bfloat16)
+            B_shared = T.alloc_shared((128, 128), T.bfloat16)
+            C_local = T.alloc_fragment((128, 128), T.float32)
+            mbar = T.alloc_barrier(1)
+
+            T.copy(A[0:128, 0:128], A_shared)
+            T.copy(B[0:128, 0:128], B_shared)
+            T.tcgen05_gemm(A_shared, B_shared, C_local, transpose_B=True, mbar=mbar, clear_accum=True)
+            T.copy(C_local, D[0:128, 0:128])
+
+    with pytest.raises(
+        tvm.error.InternalError,
+        match=r"T\.tcgen05_gemm\(\) requires Blackwell TCGEN5MMA lowering",
+    ):
+        tilelang.compile(main, target="cuda")
diff --git a/testing/python/language/test_tilelang_language_tma_copy.py b/testing/python/language/test_tilelang_language_tma_copy.py
new file mode 100644
index 0000000000..6cc1698079
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_tma_copy.py
@@ -0,0 +1,296 @@
+"""Test T.tma_copy() with user-managed synchronization.
+
+For TMA loads (global -> shared):
+  T.tma_copy() emits only expect_tx + tma_load (no arrive, no wait).
+  The user must explicitly call T.barrier_arrive() and T.mbarrier_wait_parity().
+  This allows multiple tma_copy operations to share a single barrier arrive.
+  Pipeline buffer versioning expands the barrier to num_stages versions automatically.
+
+For TMA stores (shared -> global):
+  T.tma_copy() emits tma_store + tma_store_arrive (no wait).
+  The user must explicitly call T.tma_store_wait() for synchronization.
+  No barrier argument is needed for stores.
+"""
+
+from tilelang import tvm as tvm
+import tilelang.testing
+import tilelang.language as T
+import tilelang
+
+
+def matmul_tma_copy(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    num_stages,
+):
+    A_shape = (M, K)
+    B_shape = (K, N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), in_dtype)
+            B_shared = T.alloc_shared((block_K, block_N), in_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            mbar_A = T.alloc_barrier(128)
+            mbar_B = T.alloc_barrier(128)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.tma_copy(A[by * block_M, k * block_K], A_shared, barrier=mbar_A)
+                T.barrier_arrive(mbar_A)
+                T.tma_copy(B[k * block_K, bx * block_N], B_shared, barrier=mbar_B)
+                T.barrier_arrive(mbar_B)
+                T.mbarrier_wait_parity(mbar_A, k % 2)
+                T.mbarrier_wait_parity(mbar_B, k % 2)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def run_gemm_tma_copy(num_stages):
+    M, N, K = 1024, 1024, 1024
+    block_M, block_N, block_K = 128, 128, 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    accum_dtype = T.float32
+    threads = 128
+
+    program = matmul_tma_copy(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        threads,
+        num_stages,
+    )
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    print(kernel.get_kernel_source())
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        return C.to(torch.__getattribute__(out_dtype))
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_tma_copy_pipeline_2_stages():
+    run_gemm_tma_copy(num_stages=2)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_tma_copy_pipeline_3_stages():
+    run_gemm_tma_copy(num_stages=3)
+
+
+def matmul_tma_copy_store(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    num_stages,
+):
+    """GEMM using T.tma_copy for both load (global->shared) and store (shared->global)."""
+    A_shape = (M, K)
+    B_shape = (K, N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), in_dtype)
+            B_shared = T.alloc_shared((block_K, block_N), in_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            mbar_A = T.alloc_barrier(128)
+            mbar_B = T.alloc_barrier(128)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.tma_copy(A[by * block_M, k * block_K], A_shared, barrier=mbar_A)
+                T.barrier_arrive(mbar_A)
+                T.tma_copy(B[k * block_K, bx * block_N], B_shared, barrier=mbar_B)
+                T.barrier_arrive(mbar_B)
+                T.mbarrier_wait_parity(mbar_A, k % 2)
+                T.mbarrier_wait_parity(mbar_B, k % 2)
+                T.gemm(A_shared, B_shared, C_local)
+            # Store result: fragment -> shared -> global via T.tma_copy (no barrier needed)
+            T.copy(C_local, C_shared)
+            T.tma_copy(C_shared, C[by * block_M, bx * block_N])
+            T.tma_store_wait()
+
+    return main
+
+
+def run_gemm_tma_copy_store(num_stages):
+    M, N, K = 1024, 1024, 1024
+    block_M, block_N, block_K = 128, 128, 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    accum_dtype = T.float32
+    threads = 128
+
+    program = matmul_tma_copy_store(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        threads,
+        num_stages,
+    )
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    print(kernel.get_kernel_source())
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        return C.to(torch.__getattribute__(out_dtype))
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def fp4_tma_copy_roundtrip(M=128, N=256, block_M=64, block_N=128):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), T.float4_e2m1fn),
+        B: T.Tensor((M, N), T.float4_e2m1fn),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_N), T.float4_e2m1fn)
+            mbar = T.alloc_barrier(128)
+            T.tma_copy(A[by * block_M, bx * block_N], A_shared, barrier=mbar)
+            T.barrier_arrive(mbar)
+            T.mbarrier_wait_parity(mbar, 0)
+            T.tma_copy(A_shared, B[by * block_M, bx * block_N])
+            T.tma_store_wait()
+
+    return main
+
+
+def run_fp4_tma_copy_roundtrip():
+    import re
+    import torch
+
+    M, N = 128, 256
+    program = fp4_tma_copy_roundtrip(M=M, N=N)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    device_source = kernel.get_kernel_source()
+    host_source = kernel.get_host_source()
+    assert "CUtensorMap" in device_source
+    assert "tl::tma_load" in device_source
+    assert "tl::tma_store" in device_source
+    assert host_source.count("__tvm_tensormap_create_tiled") >= 2
+
+    def descriptor_init_block(desc_name):
+        marker = f"[0].v_ptr) = {desc_name};"
+        start = host_source.find(marker)
+        assert start >= 0, f"Missing {desc_name} TensorMap initialization"
+        end = host_source.find("TVMFFIFunctionCall(__tvm_tensormap_create_tiled_packed", start)
+        assert end >= 0, f"Missing {desc_name} TensorMap creation call"
+        return host_source[start:end]
+
+    def stack_int(block, index):
+        match = re.search(rf"\[{index}\]\.v_int64\)\s*=\s*\(\(int64_t\)(-?\d+)\);", block)
+        assert match, f"Missing stack[{index}] integer assignment in:\n{block}"
+        return int(match.group(1))
+
+    # create_tma_descriptor(data_type, rank, global_addr,
+    #   global_shape..., global_stride..., smem_box..., smem_stride...,
+    #   interleave, swizzle, l2_promotion, oob_fill)
+    expected_tma_args = {
+        1: 13,  # CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B
+        2: 2,  # rank
+        4: 256,  # global_shape[0], reversed innermost dimension
+        5: 128,  # global_shape[1]
+        6: 1,  # raw innermost stride, ignored by CUDA encode
+        7: 128,  # next global stride in bytes: 256 fp4 elements == 128 bytes
+        8: 128,  # smem_box[0]: 128 fp4 elements == 64 bytes
+        9: 64,  # smem_box[1]
+        10: 1,  # element stride[0]
+        11: 1,  # element stride[1]
+        12: 0,  # CU_TENSOR_MAP_INTERLEAVE_NONE
+        13: 2,  # CU_TENSOR_MAP_SWIZZLE_64B
+        14: 2,  # CU_TENSOR_MAP_L2_PROMOTION_L2_128B
+        15: 0,  # CU_TENSOR_MAP_FLOAT_OOB_FILL_NONE
+    }
+    for desc_name in ("A_desc", "B_desc"):
+        block = descriptor_init_block(desc_name)
+        for index, expected in expected_tma_args.items():
+            assert stack_int(block, index) == expected
+
+    a = torch.randint(-128, 128, (M, N // 2), device="cuda", dtype=torch.int8)
+    b = kernel(a)
+    assert torch.equal(b.view(torch.int8), a)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_fp4_tma_copy_roundtrip():
+    run_fp4_tma_copy_roundtrip()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_tma_copy_store_pipeline_2_stages():
+    run_gemm_tma_copy_store(num_stages=2)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_tma_copy_store_pipeline_3_stages():
+    run_gemm_tma_copy_store(num_stages=3)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
+    # test_tma_copy_pipeline_2_stages()
diff --git a/testing/python/language/test_tilelang_language_tma_store.py b/testing/python/language/test_tilelang_language_tma_store.py
new file mode 100644
index 0000000000..4c5a68cdf2
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_tma_store.py
@@ -0,0 +1,190 @@
+"""Tests for TMA store (shared -> global).
+
+Explicit T.tma_copy(shared_buf, global_buf) emits tma_store + tma_store_arrive
+(no wait). The user must explicitly call T.tma_store_wait() for
+synchronization.
+
+Plain T.copy(shared_buf, global_buf) may also auto-lower to tma_store when the
+store-side TMA constraints are satisfied. In that case lowering emits both
+tma_store_arrive and tma_store_wait automatically.
+"""
+
+from tilelang import tvm as tvm
+import tilelang.testing
+import tilelang.language as T
+import tilelang
+
+
+def matmul_tma_store(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    threads,
+    num_stages,
+):
+    """GEMM with T.copy for loads and T.tma_copy for the final store (shared -> global)."""
+    A_shape = (M, K)
+    B_shape = (K, N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), in_dtype)
+            B_shared = T.alloc_shared((block_K, block_N), in_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            # Store: fragment -> shared, then shared -> global via T.tma_copy
+            T.copy(C_local, C_shared)
+            T.tma_copy(C_shared, C[by * block_M, bx * block_N])
+            T.tma_store_wait()
+
+    return main
+
+
+def run_gemm_tma_store(num_stages):
+    M, N, K = 1024, 1024, 1024
+    block_M, block_N, block_K = 128, 128, 32
+    in_dtype = T.float16
+    out_dtype = T.float16
+    accum_dtype = T.float32
+    threads = 128
+
+    program = matmul_tma_store(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        threads,
+        num_stages,
+    )
+    kernel = tilelang.compile(
+        program,
+        out_idx=[2],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    kernel_source = kernel.get_kernel_source()
+    print(kernel_source)
+    # exit()
+    # Verify that the generated kernel contains tma_store_arrive but NOT tma_store_wait
+    # (the wait is issued separately by the user via T.tma_store_wait)
+    assert "tma_store_arrive" in kernel_source, "Expected tma_store_arrive in kernel source"
+
+    profiler = kernel.get_profiler()
+
+    def ref_program(A, B):
+        import torch
+
+        C = torch.matmul(A.to(torch.float), B.to(torch.float))
+        return C.to(torch.__getattribute__(out_dtype))
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def auto_tma_store_copy(M, N, block_M, block_N, dtype, threads):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_N), dtype)
+            T.copy(A[by * block_M, bx * block_N], A_shared)
+            T.copy(A_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def full_shape_tma_store_1d(dtype, threads):
+    @T.prim_func
+    def main(
+        C: T.Tensor((128, 128), dtype),
+    ):
+        with T.Kernel(threads=threads):
+            C_shared = T.alloc_shared((128, 128), dtype)
+            T.copy(C_shared, C)
+
+    return main
+
+
+def run_auto_tma_store_copy():
+    M = N = 256
+    block_M = block_N = 128
+    dtype = T.float16
+    threads = 128
+
+    program = auto_tma_store_copy(M, N, block_M, block_N, dtype, threads)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    kernel_source = kernel.get_kernel_source()
+    assert "tma_store_arrive" in kernel_source, "Expected auto tma_store_arrive in kernel source"
+    assert "tma_store_wait" in kernel_source, "Expected auto tma_store_wait in kernel source"
+
+    profiler = kernel.get_profiler()
+
+    def ref_program(A):
+        return A
+
+    profiler.assert_allclose(ref_program, atol=1e-2, rtol=1e-2)
+
+
+def run_full_shape_tma_store_1d_codegen():
+    program = full_shape_tma_store_1d(T.float32, 128)
+    kernel = tilelang.compile(
+        program,
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    kernel_source = kernel.get_kernel_source()
+    assert "tl::tma_store" in kernel_source, "Expected TMA store in kernel source"
+    assert "CUtensorMap" not in kernel_source, "Expected pointer-based 1D TMA store"
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_tma_store_2_stages():
+    run_gemm_tma_store(num_stages=2)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_tma_store_3_stages():
+    run_gemm_tma_store(num_stages=3)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_plain_copy_auto_tma_store():
+    run_auto_tma_store_copy()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_plain_copy_full_shape_tma_store_uses_1d():
+    run_full_shape_tma_store_1d_codegen()
+
+
+if __name__ == "__main__":
+    # tilelang.testing.main()
+    test_tma_store_2_stages()
diff --git a/testing/python/language/test_tilelang_language_transpose.py b/testing/python/language/test_tilelang_language_transpose.py
new file mode 100644
index 0000000000..e01f04935a
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_transpose.py
@@ -0,0 +1,112 @@
+"""Tests for T.transpose shared memory transpose primitive."""
+
+import tilelang
+import tilelang.language as T
+import torch
+
+
+def tilelang_transpose(M, N, block_M, block_N, dtype=T.float16):
+    """Kernel: read tile from A into shared, transpose in shared, write to B.
+
+    A is (M, N), B is (M, N).
+    B = A.T.T = A when block_M == M and block_N == N (single tile).
+    Actually: we read A tile (block_M, block_N) into shared,
+    transpose to (block_N, block_M) in shared, then write to B
+    so B[bx*block_N + j, by*block_M + i] = A[by*block_M + i, bx*block_N + j]
+    i.e., B = A.T
+    """
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, N), dtype),
+        B: T.Tensor((N, M), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            tile = T.alloc_shared((block_M, block_N), dtype)
+            tile_T = T.alloc_shared((block_N, block_M), dtype)
+
+            # Load from global to shared
+            T.copy(
+                A[by * block_M : (by + 1) * block_M, bx * block_N : (bx + 1) * block_N],
+                tile,
+            )
+            # Transpose in shared memory
+            T.transpose(tile, tile_T)
+            # Store transposed tile back to global
+            T.copy(
+                tile_T,
+                B[bx * block_N : (bx + 1) * block_N, by * block_M : (by + 1) * block_M],
+            )
+
+    return main
+
+
+def run_tilelang_transpose(M=128, N=128, block_M=128, block_N=128, dtype=T.float16):
+    program = tilelang_transpose(M, N, block_M, block_N, dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    a = torch.randn(M, N, device="cuda", dtype=getattr(torch, dtype))
+    b = kernel(a)
+    expected = a.T
+    torch.testing.assert_close(b, expected, rtol=1e-2, atol=1e-2)
+    print(f"PASS: transpose M={M}, N={N}, block_M={block_M}, block_N={block_N}")
+
+
+def tilelang_transpose_square(M, block_M, dtype=T.float16):
+    """Simpler test: square transpose with single tile."""
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, M), dtype),
+        B: T.Tensor((M, M), dtype),
+    ):
+        with T.Kernel(T.ceildiv(M, block_M), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            tile = T.alloc_shared((block_M, block_M), dtype)
+            tile_T = T.alloc_shared((block_M, block_M), dtype)
+
+            T.copy(
+                A[by * block_M : (by + 1) * block_M, bx * block_M : (bx + 1) * block_M],
+                tile,
+            )
+            T.transpose(tile, tile_T)
+            T.copy(
+                tile_T,
+                B[bx * block_M : (bx + 1) * block_M, by * block_M : (by + 1) * block_M],
+            )
+
+    return main
+
+
+def run_tilelang_transpose_square(M=256, block_M=128, dtype=T.float16):
+    program = tilelang_transpose_square(M, block_M, dtype)
+    kernel = tilelang.compile(
+        program,
+        out_idx=[1],
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    a = torch.randn(M, M, device="cuda", dtype=getattr(torch, dtype))
+    b = kernel(a)
+    expected = a.T
+    torch.testing.assert_close(b, expected, rtol=1e-2, atol=1e-2)
+    print(f"PASS: square transpose M={M}, block_M={block_M}")
+
+
+@tilelang.testing.requires_cuda
+def test_tilelang_transpose():
+    run_tilelang_transpose(M=128, N=128, block_M=128, block_N=128)
+    run_tilelang_transpose(M=256, N=256, block_M=128, block_N=128)
+    run_tilelang_transpose(M=128, N=256, block_M=128, block_N=256)
+
+
+@tilelang.testing.requires_cuda
+def test_tilelang_transpose_square():
+    run_tilelang_transpose_square(M=128, block_M=128)
+    run_tilelang_transpose_square(M=256, block_M=128)
+    run_tilelang_transpose_square(M=512, block_M=128)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_unroll.py b/testing/python/language/test_tilelang_language_unroll.py
index 06367e975e..2adb638553 100644
--- a/testing/python/language/test_tilelang_language_unroll.py
+++ b/testing/python/language/test_tilelang_language_unroll.py
@@ -1,3 +1,4 @@
+import tilelang
 import tilelang.testing
 from tilelang import tvm as tvm
 from tilelang import language as T
@@ -13,10 +14,12 @@ def main(A_ptr: T.handle):
                 for i in T.unroll(0, 16, step=4):
                     A[0, i] = 1.0
 
-    kernel = tilelang.compile(main, target="cuda")
+    kernel = tilelang.compile(main)
     assert "#pragma unroll" in kernel.get_kernel_source()
 
 
+# TODO: unroll factor is not supported on hip, skip.
+@tilelang.testing.requires_cuda
 def test_unroll_with_unroll_factor():
     @T.prim_func
     def main(A_ptr: T.handle):
@@ -27,9 +30,26 @@ def main(A_ptr: T.handle):
                 for i in T.unroll(0, 16, unroll_factor=4):
                     A[0, i] = 1.0
 
-    kernel = tilelang.compile(main, target="cuda")
+    kernel = tilelang.compile(main)
     assert "#pragma unroll 4" in kernel.get_kernel_source()
 
 
+def test_unroll_with_extent_only():
+    """Test T.unroll with only extent parameter."""
+
+    @tilelang.jit
+    def unroll_kernel():
+        out = T.empty((512,), dtype=T.float32)
+        with T.Kernel(1, threads=512):
+            tid = T.get_thread_binding()
+            for i in T.unroll(tid % 32):
+                out[i] = i
+        return out
+
+    kernel = unroll_kernel.compile()
+    source = kernel.get_kernel_source()
+    assert "#pragma unroll" in source
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_var_init.py b/testing/python/language/test_tilelang_language_var_init.py
index 36d9bf0141..35e8a074d6 100644
--- a/testing/python/language/test_tilelang_language_var_init.py
+++ b/testing/python/language/test_tilelang_language_var_init.py
@@ -3,6 +3,8 @@
 import tilelang.testing
 
 
+# TODO: var init is not supported on hip.
+@tilelang.testing.requires_cuda
 def test_var_assign() -> None:
     @tilelang.jit(out_idx=-1)
     def jit_kernel():
diff --git a/testing/python/language/test_tilelang_language_vectorize.py b/testing/python/language/test_tilelang_language_vectorize.py
index 7462aa81bb..f042339d42 100644
--- a/testing/python/language/test_tilelang_language_vectorize.py
+++ b/testing/python/language/test_tilelang_language_vectorize.py
@@ -1,6 +1,8 @@
 import torch
 import tilelang.testing
 import tilelang.language as T
+
+from tilelang.intrinsics import make_mma_swizzle_layout
 import pytest
 
 
@@ -49,7 +51,7 @@ def run_vectorize(N, M, stride_A, stride_B):
 
 
 def test_vectorize():
-    N, M = 512, 256
+    N, M = 128, 128
 
     run_vectorize(N, M, N, N)
     run_vectorize(N, M, N + 2, N + 4)
@@ -103,13 +105,11 @@ def run_vectorize_invariant_index(N, M, K):
 
 
 def test_vectorize_invariant_index():
-    N, M = 512, 256
+    N, M = 128, 128
 
     run_vectorize_invariant_index(N, M, 2)
     run_vectorize_invariant_index(N, M, 4)
     run_vectorize_invariant_index(N, M * 3, 6)
-    run_vectorize_invariant_index(N, M, 8)
-    run_vectorize_invariant_index(N, M * 3, 12)
     run_vectorize_invariant_index(N, M * 7, 14)
 
 
@@ -124,16 +124,13 @@ def main(A: T.Tensor[(64,), dtype]):
     return main
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "dtype",
     [
         torch.uint8,
-        torch.uint16,
-        torch.uint32,
         torch.uint64,
         torch.int8,
-        torch.int16,
-        torch.int32,
         torch.int64,
         torch.float8_e4m3fn,
         torch.float8_e5m2,
@@ -147,5 +144,55 @@ def test_vectorize_all_dtypes(dtype, vec_num):
     kernel(x)
 
 
+@tilelang.jit
+def vectorize_broadcast_int8(vec_num):
+    with T.Kernel(1, threads=128):
+        a = T.alloc_local((64,), "int8")
+        b = T.alloc_var("int8")
+
+        for i in T.vectorized(vec_num):
+            a[i] = b
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("vec_num", [4, 32])
+def test_vectorize_broadcast_int8(vec_num):
+    """Test broadcasting a non-constant int8 value to a vectorized store."""
+    vectorize_broadcast_int8.compile(vec_num=vec_num)
+
+
+@tilelang.jit
+def vectorize_test_call_infinity():
+    A = T.empty((4,), dtype=T.float32)
+    with T.Kernel(1, threads=128):
+        for i in T.vectorized(4):
+            A[i] = T.infinity(T.float32)
+    return A
+
+
+def test_vectorize_call_infinity():
+    kernel = vectorize_test_call_infinity.compile()
+    assert "float4" in kernel.get_kernel_source()
+
+
+@tilelang.jit(
+    pass_configs={tilelang.PassConfigKey.TL_ENABLE_VECTORIZE_PLANNER_VERBOSE: True, tilelang.PassConfigKey.TL_ENABLE_ASYNC_COPY: False}
+)
+def vectorize_test_call_bitwise_logical():
+    A = T.empty((128, 32), dtype=T.float32)
+    with T.Kernel(1, threads=128):
+        A_shared = T.alloc_shared((128, 32), dtype=T.float32)
+        T.annotate_layout({A_shared: make_mma_swizzle_layout(A_shared)})
+        for i, j in T.Parallel(128, 32):
+            A_shared[i, j] = A[i, j]
+    return A
+
+
+def test_vectorize_call_bitwise_logical():
+    kernel = vectorize_test_call_bitwise_logical.compile()
+    print(kernel.get_kernel_source())
+    assert "float4" in kernel.get_kernel_source()
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_vectorized_cast.py b/testing/python/language/test_tilelang_language_vectorized_cast.py
index 33d40e679f..1082589049 100644
--- a/testing/python/language/test_tilelang_language_vectorized_cast.py
+++ b/testing/python/language/test_tilelang_language_vectorized_cast.py
@@ -55,9 +55,12 @@ def run_vectorized_cast(src_dtype: T.dtype, dst_dtype: T.dtype, check_str: str,
 
     code = kernel.get_kernel_source()
     code_parallel = kernel_parallel.get_kernel_source()
-    print(code)
     assert check_str in code and check_str in code_parallel, f"Cast {src_dtype} to {dst_dtype} with {lanes=} is not vectorized!"
 
+    # Requires torch >= 2.8
+    if src_dtype == T.float8_e8m0fnu or dst_dtype == T.float8_e8m0fnu:
+        return
+
     if src_dtype == T.float4_e2m1fn or dst_dtype == T.float4_e2m1fn:
         return
 
@@ -75,6 +78,7 @@ def run_vectorized_cast(src_dtype: T.dtype, dst_dtype: T.dtype, check_str: str,
     torch.testing.assert_close(A.to(dst_dtype.as_torch()), C)
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "src_dtype, dst_dtype, check_str, lanes",
     [
@@ -82,10 +86,6 @@ def run_vectorized_cast(src_dtype: T.dtype, dst_dtype: T.dtype, check_str: str,
         (T.float32, T.float16, "__float22half2_rn", 4),
         (T.float16, T.float32, "__half22float2", 2),
         (T.float16, T.float32, "__half22float2", 4),
-        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 2),
-        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 4),
-        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 2),
-        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 4),
         (T.float32, T.bfloat16, "__float22bfloat162_rn", 2),
         (T.float32, T.bfloat16, "__float22bfloat162_rn", 4),
         (T.bfloat16, T.float32, "__bfloat1622float2", 2),
@@ -101,10 +101,22 @@ def test_vectorized_cast(src_dtype, dst_dtype, check_str, lanes):
 @pytest.mark.parametrize(
     "src_dtype, dst_dtype, check_str, lanes",
     [
+        # FP8 <-> FP32
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e4m3fn, "__nv_cvt_float2_to_fp8x2", 4),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 2),
+        (T.float32, T.float8_e5m2, "__nv_cvt_float2_to_fp8x2", 4),
         (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
         (T.float8_e4m3fn, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
         (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 2),
         (T.float8_e5m2, T.float32, "__tl_cvt_fp8x2_to_float2", 4),
+        # E8M0 <-> BFloat16
+        (T.float8_e8m0fnu, T.bfloat16, "__tl_cvt_e8m0x2_to_bfloat162", 2),
+        (T.bfloat16, T.float8_e8m0fnu, "__tl_cvt_bfloat162_to_e8m0x2", 2),
+        # Float -> E8M0
+        (T.float32, T.float8_e8m0fnu, "__tl_cvt_float2_to_e8m0x2", 2),
+        # Double -> E8M0
+        (T.float64, T.float8_e8m0fnu, "__tl_cvt_double2_to_e8m0x2", 2),
     ],
 )
 def test_vectorized_cast_fp8(src_dtype, dst_dtype, check_str, lanes):
diff --git a/testing/python/language/test_tilelang_language_view.py b/testing/python/language/test_tilelang_language_view.py
index dc4c3711b2..8f427d8757 100644
--- a/testing/python/language/test_tilelang_language_view.py
+++ b/testing/python/language/test_tilelang_language_view.py
@@ -3,6 +3,7 @@
 import tilelang.testing
 import tilelang as tl
 import pytest
+import torch
 
 
 def view_test(N, M, dtype, new_dtype=None):
@@ -82,5 +83,73 @@ def test_view_shape_mismatch():
         view_shape_mismatch_test(1024, 32, T.float32)
 
 
+def test_view_subbyte_dtype_change():
+    A = tvm.tir.decl_buffer((16, 32), "float4_e2m1fn", name="A")
+    A_viewed = T.view(A, (16, 16), dtype=T.uint8)
+    assert str(A_viewed.dtype) == "uint8"
+    assert tuple(int(dim) for dim in A_viewed.shape) == (16, 16)
+    assert A_viewed.data.same_as(A.data)
+
+
+def fp4_to_uint8_view_test(rows_per_cta=16, mask_k=256):
+    @T.prim_func
+    def main(
+        A: T.Tensor((rows_per_cta, mask_k), T.bfloat16),
+        B: T.Tensor((rows_per_cta, mask_k // 2), T.uint8),
+    ):
+        with T.Kernel(1, threads=256) as _:
+            A_frag = T.alloc_fragment((rows_per_cta, mask_k), T.bfloat16)
+            B_shared_fp4 = T.alloc_shared((rows_per_cta, mask_k), T.float4_e2m1fn)
+            B_shared_uint8 = T.view(B_shared_fp4, (rows_per_cta, mask_k // 2), dtype=T.uint8)
+
+            T.copy(A, A_frag)
+            for i, j in T.Parallel(rows_per_cta, mask_k):
+                B_shared_fp4[i, j] = T.cast(A_frag[i, j], T.float4_e2m1fn)
+            T.copy(B_shared_uint8, B)
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10, 0)
+def test_view_shared_fp4_to_uint8_compile():
+    program = fp4_to_uint8_view_test()
+    kernel = tl.compile(program, out_idx=-1)
+    src = kernel.get_kernel_source()
+    assert "fp4_e2" in src
+
+    dummy_input = torch.randn((16, 256), device="cuda", dtype=torch.bfloat16)
+    output = kernel(dummy_input)
+    assert output.shape == (16, 128)
+    assert output.dtype == torch.uint8
+
+
+def annotated_layout_on_dtype_changing_view_test():
+    @T.prim_func
+    def main(
+        A: T.Tensor((64, 64), T.float16),
+        B: T.Tensor((64, 128), T.int8),
+    ):
+        with T.Kernel(1, threads=128) as _:
+            A_stage = T.alloc_shared((2, 64, 64), T.float16, scope="shared.dyn")
+            A_i8 = T.view(A_stage, (2, 64, 128), dtype=T.int8)
+            T.annotate_layout({A_i8: T.Layout((2, 64, 128), lambda s, i, j: [s, i, j])})
+
+            for i, j in T.Parallel(64, 64):
+                A_stage[0, i, j] = A[i, j]
+
+            for i, j in T.Parallel(64, 128):
+                B[i, j] = A_i8[0, i, j]
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_annotated_layout_on_dtype_changing_view_compile():
+    program = annotated_layout_on_dtype_changing_view_test()
+    kernel = tl.compile(program, out_idx=-1)
+    assert kernel.get_kernel_source()
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_warp_sync.py b/testing/python/language/test_tilelang_language_warp_sync.py
new file mode 100644
index 0000000000..d113a43c0f
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_warp_sync.py
@@ -0,0 +1,62 @@
+import tilelang
+import tilelang.language as T
+import torch
+from tvm import tir
+import tilelang.testing
+
+
+@tilelang.jit
+def kernel_with_warp_sync():
+    @T.prim_func
+    def main(
+        A: T.Tensor((1,), "int32"),
+        B: T.Tensor((1,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            if tx == 0:
+                tir.call_extern("void", "__nanosleep", 100)
+                A[0] = -1
+            T.sync_warp()
+            if tx == 1:
+                B[0] = A[0]
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_warp_sync():
+    a = torch.empty((1), device="cuda", dtype=torch.int32)
+    b = torch.empty((1), device="cuda", dtype=torch.int32)
+    kernel = kernel_with_warp_sync()
+    assert "__syncwarp" in kernel.get_kernel_source()
+    kernel(a, b)
+    assert b[0] == -1
+
+
+@tilelang.jit
+def kernel_with_shfl_sync():
+    @T.prim_func
+    def main(
+        A: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            val = tx * 10
+            broadcast = T.shfl_sync(val, 31)
+            A[tx] = broadcast
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_shfl_sync():
+    a = torch.empty((32), device="cuda", dtype=torch.int32)
+    kernel = kernel_with_shfl_sync()
+    assert "__shfl_sync" in kernel.get_kernel_source()
+    kernel(a)
+    assert torch.all(a == 310)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_warp_vote.py b/testing/python/language/test_tilelang_language_warp_vote.py
new file mode 100644
index 0000000000..8a630ce6df
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_warp_vote.py
@@ -0,0 +1,410 @@
+"""Tests for warp-vote / warp-ballot / block-sync-with-predicate intrinsics.
+
+Covered intrinsics
+------------------
+T.any_sync          – __any_sync  / __any  (HIP)
+T.all_sync          – __all_sync  / __all  (HIP)
+T.ballot_sync       – __ballot_sync→uint64 (CUDA, zero-ext) / __ballot uint64 (HIP, all lanes)
+T.ballot            – full-warp ballot_sync / __ballot uint64 (HIP, all lanes)
+T.activemask        – __activemask→uint64 (CUDA, zero-ext) / __ballot(1) uint64 (HIP, all lanes)
+T.syncthreads_count – __syncthreads_count
+T.syncthreads_and   – __syncthreads_and
+T.syncthreads_or    – __syncthreads_or
+"""
+
+import tilelang
+import tilelang.language as T
+import torch
+import tilelang.testing
+
+
+# ---------------------------------------------------------------------------
+# any_sync
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_any_sync():
+    """Lane 0 has a non-zero predicate; any_sync should return non-zero for all lanes."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            val = T.any_sync(tx == 0)
+            B[tx] = val
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_any_sync():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_any_sync()
+    src = kernel.get_kernel_source()
+    assert "__any_sync" in src or "__any" in src, f"Expected __any_sync/__any in source:\n{src}"
+    kernel(b)
+    # any lane (lane 0) has predicate==1 → result must be non-zero for all lanes
+    assert torch.all(b != 0), f"Expected all non-zero, got {b}"
+
+
+# ---------------------------------------------------------------------------
+# all_sync
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_all_sync():
+    """All lanes always pass predicate 1 → all_sync should return non-zero."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            val = T.all_sync(1)
+            B[tx] = val
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_all_sync():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_all_sync()
+    src = kernel.get_kernel_source()
+    assert "__all_sync" in src or "__all" in src, f"Expected __all_sync/__all in source:\n{src}"
+    kernel(b)
+    assert torch.all(b != 0), f"Expected all non-zero, got {b}"
+
+
+# ---------------------------------------------------------------------------
+# ballot_sync
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_ballot_sync():
+    """Only lane 0 has a non-zero predicate → bit 0 of ballot must be set."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int64"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            mask = T.ballot_sync(tx == 0)
+            B[tx] = T.cast(mask, "int64")
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_ballot_sync():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int64)
+    kernel = kernel_ballot_sync()
+    src = kernel.get_kernel_source()
+    assert "__ballot_sync" in src or "__ballot" in src, f"Expected __ballot_sync/__ballot in source:\n{src}"
+    kernel(b)
+    # All lanes read the same ballot value; bit 0 must be set (lane 0 had pred=1)
+    assert int(b[0]) & 1, f"Expected bit 0 set in ballot result, got {int(b[0]):#018x}"
+    # upper 32 bits must be zero on CUDA (32-wide warp)
+    assert (int(b[0]) >> 32) == 0, f"Expected upper 32 bits zero on CUDA, got {int(b[0]):#018x}"
+
+
+# ---------------------------------------------------------------------------
+# ballot  (full-warp convenience wrapper)
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_ballot():
+    """All lanes pass predicate 1 → lower 32 bits of ballot must all be set."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int64"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            mask = T.ballot(1)
+            B[tx] = T.cast(mask, "int64")
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_ballot():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int64)
+    kernel = kernel_ballot()
+    src = kernel.get_kernel_source()
+    assert "__ballot_sync" in src or "__ballot" in src, f"Expected __ballot_sync/__ballot in source:\n{src}"
+    kernel(b)
+    # With predicate=1 for all 32 lanes the lower 32 bits should be 0xFFFFFFFF;
+    # upper 32 bits are 0 on CUDA (32-wide warp).
+    assert (int(b[0]) & 0xFFFFFFFF) == 0xFFFFFFFF, f"Expected lower 32 bits all set, got {int(b[0]):#018x}"
+    assert (int(b[0]) >> 32) == 0, f"Expected upper 32 bits zero on CUDA, got {int(b[0]):#018x}"
+
+
+# ---------------------------------------------------------------------------
+# activemask
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_activemask():
+    """All 32 threads are active → lower 32 bits of activemask must all be set."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int64"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            mask = T.activemask()
+            B[tx] = T.cast(mask, "int64")
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_activemask():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int64)
+    kernel = kernel_activemask()
+    src = kernel.get_kernel_source()
+    assert "__activemask" in src or "__ballot" in src, f"Expected __activemask/__ballot in source:\n{src}"
+    kernel(b)
+    # All 32 lanes active → lower 32 bits = 0xFFFFFFFF; upper 32 bits = 0 on CUDA.
+    assert (int(b[0]) & 0xFFFFFFFF) == 0xFFFFFFFF, f"Expected lower 32 bits all set, got {int(b[0]):#018x}"
+    assert (int(b[0]) >> 32) == 0, f"Expected upper 32 bits zero on CUDA, got {int(b[0]):#018x}"
+
+
+# ---------------------------------------------------------------------------
+# syncthreads_count
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_syncthreads_count():
+    """Exactly half the threads (lanes 0–15) pass predicate 1."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            cnt = T.syncthreads_count(tx < 16)
+            B[tx] = cnt
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_syncthreads_count():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_syncthreads_count()
+    src = kernel.get_kernel_source()
+    assert "__syncthreads_count" in src, f"Expected __syncthreads_count in source:\n{src}"
+    kernel(b)
+    assert torch.all(b == 16), f"Expected all 16, got {b}"
+
+
+# ---------------------------------------------------------------------------
+# syncthreads_and
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_syncthreads_and_true():
+    """All threads pass predicate 1 → syncthreads_and returns non-zero."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.syncthreads_and(1)
+            B[tx] = result
+
+    return main
+
+
+@tilelang.jit
+def kernel_syncthreads_and_false():
+    """Thread 0 passes predicate 0 → syncthreads_and returns 0."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.syncthreads_and(tx != 0)
+            B[tx] = result
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_syncthreads_and():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_syncthreads_and_true()
+    src = kernel.get_kernel_source()
+    assert "__syncthreads_and" in src, f"Expected __syncthreads_and in source:\n{src}"
+    kernel(b)
+    assert torch.all(b != 0), f"Expected all non-zero, got {b}"
+
+    b2 = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel2 = kernel_syncthreads_and_false()
+    kernel2(b2)
+    assert torch.all(b2 == 0), f"Expected all 0, got {b2}"
+
+
+# ---------------------------------------------------------------------------
+# syncthreads_or
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_syncthreads_or_true():
+    """At least one thread (lane 0) passes predicate 1 → syncthreads_or != 0."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.syncthreads_or(tx == 0)
+            B[tx] = result
+
+    return main
+
+
+@tilelang.jit
+def kernel_syncthreads_or_false():
+    """No thread passes predicate 1 → syncthreads_or returns 0."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.syncthreads_or(0)
+            B[tx] = result
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_syncthreads_or():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_syncthreads_or_true()
+    src = kernel.get_kernel_source()
+    assert "__syncthreads_or" in src, f"Expected __syncthreads_or in source:\n{src}"
+    kernel(b)
+    assert torch.all(b != 0), f"Expected all non-zero, got {b}"
+
+    b2 = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel2 = kernel_syncthreads_or_false()
+    kernel2(b2)
+    assert torch.all(b2 == 0), f"Expected all 0, got {b2}"
+
+
+# ---------------------------------------------------------------------------
+# match_any_sync
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_match_any_sync():
+    """Lanes 0-15 share value 1; lanes 16-31 share value 2. match_any_sync
+    should return 0x0000FFFF for the first half and 0xFFFF0000 for the
+    second half."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            value = T.if_then_else(tx < 16, 1, 2)
+            peers = T.match_any_sync(value)
+            B[tx] = T.cast(peers, "int32")
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_match_any_sync():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_match_any_sync()
+    src = kernel.get_kernel_source()
+    assert "__match_any_sync" in src, f"Expected __match_any_sync in source:\n{src}"
+    kernel(b)
+    # Reinterpret the int32 buffer as uint32 to compare against bitmasks
+    # whose high bit is set (0xFFFF0000 overflows int32).
+    observed_u32 = b.to(torch.int64) & 0xFFFFFFFF
+    expected = torch.tensor([0x0000FFFF] * 16 + [0xFFFF0000] * 16, dtype=torch.int64, device="cuda")
+    assert torch.equal(observed_u32, expected), f"Expected {expected}, got {observed_u32}"
+
+
+# ---------------------------------------------------------------------------
+# match_all_sync
+# ---------------------------------------------------------------------------
+
+
+@tilelang.jit
+def kernel_match_all_sync_true():
+    """All lanes share value 7 → match_all_sync returns the full mask."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.match_all_sync(7)
+            B[tx] = T.cast(result, "int32")
+
+    return main
+
+
+@tilelang.jit
+def kernel_match_all_sync_false():
+    """Lanes disagree → match_all_sync returns 0."""
+
+    @T.prim_func
+    def main(
+        B: T.Tensor((32,), "int32"),
+    ):
+        with T.Kernel(1, threads=32):
+            tx = T.get_thread_binding()
+            result = T.match_all_sync(tx)
+            B[tx] = T.cast(result, "int32")
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+def test_match_all_sync():
+    b = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel = kernel_match_all_sync_true()
+    src = kernel.get_kernel_source()
+    assert "__match_all_sync" in src, f"Expected __match_all_sync in source:\n{src}"
+    kernel(b)
+    assert torch.all(b == -1), f"Expected all 0xFFFFFFFF (sign-extended -1), got {b}"
+
+    b2 = torch.zeros((32,), device="cuda", dtype=torch.int32)
+    kernel_match_all_sync_false()(b2)
+    assert torch.all(b2 == 0), f"Expected all 0, got {b2}"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/language/test_tilelang_language_wgmma_gemm.py b/testing/python/language/test_tilelang_language_wgmma_gemm.py
new file mode 100644
index 0000000000..4f5c573837
--- /dev/null
+++ b/testing/python/language/test_tilelang_language_wgmma_gemm.py
@@ -0,0 +1,78 @@
+import pytest
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang import tvm
+
+
+def _make_wgmma_kernel(gemm_op):
+    @T.prim_func
+    def main(
+        A: T.Tensor((64, 16), T.float16),
+        B: T.Tensor((16, 64), T.float16),
+        D: T.Tensor((64, 64), T.float16),
+    ):
+        with T.Kernel(1, threads=128):
+            A_shared = T.alloc_shared((64, 16), T.float16)
+            B_shared = T.alloc_shared((16, 64), T.float16)
+            C_local = T.alloc_fragment((64, 64), T.float16)
+
+            T.copy(A[0:64, 0:16], A_shared)
+            T.copy(B[0:16, 0:64], B_shared)
+            gemm_op(A_shared, B_shared, C_local)
+            T.wait_wgmma(0)
+            T.copy(C_local, D[0:64, 0:64])
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
+@pytest.mark.parametrize(
+    "gemm_api",
+    [T.wgmma_gemm],
+)
+def test_wgmma_gemm_has_no_implicit_wait(gemm_api):
+    kernel = tilelang.compile(_make_wgmma_kernel(lambda A, B, C: gemm_api(A, B, C, clear_accum=True)), target="cuda")
+    src = kernel.get_kernel_source()
+
+    assert src.count("tl::wait_wgmma<0>();") == 1
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
+def test_wgmma_gemm_dispatch_has_no_implicit_wait():
+    kernel = tilelang.compile(
+        _make_wgmma_kernel(lambda A, B, C: T.wgmma_gemm(A, B, C, clear_accum=True)),
+        target="cuda",
+    )
+
+    assert kernel.get_kernel_source().count("tl::wait_wgmma<0>();") == 1
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
+def test_wgmma_gemm_rejects_mma_fallback():
+    @T.prim_func
+    def main(
+        A: T.Tensor((32, 16), T.float16),
+        B: T.Tensor((16, 64), T.float16),
+        D: T.Tensor((32, 64), T.float16),
+    ):
+        with T.Kernel(1, threads=128):
+            A_shared = T.alloc_shared((32, 16), T.float16)
+            B_shared = T.alloc_shared((16, 64), T.float16)
+            C_local = T.alloc_fragment((32, 64), T.float16)
+
+            T.copy(A[0:32, 0:16], A_shared)
+            T.copy(B[0:16, 0:64], B_shared)
+            T.wgmma_gemm(A_shared, B_shared, C_local, clear_accum=True)
+            T.wait_wgmma(0)
+            T.copy(C_local, D[0:32, 0:64])
+
+    with pytest.raises(
+        tvm.error.InternalError,
+        match=r"T\.wgmma_gemm\(\) requires Hopper WGMMA lowering",
+    ):
+        tilelang.compile(main, target="cuda")
diff --git a/testing/python/language/test_tilelang_memory_leak.py b/testing/python/language/test_tilelang_memory_leak.py
index 7da187fa37..f58b884d76 100644
--- a/testing/python/language/test_tilelang_memory_leak.py
+++ b/testing/python/language/test_tilelang_memory_leak.py
@@ -9,10 +9,7 @@
 
 def test_tilelang_globals_leak():
     @tilelang.jit(
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     def get_dummy_kernel():
         @T.prim_func
diff --git a/testing/python/layout/test_tilelang_annotate_loop_layout.py b/testing/python/layout/test_tilelang_annotate_loop_layout.py
new file mode 100644
index 0000000000..4a50b98ffd
--- /dev/null
+++ b/testing/python/layout/test_tilelang_annotate_loop_layout.py
@@ -0,0 +1,110 @@
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+
+
+@tilelang.jit
+def loop_layout_kernel(A, B, loop_layout):
+    M, N = T.const("M, N")
+    A: T.Tensor[(M, N), T.float32]
+    B: T.Tensor[(M, N), T.float32]
+
+    with T.Kernel(1, threads=128):
+        for i, j in T.Parallel(M, N, loop_layout=loop_layout):
+            B[i, j] = A[i, j]
+
+
+@tilelang.testing.requires_cuda
+def test_loop_layout_fragment_vec4():
+    def loop_layout_fn(i, j):
+        elems = i * 32 + j
+        forward_thread = (elems // 4) % 128
+        forward_local = elems % 4 + (elems // 512) * 4
+        return forward_thread, forward_local
+
+    M, N = 128, 32
+    loop_layout = T.Fragment((M, N), forward_fn=loop_layout_fn)
+    kernel = loop_layout_kernel.compile(M=M, N=N, loop_layout=loop_layout)
+    code = kernel.get_kernel_source()
+
+    # Expect vectorized copy along innermost dimension (float4)
+    assert "*(float4*)(B + ((i * 512) + (((int)threadIdx.x) * 4))) = *(float4*)(A + ((i * 512) + (((int)threadIdx.x) * 4)));" in code
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_lt(9, 0)
+def test_loop_layout_identity():
+    def loop_layout_fn(i, j):
+        forward_thread = i
+        forward_local = j
+        return forward_thread, forward_local
+
+    M, N = 128, 32
+    loop_layout = T.Fragment((M, N), forward_fn=loop_layout_fn)
+    kernel = loop_layout_kernel.compile(M=M, N=N, loop_layout=loop_layout)
+    code = kernel.get_kernel_source()
+    assert "*(float4*)(B + ((((int)threadIdx.x) * 32) + (i * 4))) = *(float4*)(A + ((((int)threadIdx.x) * 32) + (i * 4)));" in code
+
+
+@tilelang.jit
+def copy_with_layout_kernel(A, B, loop_layout):
+    M, N = T.const("M, N")
+    A: T.Tensor[(M, N), T.float32]
+    B: T.Tensor[(M, N), T.float32]
+
+    with T.Kernel(1, threads=128):
+        T.copy(A, B, loop_layout=loop_layout)
+
+
+@tilelang.testing.requires_cuda
+def test_copy_loop_layout_annotated_replicate_vec4():
+    def loop_layout_fn(i, j, rep):
+        elems = i * 32 + j
+        fth = (elems // 4) % 64 + rep * 64
+        floc = elems % 4 + (elems // (64 * 4)) * 4
+        return fth, floc
+
+    M, N = 128, 32
+    loop_layout = T.Fragment((M, N), forward_fn=loop_layout_fn, replicate=2)
+    kernel = copy_with_layout_kernel.compile(M=M, N=N, loop_layout=loop_layout)
+    code = kernel.get_kernel_source()
+
+    assert (
+        "*(float4*)(B + ((i * 256) + ((((int)threadIdx.x) & 63) * 4))) = *(float4*)(A + ((i * 256) + ((((int)threadIdx.x) & 63) * 4)));"
+        in code
+    )
+
+
+@tilelang.jit
+def replicate_loop_layout_kernel(A, B, loop_layout):
+    M, N = T.const("M, N")
+    A: T.Tensor[(M, N), T.float32]
+    B: T.Tensor[(M, N), T.float32]
+
+    with T.Kernel(1, threads=128):
+        for i, j in T.Parallel(M, N, loop_layout=loop_layout):
+            B[i, j] = A[i, j]
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_annotate_replicate_loop_layout_vec4():
+    M, N = 128, 32
+
+    def loop_layout_fn(i, j, rep):
+        elems = i * 32 + j
+        forward_thread = (elems // 4) % 64 + rep * 64
+        forward_local = elems % 4 + (elems // (64 * 4)) * 4
+        return forward_thread, forward_local
+
+    loop_layout = T.Fragment((M, N), forward_fn=loop_layout_fn, replicate=2)
+    kernel = replicate_loop_layout_kernel.compile(M=M, N=N, loop_layout=loop_layout)
+    code = kernel.get_kernel_source()
+    assert (
+        "*(float4*)(B + ((i * 256) + ((((int)threadIdx.x) & 63) * 4))) = *(float4*)(A + ((i * 256) + ((((int)threadIdx.x) & 63) * 4)));"
+        in code
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_bank_swizzle_expand.py b/testing/python/layout/test_tilelang_bank_swizzle_expand.py
new file mode 100644
index 0000000000..c5efb17abc
--- /dev/null
+++ b/testing/python/layout/test_tilelang_bank_swizzle_expand.py
@@ -0,0 +1,34 @@
+import pytest
+
+import tilelang.testing
+from tvm import tir
+
+from tilelang.layout.swizzle import (
+    make_full_bank_swizzled_layout,
+    make_half_bank_swizzled_layout,
+    make_quarter_bank_swizzled_layout,
+)
+
+tilelang.testing.set_random_seed()
+
+
+@pytest.mark.parametrize(
+    "make_layout",
+    [
+        make_quarter_bank_swizzled_layout,
+        make_half_bank_swizzled_layout,
+        make_full_bank_swizzled_layout,
+    ],
+)
+def test_bank_swizzle_layout_expand_leading_dims(make_layout):
+    buf2d = tir.decl_buffer((8, 64), "float16", name="A2", scope="shared")
+    buf3d = tir.decl_buffer((2, 8, 64), "float16", name="A3", scope="shared")
+    buf4d = tir.decl_buffer((3, 2, 8, 64), "float16", name="A4", scope="shared")
+
+    layout2d = make_layout(buf2d)
+    assert make_layout(buf3d).is_equal(layout2d.expand([2]))
+    assert make_layout(buf4d).is_equal(layout2d.expand([3, 2]))
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_layout_expand.py b/testing/python/layout/test_tilelang_layout_expand.py
new file mode 100644
index 0000000000..d2eadff21c
--- /dev/null
+++ b/testing/python/layout/test_tilelang_layout_expand.py
@@ -0,0 +1,49 @@
+import pytest
+
+import tilelang
+import tilelang.testing
+from tilelang.layout import Layout
+
+tilelang.testing.set_random_seed()
+
+
+def test_layout_expand_linear_scalar_output():
+    base = Layout([16, 128], lambda j, k: j * 128 + k)
+    expanded = base.expand([4])
+
+    expected = Layout([4, 16, 128], lambda i, j, k: [i, j * 128 + k])
+    assert expanded.is_equal(expected)
+
+    assert list(expanded.get_input_shape()) == [4, 16, 128]
+    assert list(expanded.get_output_shape()) == [4, 2048]
+
+
+def test_layout_expand_multi_output_identity():
+    base = Layout([16, 128], lambda j, k: [j, k])
+    expanded = base.expand(4)
+
+    expected = Layout([4, 16, 128], lambda i, j, k: [i, j, k])
+    assert expanded.is_equal(expected)
+
+    assert list(expanded.get_output_shape()) == [4, 16, 128]
+
+
+def test_layout_expand_noop_empty():
+    base = Layout([16, 128], lambda j, k: j * 128 + k)
+    assert base.expand([]) is base
+    assert base.expand(()) is base
+
+
+def test_layout_expand_invalid_args():
+    base = Layout([16, 128], lambda j, k: j * 128 + k)
+
+    with pytest.raises(ValueError):
+        _ = base.expand(0)
+    with pytest.raises(ValueError):
+        _ = base.expand([-1])
+    with pytest.raises(TypeError):
+        _ = base.expand("4")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/layout/test_tilelang_layout_repeat.py b/testing/python/layout/test_tilelang_layout_repeat.py
new file mode 100644
index 0000000000..10682b1124
--- /dev/null
+++ b/testing/python/layout/test_tilelang_layout_repeat.py
@@ -0,0 +1,75 @@
+import pytest
+
+import tilelang
+import tilelang.testing
+from tilelang.layout import Layout
+from tilelang import _ffi_api
+
+tilelang.testing.set_random_seed()
+
+
+def test_layout_repeat_dim0_linear():
+    atom = Layout([8, 64], lambda i, j: i * 64 + j)
+    repeated = atom.repeat(dim=0, factor=2)
+
+    expected = Layout([16, 64], lambda i, j: [i // 8, (i % 8) * 64 + j])
+    assert repeated.is_equal(expected)
+
+    assert list(repeated.get_input_shape()) == [16, 64]
+    assert list(repeated.get_output_shape()) == [2, 512]
+
+
+def test_layout_repeat_dim1_linear():
+    atom = Layout([8, 64], lambda i, j: i * 64 + j)
+    repeated = atom.repeat(dim=1, factor=3)
+
+    expected = Layout([8, 192], lambda i, j: [j // 64, i * 64 + (j % 64)])
+    assert repeated.is_equal(expected)
+
+    assert list(repeated.get_input_shape()) == [8, 192]
+    assert list(repeated.get_output_shape()) == [3, 512]
+
+
+def test_layout_repeat_multi_output():
+    atom = Layout([8, 64], lambda i, j: [i, j])
+    repeated = atom.repeat(dim=0, factor=2)
+
+    expected = Layout([16, 64], lambda i, j: [i // 8, i % 8, j])
+    assert repeated.is_equal(expected)
+
+    assert list(repeated.get_output_shape()) == [2, 8, 64]
+
+
+def test_layout_repeat_factor_one_is_noop():
+    atom = Layout([8, 64], lambda i, j: i * 64 + j)
+    assert atom.repeat(dim=0, factor=1) is atom
+
+
+def test_layout_repeat_invalid_args():
+    atom = Layout([8, 64], lambda i, j: i * 64 + j)
+
+    with pytest.raises(ValueError):
+        _ = atom.repeat(dim=0, factor=0)
+    with pytest.raises(ValueError):
+        _ = atom.repeat(dim=0, factor=-1)
+    with pytest.raises(ValueError):
+        _ = atom.repeat(dim=2, factor=2)
+    with pytest.raises(ValueError):
+        _ = atom.repeat(dim=-3, factor=2)
+
+
+def test_layout_repeat_invalid_args_cpp_raises_value_error():
+    atom = Layout([8, 64], lambda i, j: i * 64 + j)
+
+    with pytest.raises(ValueError):
+        _ = _ffi_api.Layout_repeat(atom, 0, 0)
+    with pytest.raises(ValueError):
+        _ = _ffi_api.Layout_repeat(atom, 0, -1)
+    with pytest.raises(ValueError):
+        _ = _ffi_api.Layout_repeat(atom, 2, 2)
+    with pytest.raises(ValueError):
+        _ = _ffi_api.Layout_repeat(atom, -3, 2)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/math/test_math_bitwise_reduce.py b/testing/python/math/test_math_bitwise_reduce.py
index 044e0ea376..4e70169955 100644
--- a/testing/python/math/test_math_bitwise_reduce.py
+++ b/testing/python/math/test_math_bitwise_reduce.py
@@ -1,3 +1,4 @@
+import pytest
 import tilelang
 import tilelang.language as T
 import torch
@@ -43,40 +44,13 @@ def reduce_func(
 def run_single_bitwise_reduce(
     name,
     func,
+    a,
     clear=True,
 ):
     M, N = 32, 32
     block_M, block_N = 32, 32
     kernel = bitwise_reduce(M, N, block_M, block_N, name, func, clear)
 
-    # Generate test data that exercises all bit patterns for robust bitwise reduce testing
-    a = torch.zeros((M, N), device="cuda", dtype=torch.int32)
-
-    # Fill with patterns that will produce meaningful results for bitwise operations:
-    # - Different bit patterns across rows/columns
-    # - Mix of 0s and 1s in various positions
-    # - Some all-1s and all-0s patterns for edge cases
-    for i in range(M):
-        for j in range(N):
-            # Create varied bit patterns:
-            # Row-based pattern: alternating bits based on row index
-            row_pattern = (i & 0xF) << (i % 4)  # 4-bit patterns shifted by row
-
-            # Column-based pattern: different bit positions set based on column
-            col_pattern = 1 << (j % 31)  # Single bit set at different positions
-
-            # Combine patterns with XOR to create diverse bit distributions
-            # Add some deterministic "noise" based on position
-            position_factor = (i * N + j) % 256
-
-            # Final value combines all patterns
-            a[i, j] = (row_pattern ^ col_pattern ^ position_factor) & 0xFFFFFFFF
-
-            if i % 4 == 0:
-                a[i, j] &= ~(0x1 << (i // 4))
-            elif i % 2 == 0:
-                a[i, j] |= 0x1 << (i // 2)
-
     if name == "reduce_bitand":
         expected = torch.full((M,), -1, device="cuda", dtype=torch.int32)
     elif name == "reduce_bitor" or name == "reduce_bitxor":
@@ -86,28 +60,73 @@ def run_single_bitwise_reduce(
 
     output = kernel(a, expected)
 
-    for i in range(M):
-        for j in range(N):
-            if name == "reduce_bitand":
-                expected[i] = expected[i] & a[i, j]
-            elif name == "reduce_bitor":
-                expected[i] = expected[i] | a[i, j]
-            elif name == "reduce_bitxor":
-                expected[i] = expected[i] ^ a[i, j]
-            else:
-                raise ValueError("Invalid name: {}".format(name))
+    expected = reference_bitwise_reduce(name, a)
     assert torch.all(output == expected)
     print("✓ {} with clear={} test passed".format(name, clear))
 
 
+def reference_bitwise_reduce(name, a):
+    if name == "reduce_bitand":
+        op = torch.bitwise_and
+        identity = -1
+    elif name == "reduce_bitor":
+        op = torch.bitwise_or
+        identity = 0
+    elif name == "reduce_bitxor":
+        op = torch.bitwise_xor
+        identity = 0
+    else:
+        raise ValueError("Invalid name: {}".format(name))
+
+    reduced = a
+    while reduced.shape[1] > 1:
+        if reduced.shape[1] % 2:
+            padding = torch.full(
+                (reduced.shape[0], 1),
+                identity,
+                device=reduced.device,
+                dtype=reduced.dtype,
+            )
+            reduced = torch.cat([reduced, padding], dim=1)
+        reduced = op(reduced[:, 0::2], reduced[:, 1::2])
+    return reduced[:, 0]
+
+
+@pytest.fixture(scope="module")
+def bitwise_reduce_input():
+    M, N = 32, 32
+    rows = torch.arange(M, dtype=torch.int32)[:, None]
+    cols = torch.arange(N, dtype=torch.int32)[None, :]
+
+    row_pattern = (rows & 0xF) << (rows % 4)
+    col_pattern = torch.bitwise_left_shift(torch.ones_like(cols), cols % 31)
+    position_factor = (rows * N + cols) % 256
+
+    a = row_pattern ^ col_pattern ^ position_factor
+
+    clear_rows = (rows % 4) == 0
+    clear_bits = torch.bitwise_left_shift(torch.ones_like(rows), rows // 4)
+    a = torch.where(clear_rows, a & torch.bitwise_not(clear_bits), a)
+
+    set_rows = ((rows % 4) != 0) & ((rows % 2) == 0)
+    set_bits = torch.bitwise_left_shift(torch.ones_like(rows), rows // 2)
+    a = torch.where(set_rows, a | set_bits, a)
+
+    return a.to(device="cuda")
+
+
+BITWISE_REDUCE_OPS = [
+    ("reduce_bitand", T.reduce_bitand),
+    ("reduce_bitor", T.reduce_bitor),
+    ("reduce_bitxor", T.reduce_bitxor),
+]
+
+
 @tilelang.testing.requires_cuda
-def test_bitwise_reduce_ops():
-    run_single_bitwise_reduce("reduce_bitand", T.reduce_bitand, clear=True)
-    run_single_bitwise_reduce("reduce_bitor", T.reduce_bitor, clear=True)
-    run_single_bitwise_reduce("reduce_bitxor", T.reduce_bitxor, clear=True)
-    run_single_bitwise_reduce("reduce_bitand", T.reduce_bitand, clear=False)
-    run_single_bitwise_reduce("reduce_bitor", T.reduce_bitor, clear=False)
-    run_single_bitwise_reduce("reduce_bitxor", T.reduce_bitxor, clear=False)
+@pytest.mark.parametrize(("name", "func"), BITWISE_REDUCE_OPS, ids=[name for name, _ in BITWISE_REDUCE_OPS])
+@pytest.mark.parametrize("clear", [True, False], ids=["clear", "no-clear"])
+def test_bitwise_reduce_ops(bitwise_reduce_input, name, func, clear):
+    run_single_bitwise_reduce(name, func, bitwise_reduce_input, clear=clear)
 
 
 if __name__ == "__main__":
diff --git a/testing/python/math/test_math_fast_math.py b/testing/python/math/test_math_fast_math.py
index 3c50e95f4c..25ae0dff2f 100644
--- a/testing/python/math/test_math_fast_math.py
+++ b/testing/python/math/test_math_fast_math.py
@@ -3,6 +3,7 @@
 import torch
 import tilelang.testing
 import re
+import pytest
 
 
 def get_mathop_lines(source, mathop_name):
@@ -49,7 +50,7 @@ def check_non_fastmath_usage(source, mathop_name):
     check_fastmath_usage(source, mathop_name, expect_fastmath=False)
 
 
-def run_single_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
+def run_single_arg_mathop_test(mathop_name, mathop_func, M=32, N=32, block_M=32, block_N=32, dtype=T.float32):
     """
     Test single-argument mathops.
     T.exp should generate expf (non-fastmath), T.__exp should generate __expf (fastmath)
@@ -85,7 +86,7 @@ def main(
     print(f"✓ {mathop_name} compilation and execution test passed")
 
 
-def run_two_arg_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
+def run_two_arg_mathop_test(mathop_name, mathop_func, M=32, N=32, block_M=32, block_N=32, dtype=T.float32):
     """
     Test two-argument mathops to ensure they generate non-fastmath CUDA code.
     """
@@ -154,7 +155,7 @@ def main(
 
 def run_abs_test():
     """Test that abs correctly maps to fabs (not __fabsf) in generated CUDA code"""
-    M, N = 128, 128
+    M, N = 32, 32
     block_M, block_N = 32, 32
 
     @T.prim_func
@@ -188,7 +189,7 @@ def main(
     print("✓ abs numerical test passed")
 
 
-def run_fastmath_mathop_test(mathop_name, mathop_func, M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
+def run_fastmath_mathop_test(mathop_name, mathop_func, M=32, N=32, block_M=32, block_N=32, dtype=T.float32):
     """
     Test fastmath mathops to ensure they generate fastmath CUDA code (with __ prefix).
     """
@@ -242,52 +243,63 @@ def main(
     print(f"✓ {mathop_name} numerical test passed")
 
 
+FASTMATH_MATHOPS = [
+    ("__exp", T.__exp),
+    ("__exp10", T.__exp10),
+    ("__log", T.__log),
+    ("__log2", T.__log2),
+    ("__log10", T.__log10),
+    ("__tan", T.__tan),
+    ("__cos", T.__cos),
+    ("__sin", T.__sin),
+]
+
+TWO_ARG_MATHOPS = [
+    ("pow", T.pow),
+    ("fmod", T.fmod),
+]
+
+SINGLE_ARG_MATHOPS = [
+    ("exp", T.exp),
+    ("exp2", T.exp2),
+    ("exp10", T.exp10),
+    ("log", T.log),
+    ("log2", T.log2),
+    ("log10", T.log10),
+    ("sin", T.sin),
+    ("cos", T.cos),
+    ("tan", T.tan),
+    ("sinh", T.sinh),
+    ("cosh", T.cosh),
+    ("tanh", T.tanh),
+    ("atan", T.atan),
+    ("sqrt", T.sqrt),
+    ("rsqrt", T.rsqrt),
+    ("erf", T.erf),
+    ("floor", T.floor),
+    ("ceil", T.ceil),
+    ("trunc", T.trunc),
+    ("round", T.round),
+    ("nearbyint", T.nearbyint),
+]
+
+
 @tilelang.testing.requires_cuda
-def test_mathops_generate_no_fastmath():
+@pytest.mark.parametrize(("name", "func"), SINGLE_ARG_MATHOPS, ids=[name for name, _ in SINGLE_ARG_MATHOPS])
+def test_mathops_generate_no_fastmath(name, func):
     """Test that our tl.* mathops generate fastmath CUDA code (__expf etc.)"""
     # Based on test results, our tl.* intrinsics actually generate
     # no fastmath versions
     # This appears to be the intended behavior
-    single_arg_mathops = [
-        ("exp", T.exp),
-        ("exp2", T.exp2),
-        ("exp10", T.exp10),
-        ("log", T.log),
-        ("log2", T.log2),
-        ("log10", T.log10),
-        ("sin", T.sin),
-        ("cos", T.cos),
-        ("tan", T.tan),
-        ("sinh", T.sinh),
-        ("cosh", T.cosh),
-        ("tanh", T.tanh),
-        ("atan", T.atan),
-        ("sqrt", T.sqrt),
-        ("rsqrt", T.rsqrt),
-        ("erf", T.erf),
-        ("floor", T.floor),
-        ("ceil", T.ceil),
-        ("trunc", T.trunc),
-        ("round", T.round),
-        ("nearbyint", T.nearbyint),
-    ]
-
-    for name, func in single_arg_mathops:
-        run_single_arg_mathop_test(name, func, dtype=T.float32)
-        print(f"✓ {name} test passed")
+    run_single_arg_mathop_test(name, func, dtype=T.float32)
+    print(f"✓ {name} test passed")
 
 
 @tilelang.testing.requires_cuda
-def test_two_arg_mathops_fastmath():
+@pytest.mark.parametrize(("name", "func"), TWO_ARG_MATHOPS, ids=[name for name, _ in TWO_ARG_MATHOPS])
+def test_two_arg_mathops_fastmath(name, func):
     """Test all two-argument mathops"""
-    # Two argument mathops
-    two_arg_mathops = [
-        ("pow", T.pow),
-        ("fmod", T.fmod),
-    ]
-
-    for name, func in two_arg_mathops:
-        run_two_arg_mathop_test(name, func, dtype=T.float32)
+    run_two_arg_mathop_test(name, func, dtype=T.float32)
 
 
 @tilelang.testing.requires_cuda
@@ -297,23 +309,11 @@ def test_abs_maps_to_fabs():
 
 
 @tilelang.testing.requires_cuda
-def test_fastmath_versions():
+@pytest.mark.parametrize(("name", "func"), FASTMATH_MATHOPS, ids=[name for name, _ in FASTMATH_MATHOPS])
+def test_fastmath_versions(name, func):
     """Test that __exp, __exp10, __log, __log2, __log10, __tan, __cos, __sin generate fastmath CUDA code"""
-    # Test fastmath versions
-    fastmath_mathops = [
-        ("__exp", T.__exp),
-        ("__exp10", T.__exp10),
-        ("__log", T.__log),
-        ("__log2", T.__log2),
-        ("__log10", T.__log10),
-        ("__tan", T.__tan),
-        ("__cos", T.__cos),
-        ("__sin", T.__sin),
-    ]
-
-    for name, func in fastmath_mathops:
-        run_fastmath_mathop_test(name, func, dtype=T.float32)
-        print(f"✓ {name} test passed")
+    run_fastmath_mathop_test(name, func, dtype=T.float32)
+    print(f"✓ {name} test passed")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/math/test_math_ieee_math.py b/testing/python/math/test_math_ieee_math.py
index 5d49880027..5adb630806 100644
--- a/testing/python/math/test_math_ieee_math.py
+++ b/testing/python/math/test_math_ieee_math.py
@@ -4,8 +4,20 @@
 import tilelang.testing
 import pytest
 
-
-def run_ieee_math_test(mathop_name, mathop_func, rounding_mode="rn", M=128, N=128, block_M=32, block_N=32, dtype=T.float32):
+ROUNDING_MODES = ["rn", "rz", "ru", "rd"]
+
+
+def run_ieee_math_test(
+    mathop_name,
+    mathop_func,
+    rounding_mode="rn",
+    M=32,
+    N=32,
+    block_M=16,
+    block_N=16,
+    dtype=T.float32,
+    run_execution=True,
+):
     """
     Test IEEE-compliant math operations with specified rounding modes.
     """
@@ -74,6 +86,9 @@ def main_func(
     print(f"\n=== Testing {mathop_name} with rounding mode {rounding_mode} ===")
     print(f"✓ {mathop_name} compilation test passed")
 
+    if not run_execution:
+        return
+
     # Test numerical execution
     torch_dtype = dtype.as_torch()
     a = torch.randn(M, N, device="cuda", dtype=torch_dtype)
@@ -121,63 +136,51 @@ def test_rounding_mode_validation():
 
 
 @tilelang.testing.requires_cuda
-def test_ieee_add_all_rounding_modes():
+@pytest.mark.parametrize("mode", ROUNDING_MODES, ids=ROUNDING_MODES)
+def test_ieee_add_all_rounding_modes(mode):
     """Test IEEE addition with all rounding modes"""
-    rounding_modes = ["rn", "rz", "ru", "rd"]
-
-    for mode in rounding_modes:
-        run_ieee_math_test("ieee_add", T.ieee_add, rounding_mode=mode)
-        print(f"✓ ieee_add with {mode} passed")
+    run_ieee_math_test("ieee_add", T.ieee_add, rounding_mode=mode, run_execution=mode == "rn")
+    print(f"✓ ieee_add with {mode} passed")
 
 
 @tilelang.testing.requires_cuda
-def test_ieee_sub_all_rounding_modes():
+@pytest.mark.parametrize("mode", ROUNDING_MODES, ids=ROUNDING_MODES)
+def test_ieee_sub_all_rounding_modes(mode):
     """Test IEEE subtraction with all rounding modes"""
-    rounding_modes = ["rn", "rz", "ru", "rd"]
-
-    for mode in rounding_modes:
-        run_ieee_math_test("ieee_sub", T.ieee_sub, rounding_mode=mode)
-        print(f"✓ ieee_sub with {mode} passed")
+    run_ieee_math_test("ieee_sub", T.ieee_sub, rounding_mode=mode, run_execution=mode == "rn")
+    print(f"✓ ieee_sub with {mode} passed")
 
 
 @tilelang.testing.requires_cuda
-def test_ieee_mul_all_rounding_modes():
+@pytest.mark.parametrize("mode", ROUNDING_MODES, ids=ROUNDING_MODES)
+def test_ieee_mul_all_rounding_modes(mode):
     """Test IEEE multiplication with all rounding modes"""
-    rounding_modes = ["rn", "rz", "ru", "rd"]
-
-    for mode in rounding_modes:
-        run_ieee_math_test("ieee_mul", T.ieee_mul, rounding_mode=mode)
-        print(f"✓ ieee_mul with {mode} passed")
+    run_ieee_math_test("ieee_mul", T.ieee_mul, rounding_mode=mode, run_execution=mode == "rn")
+    print(f"✓ ieee_mul with {mode} passed")
 
 
 @tilelang.testing.requires_cuda
-def test_ieee_fmaf_all_rounding_modes():
+@pytest.mark.parametrize("mode", ROUNDING_MODES, ids=ROUNDING_MODES)
+def test_ieee_fmaf_all_rounding_modes(mode):
     """Test IEEE fused multiply-add with all rounding modes"""
-    rounding_modes = ["rn", "rz", "ru", "rd"]
-
-    for mode in rounding_modes:
-        run_ieee_math_test("ieee_fmaf", T.ieee_fmaf, rounding_mode=mode)
-        print(f"✓ ieee_fmaf with {mode} passed")
+    run_ieee_math_test("ieee_fmaf", T.ieee_fmaf, rounding_mode=mode, run_execution=mode == "rn")
+    print(f"✓ ieee_fmaf with {mode} passed")
 
 
 @tilelang.testing.requires_cuda
-def test_ieee_frcp_all_rounding_modes():
+@pytest.mark.parametrize("mode", ROUNDING_MODES, ids=ROUNDING_MODES)
+def test_ieee_frcp_all_rounding_modes(mode):
     """Test IEEE reciprocal with all rounding modes"""
-    rounding_modes = ["rn", "rz", "ru", "rd"]
-
-    for mode in rounding_modes:
-        run_ieee_math_test("ieee_frcp", T.ieee_frcp, rounding_mode=mode)
-        print(f"✓ ieee_frcp with {mode} passed")
+    run_ieee_math_test("ieee_frcp", T.ieee_frcp, rounding_mode=mode, run_execution=mode == "rn")
+    print(f"✓ ieee_frcp with {mode} passed")
 
 
 @tilelang.testing.requires_cuda
-def test_ieee_fsqrt_all_rounding_modes():
+@pytest.mark.parametrize("mode", ROUNDING_MODES, ids=ROUNDING_MODES)
+def test_ieee_fsqrt_all_rounding_modes(mode):
     """Test IEEE square root with all rounding modes"""
-    rounding_modes = ["rn", "rz", "ru", "rd"]
-
-    for mode in rounding_modes:
-        run_ieee_math_test("ieee_fsqrt", T.ieee_fsqrt, rounding_mode=mode)
-        print(f"✓ ieee_fsqrt with {mode} passed")
+    run_ieee_math_test("ieee_fsqrt", T.ieee_fsqrt, rounding_mode=mode, run_execution=mode == "rn")
+    print(f"✓ ieee_fsqrt with {mode} passed")
 
 
 @tilelang.testing.requires_cuda
@@ -186,12 +189,12 @@ def test_ieee_frsqrt_rn_only():
 
     @T.prim_func
     def main(
-        A: T.Tensor((128, 128), T.float32),
-        B: T.Tensor((128, 128), T.float32),
+        A: T.Tensor((32, 32), T.float32),
+        B: T.Tensor((32, 32), T.float32),
     ):
-        with T.Kernel(T.ceildiv(128, 32), T.ceildiv(128, 32), threads=128) as (bx, by):
-            for i, j in T.Parallel(32, 32):
-                B[by * 32 + i, bx * 32 + j] = T.ieee_frsqrt(A[by * 32 + i, bx * 32 + j])
+        with T.Kernel(T.ceildiv(32, 16), T.ceildiv(32, 16), threads=128) as (bx, by):
+            for i, j in T.Parallel(16, 16):
+                B[by * 16 + i, bx * 16 + j] = T.ieee_frsqrt(A[by * 16 + i, bx * 16 + j])
 
     kernel = tilelang.compile(
         main,
@@ -206,7 +209,7 @@ def main(
     print("✓ ieee_frsqrt compilation test passed")
 
     # Test numerical execution
-    a = torch.abs(torch.randn(128, 128, device="cuda", dtype=torch.float32)) + 0.1
+    a = torch.abs(torch.randn(32, 32, device="cuda", dtype=torch.float32)) + 0.1
 
     try:
         result = kernel(a)
@@ -217,13 +220,11 @@ def main(
 
 
 @tilelang.testing.requires_cuda
-def test_ieee_fdiv_all_rounding_modes():
+@pytest.mark.parametrize("mode", ROUNDING_MODES, ids=ROUNDING_MODES)
+def test_ieee_fdiv_all_rounding_modes(mode):
     """Test IEEE division with all rounding modes"""
-    rounding_modes = ["rn", "rz", "ru", "rd"]
-
-    for mode in rounding_modes:
-        run_ieee_math_test("ieee_fdiv", T.ieee_fdiv, rounding_mode=mode)
-        print(f"✓ ieee_fdiv with {mode} passed")
+    run_ieee_math_test("ieee_fdiv", T.ieee_fdiv, rounding_mode=mode, run_execution=mode == "rn")
+    print(f"✓ ieee_fdiv with {mode} passed")
 
 
 if __name__ == "__main__":
diff --git a/testing/python/metal/test_metal_codegen.py b/testing/python/metal/test_metal_codegen.py
index 5349bbec58..10e349246d 100644
--- a/testing/python/metal/test_metal_codegen.py
+++ b/testing/python/metal/test_metal_codegen.py
@@ -5,7 +5,7 @@
 import torch
 
 
-@tilelang.jit(execution_backend="torch")
+@tilelang.jit
 def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float32, accum_dtype=T.float32):
     @T.prim_func
     def gemm(
@@ -24,8 +24,9 @@ def gemm(
                 T.copy(A[by * block_M, ko * block_K], A_shared, coalesced_width=2)
                 T.copy(B[ko * block_K, bx * block_N], B_shared, coalesced_width=2)
 
-                for i, j, k in T.Parallel(block_M, block_N, block_K):
-                    C_local[i, j] += A_shared[i, k] * B_shared[k, j]
+                for i, j in T.Parallel(block_M, block_N):
+                    for k in T.Serial(block_K):
+                        C_local[i, j] += A_shared[i, k] * B_shared[k, j]
 
             T.copy(C_local, C[by * block_M, bx * block_N], coalesced_width=2)
 
diff --git a/testing/python/metal/test_metal_codegen_linux.py b/testing/python/metal/test_metal_codegen_linux.py
new file mode 100644
index 0000000000..690a0a3a29
--- /dev/null
+++ b/testing/python/metal/test_metal_codegen_linux.py
@@ -0,0 +1,72 @@
+"""Test Metal code generation on any platform (including Linux).
+
+These tests verify that TileLang can compile kernels down to Metal shader
+source code without requiring a Metal runtime or macOS.
+"""
+
+import tilelang
+from tilelang import tvm as tvm
+import tilelang.testing
+import tilelang.language as T
+
+
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float32, accum_dtype=T.float32):
+    @T.prim_func
+    def main(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype, scope="shared")
+            B_shared = T.alloc_shared((block_K, block_N), dtype, scope="shared")
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=0):
+                T.copy(A[by * block_M, ko * block_K], A_shared, coalesced_width=2)
+                T.copy(B[ko * block_K, bx * block_N], B_shared, coalesced_width=2)
+
+                for i, j in T.Parallel(block_M, block_N):
+                    for k in T.Serial(block_K):
+                        C_local[i, j] += A_shared[i, k] * B_shared[k, j]
+
+            T.copy(C_local, C[by * block_M, bx * block_N], coalesced_width=2)
+
+    return main
+
+
+def assert_metal_codegen(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    dtype=T.float32,
+    accum_dtype=T.float32,
+):
+    func = matmul(M, N, K, block_M, block_N, block_K, dtype=dtype, accum_dtype=accum_dtype)
+    with tvm.transform.PassContext(), tvm.target.Target("metal"):
+        artifact = tilelang.lower(func, target="metal")
+
+    src_code = artifact.kernel_source
+    assert src_code is not None
+    assert "metal" in src_code or "kernel void" in src_code
+
+
+def test_metal_codegen_float32():
+    assert_metal_codegen(1024, 1024, 1024, 16, 16, 16)
+
+
+def test_metal_codegen_float16():
+    assert_metal_codegen(1024, 1024, 1024, 16, 16, 16, dtype=T.float16)
+
+
+def test_metal_codegen_int32():
+    assert_metal_codegen(1024, 1024, 1024, 16, 16, 16, dtype=T.int32)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/profiler/test_tilelang_profiler.py b/testing/python/profiler/test_tilelang_profiler.py
index 09d894c599..23c5afd992 100644
--- a/testing/python/profiler/test_tilelang_profiler.py
+++ b/testing/python/profiler/test_tilelang_profiler.py
@@ -1,5 +1,6 @@
 import tilelang
 import tilelang.language as T
+import pytest
 
 
 @tilelang.jit(out_idx=[-1])
@@ -26,6 +27,7 @@ def gemm(
     return gemm
 
 
+@pytest.mark.perf
 def test_profiler():
     kernel = matmul(1024, 1024, 1024, 128, 128, 32)
 
diff --git a/testing/python/profiler/test_tilelang_profiler_dynamic_symbolic.py b/testing/python/profiler/test_tilelang_profiler_dynamic_symbolic.py
new file mode 100644
index 0000000000..31f639d2e2
--- /dev/null
+++ b/testing/python/profiler/test_tilelang_profiler_dynamic_symbolic.py
@@ -0,0 +1,145 @@
+"""Test for profiler with dynamic symbolic constraints."""
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+import torch
+import pytest
+
+SINGLE_SYMBOLIC_M_VALUES = [128, 256, 512]
+MULTIPLE_SYMBOLIC_CONSTRAINTS = [(128, 128), (256, 512), (512, 256)]
+
+
+@tilelang.jit(out_idx=[-1])
+def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), dtype),
+        B: T.Tensor((K, N), dtype),
+        C: T.Tensor((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+
+            T.clear(C_local)
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[k * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return gemm
+
+
+@pytest.mark.parametrize("m_val", SINGLE_SYMBOLIC_M_VALUES, ids=[f"m={m}" for m in SINGLE_SYMBOLIC_M_VALUES])
+def test_profiler_dynamic_symbolic_single(m_val):
+    """Test profiler with a single dynamic symbolic variable."""
+    M = T.dynamic("m")
+    N = 256
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    latency = profiler.do_bench(dynamic_symbolic_constraints={"m": m_val})
+    assert latency > 0, f"Expected positive latency for m={m_val}, got {latency}"
+    print(f"Latency (m={m_val}): {latency:.3f} ms")
+
+
+@pytest.mark.parametrize(
+    ("m_val", "n_val"),
+    MULTIPLE_SYMBOLIC_CONSTRAINTS,
+    ids=[f"m={m}-n={n}" for m, n in MULTIPLE_SYMBOLIC_CONSTRAINTS],
+)
+def test_profiler_dynamic_symbolic_multiple(m_val, n_val):
+    """Test profiler with multiple dynamic symbolic variables."""
+    M = T.dynamic("m")
+    N = T.dynamic("n")
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    latency = profiler.do_bench(dynamic_symbolic_constraints={"m": m_val, "n": n_val})
+    assert latency > 0, f"Expected positive latency for m={m_val}, n={n_val}, got {latency}"
+    print(f"Latency (m={m_val}, n={n_val}): {latency:.3f} ms")
+
+
+@pytest.mark.parametrize("m_val", SINGLE_SYMBOLIC_M_VALUES, ids=[f"m={m}" for m in SINGLE_SYMBOLIC_M_VALUES])
+def test_profiler_dynamic_symbolic_correctness(m_val):
+    """Test that kernel with dynamic symbolic produces correct results."""
+    M = T.dynamic("m")
+    N = 256
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+
+    a = torch.randn(m_val, K, dtype=torch.float16, device="cuda")
+    b = torch.randn(K, N, dtype=torch.float16, device="cuda")
+
+    c = kernel(a, b)
+    ref_c = a @ b
+
+    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
+    print(f"Correctness test passed for m={m_val}")
+
+
+def test_profiler_dynamic_symbolic_missing_constraint():
+    """Test that missing constraint raises appropriate error."""
+    M = T.dynamic("m")
+    N = 256
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    # Test that missing constraint raises ValueError
+    try:
+        profiler.do_bench(dynamic_symbolic_constraints={"wrong_name": 256})
+        raise ValueError("Expected ValueError for missing constraint")
+    except ValueError as e:
+        assert "m" in str(e), f"Error message should mention missing variable 'm', got: {e}"
+        print(f"Correctly raised error for missing constraint: {e}")
+
+
+def test_profiler_dynamic_symbolic_with_input_tensors():
+    """Test that input_tensors takes precedence over dynamic_symbolic_constraints."""
+    M = T.dynamic("m")
+    N = 256
+    K = 256
+    block_M = 64
+    block_N = 64
+    block_K = 32
+
+    kernel = matmul(M, N, K, block_M, block_N, block_K)
+    profiler = kernel.get_profiler(tensor_supply_type=tilelang.TensorSupplyType.Normal)
+
+    # Manually create input tensors
+    # Note: out_idx=[-1] means C is output, so we only need A and B as inputs
+    concrete_M = 512
+    a = torch.randn(concrete_M, K, dtype=torch.float16, device="cuda")
+    b = torch.randn(K, N, dtype=torch.float16, device="cuda")
+
+    # input_tensors should take precedence
+    latency = profiler.do_bench(input_tensors=[a, b])
+    assert latency > 0, f"Expected positive latency, got {latency}"
+    print(f"Latency with manual input_tensors (M={concrete_M}): {latency:.3f} ms")
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/runtime/test_tilelang_runtime_tma_validation.py b/testing/python/runtime/test_tilelang_runtime_tma_validation.py
new file mode 100644
index 0000000000..a3c9da5138
--- /dev/null
+++ b/testing/python/runtime/test_tilelang_runtime_tma_validation.py
@@ -0,0 +1,112 @@
+import ctypes
+import re
+
+import pytest
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang import tvm
+
+
+CUDA_SM90_TARGET = "cuda -arch=sm_90"
+
+
+def _compile_tvm_ffi(func, *, target=CUDA_SM90_TARGET, target_host="c", pass_configs=None):
+    tilelang.disable_cache()
+    try:
+        return tilelang.compile(
+            func,
+            target=target,
+            target_host=target_host,
+            execution_backend="tvm_ffi",
+            pass_configs=pass_configs or {},
+        )
+    finally:
+        tilelang.enable_cache()
+
+
+def _get_tma_create_tiled():
+    func = tvm.ffi.get_global_func("__tvm_tensormap_create_tiled", allow_missing=True)
+    if func is None:
+        pytest.skip("__tvm_tensormap_create_tiled is unavailable in this build.")
+    return func
+
+
+@tilelang.testing.requires_cuda
+def test_tma_runtime_validation_surfaces_invalid_argument_constraints():
+    create_tiled = _get_tma_create_tiled()
+
+    with pytest.raises(tvm.error.InternalError) as exc_info:
+        create_tiled(
+            ctypes.c_void_p(0x10),
+            7,
+            2,
+            ctypes.c_void_p(0x20),
+            1733,
+            4,
+            4,
+            6932,
+            64,
+            1,
+            1,
+            1,
+            0,
+            0,
+            2,
+            0,
+        )
+
+    message = str(exc_info.value)
+    assert "Invalid TMA descriptor arguments for __tvm_tensormap_create_tiled" in message
+    assert "tensorMap address must be 64-byte aligned" in message
+    assert "effective cuda globalStrides[0]" in message
+    assert "globalStridesRaw [4, 6932]" in message
+    assert "cudaGlobalStrides [6932]" in message
+    assert "format         7 (CU_TENSOR_MAP_DATA_TYPE_FLOAT32)" in message
+
+
+@tilelang.testing.requires_cuda
+def test_tma_host_codegen_aligns_tvm_ffi_stack_alloca_for_descriptor():
+    m, k = 16, 256
+    block_m, block_k = 4, 128
+    threads = 32
+
+    @T.prim_func
+    def tma_copy_2d_desc(
+        x: T.Tensor((m, k), T.float16),
+        y: T.Tensor((m, k), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(m, block_m), T.ceildiv(k, block_k), threads=threads) as (pid_m, pid_k):
+            x_shared = T.alloc_shared((block_m, block_k), dtype=T.float16)
+            mbar = T.alloc_barrier(1)
+            T.tma_copy(
+                x[
+                    pid_m * block_m : (pid_m + 1) * block_m,
+                    pid_k * block_k : (pid_k + 1) * block_k,
+                ],
+                x_shared,
+                barrier=mbar,
+            )
+            T.barrier_arrive(mbar)
+            T.mbarrier_wait_parity(mbar, 0)
+            T.copy(
+                x_shared,
+                y[
+                    pid_m * block_m : (pid_m + 1) * block_m,
+                    pid_k * block_k : (pid_k + 1) * block_k,
+                ],
+            )
+
+    kernel = _compile_tvm_ffi(
+        tma_copy_2d_desc,
+        pass_configs={tilelang.PassConfigKey.TL_ENABLE_FAST_MATH: False, tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+
+    source = kernel.get_host_source()
+    assert "__tvm_tensormap_create_tiled_packed" in source
+    assert re.search(r"__attribute__\(\(aligned\(64\)\)\) TVMFFIAny stack(_\d+)?\[", source)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/target/test_tilelang_codegen_cutedsl_cp_async.py b/testing/python/target/test_tilelang_codegen_cutedsl_cp_async.py
new file mode 100644
index 0000000000..6c668fdbbc
--- /dev/null
+++ b/testing/python/target/test_tilelang_codegen_cutedsl_cp_async.py
@@ -0,0 +1,33 @@
+import pytest
+
+import tilelang
+import tilelang.testing
+import tilelang.language as T
+from tilelang import tvm
+from tilelang.engine.lower import lower
+from tvm.target import Target
+
+
+def test_cutedsl_codegen_supports_tl_ptx_cp_async():
+    if not tvm.runtime.enabled("cuda"):
+        pytest.skip("TileLang CuTeDSL codegen requires TVM built with CUDA support.")
+
+    build_cutedsl = tvm.ffi.get_global_func("target.build.tilelang_cutedsl_without_compile", allow_missing=True)
+    if build_cutedsl is None:
+        pytest.skip("TileLang CuTeDSL backend is not enabled in this build.")
+
+    target = Target({"kind": "cuda", "arch": "sm_80", "keys": ["cuda", "gpu", "cutedsl"]})
+
+    @T.prim_func
+    def prog(A: T.Tensor((16,), "uint8"), B: T.Tensor((16,), "uint8")):
+        with T.Kernel(1, threads=1):
+            A_shared = T.alloc_shared((16,), "uint8", scope="shared")
+            T.ptx_cp_async(T.access_ptr(A_shared[0], "w", 16), T.access_ptr(A[0], "r", 16), 16)
+            B[0] = A_shared[0]
+
+    artifact = lower(prog.with_attr("global_symbol", "main"), target=target)
+    assert "tl.cp_async_gs(" in artifact.kernel_source
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
index b0f4a29c92..8ffffd8ce0 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp.py
@@ -6,11 +6,10 @@
 
 from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
 from tilelang.layout import make_cutlass_metadata_layout
-from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.utils.tensor import torch_assert_close
 from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
 
 torch.backends.cuda.matmul.allow_tf32 = False
-# torch.manual_seed(42)  # only enable when debugging
 
 
 def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
@@ -22,11 +21,11 @@ def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
             low, high = (0, 4) if is_unsigned else (-2, 2)
         else:
             low, high = (0, 128) if is_unsigned else (-64, 64)
-        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
-        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=T.dtype(in_dtype).as_torch(), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=T.dtype(in_dtype).as_torch(), device="cuda")
     else:
-        A = randn_semi_sparse(M, K, dtype=torch.float32, device="cuda", transposed=trans_A).to(map_torch_type(in_dtype))
-        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
+        A = randn_semi_sparse(M, K, dtype=torch.float32, device="cuda", transposed=trans_A).to(T.dtype(in_dtype).as_torch())
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(T.dtype(in_dtype).as_torch())
     return A, B
 
 
@@ -305,20 +304,13 @@ def run_gemm_sp_sm80(
 
 
 @tilelang.testing.requires_cuda
-@tilelang.testing.requires_cuda_compute_version(9, 0)
+@tilelang.testing.requires_cuda_compute_version_eq(9, 0)
 @pytest.mark.parametrize(
     "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
     [
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 2, 128, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 32, 0, 256, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
         (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 0, 128, False, False),
         (512, 1024, 768, T.float16, T.float32, T.float32, 128, 128, 128, 2, 128, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 0, 128, False, False),
         (512, 1024, 768, T.float16, T.float32, T.float32, 64, 128, 256, 2, 128, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
         (512, 1024, 768, T.float8_e4m3fn, T.float16, T.float16, 64, 64, 64, 2, 128, False, True),
         (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
     ],
@@ -333,19 +325,9 @@ def test_gemm_sp_sm90(M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_
 @pytest.mark.parametrize(
     "M, N, K, in_dtype, out_dtype, accum_dtype, block_M, block_N, block_K, num_stages, num_threads, trans_A, trans_B",
     [
-        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 32, 0, 32, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, False),
         (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 32, 32, 64, 0, 32, False, True),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 32, False, True),
         (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 0, 128, False, True),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 1, 128, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 2, 128, False, False),
-        (512, 1024, 768, T.float16, T.float32, T.float32, 64, 64, 64, 3, 128, False, False),
-        (512, 1024, 768, T.int8, T.int32, T.int32, 32, 32, 64, 0, 32, False, True),
-        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 0, 32, False, True),
         (512, 1024, 768, T.int8, T.int32, T.int32, 128, 128, 128, 0, 128, False, True),
-        (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 1, 128, False, True),
         (512, 1024, 768, T.int8, T.int32, T.int32, 64, 64, 64, 2, 128, False, True),
     ],
 )
diff --git a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
index 9d232902c6..32742a005f 100644
--- a/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
+++ b/testing/python/tilelibrary/test_tilelang_tilelibrary_gemm_sp_v2.py
@@ -1,7 +1,7 @@
 import pytest
 from tilelang import tvm as tvm
 from tilelang.utils.sparse import compress, randn_semi_sparse, randint_semi_sparse
-from tilelang.utils.tensor import torch_assert_close, map_torch_type
+from tilelang.utils.tensor import torch_assert_close
 from tilelang.layout import make_cutlass_metadata_layout
 from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
 
@@ -104,10 +104,7 @@ def run_gemm_ss(
     kernel = tilelang.compile(
         program,
         out_idx=[3],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
 
@@ -126,8 +123,8 @@ def _matmul(A, B):
     C = _matmul(A, B)
 
     torch_assert_close(
-        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
-        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        C_sp.to(T.dtype(out_dtype).as_torch()).to(torch.float32),
+        C.to(T.dtype(out_dtype).as_torch()).to(torch.float32),
         rtol=1e-3,
         atol=1e-3,
         base_name="tilelang_sp",
@@ -145,14 +142,15 @@ def generate_dense_input(M, N, K, trans_A, trans_B, in_dtype):
             low, high = (0, 4) if is_unsigned else (-2, 2)
         else:
             low, high = (0, 128) if is_unsigned else (-64, 64)
-        A = randint_semi_sparse(M, K, low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
-        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=map_torch_type(in_dtype), device="cuda")
+        A = randint_semi_sparse(M, K, low=low, high=high, dtype=T.dtype(in_dtype).as_torch(), device="cuda", transposed=trans_A)
+        B = torch.randint(size=(N, K) if trans_B else (K, N), low=low, high=high, dtype=T.dtype(in_dtype).as_torch(), device="cuda")
     else:
-        A = randn_semi_sparse(M, K, dtype=map_torch_type(in_dtype), device="cuda", transposed=trans_A)
-        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(map_torch_type(in_dtype))
+        A = randn_semi_sparse(M, K, dtype=T.dtype(in_dtype).as_torch(), device="cuda", transposed=trans_A)
+        B = torch.randn((N, K) if trans_B else (K, N), device="cuda", dtype=torch.float32).to(T.dtype(in_dtype).as_torch())
     return A, B
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
@@ -269,14 +267,10 @@ def run_gemm_rs(
         num_stages,
         num_threads,
     )
-
     kernel = tilelang.compile(
         program,
         out_idx=[3],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
@@ -294,8 +288,8 @@ def _matmul(A, B):
     C = _matmul(A, B)
 
     torch_assert_close(
-        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
-        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        C_sp.to(T.dtype(out_dtype).as_torch()).to(torch.float32),
+        C.to(T.dtype(out_dtype).as_torch()).to(torch.float32),
         rtol=1e-3,
         atol=1e-3,
         base_name="tilelang_sp",
@@ -304,6 +298,7 @@ def _matmul(A, B):
     print("pass")
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
@@ -423,10 +418,7 @@ def run_gemm_sr(
     kernel = tilelang.compile(
         program,
         out_idx=[3],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
@@ -444,8 +436,8 @@ def _matmul(A, B):
     C = _matmul(A, B)
 
     torch_assert_close(
-        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
-        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        C_sp.to(T.dtype(out_dtype).as_torch()).to(torch.float32),
+        C.to(T.dtype(out_dtype).as_torch()).to(torch.float32),
         rtol=1e-3,
         atol=1e-3,
         base_name="tilelang_sp",
@@ -454,6 +446,7 @@ def _matmul(A, B):
     print("pass")
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
@@ -577,10 +570,7 @@ def run_gemm_rr(
     kernel = tilelang.compile(
         program,
         out_idx=[3],
-        pass_configs={
-            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
-            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
-        },
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
     )
     A, B = generate_dense_input(M, N, K, trans_A, trans_B, in_dtype)
     A_sparse, E = compress(A, transposed=trans_A, block_k=block_K, arch="8.0")
@@ -598,8 +588,8 @@ def _matmul(A, B):
     C = _matmul(A, B)
 
     torch_assert_close(
-        C_sp.to(map_torch_type(out_dtype)).to(torch.float32),
-        C.to(map_torch_type(out_dtype)).to(torch.float32),
+        C_sp.to(T.dtype(out_dtype).as_torch()).to(torch.float32),
+        C.to(T.dtype(out_dtype).as_torch()).to(torch.float32),
         rtol=1e-3,
         atol=1e-3,
         base_name="tilelang_sp",
@@ -608,6 +598,7 @@ def _matmul(A, B):
     print("pass")
 
 
+@tilelang.testing.requires_cuda
 @pytest.mark.parametrize(
     "M, N, K, trans_A, trans_B, in_dtype, out_dtype, dtypeAccum, block_M, block_N, block_K, num_stages, num_threads",
     [
diff --git a/testing/python/transform/test_nullable_buffer_params.py b/testing/python/transform/test_nullable_buffer_params.py
index e02c8125a7..fd31e66012 100644
--- a/testing/python/transform/test_nullable_buffer_params.py
+++ b/testing/python/transform/test_nullable_buffer_params.py
@@ -100,5 +100,45 @@ def sample_kernel(x: T.Tensor[(m,), T.int32]):
     kernel(None)
 
 
+def test_nullable_shared_shape_with_no_source_buffers_but_other_tensor_present():
+    """Test that unused buffers sharing a symbolic shape var can both be None.
+
+    Repro for:
+      - Two (or more) unused buffers have shape (m,)
+      - All buffers that mention `m` are passed as None
+      - Another (non-null) tensor argument exists, but does not mention `m`
+
+    TVM requires at least one non-null buffer to bind `m` when it appears in multiple
+    buffers. TileLang should handle this gracefully for truly-unused nullable buffers.
+    """
+
+    @tilelang.jit(execution_backend="tvm_ffi")
+    def get_kernel():
+        m = T.dynamic("m")
+
+        @T.prim_func
+        def test_kernel(
+            a: T.Tensor[(m,), T.float16],
+            b: T.Tensor[(m,), T.float16],
+            out: T.Tensor[(1,), T.float16],
+        ):
+            with T.Kernel(1, threads=32):
+                fragment = T.alloc_fragment((1,), T.float32)
+                T.copy(out[0], fragment)
+                T.copy(fragment, out[0])
+
+        return test_kernel
+
+    kernel = get_kernel()
+
+    out = torch.randn((1,), device="cuda", dtype=torch.float16)
+    out_ref = out.clone()
+
+    # Both `a` and `b` are None; they also share the symbolic shape var `m`.
+    # This should run because `a`/`b` are unused by the kernel body.
+    kernel(None, None, out)
+    torch.testing.assert_close(out, out_ref)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
index cdff6fb1d3..1ec14566f0 100644
--- a/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
+++ b/testing/python/transform/test_tilelang_transform_Inject_software_pipeline.py
@@ -1,7 +1,30 @@
 from tilelang import tvm as tvm
 import tilelang as tl
 import tilelang.language as T
+from tilelang.layout import Layout
 import tilelang.testing
+from tvm.tir.stmt_functor import post_order_visit
+
+_MVB_ATTR_KEYS = frozenset(
+    [
+        "tl.pipeline_mvb_num_stages",
+        "tl.pipeline_mvb_stage_expr",
+        "tl.pipeline_mvb_parity_expr",
+        "tl.pipeline_context_num_stages",
+    ]
+)
+
+
+@tvm.tir.transform.prim_func_pass(opt_level=0)
+def _strip_mvb_attrs(func, mod, ctx):
+    """Remove intermediate MVB attributes that are consumed by later passes."""
+
+    def _visit(stmt):
+        if isinstance(stmt, tvm.tir.AttrStmt) and str(stmt.attr_key) in _MVB_ATTR_KEYS:
+            return stmt.body
+        return None
+
+    return func.with_body(tvm.tir.stmt_functor.ir_transform(func.body, None, _visit, ["tir.AttrStmt"]))
 
 
 def _check(original, transformed):
@@ -11,9 +34,113 @@ def _check(original, transformed):
     mod = tl.transform.Simplify()(mod)
     mod = tl.transform.LowerOpaqueBlock()(mod)
     mod = tl.transform.Simplify()(mod)
+    mod = _strip_mvb_attrs(mod)
     tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), True)
 
 
+def _count_attrs_and_calls(func):
+    attr_count = {}
+    call_count = {}
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.AttrStmt):
+            key = str(node.attr_key)
+            attr_count[key] = attr_count.get(key, 0) + 1
+        elif isinstance(node, tvm.tir.Call) and isinstance(node.op, tvm.ir.Op):
+            key = str(node.op.name)
+            call_count[key] = call_count.get(key, 0) + 1
+
+    post_order_visit(func.body, _visit)
+    return attr_count, call_count
+
+
+def _collect_attr_values(func, attr_key):
+    values = []
+    stmt = func.body if hasattr(func, "body") else func
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.AttrStmt) and str(node.attr_key) == attr_key:
+            value = node.value
+            if isinstance(value, tvm.tir.IntImm):
+                values.append(int(value.value))
+
+    post_order_visit(stmt, _visit)
+    return values
+
+
+def _collect_attr_value_nodes(func, attr_key):
+    values = []
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.AttrStmt) and str(node.attr_key) == attr_key:
+            values.append(node.value)
+
+    post_order_visit(func.body, _visit)
+    return values
+
+
+def _collect_wait_args(func):
+    wait_args = []
+    stmt = func.body if hasattr(func, "body") else func
+
+    def _visit(node):
+        if (
+            isinstance(node, tvm.tir.Call)
+            and isinstance(node.op, tvm.ir.Op)
+            and str(node.op.name) == "tir.ptx_wait_group"
+            and len(node.args) == 1
+        ):
+            arg = node.args[0]
+            if isinstance(arg, tvm.tir.IntImm):
+                wait_args.append(int(arg.value))
+
+    post_order_visit(stmt, _visit)
+    return wait_args
+
+
+def _find_pipelined_loop(func):
+    loops = []
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.For) and "tl_pipelined_num_stages" in node.annotations:
+            loops.append(node)
+
+    post_order_visit(func.body, _visit)
+    assert loops, "Expected at least one loop annotated with tl_pipelined_num_stages"
+    return loops[0]
+
+
+def _count_copy_calls_with_annotation(func, annotation_key):
+    annotated = 0
+    total = 0
+
+    def _visit(node):
+        nonlocal annotated, total
+        if not isinstance(node, tvm.tir.Call) or not isinstance(node.op, tvm.ir.Op):
+            return
+        if str(node.op.name) not in {"tl.tileop.copy", "tl.tileop.async_copy"}:
+            return
+        total += 1
+        value = node.annotations.get(annotation_key) if node.annotations else None
+        if isinstance(value, tvm.tir.IntImm) and int(value.value) != 0:
+            annotated += 1
+
+    post_order_visit(func.body, _visit)
+    return annotated, total
+
+
+def _find_block_with_layout_map(func):
+    blocks = []
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.Block) and "layout_map" in node.annotations:
+            blocks.append(node)
+
+    post_order_visit(func.body, _visit)
+    assert blocks, "Expected at least one block with layout_map"
+    return blocks[0]
+
+
 def test_trival_pipeline():
     @T.prim_func
     def before(A: T.Tensor((16, 1), T.float32), C: T.Tensor((16, 1), T.float32)):
@@ -39,13 +166,337 @@ def expected(A_handle: T.handle, C_handle: T.handle):
         tx = T.launch_thread("threadIdx.x", 16)
         B = T.decl_buffer((2, 16, 1), scope="shared")
         B[0, tx, 0] = A[tx, 0] * T.float32(2.0)
-        for i in range(0):
-            B[i + 1, tx, 0] = A[tx, i + 1] * T.float32(2.0)
-            C[tx, i] = B[i, tx, 0] + T.float32(1.0)
         C[tx, 0] = B[0, tx, 0] + T.float32(1.0)
 
     _check(before, expected)
 
 
+def test_preserve_inline_cp_async_sync_in_pipeline_stage():
+    @T.prim_func
+    def before(A: T.Tensor((16,), T.uint8), B: T.Tensor((16,), T.uint8)):
+        S = T.alloc_buffer((16,), dtype=T.uint8, scope="shared")
+        for i in T.serial(
+            4,
+            annotations={
+                "software_pipeline_stage": [T.int32(0), T.int32(1)],
+                "software_pipeline_order": [T.int32(0), T.int32(1)],
+                "software_pipeline_async_stages": [T.int32(0)],
+            },
+        ):
+            with T.block():
+                T.reads(A[i * 4 : i * 4 + 4])
+                T.writes(S[i * 4 : i * 4 + 4])
+                T.ptx_cp_async(
+                    T.access_ptr(S[i * 4], "w", 4),
+                    T.access_ptr(A[i * 4], "r", 4),
+                    4,
+                )
+                T.ptx_commit_group()
+                T.ptx_wait_group(0)
+            with T.block():
+                T.reads(S[i * 4 : i * 4 + 4])
+                T.writes(B[i * 4 : i * 4 + 4])
+                B[i * 4] = S[i * 4]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.InjectSoftwarePipeline()(mod)
+    mod = tl.transform.Simplify()(mod)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    mod = tl.transform.Simplify()(mod)
+
+    attrs, calls = _count_attrs_and_calls(mod["main"])
+    assert attrs.get("async_scope", 0) == 0
+    assert attrs.get("async_commit_queue_scope", 0) == 0
+    assert attrs.get("async_wait_queue_scope", 0) == 0
+    assert attrs.get("async_wait_inflight_count", 0) == 0
+    # Inline sync calls should remain explicit in the rewritten pipeline.
+    assert calls.get("tir.ptx_commit_group", 0) > 0
+    assert calls.get("tir.ptx_wait_group", 0) > 0
+
+
+def test_async_pipeline_groups_multiple_copy_producers():
+    @T.prim_func
+    def before(
+        A: T.Tensor((16, 16), T.float32),
+        B: T.Tensor((16, 16), T.float32),
+        C: T.Tensor((16, 16), T.float32),
+    ):
+        for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+            for i in T.serial(
+                0,
+                4,
+                annotations={
+                    "software_pipeline_stage": [0, 0, 1],
+                    "software_pipeline_order": [0, 1, 2],
+                    "software_pipeline_async_stages": [0],
+                    "software_pipeline_async_producers": [1, 1, 0],
+                    "software_pipeline_async_producer_groups": [0, 0, -1],
+                },
+            ):
+                with T.block("compute"):
+                    T.reads(A[tx, i], B[tx, i])
+                    T.writes(C[tx, i])
+                    A_shared = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
+                    B_shared = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
+                    with T.block("copy_a"):
+                        T.reads(A[tx, i])
+                        T.writes(A_shared[tx, 0])
+                        A_shared[tx, 0] = A[tx, i]
+                    with T.block("copy_b"):
+                        T.reads(B[tx, i])
+                        T.writes(B_shared[tx, 0])
+                        B_shared[tx, 0] = B[tx, i]
+                    with T.block("consume"):
+                        T.reads(A_shared[tx, 0], B_shared[tx, 0])
+                        T.writes(C[tx, i])
+                        C[tx, i] = A_shared[tx, 0] + B_shared[tx, 0]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.InjectSoftwarePipeline()(mod)
+    mod = tl.transform.Simplify()(mod)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    mod = tl.transform.Simplify()(mod)
+
+    attrs, calls = _count_attrs_and_calls(mod["main"])
+    assert attrs.get("async_scope", 0) > 0
+    assert attrs.get("async_commit_queue_scope", 0) == 0
+    assert attrs.get("async_wait_queue_scope", 0) == 0
+    assert attrs.get("async_wait_inflight_count", 0) == 0
+    assert calls.get("tir.ptx_commit_group", 0) > 0
+    assert 1 in _collect_wait_args(mod["main"])
+
+
+def test_async_pipeline_only_wraps_producer_statements_from_explicit_group_annotations():
+    @T.prim_func
+    def before(
+        A: T.Tensor((16, 16), T.float32),
+        B: T.Tensor((16, 16), T.float32),
+        C: T.Tensor((16, 16), T.float32),
+    ):
+        for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+            for i in T.serial(
+                0,
+                4,
+                annotations={
+                    "software_pipeline_stage": [0, 0, 0, 1],
+                    "software_pipeline_order": [0, 1, 2, 3],
+                    "software_pipeline_async_stages": [0],
+                    "software_pipeline_async_producers": [0, 1, 1, 0],
+                    "software_pipeline_async_producer_groups": [-1, 0, 0, -1],
+                },
+            ):
+                with T.block("compute"):
+                    T.reads(A[tx, i], B[tx, i])
+                    T.writes(C[tx, i])
+                    A_shared = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
+                    B_shared = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
+                    with T.block("fill"):
+                        T.reads()
+                        T.writes(A_shared[tx, 0])
+                        A_shared[tx, 0] = T.float32(0)
+                    with T.block("copy_a"):
+                        T.reads(A[tx, i])
+                        T.writes(A_shared[tx, 0])
+                        A_shared[tx, 0] = A[tx, i]
+                    with T.block("copy_b"):
+                        T.reads(B[tx, i])
+                        T.writes(B_shared[tx, 0])
+                        B_shared[tx, 0] = B[tx, i]
+                    with T.block("consume"):
+                        T.reads(A_shared[tx, 0], B_shared[tx, 0])
+                        T.writes(C[tx, i])
+                        C[tx, i] = A_shared[tx, 0] + B_shared[tx, 0]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.InjectSoftwarePipeline()(mod)
+    mod = tl.transform.Simplify()(mod)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    mod = tl.transform.Simplify()(mod)
+
+    attrs, calls = _count_attrs_and_calls(mod["main"])
+    # Dead prologue/epilogue producer clones are now dropped during injection,
+    # so only the live producer copies remain wrapped.
+    assert attrs.get("async_scope", 0) == 4
+    assert attrs.get("async_commit_queue_scope", 0) == 0
+    assert calls.get("tir.ptx_commit_group", 0) == 2
+
+
+def test_async_pipeline_marks_copy_ops_for_pipeline_managed_cp_async_sync():
+    @T.prim_func
+    def before(
+        A: T.Tensor((16, 16), T.float32),
+        B: T.Tensor((16, 16), T.float32),
+        C: T.Tensor((16, 16), T.float32),
+    ):
+        for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+            for i in T.serial(
+                0,
+                4,
+                annotations={
+                    "software_pipeline_stage": [0, 0, 1],
+                    "software_pipeline_order": [0, 1, 2],
+                    "software_pipeline_async_stages": [0],
+                    "software_pipeline_async_producers": [1, 1, 0],
+                    "software_pipeline_async_producer_groups": [0, 0, -1],
+                },
+            ):
+                with T.block("compute"):
+                    T.reads(A[tx, i], B[tx, i])
+                    T.writes(C[tx, i])
+                    A_shared = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
+                    B_shared = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
+                    T.copy(A[tx, i : i + 1], A_shared[tx, 0:1])
+                    T.copy(B[tx, i : i + 1], B_shared[tx, 0:1])
+                    C[tx, i] = A_shared[tx, 0] + B_shared[tx, 0]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.InjectSoftwarePipeline()(mod)
+
+    annotated, total = _count_copy_calls_with_annotation(mod["main"], "no_implicit_async_commit_wait")
+    assert total > 0
+    assert annotated == total
+
+
+def test_async_pipeline_does_not_mark_non_cp_async_compatible_copy():
+    @T.prim_func
+    def before(
+        A: T.Tensor((16, 16), T.bfloat16),
+        C: T.Tensor((16, 16), T.float32),
+    ):
+        for tx in T.thread_binding(0, 16, thread="threadIdx.x"):
+            for i in T.serial(
+                0,
+                4,
+                annotations={
+                    "software_pipeline_stage": [0, 1],
+                    "software_pipeline_order": [0, 1],
+                    "software_pipeline_async_stages": [0],
+                    "software_pipeline_async_producers": [1, 0],
+                    "software_pipeline_async_producer_groups": [0, -1],
+                },
+            ):
+                with T.block("compute"):
+                    T.reads(A[tx, i])
+                    T.writes(C[tx, i])
+                    S = T.alloc_buffer((16, 1), dtype=T.float32, scope="shared")
+                    T.copy(A[tx, i : i + 1], S[tx, 0:1])
+                    C[tx, i] = S[tx, 0]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.InjectSoftwarePipeline()(mod)
+
+    annotated, total = _count_copy_calls_with_annotation(mod["main"], "no_implicit_async_commit_wait")
+    assert total > 0
+    assert annotated == 0
+
+
+def test_async_pipeline_relaxes_loop_wait_and_descends_trailing_drain():
+    @T.prim_func
+    def before(A: T.Tensor((40,), T.uint8), B: T.Tensor((40,), T.uint8)):
+        S = T.alloc_buffer((4,), dtype=T.uint8, scope="shared")
+        for i in T.serial(
+            0,
+            5,
+            annotations={
+                "software_pipeline_stage": [0, 3],
+                "software_pipeline_order": [0, 1],
+                "software_pipeline_async_stages": [0],
+                "software_pipeline_async_producers": [1, 0],
+                "software_pipeline_async_producer_groups": [0, -1],
+            },
+        ):
+            with T.block("copy"):
+                T.reads(A[i * 4 : i * 4 + 4])
+                T.writes(S[0:4])
+                T.copy(A[i * 4 : i * 4 + 4], S[0:4])
+            with T.block("consume"):
+                T.reads(S[0:4])
+                T.writes(B[i * 4 : i * 4 + 4])
+                for j in range(4):
+                    B[i * 4 + j] = S[j]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.InjectSoftwarePipeline()(mod)
+    mod = tl.transform.Simplify()(mod)
+
+    func = mod["main"]
+    loop = _find_pipelined_loop(func)
+    loop_waits = _collect_wait_args(loop.body)
+    all_waits = _collect_wait_args(func)
+
+    assert loop_waits == [3], f"Expected relaxed loop wait to keep three groups in flight, got {loop_waits}"
+    assert all_waits == [3, 2, 1, 0], f"Expected trailing waits to descend through the drain suffix, got {all_waits}"
+
+
+def test_degenerate_pipeline_with_single_stage_is_not_expanded():
+    @T.prim_func
+    def before(B: T.Tensor((128,), T.float32)):
+        with T.Kernel(1, threads=128) as _:
+            frag = T.alloc_fragment((4, 128), T.float16)
+            split = T.alloc_fragment((128,), T.float32)
+            scale = T.alloc_fragment((128,), T.float32)
+            for k in T.serial(
+                4,
+                annotations={"software_pipeline_stage": [2, 2], "software_pipeline_order": [0, 1], "tl_pipelined_num_stages": 2},
+            ):
+                for i in T.Parallel(128):
+                    split[i] = T.Cast("float32", frag[k, i])
+                for i in T.Parallel(128):
+                    scale[i] = split[i]
+                    B[i] = scale[i]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.InjectSoftwarePipeline()(mod)
+    mod = tl.transform.Simplify()(mod)
+
+    func = mod["main"]
+    attrs, calls = _count_attrs_and_calls(func)
+    assert attrs.get("tl.pipeline_context_num_stages", 0) == 0
+    assert attrs.get("tl.pipeline_mvb_num_stages", 0) == 0
+    assert attrs.get("tl.pipeline_mvb_stage_expr", 0) == 0
+    assert attrs.get("tl.pipeline_mvb_parity_expr", 0) == 0
+    assert calls.get("tir.ptx_wait_group", 0) == 0
+    assert "tl_pipelined_num_stages" not in func.script()
+    assert "frag[k, i]" in func.script()
+    assert "frag[2, i]" not in func.script()
+
+
+def test_inject_software_pipeline_expands_annotated_layout():
+    layout = Layout([8, 16], lambda i, j: i * 16 + j)
+
+    @T.prim_func
+    def before(A: T.Tensor((4, 8, 16), T.float16), B: T.Tensor((4, 8, 16), T.float16)):
+        with T.block("root"):
+            shared = T.alloc_buffer((8, 16), T.float16, scope="shared.dyn")
+            T.annotate_layout({shared: layout})
+            for k in T.serial(
+                4,
+                annotations={"software_pipeline_stage": [0, 1], "software_pipeline_order": [0, 1]},
+            ):
+                with T.block("load"):
+                    T.reads(A[k, 0:8, 0:16])
+                    T.writes(shared[0:8, 0:16])
+                    for i in T.serial(8):
+                        for j in T.serial(16):
+                            shared[i, j] = A[k, i, j]
+                with T.block("store"):
+                    T.reads(shared[0:8, 0:16])
+                    T.writes(B[k, 0:8, 0:16])
+                    for i in T.serial(8):
+                        for j in T.serial(16):
+                            B[k, i, j] = shared[i, j]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.InjectSoftwarePipeline()(mod)
+
+    block = _find_block_with_layout_map(mod["main"])
+    shared = next(buf for buf in block.alloc_buffers if buf.scope() == "shared.dyn")
+    layout_map = block.annotations["layout_map"]
+
+    assert [int(dim) for dim in shared.shape] == [2, 8, 16]
+    assert list(layout_map[shared.data].get_input_shape()) == [2, 8, 16]
+    assert layout_map[shared.data].is_equal(layout.expand([2]))
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_decouple_type_cast.py b/testing/python/transform/test_tilelang_transform_decouple_type_cast.py
new file mode 100644
index 0000000000..0dcf9bf1de
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_decouple_type_cast.py
@@ -0,0 +1,475 @@
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang.transform import DecoupleTypeCast
+
+
+def _check(original, transformed):
+    """Apply DecoupleTypeCast pass and check IR matches expected output."""
+    mod = tvm.IRModule.from_expr(original.with_attr("global_symbol", "main"))
+    mod = DecoupleTypeCast()(mod)
+
+    transformed = tvm.IRModule.from_expr(transformed.with_attr("global_symbol", "main"))
+
+    tvm.ir.assert_structural_equal(mod["main"], transformed["main"], True)
+
+
+def test_local_to_memory():
+    """Test local → memory: compute to cast buffer, then copy to memory."""
+
+    @T.prim_func
+    def before(b: T.Tensor[(16,), T.float4_e2m1fn]):
+        b_frag = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            b[i] = b_frag[i]
+
+    @T.prim_func
+    def after(b: T.Tensor[(16,), T.float4_e2m1fn]):
+        b_frag = T.alloc_local((16,), T.float32)
+        b_local_cast = T.decl_buffer((16,), T.float4_e2m1fn, scope="local")
+        for i in T.vectorized(16):
+            b_local_cast[i] = T.cast(b_frag[i], T.float4_e2m1fn)
+        for i_copy in T.vectorized(16):
+            b[i_copy] = b_local_cast[i_copy]
+
+    _check(before, after)
+
+
+def test_memory_to_local():
+    """Test memory → local: copy from memory to cast buffer, then compute."""
+
+    @T.prim_func
+    def before(b: T.Tensor[(16,), T.float4_e2m1fn]):
+        b_frag = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            b[i] = b_frag[i]
+
+    @T.prim_func
+    def after(b: T.Tensor[(16,), T.float4_e2m1fn]):
+        b_frag = T.alloc_local((16,), T.float32)
+        b_local_cast = T.decl_buffer((16,), T.float4_e2m1fn, scope="local")
+        for i in T.vectorized(16):
+            b_local_cast[i] = b_frag[i]
+        for i_copy in T.vectorized(16):
+            b[i_copy] = b_local_cast[i_copy]
+
+    _check(before, after)
+
+
+def test_no_transform_same_dtype():
+    """Test no transformation when dtypes are the same."""
+
+    @T.prim_func
+    def before(b: T.Tensor[(16,), T.float32]):
+        b_frag = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            b[i] = b_frag[i]
+
+    @T.prim_func
+    def after(b: T.Tensor[(16,), T.float32]):
+        b_frag = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            b[i] = b_frag[i]
+
+    _check(before, after)
+
+
+def test_no_transform_local_to_local():
+    """Test no transformation for local → local (both are local buffers)."""
+
+    @T.prim_func
+    def before():
+        a_frag = T.alloc_local((16,), T.float32)
+        b_frag = T.alloc_local((16,), T.float4_e2m1fn)
+        for i in T.vectorized(16):
+            b_frag[i] = a_frag[i]
+
+    @T.prim_func
+    def after():
+        a_frag = T.alloc_local((16,), T.float32)
+        b_frag = T.alloc_local((16,), T.float4_e2m1fn)
+        for i in T.vectorized(16):
+            b_frag[i] = T.cast(a_frag[i], T.float4_e2m1fn)
+
+    _check(before, after)
+
+
+def test_no_transform_if_then_else_condition():
+    """Test no transformation when different dtype is only in if_then_else condition.
+
+    The condition part of if_then_else doesn't participate in type casting,
+    so a global/shared buffer load with different dtype in condition should
+    not trigger cast buffer insertion.
+    """
+
+    @T.prim_func
+    def before(cond_buf: T.Tensor[(1,), T.int32]):
+        acc = T.alloc_local((8,), T.float32)
+        for i in T.vectorized(8):
+            # cond_buf is int32, acc is float32, but cond_buf is only in condition
+            acc[i] = T.if_then_else(cond_buf[0] > 0, acc[i] * 2.0, acc[i])
+
+    @T.prim_func
+    def after(cond_buf: T.Tensor[(1,), T.int32]):
+        acc = T.alloc_local((8,), T.float32)
+        for i in T.vectorized(8):
+            # Should remain unchanged - no cast buffer needed
+            acc[i] = T.if_then_else(cond_buf[0] > 0, acc[i] * T.float32(2), acc[i])
+
+    _check(before, after)
+
+
+def test_rmw_same_buffer_different_indices():
+    """RMW with different indices into the same buffer: a[i] = a[i] + a[i+32].
+
+    Both loads and the store target the same buffer but at different index
+    expressions. Each unique (buffer, indices) pair should get its own cast
+    buffer, and the RMW load `a[i]` should read from the same cast buffer the
+    store writes to (so the read-side copy-from and the write-side copy-to
+    share that buffer).
+    """
+
+    @T.prim_func
+    def before(a: T.Tensor[(64,), T.float8_e4m3fn]):
+        for i in T.vectorized(32):
+            a[i] = T.cast(
+                T.cast(a[i], T.float32) + T.cast(a[i + 32], T.float32),
+                T.float8_e4m3fn,
+            )
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = DecoupleTypeCast()(mod)
+
+    # Sanity checks: pass ran, two distinct cast buffers were created, and the
+    # RMW load site no longer references `a` directly in the compute body.
+    text = mod["main"].script()
+    assert "a_local_cast" in text, "Expected cast buffer for store-side of a[i]"
+    assert "a_local_cast_1" in text, "Expected second cast buffer for a[i+32]"
+
+
+def test_local_to_memory_with_let_stmt():
+    """Test local → memory transform still triggers through LetStmt-bound loads."""
+
+    @T.prim_func
+    def before(b: T.Tensor[(16,), T.float8_e4m3fn]):
+        a_frag = T.alloc_local((16,), T.float32)
+        scale = T.alloc_local((16,), T.float32)
+        for i in T.vectorized(16):
+            factor = scale[i]
+            b[i] = a_frag[i] * factor
+
+    @T.prim_func
+    def after(b: T.Tensor[(16,), T.float8_e4m3fn]):
+        a_frag = T.alloc_local((16,), T.float32)
+        scale = T.alloc_local((16,), T.float32)
+        b_local_cast = T.decl_buffer((16,), T.float8_e4m3fn, scope="local")
+        for i in T.vectorized(16):
+            b_local_cast[i] = T.cast(a_frag[i] * scale[i], T.float8_e4m3fn)
+        for i_copy in T.vectorized(16):
+            b[i_copy] = b_local_cast[i_copy]
+
+    _check(before, after)
+
+
+# =============================================================================
+# CUDA Codegen Tests
+# =============================================================================
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_local_to_memory():
+    """Test CUDA codegen for local → memory with vectorized copy."""
+
+    @tilelang.jit
+    def kernel_fn():
+        b = T.empty((16,), dtype="float4_e2m1fn")
+        with T.Kernel(1, threads=32):
+            b_frag = T.alloc_local((16,), T.float32)
+            for i in T.vectorized(16):
+                b[i] = b_frag[i]
+        return b
+
+    kernel = kernel_fn.compile()
+    source = kernel.get_kernel_source()
+
+    # Should have local cast buffer
+    assert "b_local_cast" in source, "Expected local cast buffer in generated code"
+    # Should have vectorized copy (fp4_e2_16_t is 16 fp4 elements = 64 bits)
+    assert "fp4_e2_16_t" in source, "Expected vectorized fp4 copy in generated code"
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_memory_to_local():
+    """Test CUDA codegen for memory → local with vectorized copy."""
+
+    @tilelang.jit
+    def kernel_fn():
+        b = T.empty((16,), dtype="float4_e2m1fn")
+        with T.Kernel(1, threads=32):
+            a_frag = T.alloc_local((16,), T.float32)
+            for i in T.vectorized(16):
+                a_frag[i] = b[i]
+        return b
+
+    kernel = kernel_fn.compile()
+    source = kernel.get_kernel_source()
+
+    # Should have local cast buffer
+    assert "b_local_cast" in source, "Expected local cast buffer in generated code"
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_fp8_local_to_memory():
+    """Test CUDA codegen for fp8 local → memory."""
+
+    @tilelang.jit
+    def kernel_fn():
+        b = T.empty((16,), dtype="float8_e4m3fn")
+        with T.Kernel(1, threads=32):
+            b_frag = T.alloc_local((16,), T.float32)
+            for i in T.vectorized(16):
+                b[i] = b_frag[i]
+        return b
+
+    kernel = kernel_fn.compile()
+    source = kernel.get_kernel_source()
+
+    # Should have local cast buffer
+    assert "b_local_cast" in source, "Expected local cast buffer in generated code"
+    # Should have fp8 conversion (uses __nv_cvt for fp8)
+    assert "fp8" in source and "cvt" in source, "Expected fp8 conversion"
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_no_cast_buffer_same_dtype():
+    """Test no cast buffer when dtypes are the same."""
+
+    @tilelang.jit
+    def kernel_fn():
+        @T.prim_func
+        def kernel(b: T.Tensor[(16,), T.float32]):
+            with T.Kernel(1, threads=32):
+                b_frag = T.alloc_local((16,), T.float32)
+                for i in T.vectorized(16):
+                    b[i] = b_frag[i]
+
+        return kernel
+
+    kernel = kernel_fn()
+    source = kernel.get_kernel_source()
+
+    # Should NOT have local cast buffer when dtypes match
+    assert "local_cast" not in source, "Should not have cast buffer when dtypes match"
+
+
+# =============================================================================
+# End-to-end correctness + vectorization tests for DecoupleTypeCast
+# =============================================================================
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(10)
+def test_e2e_bf16_global_to_frag():
+    """bf16 global -> float32 frag -> bf16 global: roundtrip should be lossless.
+
+    With 1024 bf16 elements and 64 threads, each thread handles 16 bf16 = 256 bits,
+    so the kernel should use 256-bit load/store (load_global_256 / store_global_256).
+    """
+    import torch
+
+    @tilelang.jit(out_idx=[1])
+    def kernel_fn():
+        @T.prim_func
+        def main(
+            A: T.Tensor((1024,), dtype=T.bfloat16),
+            B: T.Tensor((1024,), dtype=T.bfloat16),
+        ):
+            with T.Kernel(1, threads=64):
+                a_frag = T.alloc_fragment((1024,), dtype=T.float32)
+                T.copy(A, a_frag)
+                T.copy(a_frag, B)
+
+        return main
+
+    kernel = kernel_fn()
+
+    # Check vectorization: 256-bit load/store
+    source = kernel.get_kernel_source()
+    assert "load_global_256" in source, "Expected 256-bit global load"
+    assert "store_global_256" in source, "Expected 256-bit global store"
+
+    # Correctness
+    a = torch.randn(1024, device="cuda", dtype=torch.bfloat16)
+    b = kernel(a)
+    torch.testing.assert_close(b, a, rtol=0, atol=0)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(8)
+def test_e2e_bf16_global_shared_frag():
+    """bf16 global -> shared -> float32 frag -> bf16 global: roundtrip should be lossless.
+
+    Shared memory path uses TMA for global->shared, then 128-bit for shared->local.
+    """
+    import torch
+
+    @tilelang.jit(out_idx=[1])
+    def kernel_fn():
+        @T.prim_func
+        def main(
+            A: T.Tensor((1024,), dtype=T.bfloat16),
+            B: T.Tensor((1024,), dtype=T.bfloat16),
+        ):
+            with T.Kernel(1, threads=64):
+                a_shared = T.alloc_shared((1024,), dtype=T.bfloat16)
+                a_frag = T.alloc_fragment((1024,), dtype=T.float32)
+                T.copy(A, a_shared)
+                T.copy(a_shared, a_frag)
+                T.copy(a_frag, B)
+
+        return main
+
+    kernel = kernel_fn()
+
+    # Check: shared path should NOT use 256-bit (shared doesn't support it)
+    source = kernel.get_kernel_source()
+    assert "uint4" in source, f"Expected uint4 store in {source}"
+
+    # Correctness
+    a = torch.randn(1024, device="cuda", dtype=torch.bfloat16)
+    b = kernel(a)
+    torch.testing.assert_close(b, a, rtol=0, atol=0)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9)
+def test_e2e_fp8_global_to_frag():
+    """fp8 global -> float32 frag -> fp8 global: roundtrip should be lossless.
+
+    Verifies that cast constraints do not pollute the memory access layout.
+    With 1024 fp8 elements and 64 threads, each thread handles 16 fp8 = 128 bits,
+    so the kernel should use fp8_e4_16_t (128-bit) loads/stores.
+    """
+    import torch
+
+    @tilelang.jit(out_idx=[1])
+    def kernel_fn():
+        @T.prim_func
+        def main(
+            A: T.Tensor((1024,), dtype=T.float8_e4m3fn),
+            B: T.Tensor((1024,), dtype=T.float8_e4m3fn),
+        ):
+            with T.Kernel(1, threads=64):
+                a_frag = T.alloc_fragment((1024,), dtype=T.float32)
+                T.copy(A, a_frag)
+                T.copy(a_frag, B)
+
+        return main
+
+    kernel = kernel_fn()
+    source = kernel.get_kernel_source()
+    assert "fp8_e4_16_t" in source, (
+        "Expected fp8_e4_16_t (128-bit) loads/stores for N=1024. Cast constraints may be polluting layout decisions."
+    )
+
+    a = (torch.randn(1024, device="cuda", dtype=torch.float32) * 0.5).to(torch.float8_e4m3fn)
+    b = kernel(a)
+    torch.testing.assert_close(
+        b.to(torch.float32),
+        a.to(torch.float32),
+        rtol=0,
+        atol=0,
+    )
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9)
+def test_e2e_fp8_manual_decouple():
+    """fp8 with manually decoupled copy stages: same result as auto-decoupled.
+
+    Tests: fp8 global -> fp8 frag -> float32 frag -> fp8 frag -> fp8 global
+    """
+    import torch
+
+    @tilelang.jit(out_idx=[1])
+    def kernel_fn():
+        @T.prim_func
+        def main(
+            A: T.Tensor((1024,), dtype=T.float8_e4m3fn),
+            B: T.Tensor((1024,), dtype=T.float8_e4m3fn),
+        ):
+            with T.Kernel(1, threads=64):
+                a_frag = T.alloc_fragment((1024,), dtype=T.float8_e4m3fn)
+                b_frag = T.alloc_fragment((1024,), dtype=T.float32)
+                c_frag = T.alloc_fragment((1024,), dtype=T.float8_e4m3fn)
+                T.copy(A, a_frag)
+                T.copy(a_frag, b_frag)
+                T.copy(b_frag, c_frag)
+                T.copy(c_frag, B)
+
+        return main
+
+    kernel = kernel_fn()
+
+    # Check vectorization
+    source = kernel.get_kernel_source()
+    assert "fp8_e4_16_t" in source, "Expected fp8_e4_16_t in kernel source"
+
+    # Correctness
+    a = (torch.randn(1024, device="cuda", dtype=torch.float32) * 0.5).to(torch.float8_e4m3fn)
+    b = kernel(a)
+    torch.testing.assert_close(
+        b.to(torch.float32),
+        a.to(torch.float32),
+        rtol=0,
+        atol=0,
+    )
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version_ge(9)
+def test_e2e_scalar_load_no_cast_buffer():
+    """Test that scalar memory load (b[0]) is not decoupled into a cast buffer.
+
+    When a vectorized loop stores to global with a scalar memory load in the
+    expression (e.g. c[i] = a_local[i] * b[0]), the scalar load's index does
+    not depend on the loop variable. It should remain in the compute loop as
+    a broadcast, not be extracted into a local cast buffer.
+
+    Previously this caused float32x32 codegen errors because both
+    VectorizePlanner and DecoupleTypeCast treated b[0] as a vector memory
+    access.
+    """
+
+    @tilelang.jit
+    def kernel_fn():
+        @T.prim_func
+        def main(
+            a: T.Tensor[(32,), T.float8_e4m3fn],
+            b: T.Tensor[(1,), T.float32],
+            c: T.Tensor[(32,), T.float8_e4m3fn],
+        ):
+            with T.Kernel(1, threads=32):
+                a_local = T.alloc_local((32,), T.float8_e4m3fn)
+                T.copy(a, a_local)
+
+                for i in T.vectorized(32):
+                    c[i] = a_local[i] * b[0]
+
+        return main
+
+    kernel = kernel_fn()
+    source = kernel.get_kernel_source()
+
+    assert "c_local_cast" in source, "Expected c_local_cast for store-side decoupling"
+    assert "b_local_cast" not in source, "Scalar load b[0] should not get a cast buffer"
+
+
+if __name__ == "__main__":
+    test_no_transform_if_then_else_condition()
+    test_e2e_scalar_load_no_cast_buffer()
+    test_e2e_bf16_global_to_frag()
+    test_e2e_bf16_global_shared_frag()
+    test_e2e_fp8_global_to_frag()
+    test_e2e_fp8_manual_decouple()
diff --git a/testing/python/transform/test_tilelang_transform_flatten_buffer.py b/testing/python/transform/test_tilelang_transform_flatten_buffer.py
new file mode 100644
index 0000000000..d4a0ccf443
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_flatten_buffer.py
@@ -0,0 +1,69 @@
+import tilelang
+import tilelang.testing
+from tilelang import tvm
+import tilelang.language as T
+
+
+def _collect_tvm_access_ptr_offsets(func: tvm.tir.PrimFunc):
+    offsets = []
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.Call) and isinstance(node.op, tvm.ir.Op) and str(node.op.name) == "tir.tvm_access_ptr":
+            offsets.append(node.args[2])
+
+    tvm.tir.stmt_functor.post_order_visit(func.body, _visit)
+    return offsets
+
+
+def test_flatten_buffer_promotes_tvm_access_ptr_offset_to_int64():
+    @T.prim_func
+    def before(A: T.Buffer((1,), "float16")):
+        for i in T.serial(1 << 30):
+            T.evaluate(
+                T.tvm_access_ptr(
+                    T.type_annotation(T.float16),
+                    A.data,
+                    i * 4,
+                    1,
+                    1,
+                )
+            )
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    before_offsets = _collect_tvm_access_ptr_offsets(mod["main"])
+    assert len(before_offsets) == 1
+    assert str(before_offsets[0].dtype) == "int32"
+
+    mod = tilelang.transform.FlattenBuffer()(mod)
+    after_offsets = _collect_tvm_access_ptr_offsets(mod["main"])
+    assert len(after_offsets) == 1
+    assert str(after_offsets[0].dtype) == "int64"
+
+
+def test_flatten_buffer_keeps_safe_tvm_access_ptr_offset_int32():
+    @T.prim_func
+    def before(A: T.Buffer((1,), "float16")):
+        for i in T.serial(1 << 20):
+            T.evaluate(
+                T.tvm_access_ptr(
+                    T.type_annotation(T.float16),
+                    A.data,
+                    i * 2,
+                    1,
+                    1,
+                )
+            )
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    before_offsets = _collect_tvm_access_ptr_offsets(mod["main"])
+    assert len(before_offsets) == 1
+    assert str(before_offsets[0].dtype) == "int32"
+
+    mod = tilelang.transform.FlattenBuffer()(mod)
+    after_offsets = _collect_tvm_access_ptr_offsets(mod["main"])
+    assert len(after_offsets) == 1
+    assert str(after_offsets[0].dtype) == "int32"
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_fuse_mbarrier_arrive_expect_tx.py b/testing/python/transform/test_tilelang_transform_fuse_mbarrier_arrive_expect_tx.py
new file mode 100644
index 0000000000..c1ac7e11ce
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_fuse_mbarrier_arrive_expect_tx.py
@@ -0,0 +1,116 @@
+# ruff: noqa
+from tilelang import tvm as tvm
+import tilelang as tl
+from tilelang.utils.target import determine_target
+import tilelang.language as T
+import tilelang.testing
+from tvm import tir
+
+auto_target = tvm.target.Target(determine_target("auto"))
+
+
+def _apply(func):
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.FuseMBarrierArriveExpectTx()(mod)
+    return tir.transform.LowerOpaqueBlock()(mod)
+
+
+def _collect_calls(stmt, op_name: str):
+    calls = []
+
+    def visitor(node):
+        if isinstance(node, tvm.tir.Call) and hasattr(node, "op") and hasattr(node.op, "name") and node.op.name == op_name:
+            calls.append(node)
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, visitor)
+    return calls
+
+
+def test_fuse_simple_tma_expect_arrive():
+    @T.prim_func
+    def before(A_desc: T.handle("uint8x128", "grid_constant")):
+        with T.Kernel(1):
+            smem = T.decl_buffer((16,), T.uint8, scope="shared.dyn")
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            if T.shuffle_elect(0):
+                T.mbarrier_expect_tx(mbarrier[0], 16)
+                T.tma_load(
+                    A_desc,
+                    mbarrier[0],
+                    T.tvm_access_ptr(T.type_annotation(T.uint8), smem.data, 0, 16, 2),
+                    0,
+                    0,
+                    0,
+                )
+                T.ptx_arrive_barrier(mbarrier[0])
+
+    mod = _apply(before)
+    main = mod["main"]
+    assert len(_collect_calls(main.body, "tir.ptx_arrive_barrier_expect_tx")) == 1
+    assert len(_collect_calls(main.body, "tl.mbarrier_expect_tx")) == 0
+    assert len(_collect_calls(main.body, "tir.ptx_arrive_barrier")) == 0
+
+
+def test_fuse_requires_same_barrier():
+    @T.prim_func
+    def before(A_desc: T.handle("uint8x128", "grid_constant")):
+        with T.Kernel(1):
+            smem = T.decl_buffer((16,), T.uint8, scope="shared.dyn")
+            mbarrier = T.decl_buffer((2,), T.uint64, scope="shared.barrier")
+            if T.shuffle_elect(0):
+                T.mbarrier_expect_tx(mbarrier[0], 16)
+                T.tma_load(
+                    A_desc,
+                    mbarrier[0],
+                    T.tvm_access_ptr(T.type_annotation(T.uint8), smem.data, 0, 16, 2),
+                    0,
+                    0,
+                    0,
+                )
+                T.ptx_arrive_barrier(mbarrier[1])
+
+    mod = _apply(before)
+    main = mod["main"]
+    assert len(_collect_calls(main.body, "tir.ptx_arrive_barrier_expect_tx")) == 0
+    assert len(_collect_calls(main.body, "tl.mbarrier_expect_tx")) == 1
+    assert len(_collect_calls(main.body, "tir.ptx_arrive_barrier")) == 1
+
+
+def test_fuse_inside_warp_specialization_scope():
+    @T.prim_func
+    def before(A_desc: T.handle("uint8x128", "grid_constant")):
+        tx = T.launch_thread("threadIdx.x", 256)
+        smem = T.decl_buffer((32,), T.uint8, scope="shared.dyn")
+        mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+        with T.attr([128, 128], "kWarpSpecializationScope", 0):
+            if tx >= 128:
+                if T.shuffle_elect(128):
+                    T.mbarrier_expect_tx(mbarrier[0], 32)
+                    T.tma_load(
+                        A_desc,
+                        mbarrier[0],
+                        T.tvm_access_ptr(T.type_annotation(T.uint8), smem.data, 0, 16, 2),
+                        0,
+                        0,
+                        0,
+                    )
+                    T.tma_load(
+                        A_desc,
+                        mbarrier[0],
+                        T.tvm_access_ptr(T.type_annotation(T.uint8), smem.data, 16, 16, 2),
+                        16,
+                        0,
+                        0,
+                    )
+                    T.ptx_arrive_barrier(mbarrier[0])
+
+    mod = _apply(before)
+    main = mod["main"]
+    assert len(_collect_calls(main.body, "tir.ptx_arrive_barrier_expect_tx")) == 1
+    assert len(_collect_calls(main.body, "tl.mbarrier_expect_tx")) == 0
+    assert len(_collect_calls(main.body, "tir.ptx_arrive_barrier")) == 0
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_hoist_broadcast_values.py b/testing/python/transform/test_tilelang_transform_hoist_broadcast_values.py
new file mode 100644
index 0000000000..dd85ecaa14
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_hoist_broadcast_values.py
@@ -0,0 +1,137 @@
+import tilelang
+import tilelang.language as T
+import torch
+import re
+import pytest
+import tilelang.testing
+from tilelang import tvm as tvm
+import tilelang as tl
+from tilelang.utils.target import determine_target
+
+
+@tilelang.jit
+def qwq(dtype=torch.float8_e4m3fn):
+    @T.prim_func
+    def main(
+        A: T.Tensor((32,), dtype),
+        B: T.Tensor((16,), dtype),
+        C: T.Tensor((8,), dtype),
+        D: T.Tensor((4,), dtype),
+        E: T.Tensor((2,), dtype),
+    ):
+        with T.Kernel(1, threads=32):
+            var = T.alloc_var(dtype, 1.0)
+            for i in T.vectorized(32):
+                A[i] = var
+            for i in T.vectorized(16):
+                B[i] = 13.5
+            for i in T.vectorized(8):
+                C[i] = 3.14
+            for i in T.vectorized(4):
+                D[i] = 2.72
+            for i in T.vectorized(2):
+                E[i] = 430
+
+    return main
+
+
+@tilelang.testing.requires_cuda
+@pytest.mark.parametrize("dtype", [torch.float8_e4m3fn, torch.float8_e5m2, torch.float8_e8m0fnu, torch.float16])
+def test_hoist_broadcast(dtype):
+    kernel = qwq(dtype)
+    print(kernel.get_kernel_source())
+    matches = re.findall(r"(\w+) broadcast_var(_[0-9]+)? = \1", kernel.get_kernel_source())
+    assert len(matches) == 4
+    a = torch.empty((32,), device="cuda", dtype=dtype)
+    b = torch.empty((16,), device="cuda", dtype=dtype)
+    c = torch.empty((8,), device="cuda", dtype=dtype)
+    d = torch.empty((4,), device="cuda", dtype=dtype)
+    e = torch.empty((2,), device="cuda", dtype=dtype)
+    kernel(a, b, c, d, e)
+
+
+auto_target = tvm.target.Target(determine_target("auto"))
+
+
+def _check(original, transformed):
+    mod = tvm.IRModule.from_expr(original.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.HoistBroadcastValues()(mod)
+
+    transformed = tvm.IRModule.from_expr(transformed.with_attr("global_symbol", "main"))
+    transformed = tvm.tir.transform.BindTarget(auto_target)(transformed)
+
+    tvm.ir.assert_structural_equal(mod["main"], transformed["main"], True)
+
+
+def test_transform_hoist():
+    @T.prim_func
+    def before():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            A_shared[0:8] = T.Broadcast(T.float8_e4m3fn(1.2), 8) + T.Broadcast(T.float8_e4m3fn(3.4), 8)
+
+    @T.prim_func
+    def after():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            broadcast_var: T.float8_e4m3fn = T.float8_e4m3fn(1.2)
+            broadcast_var_1: T.float8_e4m3fn = T.float8_e4m3fn(3.4)
+            A_shared[0:8] = T.Broadcast(broadcast_var, 8) + T.Broadcast(broadcast_var_1, 8)
+
+    _check(before, after)
+
+
+def test_transform_hoist_let_stmt():
+    @T.prim_func
+    def before():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            val: T.float8_e4m3fnx8 = T.Broadcast(T.float8_e4m3fn(1.2), 8) + T.Broadcast(T.float8_e4m3fn(3.4), 8)
+            A_shared[0:8] = val
+
+    @T.prim_func
+    def after():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            broadcast_var: T.float8_e4m3fn = T.float8_e4m3fn(1.2)
+            broadcast_var_1: T.float8_e4m3fn = T.float8_e4m3fn(3.4)
+            val: T.float8_e4m3fnx8 = T.Broadcast(broadcast_var, 8) + T.Broadcast(broadcast_var_1, 8)
+            A_shared[0:8] = val
+
+    _check(before, after)
+
+
+def test_transform_hoist_let_stmt_with_nested_bufferstore_broadcasts():
+    """Test case for the bug where BufferStore in LetStmt body clears pending_defs.
+
+    This test validates that broadcasts hoisted from a LetStmt's value expression
+    are preserved even when the body contains a BufferStore with additional broadcasts.
+    """
+
+    @T.prim_func
+    def before():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            # LetStmt value has broadcasts
+            val: T.float8_e4m3fnx8 = T.Broadcast(T.float8_e4m3fn(1.2), 8) + T.Broadcast(T.float8_e4m3fn(3.4), 8)
+            # Body is a BufferStore with additional broadcasts
+            A_shared[0:8] = val + T.Broadcast(T.float8_e4m3fn(5.6), 8)
+
+    @T.prim_func
+    def after():
+        with T.Kernel(8):
+            A_shared = T.decl_buffer((256), T.float8_e4m3fn, scope="shared.dyn")
+            # Hoisted from LetStmt value
+            broadcast_var: T.float8_e4m3fn = T.float8_e4m3fn(1.2)
+            broadcast_var_1: T.float8_e4m3fn = T.float8_e4m3fn(3.4)
+            val: T.float8_e4m3fnx8 = T.Broadcast(broadcast_var, 8) + T.Broadcast(broadcast_var_1, 8)
+            # Hoisted from BufferStore
+            broadcast_var_2: T.float8_e4m3fn = T.float8_e4m3fn(5.6)
+            A_shared[0:8] = val + T.Broadcast(broadcast_var_2, 8)
+
+    _check(before, after)
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
index 533a62fc68..157488f69b 100644
--- a/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
+++ b/testing/python/transform/test_tilelang_transform_inject_fence_proxy.py
@@ -1,3 +1,4 @@
+# ruff: noqa
 from tilelang import tvm as tvm
 import tilelang as tl
 from tilelang.utils.target import determine_target
@@ -21,6 +22,7 @@ def _check(original, transformed):
     tvm.ir.assert_structural_equal(mod["main"], transformed["main"], True)
 
 
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_lower_fence_proxy():
     @T.prim_func
     def before():
@@ -30,6 +32,9 @@ def before():
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
                 C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+            # A shared-memory generic store should trigger a fence before the
+            # following async-proxy GEMM on Hopper (SM90+).
+            A_shared[0, 0, 0] = T.float16(0)
             T.call_intrin(
                 "handle",
                 tir.op.Op.get("tl.tl_gemm"),
@@ -47,6 +52,7 @@ def after():
             C_local = T.decl_buffer((32,), scope="local")
             for i in T.unroll(16):
                 C_local[i * 2 : i * 2 + 2] = T.Broadcast(T.float32(0), 2)
+            A_shared[0, 0, 0] = T.float16(0)
             T.fence_proxy_async()
             T.call_intrin(
                 "handle",
@@ -60,13 +66,18 @@ def after():
     _check(before, after)
 
 
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_async_to_generic_no_double_fence():
     @T.prim_func
     def before():
         with T.Kernel(8):
             A_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
             B_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
-            T.ptx_cp_async("uint8", A_shared.data, 0, B_shared.data, 0, 16)
+            T.ptx_cp_async(
+                T.tvm_access_ptr(T.type_annotation(T.uint8), A_shared.data, 0, 16, 2),
+                T.tvm_access_ptr(T.type_annotation(T.uint8), B_shared.data, 0, 16, 1),
+                16,
+            )
             T.fence_proxy_async()
             T.call_extern("handle", "generic_op")
 
@@ -93,39 +104,251 @@ def visit(node):
     assert _count_fences(mod["main"].body) == 1
 
 
-def test_proxy_hint_override():
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_cp_async_then_wgmma_injects_fence_proxy():
+    """cp.async is treated as generic proxy traffic for fence injection."""
+
     @T.prim_func
     def before():
-        with T.Kernel(8):
-            T.evaluate(T.call_extern("handle", "custom_async"))
-            with T.attr("proxy_scope", "tl.proxy_hint", "neutral"):
-                T.evaluate(T.call_extern("handle", "custom_generic"))
-            T.evaluate(T.call_extern("handle", "custom_async_tail"))
+        with T.Kernel(1):
+            A_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
+            B_global = T.decl_buffer((1024,), T.uint8, scope="global")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.ptx_cp_async(
+                T.tvm_access_ptr(T.type_annotation(T.uint8), A_shared.data, 0, 16, 2),
+                T.tvm_access_ptr(T.type_annotation(T.uint8), B_global.data, 0, 16, 1),
+                16,
+            )
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            A_shared = T.decl_buffer((1024,), T.uint8, scope="shared.dyn")
+            B_global = T.decl_buffer((1024,), T.uint8, scope="global")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.ptx_cp_async(
+                T.tvm_access_ptr(T.type_annotation(T.uint8), A_shared.data, 0, 16, 2),
+                T.tvm_access_ptr(T.type_annotation(T.uint8), B_global.data, 0, 16, 1),
+                16,
+            )
+            T.warpgroup_arrive()
+            T.fence_proxy_async()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_unknown_extern_default_is_none():
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            smem[0] = T.float16(0)
+            T.evaluate(T.call_extern("handle", "custom_op"))
 
     mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
     mod = tvm.tir.transform.BindTarget(auto_target)(mod)
     mod = tl.transform.InjectFenceProxy()(mod)
 
-    def _has_fence(stmt):
-        result = False
+    def _count_fences(stmt):
+        count = 0
 
         def visit(node):
-            nonlocal result
+            nonlocal count
             if isinstance(node, tir.Evaluate):
                 call = node.value
                 if isinstance(call, tir.Call):
-                    op = call.op
-                    name = getattr(op, "name", None)
+                    name = getattr(call.op, "name", None)
                     if name == "tl.fence_proxy_async":
-                        result = True
+                        count += 1
 
         tir.stmt_functor.post_order_visit(stmt, visit)
-        return result
+        return count
+
+    assert _count_fences(mod["main"].body) == 0
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_unknown_extern_shared_store_then_wgmma_injects_fence_proxy():
+    """Opaque calls that may write shared memory must be treated as generic."""
+
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.evaluate(
+                T.call_extern(
+                    "handle",
+                    "custom_smem_store",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), smem.data, 0, 16, 2),
+                )
+            )
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.evaluate(
+                T.call_extern(
+                    "handle",
+                    "custom_smem_store",
+                    T.tvm_access_ptr(T.type_annotation(T.float16), smem.data, 0, 16, 2),
+                )
+            )
+            T.warpgroup_arrive()
+            T.fence_proxy_async()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_unknown_extern_address_of_shared_then_wgmma_injects_fence_proxy():
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.evaluate(T.call_extern("handle", "custom_ptr_store", T.address_of(smem[0])))
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.evaluate(T.call_extern("handle", "custom_ptr_store", T.address_of(smem[0])))
+            T.warpgroup_arrive()
+            T.fence_proxy_async()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
 
-    assert not _has_fence(mod["main"].body)
+    _check(before, after)
 
 
-def test_tma_store_sync_injection():
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_inject_fence_proxy_does_not_inject_tma_store_sync():
     @T.prim_func
     def before():
         with T.Kernel(8):
@@ -151,10 +374,11 @@ def visit(node):
                     waits += 1
 
     tir.stmt_functor.post_order_visit(mod["main"].body, visit)
-    assert arrives == 1
-    assert waits == 1
+    assert arrives == 0
+    assert waits == 0
 
 
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
 def test_wgmma_marked_async():
     @T.prim_func
     def before():
@@ -202,5 +426,1009 @@ def visit(node):
     assert order.index("tl.fence_proxy_async") < order.index("tl.ptx_wgmma_ss")
 
 
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_shared_barrier_ops_do_not_trigger_fence_proxy():
+    @T.prim_func
+    def before(A_desc: T.handle("uint8x128", "grid_constant")):
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.uint8, scope="shared.dyn")
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+
+            # Local stores should not be treated as generic proxy traffic.
+            C_local[0] = T.float16(0)
+
+            # Descriptor initialization is metadata only and should not be
+            # treated as generic proxy traffic.
+            T.initialize_wgmma_descriptor(desc_a, T.uint64(0), 2, 1, 32)
+            T.initialize_wgmma_descriptor(desc_b, T.uint64(0), 2, 1, 32)
+
+            if T.shuffle_elect(0):
+                T.ptx_init_barrier_thread_count(mbarrier[0], 128)
+            T.ptx_fence_barrier_init()
+            T.tvm_storage_sync("shared")
+
+            # Barrier ops should not be classified as generic proxy traffic.
+            T.mbarrier_wait_parity(mbarrier[0], 0)
+            T.mbarrier_expect_tx(mbarrier[0], 16)
+            T.tma_load(
+                A_desc,
+                mbarrier[0],
+                T.tvm_access_ptr(T.type_annotation(T.uint8), smem.data, 0, 16, 2),
+                0,
+                0,
+                0,
+            )
+
+            # Another async proxy op after barrier ops.
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    @T.prim_func
+    def after(A_desc: T.handle("uint8x128", "grid_constant")):
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.uint8, scope="shared.dyn")
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+
+            C_local[0] = T.float16(0)
+
+            T.initialize_wgmma_descriptor(desc_a, T.uint64(0), 2, 1, 32)
+            T.initialize_wgmma_descriptor(desc_b, T.uint64(0), 2, 1, 32)
+
+            if T.shuffle_elect(0):
+                T.ptx_init_barrier_thread_count(mbarrier[0], 128)
+            T.ptx_fence_barrier_init()
+            T.tvm_storage_sync("shared")
+
+            T.mbarrier_wait_parity(mbarrier[0], 0)
+            T.mbarrier_expect_tx(mbarrier[0], 16)
+            T.tma_load(
+                A_desc,
+                mbarrier[0],
+                T.tvm_access_ptr(T.type_annotation(T.uint8), smem.data, 0, 16, 2),
+                0,
+                0,
+                0,
+            )
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_regression_0219_fence_no_fence_inserted():
+    """Regression test copied from `debug/0219_fence/fence.py`.
+
+    This kernel mixes:
+    - shared.barrier initialization + sync
+    - producer TMA loads (async proxy)
+    - consumer WGMMA (async proxy)
+
+    There is no generic shared-memory traffic that should force a
+    generic->async proxy switch, so InjectFenceProxy must be a no-op.
+    """
+
+    @T.prim_func
+    def before(
+        A_desc: T.handle("uint8x128", "grid_constant"),
+        B_desc: T.handle("uint8x128", "grid_constant"),
+        C: T.handle("float16", "global"),
+    ):
+        T.func_attr(
+            {
+                "target": T.target(
+                    {
+                        "arch": "sm_90a",
+                        "keys": ["cuda", "gpu"],
+                        "kind": "cuda",
+                        "max_num_threads": 1024,
+                        "tag": "",
+                        "thread_warp_size": 32,
+                    }
+                ),
+                "tir.is_global_func": True,
+                "tir.noalias": True,
+                "tl.non_restrict_params": [],
+                "tl.readonly_param_indices": [0, 1],
+            }
+        )
+        bx = T.launch_thread("blockIdx.x", 8)
+        buf_dyn_shmem = T.allocate([49152], "uint8", "shared.dyn")
+        C_local = T.allocate([128], "float32", "local")
+        desc_a = T.allocate([1], "uint64", "local.descriptor.wgmma")
+        desc_b = T.allocate([1], "uint64", "local.descriptor.wgmma")
+        C_local_cast = T.allocate([2], "float16", "local")
+        by = T.launch_thread("blockIdx.y", 8)
+        tx = T.launch_thread("threadIdx.x", 256)
+        mbarrier = T.decl_buffer((6,), "uint64", scope="shared.barrier")
+        if T.shuffle_elect(0):
+            T.call_extern("handle", "tl::prefetch_tma_descriptor", A_desc)
+            T.call_extern("handle", "tl::prefetch_tma_descriptor", B_desc)
+            T.ptx_init_barrier_thread_count(mbarrier[0], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[1], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[2], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[3], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[4], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[5], 128)
+        T.ptx_fence_barrier_init()
+        T.tvm_storage_sync("shared")
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        T.attr([128, 128], "kWarpSpecializationScope", 0)
+        if tx >= 128:
+            for ko in range(32):
+                T.mbarrier_wait_parity(mbarrier[ko % 3 + 3], T.bitwise_xor(ko % 6 // 3, 1))
+                if T.shuffle_elect(128):
+                    T.mbarrier_expect_tx(mbarrier[ko % 3], 8192)
+                    T.tma_load(
+                        A_desc,
+                        mbarrier[ko % 3],
+                        T.tvm_access_ptr(T.type_annotation("float16"), buf_dyn_shmem, ko % 3 * 4096, 4096, 2),
+                        ko * 32,
+                        by * 128,
+                        0,
+                    )
+                if T.shuffle_elect(128):
+                    T.mbarrier_expect_tx(mbarrier[ko % 3], 8192)
+                    T.tma_load(
+                        B_desc,
+                        mbarrier[ko % 3],
+                        T.tvm_access_ptr(T.type_annotation("float16"), buf_dyn_shmem, 12288 + ko % 3 * 4096, 2048, 2),
+                        bx * 128,
+                        ko * 32,
+                        0,
+                    )
+                    T.tma_load(
+                        B_desc,
+                        mbarrier[ko % 3],
+                        T.tvm_access_ptr(
+                            T.type_annotation("float16"),
+                            buf_dyn_shmem,
+                            12288 + (ko % 3 * 4096 + 2048),
+                            2048,
+                            2,
+                        ),
+                        bx * 128 + 64,
+                        ko * 32,
+                        0,
+                    )
+                T.ptx_arrive_barrier(mbarrier[ko % 3])
+        else:
+            C_local_2 = T.Buffer((128,), data=C_local, scope="local")
+            for i in T.unroll(32):
+                C_local_2[i * 4 : i * 4 + 4] = T.Broadcast(T.float32(0.0), 4)
+            for ko in range(32):
+                T.mbarrier_wait_parity(mbarrier[ko % 3], ko % 6 // 3)
+                desc_a_2 = T.Buffer((1,), "uint64", data=desc_a, scope="local.descriptor.wgmma")
+                T.initialize_wgmma_descriptor(
+                    desc_a_2[0],
+                    T.tvm_access_ptr(T.type_annotation("float16"), buf_dyn_shmem, ko % 3 * 4096, 4096, 1),
+                    2,
+                    1,
+                    32,
+                )
+                desc_b_2 = T.Buffer((1,), "uint64", data=desc_b, scope="local.descriptor.wgmma")
+                T.initialize_wgmma_descriptor(
+                    desc_b_2[0],
+                    T.tvm_access_ptr(T.type_annotation("float16"), buf_dyn_shmem, 12288 + ko % 3 * 4096, 4096, 1),
+                    1,
+                    256,
+                    64,
+                )
+                T.warpgroup_fence_operand("float32", C_local, 0, 128)
+                T.warpgroup_arrive()
+                for i in T.unroll(2):
+                    for ki in T.unroll(2):
+                        T.ptx_wgmma_ss(
+                            "float32",
+                            "m64n128k16",
+                            T.bool(True),
+                            T.bool(False),
+                            "fp16",
+                            "fp16",
+                            "fp32",
+                            desc_a,
+                            T.shift_right(i * 4096 + ki * 32, 4),
+                            desc_b,
+                            T.shift_right(ki * 2048, 4),
+                            C_local,
+                            i * 64,
+                            1,
+                            1,
+                            1,
+                        )
+                T.warpgroup_commit_batch()
+                T.warpgroup_wait(0)
+                T.warpgroup_fence_operand("float32", C_local, 0, 128)
+                T.ptx_arrive_barrier(mbarrier[ko % 3 + 3])
+            for i in T.unroll(128):
+                C_local_2[i] = T.max(C_local_2[i], T.float32(0.0))
+            for i in T.unroll(64):
+                C_local_cast_2 = T.Buffer((2,), "float16", data=C_local_cast, scope="local")
+                C_local_cast_2[0:2] = T.Cast("float16x2", C_local_2[i * 2 : i * 2 + 2])
+                C_2 = T.Buffer((1048576,), "float16", data=C)
+                C_2[
+                    by * 131072
+                    + i // 32 * 65536
+                    + tx // 32 * 16384
+                    + i % 2 * 8192
+                    + tx % 32 // 4 * 1024
+                    + bx * 128
+                    + i % 32 // 2 * 8
+                    + tx % 4 * 2 : by * 131072
+                    + i // 32 * 65536
+                    + tx // 32 * 16384
+                    + i % 2 * 8192
+                    + tx % 32 // 4 * 1024
+                    + bx * 128
+                    + i % 32 // 2 * 8
+                    + tx % 4 * 2
+                    + 2
+                ] = C_local_cast_2[0:2]
+
+    @T.prim_func
+    def after(
+        A_desc: T.handle("uint8x128", "grid_constant"),
+        B_desc: T.handle("uint8x128", "grid_constant"),
+        C: T.handle("float16", "global"),
+    ):
+        T.func_attr(
+            {
+                "target": T.target(
+                    {
+                        "arch": "sm_90a",
+                        "keys": ["cuda", "gpu"],
+                        "kind": "cuda",
+                        "max_num_threads": 1024,
+                        "tag": "",
+                        "thread_warp_size": 32,
+                    }
+                ),
+                "tir.is_global_func": True,
+                "tir.noalias": True,
+                "tl.non_restrict_params": [],
+                "tl.readonly_param_indices": [0, 1],
+            }
+        )
+        bx = T.launch_thread("blockIdx.x", 8)
+        buf_dyn_shmem = T.allocate([49152], "uint8", "shared.dyn")
+        C_local = T.allocate([128], "float32", "local")
+        desc_a = T.allocate([1], "uint64", "local.descriptor.wgmma")
+        desc_b = T.allocate([1], "uint64", "local.descriptor.wgmma")
+        C_local_cast = T.allocate([2], "float16", "local")
+        by = T.launch_thread("blockIdx.y", 8)
+        tx = T.launch_thread("threadIdx.x", 256)
+        mbarrier = T.decl_buffer((6,), "uint64", scope="shared.barrier")
+        if T.shuffle_elect(0):
+            T.call_extern("handle", "tl::prefetch_tma_descriptor", A_desc)
+            T.call_extern("handle", "tl::prefetch_tma_descriptor", B_desc)
+            T.ptx_init_barrier_thread_count(mbarrier[0], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[1], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[2], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[3], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[4], 128)
+            T.ptx_init_barrier_thread_count(mbarrier[5], 128)
+        T.ptx_fence_barrier_init()
+        T.tvm_storage_sync("shared")
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        T.attr([128, 128], "kWarpSpecializationScope", 0)
+        if tx >= 128:
+            for ko in range(32):
+                T.mbarrier_wait_parity(mbarrier[ko % 3 + 3], T.bitwise_xor(ko % 6 // 3, 1))
+                if T.shuffle_elect(128):
+                    T.mbarrier_expect_tx(mbarrier[ko % 3], 8192)
+                    T.tma_load(
+                        A_desc,
+                        mbarrier[ko % 3],
+                        T.tvm_access_ptr(T.type_annotation("float16"), buf_dyn_shmem, ko % 3 * 4096, 4096, 2),
+                        ko * 32,
+                        by * 128,
+                        0,
+                    )
+                if T.shuffle_elect(128):
+                    T.mbarrier_expect_tx(mbarrier[ko % 3], 8192)
+                    T.tma_load(
+                        B_desc,
+                        mbarrier[ko % 3],
+                        T.tvm_access_ptr(T.type_annotation("float16"), buf_dyn_shmem, 12288 + ko % 3 * 4096, 2048, 2),
+                        bx * 128,
+                        ko * 32,
+                        0,
+                    )
+                    T.tma_load(
+                        B_desc,
+                        mbarrier[ko % 3],
+                        T.tvm_access_ptr(
+                            T.type_annotation("float16"),
+                            buf_dyn_shmem,
+                            12288 + (ko % 3 * 4096 + 2048),
+                            2048,
+                            2,
+                        ),
+                        bx * 128 + 64,
+                        ko * 32,
+                        0,
+                    )
+                T.ptx_arrive_barrier(mbarrier[ko % 3])
+        else:
+            C_local_2 = T.Buffer((128,), data=C_local, scope="local")
+            for i in T.unroll(32):
+                C_local_2[i * 4 : i * 4 + 4] = T.Broadcast(T.float32(0.0), 4)
+            for ko in range(32):
+                T.mbarrier_wait_parity(mbarrier[ko % 3], ko % 6 // 3)
+                desc_a_2 = T.Buffer((1,), "uint64", data=desc_a, scope="local.descriptor.wgmma")
+                T.initialize_wgmma_descriptor(
+                    desc_a_2[0],
+                    T.tvm_access_ptr(T.type_annotation("float16"), buf_dyn_shmem, ko % 3 * 4096, 4096, 1),
+                    2,
+                    1,
+                    32,
+                )
+                desc_b_2 = T.Buffer((1,), "uint64", data=desc_b, scope="local.descriptor.wgmma")
+                T.initialize_wgmma_descriptor(
+                    desc_b_2[0],
+                    T.tvm_access_ptr(T.type_annotation("float16"), buf_dyn_shmem, 12288 + ko % 3 * 4096, 4096, 1),
+                    1,
+                    256,
+                    64,
+                )
+                T.warpgroup_fence_operand("float32", C_local, 0, 128)
+                T.warpgroup_arrive()
+                for i in T.unroll(2):
+                    for ki in T.unroll(2):
+                        T.ptx_wgmma_ss(
+                            "float32",
+                            "m64n128k16",
+                            T.bool(True),
+                            T.bool(False),
+                            "fp16",
+                            "fp16",
+                            "fp32",
+                            desc_a,
+                            T.shift_right(i * 4096 + ki * 32, 4),
+                            desc_b,
+                            T.shift_right(ki * 2048, 4),
+                            C_local,
+                            i * 64,
+                            1,
+                            1,
+                            1,
+                        )
+                T.warpgroup_commit_batch()
+                T.warpgroup_wait(0)
+                T.warpgroup_fence_operand("float32", C_local, 0, 128)
+                T.ptx_arrive_barrier(mbarrier[ko % 3 + 3])
+            for i in T.unroll(128):
+                C_local_2[i] = T.max(C_local_2[i], T.float32(0.0))
+            for i in T.unroll(64):
+                C_local_cast_2 = T.Buffer((2,), "float16", data=C_local_cast, scope="local")
+                C_local_cast_2[0:2] = T.Cast("float16x2", C_local_2[i * 2 : i * 2 + 2])
+                C_2 = T.Buffer((1048576,), "float16", data=C)
+                C_2[
+                    by * 131072
+                    + i // 32 * 65536
+                    + tx // 32 * 16384
+                    + i % 2 * 8192
+                    + tx % 32 // 4 * 1024
+                    + bx * 128
+                    + i % 32 // 2 * 8
+                    + tx % 4 * 2 : by * 131072
+                    + i // 32 * 65536
+                    + tx // 32 * 16384
+                    + i % 2 * 8192
+                    + tx % 32 // 4 * 1024
+                    + bx * 128
+                    + i % 32 // 2 * 8
+                    + tx % 4 * 2
+                    + 2
+                ] = C_local_cast_2[0:2]
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_ldmatrix_then_wgmma_does_not_inject_fence_proxy():
+    """Shared-memory loads (including ldmatrix) do not trigger fence injection."""
+
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.float16, scope="shared")
+            regs = T.decl_buffer((16,), T.float16, scope="local")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.ptx_ldmatrix"),
+                T.int32(0),
+                1,
+                T.tvm_access_ptr(T.type_annotation(T.float16), smem.data, 0, 16, 1),
+                regs.data,
+            )
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    _check(before, before)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_stmatrix_then_wgmma_injects_fence_proxy():
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.ptx_stmatrix"),
+                T.int32(0),
+                1,
+                T.tvm_access_ptr(T.type_annotation(T.float16), smem.data, 0, 16, 2),
+                T.int32(0),
+            )
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            smem = T.decl_buffer((256,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            T.call_intrin(
+                "handle",
+                tir.op.Op.get("tl.ptx_stmatrix"),
+                T.int32(0),
+                1,
+                T.tvm_access_ptr(T.type_annotation(T.float16), smem.data, 0, 16, 2),
+                T.int32(0),
+            )
+            T.warpgroup_arrive()
+            T.fence_proxy_async()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_if_merge_may_be_generic_then_async_injects_fence_proxy():
+    @T.prim_func
+    def before(flag: T.int32):
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            if flag == 1:
+                smem[0] = T.float16(0)
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    @T.prim_func
+    def after(flag: T.int32):
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            if flag == 1:
+                smem[0] = T.float16(0)
+            T.warpgroup_arrive()
+            T.fence_proxy_async()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_hoist_fence_proxy_out_of_if():
+    """Hoist a single fence out of a pure-async if-then-else region."""
+
+    @T.prim_func
+    def before(flag: T.int32):
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            smem[0] = T.float16(0)
+            T.warpgroup_arrive()
+            if flag == 1:
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+            else:
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+
+    @T.prim_func
+    def after(flag: T.int32):
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            smem[0] = T.float16(0)
+            T.warpgroup_arrive()
+            T.fence_proxy_async()
+            if flag == 1:
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+            else:
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_hoist_fence_proxy_out_of_unrolled_loop():
+    """Prefer a single preheader fence over per-iteration fences.
+
+    If the loop body performs async-proxy ops but never performs generic shared
+    traffic, a possibly-generic entry state should be resolved by inserting a
+    single fence before the loop, rather than inside the loop body.
+    """
+
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+
+            smem[0] = T.float16(0)
+            T.warpgroup_arrive()
+            for _ in T.unroll(12):
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+
+            smem[0] = T.float16(0)
+            T.warpgroup_arrive()
+            T.fence_proxy_async()
+            for _ in T.unroll(12):
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_hoist_fence_proxy_out_of_while_loop():
+    """Hoist a single fence out of a pure-async while-loop body."""
+
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            counter = T.decl_buffer((1,), T.int32, scope="local")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            smem[0] = T.float16(0)
+            counter[0] = 0
+            while counter[0] < T.int32(12):
+                T.warpgroup_arrive()
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+                counter[0] = counter[0] + 1
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            counter = T.decl_buffer((1,), T.int32, scope="local")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            smem[0] = T.float16(0)
+            counter[0] = 0
+            T.fence_proxy_async()
+            while counter[0] < T.int32(12):
+                T.warpgroup_arrive()
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+                counter[0] = counter[0] + 1
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_loop_carried_generic_then_async_injects_fence_proxy():
+    """Generic proxy traffic at the end of an iteration may affect the next iteration."""
+
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            for _ in range(2):
+                T.warpgroup_arrive()
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+                smem[0] = T.float16(0)
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            for _ in range(2):
+                T.warpgroup_arrive()
+                T.fence_proxy_async()
+                T.ptx_wgmma_ss(
+                    T.float16,
+                    "m64n64k16",
+                    T.bool(True),
+                    T.bool(True),
+                    "fp16",
+                    "fp16",
+                    "fp16",
+                    desc_a.data,
+                    T.int32(0),
+                    desc_b.data,
+                    T.int32(0),
+                    C_local.data,
+                    T.int32(0),
+                    T.bool(True),
+                    1,
+                    1,
+                )
+                smem[0] = T.float16(0)
+
+    _check(before, after)
+
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_shared_load_does_not_trigger_fence_proxy():
+    """Shared loads are not treated as generic proxy traffic for fence injection."""
+
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            x = smem[0]
+            C_local[0] = x
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            smem = T.decl_buffer((1,), T.float16, scope="shared")
+            desc_a = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            desc_b = T.decl_buffer((1,), T.uint64, scope="local.descriptor.wgmma")
+            C_local = T.decl_buffer((32,), T.float16, scope="local")
+            x = smem[0]
+            C_local[0] = x
+            T.warpgroup_arrive()
+            T.ptx_wgmma_ss(
+                T.float16,
+                "m64n64k16",
+                T.bool(True),
+                T.bool(True),
+                "fp16",
+                "fp16",
+                "fp16",
+                desc_a.data,
+                T.int32(0),
+                desc_b.data,
+                T.int32(0),
+                C_local.data,
+                T.int32(0),
+                T.bool(True),
+                1,
+                1,
+            )
+
+    _check(before, after)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
index 1885c7c4b3..46704d1664 100644
--- a/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
+++ b/testing/python/transform/test_tilelang_transform_inject_set_max_nreg.py
@@ -26,26 +26,26 @@ def before(A: T.Tensor((512, 512), T.float16), B: T.Tensor((512, 512), T.float16
             B_shared = T.alloc_buffer((3, 1, 4, 512), T.float16, scope="shared.dyn")
             C_local = T.alloc_buffer((32,), scope="local")
 
-            T.create_list_of_mbarrier(128, 128, 128, 128, 128, 128)
+            mbars = T.alloc_barrier([128, 128, 128, 128, 128, 128])
             T.attr([128, 128], "kWarpSpecializationScope", 0)
 
             if v >= 128:
                 # Producer branch - should have set_max_nreg(24, 0)
                 for k in range(16):
-                    T.mbarrier_wait_parity(T.get_mbarrier(k % 3 + 3), T.bitwise_xor(k // 3 % 2, 1))
+                    T.mbarrier_wait_parity(mbars[k % 3 + 3], T.bitwise_xor(k // 3 % 2, 1))
                     if v - 128 == 0:
                         T.tma_load(
                             T.create_tma_descriptor(6, 2, A.data, 512, 512, 2, 1024, 32, 64, 1, 1, 0, 2, 2, 0),
-                            T.get_mbarrier(k % 3),
+                            mbars[k % 3],
                             T.tvm_access_ptr(T.type_annotation(T.float16), A_shared.data, k % 3 * 2048, 2048, 2),
                             k * 32,
                             by * 64,
                         )
-                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3)]))
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [mbars[k % 3]]))
             else:
                 # Consumer branch - should have set_max_nreg(240, 1)
                 for k in range(16):
-                    T.mbarrier_wait_parity(T.get_mbarrier(k % 3), k // 3 % 2)
+                    T.mbarrier_wait_parity(mbars[k % 3], k // 3 % 2)
                     T.call_extern(
                         "handle",
                         "tl::gemm_ss<64, 64, 32, 4, 1, 0, 0>",
@@ -53,7 +53,7 @@ def before(A: T.Tensor((512, 512), T.float16), B: T.Tensor((512, 512), T.float16
                         T.tvm_access_ptr(T.type_annotation(T.float16), B_shared.data, k % 3 * 2048, 2048, 1),
                         T.tvm_access_ptr(T.type_annotation(T.float32), C_local.data, 0, 32, 3),
                     )
-                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [T.get_mbarrier(k % 3 + 3)]))
+                    T.evaluate(tir.Call("handle", "tir.ptx_arrive_barrier", [mbars[k % 3 + 3]]))
 
     # Apply the InjectSetMaxNReg pass
     func = before
@@ -61,25 +61,48 @@ def before(A: T.Tensor((512, 512), T.float16), B: T.Tensor((512, 512), T.float16
     mod = tl.transform.AnnotateWarpGroupRegAlloc()(mod)
     mod = tir.transform.LowerOpaqueBlock()(mod)
 
-    # Check that set_max_nreg calls are properly injected
-    main_func = mod["main"]
-    set_max_nreg_calls = []
+    script = mod.script()
+    producer_branch = script.index("if v_2 >= 128:")
+    consumer_branch = script.index("else:", producer_branch)
+    reg_dealloc = script.index("T.set_max_nreg(24, 0)")
+    reg_alloc = script.index("T.set_max_nreg(240, 1)")
 
-    def collect_set_max_nreg(stmt):
-        if (
-            isinstance(stmt, tvm.tir.Evaluate)
-            and hasattr(stmt.value, "op")
-            and hasattr(stmt.value.op, "name")
-            and stmt.value.op.name == "tl.set_max_nreg"
-        ):
-            set_max_nreg_calls.append(stmt.value)
+    assert producer_branch < reg_dealloc < consumer_branch
+    assert consumer_branch < reg_alloc
 
-    tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
+    print("InjectSetMaxNReg test passed!")
 
-    # We should have at least 2 set_max_nreg calls (one for producer, one for consumer)
-    assert len(set_max_nreg_calls) >= 2, f"Expected at least 2 set_max_nreg calls, got {len(set_max_nreg_calls)}"
 
-    print("InjectSetMaxNReg test passed!")
+def test_raw_set_max_nreg_keeps_legacy_behavior_with_simt_copy():
+    """Raw T.set_max_nreg should stay in place instead of being treated as annotation."""
+
+    @T.prim_func
+    def before(A: T.Tensor((512, 512), T.float16), B: T.Tensor((512, 512), T.float16)):
+        bx = T.launch_thread("blockIdx.x", 8)
+        v = T.launch_thread("threadIdx.x", 256)
+
+        with T.block(""):
+            T.reads(A[bx * 64, 0:64])
+            T.writes(B[bx * 64, 0:64])
+
+            A_shared = T.alloc_buffer((128,), T.float16, scope="shared")
+            T.attr([128, 128], "kWarpSpecializationScope", 0)
+
+            if v >= 128:
+                T.set_max_nreg(80, 0)
+                A_shared[v - 128] = A[bx * 64, v - 128]
+            else:
+                T.set_max_nreg(240, 1)
+                B[bx * 64, v] = A_shared[v]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tl.transform.AnnotateWarpGroupRegAlloc()(mod)
+    mod = tir.transform.LowerOpaqueBlock()(mod)
+
+    script = mod.script()
+    assert script.count("T.set_max_nreg(80, 0)") == 1
+    assert script.count("T.set_max_nreg(240, 1)") == 1
+    assert script.count("T.set_max_nreg(") == 2
 
 
 def test_inject_set_max_nreg_no_set_max_nreg():
@@ -97,7 +120,7 @@ def before_no_set_max_nreg(A: T.Tensor((512, 512), T.float16)):
             # Add no_set_max_nreg to disable register hinting
             T.disable_warp_group_reg_alloc()
 
-            T.create_list_of_mbarrier(128, 128)
+            mbars = T.alloc_barrier([128, 128])  # noqa: F841
             T.attr([128, 128], "kWarpSpecializationScope", 0)
 
             if v >= 128:
@@ -113,25 +136,62 @@ def before_no_set_max_nreg(A: T.Tensor((512, 512), T.float16)):
     mod = tl.transform.AnnotateWarpGroupRegAlloc()(mod)
     mod = tir.transform.LowerOpaqueBlock()(mod)
 
-    # Check that no set_max_nreg calls are injected when no_set_max_nreg is present
-    main_func = mod["main"]
-    set_max_nreg_calls = []
+    script = mod.script()
+    assert "T.set_max_nreg(" not in script
 
-    def collect_set_max_nreg(stmt):
-        if (
-            isinstance(stmt, tvm.tir.Evaluate)
-            and hasattr(stmt.value, "op")
-            and hasattr(stmt.value.op, "name")
-            and stmt.value.op.name == "tl.set_max_nreg"
-        ):
-            set_max_nreg_calls.append(stmt.value)
+    print("InjectSetMaxNReg with no_set_max_nreg test passed!")
 
-    tvm.tir.stmt_functor.post_order_visit(main_func.body, collect_set_max_nreg)
 
-    # Should have no set_max_nreg calls when no_set_max_nreg is present
-    assert len(set_max_nreg_calls) == 0, f"Expected 0 set_max_nreg calls when no_set_max_nreg is present, got {len(set_max_nreg_calls)}"
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_auto_ws_reg_hints_lower_into_matching_role_scopes():
+    """Producer/consumer reg hints should be emitted inside the auto-WS branches."""
 
-    print("InjectSetMaxNReg with no_set_max_nreg test passed!")
+    M = N = K = 256
+    block_m = block_n = 128
+    block_k = 32
+
+    @T.prim_func
+    def kernel(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_n), T.ceildiv(M, block_m), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_m, block_k), T.float16)
+            B_shared = T.alloc_shared((block_k, block_n), T.float16)
+            C_local = T.alloc_fragment((block_m, block_n), T.float32)
+
+            T.annotate_producer_reg_dealloc(40)
+            T.annotate_consumer_reg_alloc(232)
+
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_k), num_stages=3):
+                T.copy(A[by * block_m, ko * block_k], A_shared)
+                T.copy(B[ko * block_k, bx * block_n], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_m, bx * block_n])
+
+    pass_configs = {
+        tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False,
+        tl.PassConfigKey.TL_ENABLE_FAST_MATH: False,
+    }
+    tl.disable_cache()
+    try:
+        kernel_mod = tl.compile(kernel, target="cuda", pass_configs=pass_configs, out_idx=[-1])
+        src = kernel_mod.get_kernel_source()
+    finally:
+        tl.enable_cache()
+
+    producer_branch = src.index("if (128 <= ((int)threadIdx.x)) {")
+    consumer_branch = src.index("} else {", producer_branch)
+    reg_dealloc = src.index("tl::warpgroup_reg_dealloc<40>();")
+    reg_alloc = src.index("tl::warpgroup_reg_alloc<232>();")
+
+    assert "warpgroup_reg_dealloc" not in src[:producer_branch]
+    assert "warpgroup_reg_alloc" not in src[:producer_branch]
+    assert producer_branch < reg_dealloc < consumer_branch
+    assert consumer_branch < reg_alloc
 
 
 if __name__ == "__main__":
diff --git a/testing/python/transform/test_tilelang_transform_inject_tcgen05_fence.py b/testing/python/transform/test_tilelang_transform_inject_tcgen05_fence.py
new file mode 100644
index 0000000000..41672084e1
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_inject_tcgen05_fence.py
@@ -0,0 +1,283 @@
+# ruff: noqa
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+from tilelang.engine.phase import LowerAndLegalize
+from tvm import tir
+
+
+sm100_target = tvm.target.Target("cuda -arch=sm_100")
+sm90_target = tvm.target.Target("cuda -arch=sm_90a")
+
+
+def _apply(func, target=sm100_target):
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(target)(mod)
+    mod = tl.transform.InjectTcgen05Fence()(mod)
+    mod = tir.transform.LowerOpaqueBlock()(mod)
+    return mod
+
+
+def _check(original, expected, target=sm100_target):
+    mod = _apply(original, target)
+    expected_mod = tvm.IRModule.from_expr(expected.with_attr("global_symbol", "main"))
+    expected_mod = tvm.tir.transform.BindTarget(target)(expected_mod)
+    expected_mod = tir.transform.LowerOpaqueBlock()(expected_mod)
+    tvm.ir.assert_structural_equal(mod["main"], expected_mod["main"], True)
+
+
+def _count_calls(stmt, op_name: str):
+    count = 0
+
+    def visitor(node):
+        nonlocal count
+        if isinstance(node, tir.Call) and hasattr(node, "op") and hasattr(node.op, "name") and node.op.name == op_name:
+            count += 1
+
+    tir.stmt_functor.post_order_visit(stmt, visitor)
+    return count
+
+
+def _count_extern_calls_with_prefix(stmt, prefix: str):
+    count = 0
+
+    def visitor(node):
+        nonlocal count
+        if not isinstance(node, tir.Call):
+            return
+        op = getattr(node, "op", None)
+        if getattr(op, "name", None) != "tir.call_extern":
+            return
+        if not node.args:
+            return
+        name = node.args[0]
+        if isinstance(name, tir.StringImm) and name.value.startswith(prefix):
+            count += 1
+
+    tir.stmt_functor.post_order_visit(stmt, visitor)
+    return count
+
+
+def _tcgen05_ld_call(tmem_ref, local_buf):
+    return T.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tcgen05_ld"),
+        32,
+        128,
+        False,
+        tmem_ref,
+        0,
+        T.tvm_access_ptr(T.type_annotation(T.float32), local_buf.data, 0, 128, 2),
+    )
+
+
+def test_storage_sync_is_wrapped_with_tcgen05_fences():
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.tvm_storage_sync("shared")
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.tcgen05_before_thread_sync()
+            T.tvm_storage_sync("shared")
+            T.tcgen05_after_thread_sync()
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+
+    _check(before, after)
+
+
+def test_lower_tmem_copy_uses_tcgen05_ld_intrin():
+    @T.prim_func
+    def func(X: T.Tensor((256, 256), T.float16), Y: T.Tensor((256, 256), T.float16)):
+        with T.Kernel(1, 1, threads=128) as (bx, by):
+            A_shared = T.alloc_shared((128, 128), T.float16)
+            B_shared = T.alloc_shared((128, 128), T.float16)
+            C_tmem = T.alloc_tmem([128, 128], T.float32)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((128, 128), T.float32)
+            T.copy(X[0, 0], A_shared)
+            T.copy(X[0, 0], B_shared)
+            T.tcgen05_gemm(
+                A_shared,
+                B_shared,
+                C_tmem,
+                transpose_B=True,
+                mbar=mbar,
+                clear_accum=True,
+            )
+            T.mbarrier_wait_parity(mbar, 0)
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, Y[0, 0])
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    with sm100_target:
+        mod = LowerAndLegalize(mod, sm100_target)
+        mod = tl.transform.LowerSharedTmem()(mod)
+
+    body = mod["main"].body
+    assert _count_calls(body, "tl.tcgen05_ld") == 1
+    assert _count_extern_calls_with_prefix(body, "tl::tcgen05_ld_") == 0
+
+
+def test_lower_tmem_copy_uses_tcgen05_st_intrin():
+    @T.prim_func
+    def func(X: T.Tensor((256, 256), T.bfloat16)):
+        with T.Kernel(1, 1, threads=128) as (bx, by):
+            A_shared = T.alloc_shared((128, 128), T.bfloat16)
+            B_shared = T.alloc_shared((128, 128), T.bfloat16)
+            S_tmem = T.alloc_tmem([128, 128], T.float32)
+            mbar = T.alloc_barrier(1)
+            S_local = T.alloc_fragment((128, 128), T.float32)
+            P_local = T.alloc_fragment((128, 128), T.bfloat16)
+            P_tmem = T.alloc_tmem([128, 128], T.bfloat16)
+            B2_shared = T.alloc_shared((128, 128), T.bfloat16)
+            D_tmem = T.alloc_tmem([128, 128], T.float32)
+            mbar2 = T.alloc_barrier(1)
+            T.copy(X[0, 0], A_shared)
+            T.copy(X[0, 0], B_shared)
+            T.tcgen05_gemm(
+                A_shared,
+                B_shared,
+                S_tmem,
+                transpose_B=True,
+                mbar=mbar,
+                clear_accum=True,
+            )
+            T.mbarrier_wait_parity(mbar, 0)
+            T.copy(S_tmem, S_local)
+            T.copy(S_local, P_local)
+            T.copy(P_local, P_tmem)
+            T.copy(X[0, 0], B2_shared)
+            T.tcgen05_gemm(
+                P_tmem,
+                B2_shared,
+                D_tmem,
+                transpose_B=True,
+                mbar=mbar2,
+                clear_accum=True,
+            )
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    with sm100_target:
+        mod = LowerAndLegalize(mod, sm100_target)
+        mod = tl.transform.LowerSharedTmem()(mod)
+
+    body = mod["main"].body
+    assert _count_calls(body, "tl.tcgen05_st") == 1
+    assert _count_extern_calls_with_prefix(body, "tl::tcgen05_st_") == 0
+
+
+def test_wait_and_arrive_are_rewritten_only_at_tmem_handoffs():
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.mbarrier_wait_parity(mbarrier[0], 0)
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+            T.ptx_arrive_barrier(mbarrier[0])
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.mbarrier_wait_parity(mbarrier[0], 0)
+            T.tcgen05_after_thread_sync()
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+            T.tcgen05_before_thread_sync()
+            T.ptx_arrive_barrier(mbarrier[0])
+
+    _check(before, after)
+
+
+def test_wait_and_arrive_scan_across_neutral_statements():
+    @T.prim_func
+    def before():
+        with T.Kernel(1):
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.mbarrier_wait_parity(mbarrier[0], 0)
+            T.call_extern("handle", "generic_op")
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+            T.call_extern("handle", "generic_op")
+            T.ptx_arrive_barrier(mbarrier[0])
+
+    @T.prim_func
+    def after():
+        with T.Kernel(1):
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.mbarrier_wait_parity(mbarrier[0], 0)
+            T.tcgen05_after_thread_sync()
+            T.call_extern("handle", "generic_op")
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+            T.call_extern("handle", "generic_op")
+            T.tcgen05_before_thread_sync()
+            T.ptx_arrive_barrier(mbarrier[0])
+
+    _check(before, after)
+
+
+def test_sync_boundary_stops_wait_lookahead():
+    @T.prim_func
+    def func():
+        with T.Kernel(1):
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.mbarrier_wait_parity(mbarrier[0], 0)
+            T.call_extern("handle", "generic_op")
+            T.ptx_arrive_barrier(mbarrier[0])
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+
+    mod = _apply(func)
+    assert _count_calls(mod["main"].body, "tl.tcgen05_after_thread_sync") == 0
+
+
+def test_existing_manual_fences_are_not_duplicated():
+    @T.prim_func
+    def func():
+        with T.Kernel(1):
+            mbarrier = T.decl_buffer((1,), T.uint64, scope="shared.barrier")
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.mbarrier_wait_parity(mbarrier[0], 0)
+            T.tcgen05_after_thread_sync()
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+            T.tcgen05_before_thread_sync()
+            T.ptx_arrive_barrier(mbarrier[0])
+
+    mod = _apply(func)
+    body = mod["main"].body
+    assert _count_calls(body, "tl.tcgen05_after_thread_sync") == 1
+    assert _count_calls(body, "tl.tcgen05_before_thread_sync") == 1
+
+
+def test_non_sm100_targets_are_left_untouched():
+    @T.prim_func
+    def func():
+        with T.Kernel(1):
+            C_tmem = T.decl_buffer((1,), T.uint32, scope="shared")
+            C_local = T.decl_buffer((128,), T.float32, scope="local")
+            T.tvm_storage_sync("shared")
+            T.evaluate(_tcgen05_ld_call(C_tmem[0], C_local))
+
+    mod = _apply(func, sm90_target)
+    assert _count_calls(mod["main"].body, "tl.tcgen05_before_thread_sync") == 0
+    assert _count_calls(mod["main"].body, "tl.tcgen05_after_thread_sync") == 0
+
+
+if __name__ == "__main__":
+    tl.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
index 4f75fa05d0..8aeb623921 100644
--- a/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
+++ b/testing/python/transform/test_tilelang_transform_legalize_safe_memory_access.py
@@ -2,6 +2,54 @@
 import tilelang as tl
 import tilelang.language as T
 import tilelang.testing
+from tvm.tir.stmt_functor import ir_transform, post_order_visit
+
+
+def _strip_block_reads_writes(stmt):
+    """Strip reads and writes from all blocks, replacing them with empty lists."""
+
+    def _postorder(op):
+        if isinstance(op, tvm.tir.Block):
+            return tvm.tir.Block(
+                op.iter_vars,
+                [],
+                [],
+                op.name_hint,
+                op.body,
+                op.init,
+                op.alloc_buffers,
+                op.match_buffers,
+                op.annotations,
+            )
+
+    return ir_transform(stmt, None, _postorder, ["tir.Block"])
+
+
+def _collect_call_nodes(stmt, op_names):
+    if isinstance(op_names, str):
+        op_names = {op_names}
+    else:
+        op_names = set(op_names)
+    calls = []
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.Call) and isinstance(node.op, tvm.ir.Op) and str(node.op.name) in op_names:
+            calls.append(node)
+
+    post_order_visit(stmt, _visit)
+    return calls
+
+
+def _count_if_then_else(stmt):
+    count = 0
+
+    def _visit(node):
+        nonlocal count
+        if isinstance(node, tvm.tir.IfThenElse):
+            count += 1
+
+    post_order_visit(stmt, _visit)
+    return count
 
 
 def vectorize_access_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
@@ -38,7 +86,11 @@ def assert_vectorize_access(M: int = 64, N: int = 64):
     func, expected = vectorize_access_legalize(M, N)
     mod = tvm.IRModule({func.attrs["global_symbol"]: func})
     transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
-    tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
+
+    tvm.ir.assert_structural_equal(
+        _strip_block_reads_writes(transformed["main"].body),
+        _strip_block_reads_writes(expected.body),
+    )
 
 
 def vectorize_access_with_atmoic_add_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
@@ -71,7 +123,7 @@ def expected(
                 # Nest if-then-else is expected, do not flatten it to pass structural equal check
                 if j + N_offset < N:  # noqa: SIM102
                     if tid + M_offset < M:
-                        T.call_extern("handle", "AtomicAdd", T.address_of(A[tid + M_offset, j + N_offset]), 1)
+                        T.atomic_add(A[tid + M_offset, j + N_offset], 1)
 
     return main, expected
 
@@ -80,7 +132,12 @@ def assert_vectorize_access_with_atmoic_add(M: int = 64, N: int = 64):
     func, expected = vectorize_access_with_atmoic_add_legalize(M, N)
     mod = tvm.IRModule({func.attrs["global_symbol"]: func})
     transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
-    tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
+    print(transformed)
+    print(expected)
+    tvm.ir.assert_structural_equal(
+        _strip_block_reads_writes(transformed["main"].body),
+        _strip_block_reads_writes(expected.body),
+    )
 
 
 def oob_store_legalize(M: int = 64, N: int = 64, M_offset: int = 2, N_offset: int = 2):
@@ -114,7 +171,109 @@ def assert_oob_store_legalize(M: int = 64, N: int = 64):
     func, expected = oob_store_legalize(M, N)
     mod = tvm.IRModule({func.attrs["global_symbol"]: func})
     transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
-    tvm.ir.assert_structural_equal(transformed["main"].body, expected.body)
+    tvm.ir.assert_structural_equal(
+        _strip_block_reads_writes(transformed["main"].body),
+        _strip_block_reads_writes(expected.body),
+    )
+
+
+def cp_async_access_ptr_legalize(N: int = 16, offset: int = 10):
+    dtype = T.float16
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype=dtype),
+    ):
+        A_shared = T.alloc_buffer((N,), dtype=dtype, scope="shared")
+        for i in T.serial(4):
+            T.ptx_cp_async(
+                T.access_ptr(A_shared[i * 4], "w", 4),
+                T.access_ptr(A[i * 4 + offset], "r", 4),
+                4,
+            )
+        T.ptx_commit_group()
+        T.ptx_wait_group(0)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((N,), dtype=dtype),
+    ):
+        A_shared = T.alloc_buffer((N,), dtype=dtype, scope="shared")
+        for i in T.serial(4):
+            T.ptx_cp_async(
+                T.access_ptr(A_shared[i * 4], "w", 4),
+                T.access_ptr(A[i * 4 + offset], "r", 4),
+                4,
+                i * 4 + offset < N,
+            )
+        T.ptx_commit_group()
+        T.ptx_wait_group(0)
+
+    return main, expected
+
+
+def assert_cp_async_access_ptr_legalize(N: int = 16):
+    func, _ = cp_async_access_ptr_legalize(N)
+    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
+    transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
+    body = transformed["main"].body
+    cp_async_calls = _collect_call_nodes(body, {"tir.ptx_cp_async", "tl.ptx_cp_async"})
+    assert len(cp_async_calls) > 0
+    assert all(len(call.args) == 4 for call in cp_async_calls)
+
+
+def cp_async_access_ptr_nonzero_safe_value_legalize(N: int = 16, offset: int = 10, pad_value: int = 3):
+    dtype = T.float16
+
+    @T.prim_func
+    def main(
+        A: T.Tensor((N,), dtype=dtype),
+    ):
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"safe_value_map": {A.data: T.float16(pad_value)}})
+            A_shared = T.alloc_buffer((N,), dtype=dtype, scope="shared")
+            for i in T.serial(4):
+                T.ptx_cp_async(
+                    T.access_ptr(A_shared[i * 4], "w", 4),
+                    T.access_ptr(A[i * 4 + offset], "r", 4),
+                    4,
+                )
+            T.ptx_commit_group()
+            T.ptx_wait_group(0)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((N,), dtype=dtype),
+    ):
+        with T.block("root"):
+            T.reads()
+            T.writes()
+            T.block_attr({"safe_value_map": {A.data: T.float16(pad_value)}})
+            A_shared = T.alloc_buffer((N,), dtype=dtype, scope="shared")
+            for i in T.serial(4):
+                if i * 4 + offset < N:
+                    T.ptx_cp_async(
+                        T.access_ptr(A_shared[i * 4], "w", 4),
+                        T.access_ptr(A[i * 4 + offset], "r", 4),
+                        4,
+                    )
+            T.ptx_commit_group()
+            T.ptx_wait_group(0)
+
+    return main, expected
+
+
+def assert_cp_async_access_ptr_nonzero_safe_value_legalize(N: int = 16):
+    func, _ = cp_async_access_ptr_nonzero_safe_value_legalize(N)
+    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
+    transformed = tl.transform.LegalizeSafeMemoryAccess()(mod)
+    body = transformed["main"].body
+    cp_async_calls = _collect_call_nodes(body, {"tir.ptx_cp_async", "tl.ptx_cp_async"})
+    assert len(cp_async_calls) > 0
+    assert all(len(call.args) == 3 for call in cp_async_calls)
+    assert _count_if_then_else(body) > 0
 
 
 def test_vectorize_access():
@@ -129,5 +288,13 @@ def test_oob_store():
     assert_oob_store_legalize(64, 64)
 
 
+def test_cp_async_access_ptr_oob():
+    assert_cp_async_access_ptr_legalize(16)
+
+
+def test_cp_async_access_ptr_nonzero_safe_value_oob():
+    assert_cp_async_access_ptr_nonzero_safe_value_legalize(16)
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_lexical_alloc_scope.py b/testing/python/transform/test_tilelang_transform_lexical_alloc_scope.py
new file mode 100644
index 0000000000..946ada4453
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_lexical_alloc_scope.py
@@ -0,0 +1,429 @@
+"""Tests for the lexical_alloc_scope feature.
+
+Verifies that:
+1. LowerOpaqueBlock inserts AttrStmt("lexical_alloc_scope") for blocks
+   explicitly marked for lexical allocation scoping.
+2. Unmarked blocks do NOT receive the marker by heuristic inference.
+3. gemm_py-produced blocks are explicitly marked and survive as scopes.
+4. StorageRewrite does not hoist allocations past the scope boundary.
+5. CUDA codegen emits { ... } for the scoped block.
+"""
+
+import tilelang as tl
+import tilelang.language as T
+from tilelang import tvm
+from tilelang.engine.phase import LowerAndLegalize
+from tvm.tir.stmt_functor import post_order_visit
+import tilelang.testing
+
+
+def _count_attrs(func, attr_key):
+    """Count occurrences of a specific AttrStmt key in the function body."""
+    count = [0]
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.AttrStmt) and str(node.attr_key) == attr_key:
+            count[0] += 1
+
+    post_order_visit(func.body, _visit)
+    return count[0]
+
+
+def _count_allocate_inside_attr(func, attr_key):
+    """Count Allocate nodes that are (transitively) nested inside the given AttrStmt."""
+    count = [0]
+    inside = [False]
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.AttrStmt) and str(node.attr_key) == attr_key:
+            old = inside[0]
+            inside[0] = True
+            post_order_visit(node.body, _visit)
+            inside[0] = old
+        elif isinstance(node, tvm.tir.Allocate) and inside[0]:
+            count[0] += 1
+
+    post_order_visit(func.body, _visit)
+    return count[0]
+
+
+def _apply_lower_opaque_pipeline(func, target, pass_configs=None):
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    pass_configs = pass_configs or {}
+    with target, tvm.transform.PassContext(config=pass_configs):
+        mod = LowerAndLegalize(mod, target)
+        mod = tl.transform.LowerSharedTmem()(mod)
+        mod = tl.transform.IfStmtBinding()(mod)
+        mod = tl.transform.PlanAndUpdateBufferAllocationLocation()(mod)
+        mod = tl.transform.LowerSharedBarrier()(mod)
+        mod = tl.transform.HoistGlobalBufferAllocations()(mod)
+        mod = tl.transform.LowerOpaqueBlock()(mod)
+    return mod
+
+
+# ---------------------------------------------------------------------------
+# Test 1: LowerOpaqueBlock inserts the lexical_alloc_scope marker for an
+#         explicitly annotated block.
+# ---------------------------------------------------------------------------
+def test_lower_opaque_block_inserts_lexical_alloc_scope_for_explicit_block():
+    """An explicitly annotated block should produce a lexical_alloc_scope."""
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        for _ in T.serial(4):
+            with T.block():
+                T.block_attr({"lexical_alloc_scope": 1})
+                S = T.alloc_buffer((128,), dtype=T.float32, scope="local")
+                S[tx] = A[tx]
+                B[tx] = S[tx]
+
+    mod = tvm.IRModule.from_expr(func)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    lowered = mod["main"]
+
+    n = _count_attrs(lowered, "lexical_alloc_scope")
+    assert n >= 1, f"Expected at least 1 lexical_alloc_scope AttrStmt, got {n}"
+
+    # The Allocate for S should be inside the scope
+    n_alloc = _count_allocate_inside_attr(lowered, "lexical_alloc_scope")
+    assert n_alloc >= 1, f"Expected Allocate inside lexical_alloc_scope, got {n_alloc}"
+
+
+# ---------------------------------------------------------------------------
+# Test 2: An unmarked block with local alloc should NOT get the marker
+# ---------------------------------------------------------------------------
+def test_lower_opaque_block_skips_unmarked_local_alloc():
+    """An unmarked local-alloc block should not produce a lexical_alloc_scope."""
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        for _ in T.serial(4):
+            with T.block():
+                S = T.alloc_buffer((128,), dtype=T.float32, scope="local")
+                S[tx] = A[tx]
+                B[tx] = S[tx]
+
+    mod = tvm.IRModule.from_expr(func)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    lowered = mod["main"]
+
+    n = _count_attrs(lowered, "lexical_alloc_scope")
+    assert n == 0, f"Expected 0 lexical_alloc_scope AttrStmt for unmarked local block, got {n}"
+
+
+# ---------------------------------------------------------------------------
+# Test 3: Block without alloc_buffers should NOT get the marker
+# ---------------------------------------------------------------------------
+def test_lower_opaque_block_skips_empty_alloc():
+    """A block without alloc_buffers should not produce a lexical_alloc_scope."""
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        for _ in T.serial(4):
+            with T.block():
+                B[tx] = A[tx]
+
+    mod = tvm.IRModule.from_expr(func)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    lowered = mod["main"]
+
+    n = _count_attrs(lowered, "lexical_alloc_scope")
+    assert n == 0, f"Expected 0 lexical_alloc_scope AttrStmt for empty block, got {n}"
+
+
+# ---------------------------------------------------------------------------
+# Test 4: GEMM descriptor allocs inside loop should get the marker
+# ---------------------------------------------------------------------------
+def test_lower_opaque_block_inserts_scope_for_gemm_descriptor_alloc():
+    """Lowered WGMMA descriptor buffers inside a loop should trigger lexical_alloc_scope."""
+    target = tvm.target.Target("cuda -arch=sm_90a")
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((64, 16), T.bfloat16),
+        B: T.Tensor((64, 16), T.bfloat16),
+        C: T.Tensor((64, 64), T.bfloat16),
+    ):
+        with T.Kernel(1, threads=128):
+            A_shared = T.alloc_shared((64, 16), T.bfloat16)
+            B_shared = T.alloc_shared((64, 16), T.bfloat16)
+            C_local = T.alloc_fragment((64, 64), T.float32)
+            T.clear(C_local)
+            for _ in T.serial(2):
+                T.copy(A[0, 0], A_shared)
+                T.copy(B[0, 0], B_shared)
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+            T.copy(C_local, C[0:64, 0:64])
+
+    mod = _apply_lower_opaque_pipeline(func, target)
+    lowered = mod["main"]
+
+    n = _count_attrs(lowered, "lexical_alloc_scope")
+    assert n >= 1, f"Expected lexical_alloc_scope for lowered GEMM descriptor alloc, got {n}"
+
+
+# ---------------------------------------------------------------------------
+# Test 5: local.var-only block inside loop should NOT get the marker
+# ---------------------------------------------------------------------------
+def test_lower_opaque_block_skips_local_var_only_alloc():
+    """A block that allocates only local.var should not get lexical_alloc_scope."""
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        for _ in T.serial(4):
+            with T.block():
+                idx = T.alloc_var(T.int32)
+                idx = tx
+                B[tx] = A[idx]
+
+    mod = tvm.IRModule.from_expr(func)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    lowered = mod["main"]
+
+    n = _count_attrs(lowered, "lexical_alloc_scope")
+    assert n == 0, f"Expected 0 lexical_alloc_scope for local.var-only block, got {n}"
+
+
+# ---------------------------------------------------------------------------
+# Test 6: top-level explicitly annotated local alloc should get the marker
+# ---------------------------------------------------------------------------
+def test_lower_opaque_block_marks_explicit_top_level_local_alloc():
+    """A top-level explicitly annotated local alloc should get lexical_alloc_scope."""
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        with T.block():
+            T.block_attr({"lexical_alloc_scope": 1})
+            S = T.alloc_buffer((128,), dtype=T.float32, scope="local")
+            S[tx] = A[tx]
+            B[tx] = S[tx]
+
+    mod = tvm.IRModule.from_expr(func)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    lowered = mod["main"]
+
+    n = _count_attrs(lowered, "lexical_alloc_scope")
+    assert n >= 1, f"Expected lexical_alloc_scope for top-level local block, got {n}"
+
+
+# ---------------------------------------------------------------------------
+# Test 7: top-level fragment alloc should not force a lexical scope
+# ---------------------------------------------------------------------------
+def test_lower_opaque_block_skips_fragment_alloc():
+    """A fragment alloc should not force lexical_alloc_scope by itself."""
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        with T.block():
+            S = T.alloc_buffer((128,), dtype=T.float32, scope="local.fragment")
+            S[tx] = A[tx]
+            B[tx] = S[tx]
+
+    mod = tvm.IRModule.from_expr(func)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    lowered = mod["main"]
+
+    n = _count_attrs(lowered, "lexical_alloc_scope")
+    assert n == 0, f"Expected no lexical_alloc_scope for fragment-only block, got {n}"
+
+
+# ---------------------------------------------------------------------------
+# Test 8: disable-ws pipelined GEMM should not wrap the fragment root block
+# ---------------------------------------------------------------------------
+def test_lower_opaque_block_skips_fragment_root_in_disable_ws_pipeline():
+    """A fragment root block should not force lexical_alloc_scope in disable-ws pipeline."""
+    target = tvm.target.Target("cuda -arch=sm_90a")
+    pass_configs = {tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED.value: True}
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((256, 256), T.bfloat16),
+        B: T.Tensor((128, 256), T.bfloat16),
+        C: T.Tensor((256, 128), T.bfloat16),
+    ):
+        with T.Kernel(1, threads=256):
+            A_shared = T.alloc_shared((256, 128), T.bfloat16)
+            B_shared = T.alloc_shared((128, 128), T.bfloat16)
+            C_local = T.alloc_fragment((256, 128), T.float32)
+            C_shared = T.alloc_shared((256, 128), T.bfloat16)
+            T.clear(C_local)
+            for k in T.Pipelined(2, num_stages=2):
+                T.copy(A[0, k * 128], A_shared)
+                T.copy(B[0, k * 128], B_shared)
+                T.gemm(A_shared, B_shared, C_local, transpose_B=True)
+            T.copy(C_local, C_shared)
+            T.copy(C_shared, C[0:256, 0:128])
+
+    mod = _apply_lower_opaque_pipeline(func, target, pass_configs=pass_configs)
+    lowered = mod["main"]
+    lowered_script = lowered.script(show_meta=False)
+
+    assert 'T.attr(0, "lexical_alloc_scope", 1)\n    C_local = T.decl_buffer' not in lowered_script, (
+        "Unexpected top-level lexical_alloc_scope around fragment-backed C_local"
+    )
+    assert lowered_script.count("lexical_alloc_scope") >= 2, "Expected inner GEMM lexical scopes to remain in the disable-ws pipeline"
+
+
+# ---------------------------------------------------------------------------
+# Test 9: StorageRewrite preserves lexical_alloc_scope
+# ---------------------------------------------------------------------------
+def test_storage_rewrite_preserves_scope():
+    """lexical_alloc_scope should survive StorageRewrite without crashing."""
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        for _ in T.serial(4):
+            with T.block():
+                T.block_attr({"lexical_alloc_scope": 1})
+                S = T.alloc_buffer((128,), dtype=T.float32, scope="local")
+                S[tx] = A[tx]
+                B[tx] = S[tx]
+
+    mod = tvm.IRModule.from_expr(func)
+    mod = tl.transform.LowerOpaqueBlock()(mod)
+    mod = tl.transform.Simplify()(mod)
+    mod = tl.transform.FlattenBuffer()(mod)
+    mod = tl.transform.VectorizeLoop()(mod)
+    mod = tl.transform.StorageRewrite()(mod)
+    lowered = mod["main"]
+
+    # The scope marker should still be present after StorageRewrite
+    n = _count_attrs(lowered, "lexical_alloc_scope")
+    assert n >= 1, f"Expected lexical_alloc_scope to survive StorageRewrite, got {n}"
+
+
+# ---------------------------------------------------------------------------
+# Test 10: CUDA codegen emits { } for the scope
+# ---------------------------------------------------------------------------
+@tilelang.testing.requires_cuda
+def test_codegen_emits_braces():
+    """The generated CUDA source should contain scoped { } blocks for explicitly marked allocs."""
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128, 4), T.float32),
+        B: T.Tensor((128, 4), T.float32),
+    ):
+        with T.Kernel(1, threads=128):
+            for k in T.serial(4):
+                with T.block():
+                    T.block_attr({"lexical_alloc_scope": 1})
+                    S = T.alloc_buffer((128,), dtype=T.float32, scope="local")
+                    S[T.get_thread_binding()] = A[T.get_thread_binding(), k]
+                    B[T.get_thread_binding(), k] = S[T.get_thread_binding()]
+
+    kernel = tilelang.compile(func, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print("=== lexical_alloc_scope codegen ===")
+    print(src)
+    import re
+
+    standalone_open_braces = re.findall(r"^\s*\{\s*$", src, re.MULTILINE)
+    assert len(standalone_open_braces) >= 1, f"Expected at least 1 standalone '{{' for lexical scope, found {len(standalone_open_braces)}"
+
+
+@tilelang.testing.requires_cuda
+def test_codegen_skips_redundant_top_level_braces():
+    """The outermost top-level lexical scope should not emit a redundant brace block."""
+
+    @T.prim_func
+    def func(
+        A: T.Tensor((128, 4), T.float32),
+        B: T.Tensor((128, 4), T.float32),
+    ):
+        with T.Kernel(1, threads=128):
+            C = T.alloc_fragment((128,), T.float32)
+            T.clear(C)
+            for k in T.serial(4):
+                with T.block():
+                    T.block_attr({"lexical_alloc_scope": 1})
+                    S = T.alloc_buffer((128,), dtype=T.float32, scope="local")
+                    S[T.get_thread_binding()] = A[T.get_thread_binding(), k]
+                    C[T.get_thread_binding()] = S[T.get_thread_binding()]
+            for k in T.serial(4):
+                B[T.get_thread_binding(), k] = C[T.get_thread_binding()]
+
+    kernel = tilelang.compile(func, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    print("=== top-level lexical_alloc_scope codegen ===")
+    print(src)
+    import re
+
+    assert re.search(r"^\s*float [A-Za-z_]\w*\[\d+\];\s*$", src, re.MULTILINE), (
+        "Expected top-level fragment allocation to stay directly in function scope"
+    )
+    kernel_match = re.search(
+        r'extern "C" __global__ void(?: __launch_bounds__\([^)]*\))? [A-Za-z_]\w*_kernel\(',
+        src,
+    )
+    assert kernel_match, "Expected generated CUDA source to contain a kernel signature"
+    body_open_idx = src.find("{", kernel_match.start())
+    assert body_open_idx >= 0, "Expected generated CUDA kernel body"
+    first_nonempty = next(line.strip() for line in src[body_open_idx + 1 :].splitlines() if line.strip())
+    assert first_nonempty != "{", "Unexpected redundant top-level lexical scope brace"
+    standalone_open_braces = re.findall(r"^\s*\{\s*$", src, re.MULTILINE)
+    assert len(standalone_open_braces) >= 1, "Expected inner lexical scopes to still emit standalone braces"
+
+
+if __name__ == "__main__":
+    test_lower_opaque_block_inserts_lexical_alloc_scope_for_explicit_block()
+    test_lower_opaque_block_skips_unmarked_local_alloc()
+    test_lower_opaque_block_skips_empty_alloc()
+    test_lower_opaque_block_inserts_scope_for_gemm_descriptor_alloc()
+    test_lower_opaque_block_skips_local_var_only_alloc()
+    test_lower_opaque_block_marks_explicit_top_level_local_alloc()
+    test_lower_opaque_block_skips_fragment_alloc()
+    test_lower_opaque_block_skips_fragment_root_in_disable_ws_pipeline()
+    test_storage_rewrite_preserves_scope()
+    test_codegen_emits_braces()
+    test_codegen_skips_redundant_top_level_braces()
+    print("All tests passed!")
diff --git a/testing/python/transform/test_tilelang_transform_loop_unswitching.py b/testing/python/transform/test_tilelang_transform_loop_unswitching.py
new file mode 100644
index 0000000000..8909a3d2ee
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_loop_unswitching.py
@@ -0,0 +1,529 @@
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tilelang.transform import PassConfigKey
+
+
+def _check(original, transformed):
+    func = original
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tl.transform.LoopUnswitching()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), map_free_vars=True)
+
+
+def _check_with_config(original, transformed, config):
+    func = original
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    with tvm.transform.PassContext(config=config):
+        mod = tl.transform.LoopUnswitching()(mod)
+    tvm.ir.assert_structural_equal(mod["main"], transformed.with_attr("global_symbol", "main"), map_free_vars=True)
+
+
+def test_basic_hoist():
+    """Basic case: loop-invariant if should be hoisted outside the loop."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in range(128):
+                B[i] = A[i]
+        else:
+            for _i in range(128):
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_hoist_with_else():
+    """Conservative: if with non-trivial else should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = A[i] * T.float32(2.0)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = A[i] * T.float32(2.0)
+
+    _check(before, expected)
+
+
+def test_no_hoist_loop_variant():
+    """If condition depends on loop variable, should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        for i in range(128):
+            if i < 64:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            if i < 64:
+                B[i] = A[i]
+
+    _check(before, expected)
+
+
+def test_no_hoist_reads_written_buffer():
+    """If condition reads a buffer written in the loop, should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        for i in range(128):
+            A[i] = T.float32(1.0)
+            if A[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            A[i] = T.float32(1.0)
+            if A[0] > 0:
+                B[i] = A[i]
+
+    _check(before, expected)
+
+
+def test_hoist_with_other_stmts():
+    """Conservative: if with other side-effecting statements should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            C[i] = A[i]
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            C[i] = A[i]
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    _check(before, expected)
+
+
+def test_nested_loop_inner_invariant():
+    """Loop-invariant if should be hoisted to outermost possible level."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16, 128), T.float32),
+        B: T.Tensor((16, 128), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(16):
+            for j in range(128):
+                if cond[0] > 0:
+                    B[i, j] = A[i, j]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((16, 128), T.float32),
+        B: T.Tensor((16, 128), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        # if is hoisted outside both loops since cond[0] is invariant to both
+        if cond[0] > 0:
+            for i in range(16):
+                for j in range(128):
+                    B[i, j] = A[i, j]
+        else:
+            for _i in range(16):
+                for _j in range(128):
+                    T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_parallel_loop():
+    """Loop-invariant if in parallel loop."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in T.Parallel(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in T.Parallel(128):
+                B[i] = A[i]
+        else:
+            for _i in T.Parallel(128):
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_hoist_let_bound_variable():
+    """If condition uses a Let-bound variable, both should be hoisted together."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((1,), T.float32),
+    ):
+        for i in range(128):
+            pos = C[0]
+            if pos >= T.float32(0):
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((1,), T.float32),
+    ):
+        # Let binding is hoisted before the if, redundant inner LetStmt is removed
+        pos = C[0]
+        if pos >= T.float32(0):
+            for i in range(128):
+                B[i] = A[i]
+        else:
+            for _i in range(128):
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_hoist_multiple_let_bound_variables():
+    """If condition uses multiple Let-bound variables, all should be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((2,), T.float32),
+    ):
+        for i in range(128):
+            x = C[0]
+            y = C[1]
+            if x + y >= T.float32(0):
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((2,), T.float32),
+    ):
+        # Let bindings are hoisted before the if, redundant inner LetStmts are removed
+        x = C[0]
+        y = C[1]
+        if x + y >= T.float32(0):
+            for i in range(128):
+                B[i] = A[i]
+        else:
+            for _i in range(128):
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_multiple_identical_conditions():
+    """Multiple if statements with the same condition should all be replaced."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            if cond[0] > 0:
+                C[i] = A[i] * T.float32(2.0)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in range(128):
+                B[i] = A[i]
+                C[i] = A[i] * T.float32(2.0)
+        else:
+            for _i in range(128):
+                T.evaluate(0)
+                T.evaluate(0)
+
+    _check(before, expected)
+
+
+def test_multiple_identical_conditions_with_else():
+    """Conservative: multiple if-else statements should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = T.float32(0)
+            if cond[0] > 0:
+                C[i] = A[i] * T.float32(2.0)
+            else:
+                C[i] = T.float32(1)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        # Should remain unchanged
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = T.float32(0)
+            if cond[0] > 0:
+                C[i] = A[i] * T.float32(2.0)
+            else:
+                C[i] = T.float32(1)
+
+    _check(before, expected)
+
+
+def test_no_hoist_let_bound_loop_variant():
+    """Let-bound variable depends on loop var, condition should NOT be hoisted."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        for i in range(128):
+            idx = i % 2
+            if idx == 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+    ):
+        # Should remain unchanged since idx depends on loop variable i
+        for i in range(128):
+            idx = i % 2
+            if idx == 0:
+                B[i] = A[i]
+
+    _check(before, expected)
+
+
+def test_no_hoist_multiple_let():
+    @tilelang.jit()
+    def get_fused_mapping_kernel(topk_idx: T.Tensor[(1,), T.int32]):
+        with T.Kernel():
+            _tmp1 = T.alloc_shared((1,), "int")
+            for i in T.serial(0, 4, 2):
+                _tmp2 = topk_idx[i]
+                T.assume(0 <= _tmp2 < 1)
+                if _tmp2 != -1:
+                    T.atomic_add(_tmp1[_tmp2], 1)
+
+    get_fused_mapping_kernel.compile()
+
+
+def test_no_hoist_thread_idx_predicate():
+    """Do not unswitch predicates that depend on threadIdx.
+
+    These predicates are loop-invariant, but hoisting them can split execution
+    across threads and break later synchronization insertion passes.
+    """
+
+    @T.prim_func
+    def before(A_ptr: T.handle, B_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (256,), dtype=T.int32)
+        B = T.match_buffer(B_ptr, (256,), dtype=T.int32)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for tx in T.thread_binding(256, thread="threadIdx.x"):
+                for i in T.unroll(0, 2):
+                    B[tx] = A[tx]
+                    if tx == 0:
+                        B[i] = T.int32(1)
+
+    _check(before, before)
+
+
+def test_hoist_with_else_when_enabled():
+    """Allow hoisting if-else when explicitly enabled."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            if cond[0] > 0:
+                B[i] = A[i]
+            else:
+                B[i] = A[i] * T.float32(2.0)
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in range(128):
+                B[i] = A[i]
+        else:
+            for i in range(128):
+                B[i] = A[i] * T.float32(2.0)
+
+    _check_with_config(
+        before,
+        expected,
+        config={PassConfigKey.TL_LOOP_UNSWITCHING_ALLOW_NON_TRIVIAL_ELSE: True},
+    )
+
+
+def test_hoist_with_other_stmts_when_enabled():
+    """Allow hoisting when loop contains other side effects if enabled."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        for i in range(128):
+            C[i] = A[i]
+            if cond[0] > 0:
+                B[i] = A[i]
+
+    @T.prim_func
+    def expected(
+        A: T.Tensor((128,), T.float32),
+        B: T.Tensor((128,), T.float32),
+        C: T.Tensor((128,), T.float32),
+        cond: T.Tensor((1,), T.int32),
+    ):
+        if cond[0] > 0:
+            for i in range(128):
+                C[i] = A[i]
+                B[i] = A[i]
+        else:
+            for i in range(128):
+                C[i] = A[i]
+                T.evaluate(0)
+
+    _check_with_config(
+        before,
+        expected,
+        config={PassConfigKey.TL_LOOP_UNSWITCHING_ALLOW_NON_TRIVIAL_ELSE: True},
+    )
+
+
+def test_no_hoist_thread_idx_predicate_even_when_enabled():
+    """The aggressive option must not unswitch per-thread predicates."""
+
+    @T.prim_func
+    def before(A_ptr: T.handle, B_ptr: T.handle):
+        A = T.match_buffer(A_ptr, (256,), dtype=T.int32)
+        B = T.match_buffer(B_ptr, (256,), dtype=T.int32)
+
+        for _blockIdx in T.thread_binding(1, thread="blockIdx.x"):
+            for tx in T.thread_binding(256, thread="threadIdx.x"):
+                for i in T.unroll(0, 2):
+                    B[tx] = A[tx]
+                    if tx == 0:
+                        B[i] = T.int32(1)
+
+    _check_with_config(
+        before,
+        before,
+        config={PassConfigKey.TL_LOOP_UNSWITCHING_ALLOW_NON_TRIVIAL_ELSE: True},
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
index f411b3d5b5..640e991828 100644
--- a/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
+++ b/testing/python/transform/test_tilelang_transform_lower_hopper_intrin.py
@@ -23,26 +23,79 @@ def _check(original, transformed):
     # tvm.ir.assert_structural_equal(mod["main"], transformed["main"], True)
 
 
-def test_lower_hopper_intrin_barrier():
+def test_lower_shared_barrier():
+    """Test that LowerSharedBarrier converts shared.barrier buffers + barrier_init
+    annotations into ptx_init_barrier_thread_count calls.
+
+    This replaces the old test_lower_hopper_intrin_barrier which tested the
+    removed tl.create_list_of_mbarrier intrinsic.
+    """
+
     @T.prim_func
     def before():
         with T.Kernel(8):
             _ = T.launch_thread("threadIdx.x", 128)
-            T.create_list_of_mbarrier(128, 128, 128, 128)
+            mbarrier = T.alloc_barrier([128, 128, 128, 128])  # noqa: F841
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.LowerSharedBarrier()(mod)
+    mod = tir.transform.LowerOpaqueBlock()(mod)
+
+    main_func = mod["main"]
+    body_text = main_func.script()
+
+    # After LowerSharedBarrier, we should see ptx_init_barrier_thread_count calls
+    assert "ptx_init_barrier_thread_count" in body_text
+    # Should see fence_barrier_init
+    assert "ptx_fence_barrier_init" in body_text
+    # Should see storage_sync
+    assert "tvm_storage_sync" in body_text
 
+
+@tilelang.testing.requires_cuda_compute_version_ge(9, 0)
+def test_tma_descriptor_init_after_alloc_global():
     @T.prim_func
-    def after():
-        with T.Kernel(8):
-            v_1 = T.launch_thread("threadIdx.x", 128)
-            T.evaluate(tir.Call("handle", "tir.create_barriers", [4]))
-            with T.If(v_1 == 0), T.Then():
-                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(0), 128]))
-                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(1), 128]))
-                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(2), 128]))
-                T.evaluate(tir.Call("handle", "tir.ptx_init_barrier_thread_count", [T.get_mbarrier(3), 128]))
-            T.evaluate(tir.Call("handle", "tir.tvm_storage_sync", ["shared"]))
-
-    _check(before, after)
+    def before():
+        T.func_attr({"tir.is_entry_func": True, "tl.has_tma": T.bool(True)})
+        Output_partial = T.allocate([32], "float16", "global")
+        with T.launch_thread("threadIdx.x", 1):
+            T.evaluate(
+                T.create_tma_descriptor(
+                    6,
+                    4,
+                    Output_partial,
+                    8,
+                    2,
+                    2,
+                    1,
+                    2,
+                    16,
+                    32,
+                    64,
+                    8,
+                    1,
+                    2,
+                    1,
+                    1,
+                    1,
+                    1,
+                    1,
+                    0,
+                    0,
+                    2,
+                    0,
+                )
+            )
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.LowerHopperIntrin()(mod)
+    func = mod["main"]
+
+    assert not tvm.tir.analysis.undefined_vars(func.body, func.params)
+    body_text = func.script()
+    assert body_text.index('T.allocate([32], "float16", "global")') < body_text.index('T.call_packed("__tvm_tensormap_create_tiled"')
 
 
 if __name__ == "__main__":
diff --git a/testing/python/transform/test_tilelang_transform_lower_ldgstg.py b/testing/python/transform/test_tilelang_transform_lower_ldgstg.py
new file mode 100644
index 0000000000..b0086d3929
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_lower_ldgstg.py
@@ -0,0 +1,332 @@
+"""Tests for LowerLDGSTG pass that converts Ramp-based global memory
+load/store to ldg/stg intrinsics.
+
+Pass configurations:
+- tl.enable_lower_ldgstg: Enable non-predicated ldg/stg lowering (default: OFF)
+- tl.enable_lower_ldgstg_predicated: Enable predicated ldg/stg lowering (default: OFF)
+"""
+
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tilelang.transform import PassConfigKey
+from tvm import tir
+
+
+def _apply_passes(mod, enable_non_predicated=False, enable_predicated=False):
+    """Apply the LowerLDGSTG pass and related lowering passes."""
+    mod = tvm.tir.transform.BindTarget(tvm.target.Target("cuda"))(mod)
+    mod = tl.transform.FlattenBuffer()(mod)
+    mod = tl.transform.VectorizeLoop()(mod)
+    with tvm.transform.PassContext(
+        config={
+            PassConfigKey.TL_ENABLE_LOWER_LDGSTG: enable_non_predicated,
+            PassConfigKey.TL_ENABLE_LOWER_LDGSTG_PREDICATED: enable_predicated,
+        }
+    ):
+        mod = tl.transform.LowerLDGSTG()(mod)
+    return mod
+
+
+def _check_has_intrinsic(mod, intrinsic_name):
+    """Check if the module contains a specific intrinsic call."""
+    found = [False]
+
+    def visitor(obj):
+        if isinstance(obj, tir.Call) and hasattr(obj.op, "name") and intrinsic_name in obj.op.name:
+            found[0] = True
+
+    tir.stmt_functor.post_order_visit(mod["main"].body, visitor)
+    return found[0]
+
+
+def test_lower_ldg32_default_off():
+    """Test that non-predicated ldg/stg lowering is OFF by default."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(128, "threadIdx.x"):
+            B[i] = A[i]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod)  # Default: enable_non_predicated=False
+    print("=== test_lower_ldg32_default_off ===")
+    print(mod)
+    # By default, non-predicated lowering is OFF
+    assert not _check_has_intrinsic(mod, "ldg32"), "Non-predicated ldg should be OFF by default"
+    assert not _check_has_intrinsic(mod, "stg32"), "Non-predicated stg should be OFF by default"
+
+
+def test_lower_ldg32_enabled():
+    """Test that ldg32/stg32 works when enabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(128, "threadIdx.x"):
+            B[i] = A[i]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_lower_ldg32_enabled ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg32"), "Expected ldg32 when enabled"
+    assert _check_has_intrinsic(mod, "stg32"), "Expected stg32 when enabled"
+
+
+def test_lower_ldg64_enabled():
+    """Test that ldg64/stg64 works when enabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(64, "threadIdx.x"):
+            for j in T.vectorized(2):
+                B[i * 2 + j] = A[i * 2 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_lower_ldg64_enabled ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg64"), "Expected ldg64 when enabled"
+    assert _check_has_intrinsic(mod, "stg64"), "Expected stg64 when enabled"
+
+
+def test_lower_ldg128_enabled():
+    """Test that ldg128/stg128 works when enabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                B[i * 4 + j] = A[i * 4 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_lower_ldg128_enabled ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg128"), "Expected ldg128 when enabled"
+    assert _check_has_intrinsic(mod, "stg128"), "Expected stg128 when enabled"
+
+
+def test_lower_ldg256_enabled():
+    """Test that ldg256/stg256 works when enabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((256,), "float32"), B: T.Buffer((256,), "float32")):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(8):
+                B[i * 8 + j] = A[i * 8 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_non_predicated=True)
+    print("=== test_lower_ldg256_enabled ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg256"), "Expected ldg256 when enabled"
+    assert _check_has_intrinsic(mod, "stg256"), "Expected stg256 when enabled"
+
+
+def test_lower_ldg32_predicated():
+    """Test predicated ldg32 for single element load."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(128, "threadIdx.x"):
+            # Predicate doesn't depend on loop var, so it can be lowered
+            B[i] = T.if_then_else(pred > 0, A[i], T.float32(0))
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)  # Default: predicated is ON
+    print("=== test_lower_ldg32_predicated ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg32"), "Expected predicated ldg32"
+
+
+def test_lower_stg32_predicated():
+    """Test predicated stg32 for single element store."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(128, "threadIdx.x"):
+            # Predicate doesn't depend on loop var, so it can be lowered
+            with T.If(pred > 0), T.Then():
+                B[i] = A[i]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)  # Default: predicated is ON
+    print("=== test_lower_stg32_predicated ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "stg32"), "Expected predicated stg32"
+
+
+def test_lower_ldg128_predicated():
+    """Test predicated ldg128 for vectorized load."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                # Predicate doesn't depend on vectorized loop var
+                B[i * 4 + j] = T.if_then_else(pred > 0, A[i * 4 + j], T.float32(0))
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)  # Default: predicated is ON
+    print("=== test_lower_ldg128_predicated ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "ldg128"), "Expected predicated ldg128"
+
+
+def test_lower_stg128_predicated():
+    """Test predicated stg128 for vectorized store."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                # Predicate doesn't depend on vectorized loop var
+                with T.If(pred > 0), T.Then():
+                    B[i * 4 + j] = A[i * 4 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)  # Default: predicated is ON
+    print("=== test_lower_stg128_predicated ===")
+    print(mod)
+    assert _check_has_intrinsic(mod, "stg128"), "Expected predicated stg128"
+
+
+def test_predicated_store_with_load():
+    """Test that when a predicated store contains a load, the load also gets predicated.
+
+    This tests the pattern: if (pred) { B[i] = A[i] }
+    Both the store and the load should use predicated versions to avoid
+    out-of-bounds memory access when pred is false.
+    """
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), pred: T.int32):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                with T.If(pred > 0), T.Then():
+                    B[i * 4 + j] = A[i * 4 + j]
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=True)
+    print("=== test_predicated_store_with_load ===")
+    print(mod)
+    # Both load and store should be predicated
+    assert _check_has_intrinsic(mod, "ldg128"), "Expected predicated ldg128 for load inside predicated store"
+    assert _check_has_intrinsic(mod, "stg128"), "Expected predicated stg128"
+
+
+def test_predicated_disabled():
+    """Test that predicated lowering can be disabled."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32"), N: T.int32):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                idx = i * 4 + j
+                B[idx] = T.if_then_else(idx < N, A[idx], T.float32(0))
+
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = _apply_passes(mod, enable_predicated=False)
+    print("=== test_predicated_disabled ===")
+    print(mod)
+    # When disabled, no predicated ldg/stg should be generated
+    # This just verifies the configuration works
+
+
+def test_non_cuda_target_skip():
+    """Test that the pass is skipped for non-CUDA targets."""
+
+    @T.prim_func
+    def func(A: T.Buffer((128,), "float32"), B: T.Buffer((128,), "float32")):
+        for i in T.thread_binding(32, "threadIdx.x"):
+            for j in T.vectorized(4):
+                B[i * 4 + j] = A[i * 4 + j]
+
+    # Use a CPU target
+    cpu_target = tvm.target.Target("llvm")
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(cpu_target)(mod)
+    mod = tl.transform.FlattenBuffer()(mod)
+    mod = tl.transform.VectorizeLoop()(mod)
+    with tvm.transform.PassContext(config={PassConfigKey.TL_ENABLE_LOWER_LDGSTG: True}):
+        mod = tl.transform.LowerLDGSTG()(mod)
+    print("=== test_non_cuda_target_skip ===")
+    print(mod)
+    # The load should NOT be lowered to ldg because target is not CUDA
+    assert not _check_has_intrinsic(mod, "ldg"), "Non-CUDA targets should NOT use ldg intrinsics"
+    assert not _check_has_intrinsic(mod, "stg"), "Non-CUDA targets should NOT use stg intrinsics"
+
+
+@tilelang.testing.requires_cuda
+def test_e2e_load_global_store_global():
+    """End-to-end test that ldg/stg intrinsics work correctly when enabled."""
+    import torch
+
+    @tilelang.jit(pass_configs={PassConfigKey.TL_ENABLE_LOWER_LDGSTG: True})
+    def copy_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            for j in T.vectorized(4):
+                Y[pid * 4 + j] = X[pid * 4 + j]
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    copy_kernel(X, Y)
+
+    # Verify correctness
+    torch.testing.assert_close(Y, X, atol=1e-5, rtol=1e-5)
+
+    # Verify codegen contains ldg/stg
+    src = copy_kernel.get_kernel_source(N=128)
+    print("=== Generated kernel source ===")
+    print(src)
+    assert "load_global_128" in src or "store_global_128" in src, "Expected load_global_128/store_global_128 in generated source"
+
+
+@tilelang.testing.requires_cuda
+def test_e2e_load_global_store_global_predicated():
+    """End-to-end test that load_global/store_global intrinsics work correctly when enabled."""
+    import torch
+
+    @tilelang.jit(pass_configs={PassConfigKey.TL_ENABLE_LOWER_LDGSTG: True, PassConfigKey.TL_ENABLE_LOWER_LDGSTG_PREDICATED: True})
+    def copy_kernel(X, Y):
+        N = T.const("N")
+        X: T.Tensor[[N], T.float32]
+        Y: T.Tensor[[N], T.float32]
+
+        with T.Kernel(N // 4, threads=32) as pid:
+            for j in T.vectorized(4):
+                Y[pid * 4 + j] = T.if_then_else(pid < N // 8, X[pid * 4 + j], T.float32(0))
+
+    X = torch.randn(128, dtype=torch.float32, device="cuda")
+    Y = torch.empty(128, dtype=torch.float32, device="cuda")
+
+    copy_kernel(X, Y)
+
+    # Verify correctness
+    Y_ref = torch.zeros(128, dtype=torch.float32, device="cuda")
+    for i in range(128):
+        if i < 64:
+            Y_ref[i] = X[i]
+        else:
+            Y_ref[i] = 0
+
+    torch.testing.assert_close(Y, Y_ref, atol=1e-5, rtol=1e-5)
+
+    # Verify codegen contains load_global/store_global
+    src = copy_kernel.get_kernel_source(N=128)
+    print("=== Generated kernel source ===")
+    print(src)
+    assert "load_global_128_conditional" in src or "store_global_128_conditional" in src, (
+        "Expected load_global_128_conditional/store_global_128_conditional in generated source"
+    )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_lower_ptx_async_copy.py b/testing/python/transform/test_tilelang_transform_lower_ptx_async_copy.py
new file mode 100644
index 0000000000..588a3170c3
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_lower_ptx_async_copy.py
@@ -0,0 +1,376 @@
+"""Tests for TileLang `LowerPTXAsyncCopy` transform pass."""
+
+from tilelang import tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tvm.tir.stmt_functor import post_order_visit
+
+
+def _count_calls(func: tvm.tir.PrimFunc):
+    counts = {}
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.Call) and isinstance(node.op, tvm.ir.Op):
+            name = str(node.op.name)
+            counts[name] = counts.get(name, 0) + 1
+
+    post_order_visit(func.body, _visit)
+    return counts
+
+
+def _count_calls_in_stmt(stmt: tvm.tir.Stmt):
+    counts = {}
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.Call) and isinstance(node.op, tvm.ir.Op):
+            name = str(node.op.name)
+            counts[name] = counts.get(name, 0) + 1
+
+    post_order_visit(stmt, _visit)
+    return counts
+
+
+def test_lower_ptx_async_copy_rewrites_plain_parallel_copy():
+    """LowerPTXAsyncCopy should rewrite plain global->shared stores to cp.async."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        for i in T.Parallel(16):
+            S[i] = A[i]
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) > 0
+    assert calls.get("tir.ptx_wait_group", 0) > 0
+
+
+def test_lower_ptx_async_copy_respects_explicit_async_scope():
+    """`async_scope` marks explicit async semantics, so implicit sync should not be added."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        with T.attr(0, "async_scope", 1):
+            for i in T.Parallel(16):
+                S[i] = A[i]
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) == 0
+    assert calls.get("tir.ptx_wait_group", 0) == 0
+
+
+def test_lower_ptx_async_copy_supports_multi_dim_indices():
+    """LowerPTXAsyncCopy should handle N-D buffer indices (pre-FlattenBuffer)."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((4, 4), T.float32),
+        B: T.Tensor((4, 4), T.float32),
+    ):
+        S = T.alloc_buffer((4, 4), dtype=T.float32, scope="shared")
+        for i, j in T.Parallel(4, 4):
+            S[i, j] = A[i, j]
+        B[0, 0] = S[0, 0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) > 0
+    assert calls.get("tir.ptx_wait_group", 0) > 0
+
+
+def test_lower_ptx_async_copy_rewrites_vectorized_float16_loop():
+    """Vectorized float16 copies should be rewritten to cp.async (widened later)."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((32,), T.float16),
+        B: T.Tensor((32,), T.float16),
+    ):
+        S = T.alloc_buffer((32,), dtype=T.float16, scope="shared")
+        for i in T.serial(4):
+            for v in T.vectorized(8):
+                S[i * 8 + v] = A[i * 8 + v]
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) > 0
+    assert calls.get("tir.ptx_wait_group", 0) > 0
+
+
+def test_lower_ptx_async_copy_hoists_sync_out_of_predicated_block():
+    """Hoist commit+wait out of loops even if block realize predicate != 1."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        for i in T.serial(4):
+            with T.block("copy"):
+                vi = T.axis.spatial(4, i)
+                T.where(vi < 3)
+                S[vi] = A[vi]
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) > 0
+    assert calls.get("tir.ptx_wait_group", 0) > 0
+
+    # Ensure we didn't introduce commit/wait *inside* the serial loop body.
+    loop = None
+
+    def _find_for(node):
+        nonlocal loop
+        if loop is None and isinstance(node, tvm.tir.For):
+            loop = node
+
+    post_order_visit(mod["main"].body, _find_for)
+    assert loop is not None
+    inner_calls = _count_calls_in_stmt(loop.body)
+    assert inner_calls.get("tir.ptx_commit_group", 0) == 0
+    assert inner_calls.get("tir.ptx_wait_group", 0) == 0
+
+
+def test_lower_ptx_async_copy_respects_enable_async_copy_config():
+    """`tl.enable_async_copy=False` should disable auto rewriting."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        for i in T.Parallel(16):
+            S[i] = A[i]
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    with tvm.transform.PassContext(config={tl.PassConfigKey.TL_ENABLE_ASYNC_COPY: False}):
+        mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) == 0
+    assert calls.get("tir.ptx_commit_group", 0) == 0
+    assert calls.get("tir.ptx_wait_group", 0) == 0
+
+
+def test_lower_ptx_async_copy_does_not_duplicate_existing_sync():
+    """If commit/wait already exist, LowerPTXAsyncCopy should not add another pair."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        for i in T.parallel(16):
+            S[i] = A[i]
+        T.ptx_commit_group()
+        T.ptx_wait_group(0)
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) == 1
+    assert calls.get("tir.ptx_wait_group", 0) == 1
+
+
+def test_lower_ptx_async_copy_inserts_commit_before_existing_wait():
+    """If a wait exists but no commit, we insert a commit to cover injected cp.async."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        for i in T.parallel(16):
+            S[i] = A[i]
+        T.ptx_wait_group(0)
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) == 1
+    assert calls.get("tir.ptx_wait_group", 0) == 1
+
+
+def test_lower_ptx_async_copy_keeps_sync_out_of_inner_unrolled_loops_in_pipelined_loop():
+    """In a pipelined loop, commit/wait should be top-level statements, not inside copy loops."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        for ko in T.Pipelined(2, num_stages=2):
+            for i in T.unroll(4):
+                S[ko * 4 + i] = A[ko * 4 + i]
+            B[ko] = S[ko]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) > 0
+    assert calls.get("tir.ptx_wait_group", 0) > 0
+
+    pipelined_loop = None
+
+    def _find_pipelined_for(node):
+        nonlocal pipelined_loop
+        if pipelined_loop is None and isinstance(node, tvm.tir.For) and "num_stages" in node.annotations:
+            pipelined_loop = node
+
+    post_order_visit(mod["main"].body, _find_pipelined_for)
+    assert pipelined_loop is not None
+
+    # Find an inner unrolled loop (the copy loop) and ensure it doesn't contain commit/wait.
+    inner_unrolled = None
+
+    def _find_unrolled(node):
+        nonlocal inner_unrolled
+        if inner_unrolled is None and isinstance(node, tvm.tir.For) and node.kind == tvm.tir.ForKind.UNROLLED:
+            inner_unrolled = node
+
+    post_order_visit(pipelined_loop.body, _find_unrolled)
+    assert inner_unrolled is not None
+    inner_calls = _count_calls_in_stmt(inner_unrolled.body)
+    assert inner_calls.get("tir.ptx_commit_group", 0) == 0
+    assert inner_calls.get("tir.ptx_wait_group", 0) == 0
+
+
+def test_lower_ptx_async_copy_from_vectorized_loop():
+    """LowerPTXAsyncCopy should rewrite vectorized loop to cp.async."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((4,), T.float32),
+        B: T.Tensor((4,), T.float32),
+    ):
+        S = T.alloc_buffer((4,), dtype=T.float32, scope="shared")
+        for i in T.vectorized(4):
+            S[i] = A[i]
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+
+
+def test_lower_ptx_async_copy_skips_vectorized_broadcast_source():
+    """Do not lower vectorized broadcast load into cp.async."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        for i in T.serial(4):
+            for v in T.vectorized(4):
+                S[i * 4 + v] = A[i * 4]
+        B[0] = S[0]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    calls = _count_calls(mod["main"])
+    assert calls.get("tl.ptx_cp_async", 0) == 0
+    assert calls.get("tir.ptx_commit_group", 0) == 0
+    assert calls.get("tir.ptx_wait_group", 0) == 0
+
+
+def test_lower_ptx_async_copy_from_ramp():
+    """LowerPTXAsyncCopy should rewrite ramp to cp.async."""
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((4,), T.float32),
+        B: T.Tensor((4,), T.float32),
+    ):
+        S = T.alloc_buffer((4,), dtype=T.float32, scope="shared")
+        S[0:4] = A[0:4]
+        B[0:4] = S[0:4]
+
+    target = tvm.target.Target("cuda -arch=sm_80")
+    func = before.with_attr("global_symbol", "main").with_attr("target", target)
+    mod = tvm.IRModule.from_expr(func)
+
+    mod = tl.transform.LowerPTXAsyncCopy()(mod)
+    print(mod)
+    calls = _count_calls(mod["main"])
+    print(calls)
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_lower_shared_barrier.py b/testing/python/transform/test_tilelang_transform_lower_shared_barrier.py
new file mode 100644
index 0000000000..ae0040aa8d
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_lower_shared_barrier.py
@@ -0,0 +1,177 @@
+# ruff: noqa
+from tilelang import tvm as tvm
+import tilelang as tl
+from tilelang.utils.target import determine_target
+import tilelang.language as T
+import tilelang.testing
+from tilelang.engine.phase import LowerAndLegalize
+from tvm import tir
+
+auto_target = tvm.target.Target(determine_target("auto"))
+
+
+def _apply(func):
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.LowerSharedBarrier()(mod)
+    return mod
+
+
+def _collect_calls(stmt, op_name: str):
+    calls = []
+
+    def visitor(node):
+        if isinstance(node, tvm.tir.Call) and hasattr(node, "op") and hasattr(node.op, "name") and node.op.name == op_name:
+            calls.append(node)
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, visitor)
+    return calls
+
+
+def _collect_storage_syncs(stmt):
+    return _collect_calls(stmt, "tir.tvm_storage_sync")
+
+
+def _collect_init_barrier_calls(stmt):
+    return _collect_calls(stmt, "tir.ptx_init_barrier_thread_count")
+
+
+def _collect_fence_barrier_init(stmt):
+    return _collect_calls(stmt, "tl.ptx_fence_barrier_init")
+
+
+def _collect_shuffle_elect(stmt):
+    return _collect_calls(stmt, "tl.tl_shuffle_elect")
+
+
+def _collect_barrier_blocks(stmt):
+    blocks = []
+
+    def visitor(node):
+        if isinstance(node, tvm.tir.Block):
+            barrier_bufs = [buf for buf in node.alloc_buffers if buf.scope() in ("shared.barrier", "shared.cluster_barrier")]
+            if barrier_bufs:
+                blocks.append(node)
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, visitor)
+    return blocks
+
+
+def test_single_barrier():
+    """Single barrier with one arrive count."""
+
+    @T.prim_func
+    def func():
+        with T.Kernel(1, threads=128):
+            mbar = T.alloc_barrier(128)  # noqa: F841
+
+    mod = _apply(func)
+    body = mod["main"].body
+
+    assert len(_collect_init_barrier_calls(body)) == 1
+    assert len(_collect_fence_barrier_init(body)) == 1
+    assert len(_collect_shuffle_elect(body)) == 1
+
+    init_call = _collect_init_barrier_calls(body)[0]
+    # arrive count should be 128
+    assert init_call.args[1].value == 128
+
+
+def test_multiple_barriers():
+    """Multiple barriers with different arrive counts."""
+
+    @T.prim_func
+    def func():
+        with T.Kernel(1, threads=128):
+            mbars = T.alloc_barrier([1, 1, 128, 128])  # noqa: F841
+
+    mod = _apply(func)
+    body = mod["main"].body
+
+    init_calls = _collect_init_barrier_calls(body)
+    assert len(init_calls) == 4
+
+    arrive_counts = sorted([c.args[1].value for c in init_calls])
+    assert arrive_counts == [1, 1, 128, 128]
+
+    # Should have exactly one shuffle_elect guard and one fence_barrier_init
+    assert len(_collect_shuffle_elect(body)) == 1
+    assert len(_collect_fence_barrier_init(body)) == 1
+
+    # Should sync after init
+    syncs = _collect_storage_syncs(body)
+    assert len(syncs) >= 1
+
+
+def test_no_barrier_is_noop():
+    """Pass should be a no-op when no barrier buffers are present."""
+
+    @T.prim_func
+    def func():
+        with T.Kernel(1, threads=128):
+            buf = T.alloc_shared((16,), T.float16)
+            buf[0] = T.float16(0)
+
+    mod = _apply(func)
+    body = mod["main"].body
+
+    assert len(_collect_init_barrier_calls(body)) == 0
+    assert len(_collect_fence_barrier_init(body)) == 0
+
+
+def test_plan_update_keeps_barrier_init_with_tcgen05_no_tma():
+    """Regression for tcgen05 no-TMA kernels after pass reordering."""
+
+    pass_configs = {tl.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True}
+
+    @T.prim_func
+    def func(
+        X: T.Tensor((256, 256), T.float16),
+        Y: T.Tensor((256, 256), T.float16),
+    ):
+        with T.Kernel(2, 2, threads=256) as (bx, by):
+            A_shared = T.alloc_shared((128, 128), T.float16)
+            B_shared = T.alloc_shared((128, 128), T.float16)
+            C_tmem = T.alloc_tmem([128, 128], T.float32)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((128, 128), T.float32)
+            Y_shared = T.alloc_shared((128, 128), T.float16)
+
+            for ko in T.Pipelined(2, num_stages=2):
+                T.copy(X[by * 128, ko * 128], A_shared)
+                T.copy(X[bx * 128, ko * 128], B_shared)
+                T.tcgen05_gemm(
+                    A_shared,
+                    B_shared,
+                    C_tmem,
+                    transpose_B=True,
+                    mbar=mbar,
+                    clear_accum=ko == 0,
+                )
+                T.mbarrier_wait_parity(mbar, ko % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, Y_shared)
+            T.copy(Y_shared, Y[by * 128, bx * 128])
+
+    target = tvm.target.Target("cuda -arch=sm_100")
+    with tvm.transform.PassContext(config=pass_configs), target:
+        mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+        mod = LowerAndLegalize(mod, target)
+        mod = tl.transform.LowerSharedTmem()(mod)
+        mod = tl.transform.IfStmtBinding()(mod)
+        mod = tl.transform.PlanAndUpdateBufferAllocationLocation()(mod)
+
+        barrier_blocks = _collect_barrier_blocks(mod["main"].body)
+        assert len(barrier_blocks) == 1
+        assert "barrier_init" in barrier_blocks[0].annotations
+
+        mod = tl.transform.LowerSharedBarrier()(mod)
+
+    body = mod["main"].body
+    assert len(_collect_init_barrier_calls(body)) == 1
+    assert len(_collect_fence_barrier_init(body)) == 1
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_lower_shared_tmem.py b/testing/python/transform/test_tilelang_transform_lower_shared_tmem.py
new file mode 100644
index 0000000000..9c5d0a70b0
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_lower_shared_tmem.py
@@ -0,0 +1,94 @@
+# ruff: noqa
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+
+
+TARGET = tvm.target.Target("cuda -arch=sm_100")
+
+
+def _apply(func):
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(TARGET)(mod)
+    mod = tl.transform.LowerSharedTmem()(mod)
+    return mod
+
+
+def _collect_calls(stmt, op_name: str):
+    calls = []
+
+    def visitor(node):
+        if isinstance(node, tvm.tir.Call) and hasattr(node, "op") and hasattr(node.op, "name") and node.op.name == op_name:
+            calls.append(node)
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, visitor)
+    return calls
+
+
+def test_explicit_deallocate_tmem_suppresses_auto_dealloc():
+    """Explicit T.deallocate_tmem on fallthrough suppresses auto-dealloc."""
+
+    @T.prim_func
+    def func():
+        with T.Kernel(1, threads=128):
+            C_tmem = T.alloc_tmem([128, 128], T.float32)
+            T.deallocate_tmem(C_tmem)
+
+    mod = _apply(func)
+    body = mod["main"].body
+    assert len(_collect_calls(body, "tl.ptx_init_tensor_memory")) == 1
+    assert len(_collect_calls(body, "tl.ptx_deallocate_tensor_memory")) == 1
+    assert len(_collect_calls(body, "tl.deallocate_tmem")) == 0
+
+    dealloc_call = _collect_calls(body, "tl.ptx_deallocate_tensor_memory")[0]
+    assert dealloc_call.args[1].value == 128
+
+
+def test_explicit_deallocate_only_suppresses_matching_buffer():
+    """Only the explicitly-deallocated buffer skips auto-dealloc; others keep it."""
+
+    @T.prim_func
+    def func():
+        with T.Kernel(1, threads=128):
+            A_tmem = T.alloc_tmem([128, 128], T.float32)
+            B_tmem = T.alloc_tmem([128, 64], T.float32)
+            T.deallocate_tmem(A_tmem)
+
+    mod = _apply(func)
+    body = mod["main"].body
+
+    dealloc_calls = _collect_calls(body, "tl.ptx_deallocate_tensor_memory")
+    # A_tmem: 1 explicit (auto suppressed); B_tmem: 1 auto = 2 total
+    assert len(dealloc_calls) == 2
+
+    dealloc_num_cols = sorted(call.args[1].value for call in dealloc_calls)
+    assert dealloc_num_cols == [64, 128]
+
+
+def test_dealloc_before_thread_return_keeps_auto_dealloc():
+    """Dealloc on non-fallthrough path (before thread_return) does NOT suppress auto-dealloc."""
+
+    @T.prim_func
+    def func():
+        with T.Kernel(1, threads=128):
+            C_tmem = T.alloc_tmem([128, 128], T.float32)
+            tx = T.get_thread_binding()
+
+            if tx < 32:
+                T.deallocate_tmem(C_tmem)
+                T.thread_return()
+
+    mod = _apply(func)
+    body = mod["main"].body
+
+    dealloc_calls = _collect_calls(body, "tl.ptx_deallocate_tensor_memory")
+    # 1 explicit (non-fallthrough) + 1 auto (block end) = 2
+    assert len(dealloc_calls) == 2
+    assert [call.args[1].value for call in dealloc_calls] == [128, 128]
+
+
+if __name__ == "__main__":
+    test_explicit_deallocate_tmem_suppresses_auto_dealloc()
+    test_explicit_deallocate_only_suppresses_matching_buffer()
+    test_dealloc_before_thread_return_keeps_auto_dealloc()
diff --git a/testing/python/transform/test_tilelang_transform_lower_tile_op.py b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
index 16c7cb8027..c43199151f 100644
--- a/testing/python/transform/test_tilelang_transform_lower_tile_op.py
+++ b/testing/python/transform/test_tilelang_transform_lower_tile_op.py
@@ -1,87 +1,110 @@
-from tilelang import tvm as tvm
-from tilelang.utils.target import determine_target
+"""Tests for TileLang `LowerTileOp` copy annotations affecting cp.async sync."""
+
 import tilelang as tl
 import tilelang.language as T
 import tilelang.testing
-import pytest
+from tilelang import tvm
+from tvm.tir.stmt_functor import post_order_visit
 
-auto_target = tvm.target.Target(determine_target("auto"))
 
+def _count_calls(func: tvm.tir.PrimFunc):
+    counts = {}
 
-@pytest.mark.parametrize(
-    "block_M, block_N, block_K, threads, vec_load_b, dtype",
-    [
-        (64, 64, 32, 128, 8, T.float16),
-    ],
-)
-def test_loop_tail_split(block_M, block_N, block_K, threads, vec_load_b, dtype):
-    N = tvm.te.var("n")
-    K = tvm.te.var("k")
+    def _visit(node):
+        if isinstance(node, tvm.tir.Call) and isinstance(node.op, tvm.ir.Op):
+            name = str(node.op.name)
+            counts[name] = counts.get(name, 0) + 1
+
+    post_order_visit(func.body, _visit)
+    return counts
+
+
+def test_lower_tile_op_respects_copy_annotation_for_pipeline_managed_cp_async():
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 16)
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        T.copy(
+            A[0:16],
+            S,
+            annotations={"no_implicit_async_commit_wait": T.int32(1)},
+        )
+        B[tx] = S[tx]
+
+    mod = tvm.IRModule.from_expr(before)
+    with target:
+        mod = tl.transform.LowerTileOp()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) == 0
+    assert calls.get("tir.ptx_wait_group", 0) == 0
 
-    def before():
-        @T.prim_func
-        def main(
-            B: T.Tensor((K, N), dtype),
-        ):
-            with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
-                B_shared = T.alloc_shared((block_K, block_N), dtype)
-                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                    T.copy(B[k * block_K, bx * block_N], B_shared)
 
-        return tvm.IRModule({"main": main})
+def test_lower_tile_op_respects_copy_annotation_for_explicit_async_copy():
+    target = tvm.target.Target("cuda -arch=sm_80")
 
-    def after():
-        @T.prim_func
-        def main(
-            B: T.Tensor((K, N), dtype),
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 16)
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        T.async_copy(
+            A[0:16],
+            S,
+            annotations={"no_implicit_async_commit_wait": T.int32(1)},
+        )
+        B[tx] = S[tx]
+
+    mod = tvm.IRModule.from_expr(before)
+    with target:
+        mod = tl.transform.LowerTileOp()(mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) == 0
+    assert calls.get("tir.ptx_wait_group", 0) == 0
+
+
+def test_lower_tile_op_respects_parallel_loop_async_annotation_without_pipeline_context():
+    target = tvm.target.Target("cuda -arch=sm_80")
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((16,), T.float32),
+        B: T.Tensor((16,), T.float32),
+    ):
+        T.func_attr({"global_symbol": "main", "target": target})
+        T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 16)
+        S = T.alloc_buffer((16,), dtype=T.float32, scope="shared")
+        for i in T.parallel(
+            16,
+            annotations={"parallel_async_without_async_commit_wait": T.bool(True)},
         ):
-            with T.Kernel(T.ceildiv(N, block_N), threads=threads) as (bx):
-                B_shared = T.alloc_shared((block_K, block_N), dtype)
-                thread_bindings = T.thread_binding(0, threads, "threadIdx.x")
-                for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                    t = thread_bindings
-                    for i in T.unroll(0, block_N * block_K // (threads * vec_load_b)):
-                        if (k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b)) * N % vec_load_b == 0:
-                            for vec in T.vectorized(vec_load_b):
-                                B_shared[
-                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
-                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
-                                ] = T.if_then_else(
-                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
-                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                    B[
-                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
-                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
-                                    ],
-                                    T.float16(0),
-                                )
-                        else:
-                            for vec in T.serial(vec_load_b):
-                                B_shared[
-                                    i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
-                                    t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
-                                ] = T.if_then_else(
-                                    k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b) < K
-                                    and bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) < N,
-                                    B[
-                                        k * block_K + i * (threads * vec_load_b // block_N) + t // (block_N // vec_load_b),
-                                        bx * block_N + t % (block_N // vec_load_b) * (block_N // vec_load_b) + vec,
-                                    ],
-                                    T.float16(0),
-                                )
-
-        return tvm.IRModule({"main": main})
-
-    with tvm.transform.PassContext():
-        mod = tvm.tir.transform.BindTarget(auto_target)(before())
+            S[i] = A[i]
+        B[tx] = S[tx]
+
+    mod = tvm.IRModule.from_expr(before)
+    with target:
+        mod = tl.transform.LayoutInference()(mod)
         mod = tl.transform.LowerTileOp()(mod)
-        mod = tvm.tir.transform.Simplify()(mod)
-    ref_mod = tvm.tir.transform.BindTarget(auto_target)(after())
-    ref_mod = tvm.tir.transform.Simplify()(ref_mod)
-    # Note(tzj): The structures are equal except the argument in "T.reads" function.
-    # The difference is just between the first index and the indices range, which is totally equivalent
-    tvm.ir.structural_equal(mod, ref_mod)
-    # tvm.ir.assert_structural_equal(mod, ref_mod)
+    calls = _count_calls(mod["main"])
+
+    assert calls.get("tl.ptx_cp_async", 0) > 0
+    assert calls.get("tir.ptx_commit_group", 0) == 0
+    assert calls.get("tir.ptx_wait_group", 0) == 0
 
 
 if __name__ == "__main__":
diff --git a/testing/python/transform/test_tilelang_transform_pipeline_barrier_ownership.py b/testing/python/transform/test_tilelang_transform_pipeline_barrier_ownership.py
new file mode 100644
index 0000000000..60d04b6a3e
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_pipeline_barrier_ownership.py
@@ -0,0 +1,145 @@
+"""Regression tests for pipeline barrier ownership.
+
+Plain pipelined T.copy should stay on the synchronous path in non-WS kernels.
+Explicit TMA-style producers such as im2col still own pipeline barriers when
+their lowering requires them.
+"""
+
+import pytest
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+
+
+def _check_hopper():
+    """Return True if running on Hopper (sm_90)."""
+    try:
+        import torch
+
+        if not torch.cuda.is_available():
+            return False
+        props = torch.cuda.get_device_properties(0)
+        return (props.major, props.minor) == (9, 0)
+    except Exception:
+        return False
+
+
+@pytest.mark.skipif(not _check_hopper(), reason="Requires Hopper GPU (sm_90)")
+def test_nonws_plain_copy_gemm_num_stages_3_stays_sync():
+    """Non-WS pipelined GEMM should not auto-upgrade plain T.copy to TMA."""
+    M, N, K = 512, 512, 512
+    block_M, block_N, block_K = 128, 128, 32
+
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_s = T.alloc_shared((block_M, block_K), T.float16)
+            B_s = T.alloc_shared((block_K, block_N), T.float16)
+            C_l = T.alloc_fragment((block_M, block_N), T.float32)
+            T.clear(C_l)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
+                T.copy(A[by * block_M, ko * block_K], A_s)
+                T.copy(B[ko * block_K, bx * block_N], B_s)
+                T.gemm(A_s, B_s, C_l)
+            T.copy(C_l, C[by * block_M, bx * block_N])
+
+    kernel = tilelang.compile(
+        gemm,
+        out_idx=-1,
+        execution_backend="tvm_ffi",
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    src = kernel.get_kernel_source()
+    assert "tl::tma_load" not in src, "Non-WS plain T.copy should stay synchronous"
+    assert "pipeline_mbar_mem" not in src, "Non-WS plain T.copy should not allocate pipeline TMA barriers"
+
+
+@pytest.mark.skipif(not _check_hopper(), reason="Requires Hopper GPU (sm_90)")
+def test_nonws_plain_copy_gemm_num_stages_1_stays_sync():
+    """num_stages=1 should also keep non-WS plain T.copy on the sync path."""
+    M, N, K = 512, 512, 512
+    block_M, block_N, block_K = 128, 128, 32
+
+    @T.prim_func
+    def gemm(
+        A: T.Tensor((M, K), T.float16),
+        B: T.Tensor((K, N), T.float16),
+        C: T.Tensor((M, N), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_s = T.alloc_shared((block_M, block_K), T.float16)
+            B_s = T.alloc_shared((block_K, block_N), T.float16)
+            C_l = T.alloc_fragment((block_M, block_N), T.float32)
+            T.clear(C_l)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=1):
+                T.copy(A[by * block_M, ko * block_K], A_s)
+                T.copy(B[ko * block_K, bx * block_N], B_s)
+                T.gemm(A_s, B_s, C_l)
+            T.copy(C_l, C[by * block_M, bx * block_N])
+
+    kernel = tilelang.compile(
+        gemm,
+        out_idx=-1,
+        execution_backend="tvm_ffi",
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    src = kernel.get_kernel_source()
+    assert "tl::tma_load" not in src, "Non-WS plain T.copy should stay synchronous"
+    assert "pipeline_mbar_mem" not in src, "Non-WS plain T.copy should not allocate pipeline TMA barriers"
+
+
+@pytest.mark.skipif(not _check_hopper(), reason="Requires Hopper GPU (sm_90)")
+def test_nonws_im2col_tma_num_stages_3_uses_pipeline_barrier():
+    """Non-WS pipelined im2col TMA with num_stages=3 must use pipeline_mbar[3]."""
+    N, C, H, W, F, K_size = 4, 64, 32, 32, 64, 3
+    S, D, P = 1, 1, 1
+    block_M, block_N, block_K = 64, 128, 32
+    KH, KW = K_size, K_size
+    OH = (H + 2 * P - D * (K_size - 1) - 1) // S + 1
+    OW = (W + 2 * P - D * (K_size - 1) - 1) // S + 1
+    num_stages = 3
+
+    @T.prim_func
+    def conv(
+        data: T.Tensor((N, H, W, C), T.float16),
+        weight: T.Tensor((KH, KW, C, F), T.float16),
+        out: T.Tensor((N, OH, OW, F), T.float16),
+    ):
+        with T.Kernel(T.ceildiv(F, block_N), T.ceildiv(N * OH * OW, block_M), threads=256) as (bx, by):
+            data_shared = T.alloc_shared((block_M, block_K), T.float16)
+            weight_shared = T.alloc_shared((block_K, block_N), T.float16)
+            out_local = T.alloc_fragment((block_M, block_N), T.float32)
+            out_shared = T.alloc_shared((block_M, block_N), T.float16)
+            kernel_flat = T.Tensor((KH * KW * C, F), T.float16, weight.data)
+            out_flat = T.Tensor((N * OH * OW, F), T.float16, out.data)
+            T.clear(out_local)
+            for k_iter in T.Pipelined(T.ceildiv(KH * KW * C, block_K), num_stages=num_stages):
+                T.c2d_im2col(data, data_shared, by, k_iter, KH, S, D, P)
+                T.copy(kernel_flat[k_iter * block_K, bx * block_N], weight_shared)
+                T.gemm(data_shared, weight_shared, out_local)
+            T.copy(out_local, out_shared)
+            T.copy(out_shared, out_flat[by * block_M, bx * block_N])
+
+    kernel = tilelang.compile(
+        conv,
+        out_idx=-1,
+        execution_backend="tvm_ffi",
+        pass_configs={tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True},
+    )
+    src = kernel.get_kernel_source()
+    assert f"pipeline_mbar_mem[{num_stages}]" in src, f"Expected pipeline_mbar_mem[{num_stages}] for non-WS im2col TMA pipeline"
+    # tma_load_im2col must appear (im2col was lowered through TMA path)
+    assert "tma_load_im2col" in src, "Expected tma_load_im2col in generated code"
+    # No fallback internal barriers for im2col
+    assert "mbarrier_1" not in src, "Should not have fallback mbarrier_1 when im2col uses pipeline barrier"
+
+
+if __name__ == "__main__":
+    test_nonws_plain_copy_gemm_num_stages_3_stays_sync()
+    test_nonws_plain_copy_gemm_num_stages_1_stays_sync()
+    test_nonws_im2col_tma_num_stages_3_uses_pipeline_barrier()
+    print("All pipeline barrier ownership tests passed!")
diff --git a/testing/python/transform/test_tilelang_transform_pipeline_planning.py b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
index 83db7f75cf..75fc4d4899 100644
--- a/testing/python/transform/test_tilelang_transform_pipeline_planning.py
+++ b/testing/python/transform/test_tilelang_transform_pipeline_planning.py
@@ -3,8 +3,11 @@
 from tilelang.utils.target import determine_target
 import tilelang.language as T
 import tilelang.testing
+import torch
+from tvm.tir.stmt_functor import post_order_visit
 
 auto_target = tvm.target.Target(determine_target("auto"))
+sm80_target = tvm.target.Target("cuda -arch=sm_80")
 
 
 def _check(original, transformed):
@@ -18,6 +21,24 @@ def _check(original, transformed):
     tvm.ir.assert_structural_equal(mod["main"], transformed["main"], True)
 
 
+def _collect_pipeline_loop_annotations(func):
+    annos = []
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.For) and "software_pipeline_stage" in node.annotations:
+            annos.append(node.annotations)
+
+    post_order_visit(func.body, _visit)
+    return annos
+
+
+def _run_pipeline_planning(func, target=auto_target):
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(target)(mod)
+    mod = tl.transform.PipelinePlanning()(mod)
+    return mod
+
+
 def test_simple_pipeline():
     @T.prim_func
     def before(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
@@ -36,8 +57,70 @@ def before(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32
 
             T.copy(C_local, C[by * 128, bx * 128])
 
+    func = before
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.PipelinePlanning()(mod)
+    mod = tl.transform.Simplify()(mod)
+
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert len(annos) == 1
+    anno = annos[0]
+    assert "software_pipeline_stage" in anno
+    assert "software_pipeline_order" in anno
+    assert "tl_pipelined_num_stages" in anno
+    stages = [int(s) for s in anno["software_pipeline_stage"]]
+    orders = [int(o) for o in anno["software_pipeline_order"]]
+    assert stages == [0, 0, 2]
+    assert orders == [0, 1, 2]
+    assert int(anno["tl_pipelined_num_stages"]) == 3
+    # tma_copies annotation depends on target TMA capability
+    if "software_pipeline_tma_copies" in anno:
+        tma_copies = [int(t) for t in anno["software_pipeline_tma_copies"]]
+        # On TMA-capable targets, copies are marked as TMA-eligible
+        assert tma_copies[2] == 0  # gemm is never TMA
+
+
+def test_pipeline_planning_recognizes_parallel_bufferstore_copy_stages():
+    @T.prim_func
+    def before(
+        A: T.Tensor((1024, 32), T.float32),
+        B: T.Tensor((32, 1024), T.float32),
+        C: T.Tensor((1024, 1024), T.float32),
+    ):
+        with T.Kernel(8, 8, threads=128) as (bx, by):
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
+
+            T.clear(C_local)
+
+            for ko in T.Pipelined(32, num_stages=3):
+                for i, k in T.Parallel(128, 32):
+                    A_shared[i, k] = A[by * 128 + i, ko * 32 + k]
+                for k, j in T.Parallel(32, 128):
+                    B_shared[k, j] = B[ko * 32 + k, bx * 128 + j]
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * 128, bx * 128])
+
+    mod = _run_pipeline_planning(before, sm80_target)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    anno = annos[0]
+    stages = [int(v) for v in anno["software_pipeline_stage"]]
+    orders = [int(v) for v in anno["software_pipeline_order"]]
+    async_producers = [int(v) for v in anno["software_pipeline_async_producers"]]
+    async_groups = [int(v) for v in anno["software_pipeline_async_producer_groups"]]
+    assert stages == [0, 0, 2]
+    assert orders == [0, 1, 2]
+    assert async_producers == [1, 1, 0]
+    assert async_groups == [0, 0, -1]
+
+
+def test_pipeline_planning_marks_async_producers_per_statement():
     @T.prim_func
-    def after(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
+    def before(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32), C: T.Tensor((1024, 1024), T.float32)):
         with T.Kernel(8, 8, threads=128) as (bx, by):
             A_shared = T.alloc_shared((128, 32), T.float32)
             B_shared = T.alloc_shared((32, 128), T.float32)
@@ -45,22 +128,364 @@ def after(A: T.Tensor((1024, 32), T.float32), B: T.Tensor((32, 1024), T.float32)
 
             T.clear(C_local)
 
-            for ko in T.serial(
-                32,
-                annotations={
-                    "software_pipeline_async_stages": [T.int32(0)],
-                    "software_pipeline_order": [T.int32(0), T.int32(1), T.int32(2)],
-                    "software_pipeline_stage": [T.int32(3), T.int32(3), T.int32(3)],
-                },
-            ):
+            for ko in T.Pipelined(32, num_stages=3):
                 T.copy(A[by * 128, ko * 32], A_shared)
                 T.copy(B[ko * 32, bx * 128], B_shared)
                 T.gemm(A_shared, B_shared, C_local)
 
             T.copy(C_local, C[by * 128, bx * 128])
 
-    _check(before, after)
+    mod = _run_pipeline_planning(before, sm80_target)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    anno = annos[0]
+    assert "software_pipeline_async_producers" in anno
+    assert "software_pipeline_async_producer_groups" in anno
+    assert "software_pipeline_async_stages" in anno
+    async_producers = [int(v) for v in anno["software_pipeline_async_producers"]]
+    async_groups = [int(v) for v in anno["software_pipeline_async_producer_groups"]]
+    async_stages = [int(v) for v in anno["software_pipeline_async_stages"]]
+    assert async_producers == [1, 1, 0]
+    assert async_groups == [0, 0, -1]
+    assert async_stages == [0]
+
+
+def test_pipeline_planning_recognizes_explicit_cp_async_copy_stage():
+    @T.prim_func
+    def before(A: T.Tensor((16,), T.uint8), B: T.Tensor((16,), T.uint8)):
+        S = T.alloc_buffer((16,), dtype=T.uint8, scope="shared")
+        for i in T.Pipelined(4, num_stages=2):
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(S[i * 4], "w", 4),
+                    T.access_ptr(A[i * 4], "r", 4),
+                    4,
+                )
+                T.ptx_commit_group()
+                T.ptx_wait_group(0)
+            with T.block():
+                B[i * 4] = S[i * 4]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.PipelinePlanning()(mod)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    stages = [int(v) for v in annos[0]["software_pipeline_stage"]]
+    assert 0 in stages, "Expected explicit cp.async producer to be recognized as stage-0 copy stage"
+
+
+def test_pipeline_planning_does_not_mark_fill_as_async_producer_for_predicated_cp_async():
+    @T.prim_func
+    def before(A: T.Tensor((16,), T.uint8), B: T.Tensor((16,), T.uint8)):
+        S = T.alloc_buffer((16,), dtype=T.uint8, scope="shared")
+        for i in T.Pipelined(4, num_stages=2):
+            with T.block():
+                for j in T.serial(16):
+                    S[j] = T.uint8(0)
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(S[i * 4], "w", 4),
+                    T.access_ptr(A[i * 4], "r", 4),
+                    4,
+                    True,
+                )
+            with T.block():
+                T.ptx_commit_group()
+            with T.block():
+                T.ptx_wait_group(0)
+            with T.block():
+                B[i * 4] = S[i * 4]
+
+    mod = _run_pipeline_planning(before, sm80_target)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    anno = annos[0]
+    assert "software_pipeline_async_producers" in anno
+    assert "software_pipeline_async_producer_groups" in anno
+    async_producers = [int(v) for v in anno["software_pipeline_async_producers"]]
+    async_groups = [int(v) for v in anno["software_pipeline_async_producer_groups"]]
+    assert async_producers == [0, 1, 0, 0, 0]
+    assert async_groups == [-1, 0, -1, -1, -1]
+
+
+def test_pipeline_planning_keeps_plain_hopper_pipeline_copies_sync():
+    hopper_target = tvm.target.Target("cuda -arch=sm_90a")
+
+    @T.prim_func
+    def before(
+        A: T.Tensor((1024, 32), T.float32),
+        B: T.Tensor((32, 1024), T.float32),
+        C: T.Tensor((1024, 1024), T.float32),
+    ):
+        with T.Kernel(8, 8, threads=128) as (bx, by):
+            A_shared = T.alloc_shared((128, 32), T.float32)
+            B_shared = T.alloc_shared((32, 128), T.float32)
+            C_local = T.alloc_fragment((128, 128), T.float32)
+            T.clear(C_local)
+            for k in T.Pipelined(32, num_stages=2):
+                T.copy(A[by * 128, k * 32], A_shared)
+                T.copy(B[k * 32, bx * 128], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+    mod = _run_pipeline_planning(before, hopper_target)
+
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    anno = annos[0]
+    tma_copies = [int(v) for v in anno["software_pipeline_tma_copies"]]
+    assert tma_copies[:2] == [0, 0]
+
+
+def test_pipeline_planning_binds_commit_to_cp_async_stage():
+    @T.prim_func
+    def before(A: T.Tensor((16,), T.uint8), B: T.Tensor((16,), T.uint8)):
+        S = T.alloc_buffer((16,), dtype=T.uint8, scope="shared")
+        for i in T.Pipelined(4, num_stages=2):
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(S[i * 4], "w", 4),
+                    T.access_ptr(A[i * 4], "r", 4),
+                    4,
+                )
+            with T.block():
+                T.ptx_commit_group()
+            with T.block():
+                B[i * 4] = S[i * 4]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.PipelinePlanning()(mod)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    stages = [int(v) for v in annos[0]["software_pipeline_stage"]]
+    orders = [int(v) for v in annos[0]["software_pipeline_order"]]
+    assert len(stages) == 3, f"Expected 3 pipeline stages for 3 statements, got {len(stages)}"
+    assert stages[0] == stages[1], f"Expected cp.async and commit to be in the same stage, got stages={stages}"
+    assert orders[0] < orders[1], f"Expected cp.async to be ordered before commit in the same stage, got orders={orders}"
+
+
+def test_pipeline_planning_binds_wait_to_cp_async_consumer_stage():
+    @T.prim_func
+    def before(A: T.Tensor((16,), T.uint8), B: T.Tensor((16,), T.uint8)):
+        S = T.alloc_buffer((16,), dtype=T.uint8, scope="shared")
+        for i in T.Pipelined(4, num_stages=2):
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(S[i * 4], "w", 4),
+                    T.access_ptr(A[i * 4], "r", 4),
+                    4,
+                )
+            with T.block():
+                T.ptx_commit_group()
+            with T.block():
+                T.ptx_wait_group(0)
+            with T.block():
+                B[i * 4] = S[i * 4]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.PipelinePlanning()(mod)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    stages = [int(v) for v in annos[0]["software_pipeline_stage"]]
+    orders = [int(v) for v in annos[0]["software_pipeline_order"]]
+    assert len(stages) == 4, f"Expected 4 pipeline stages for 4 statements, got {len(stages)}"
+    assert stages[0] == stages[1], f"Expected cp.async and commit to be in the same stage, got stages={stages}"
+    assert stages[2] == stages[3], f"Expected wait and its dependent consumer to be in the same stage, got stages={stages}"
+    assert stages[2] >= stages[1], f"Expected wait stage to not precede commit stage, got stages={stages}"
+    assert orders[2] < orders[3], f"Expected wait to stay ordered before consumer, got orders={orders}"
+
+
+def test_pipeline_planning_delays_wait_order_within_consumer_stage():
+    @T.prim_func
+    def before(A: T.Tensor((16,), T.uint8), B: T.Tensor((16,), T.uint8), C: T.Tensor((16,), T.uint8)):
+        S = T.alloc_buffer((16,), dtype=T.uint8, scope="shared")
+        for i in T.Pipelined(4, num_stages=2):
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(S[i * 4], "w", 4),
+                    T.access_ptr(A[i * 4], "r", 4),
+                    4,
+                )
+            with T.block():
+                T.ptx_commit_group()
+            with T.block():
+                T.ptx_wait_group(0)
+            # Independent prep work that does not touch waited shared buffers.
+            with T.block():
+                C[i * 4] = A[i * 4] + T.uint8(1)
+            with T.block():
+                B[i * 4] = S[i * 4]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.PipelinePlanning()(mod)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    stages = [int(v) for v in annos[0]["software_pipeline_stage"]]
+    orders = [int(v) for v in annos[0]["software_pipeline_order"]]
+    assert len(stages) == 5, f"Expected 5 pipeline stages for 5 statements, got {len(stages)}"
+    assert stages[2] == stages[4], f"Expected wait and consumer to share a stage, got stages={stages}"
+    assert orders[3] < orders[2] < orders[4], (
+        f"Expected independent prep stmt to be scheduled before wait, and wait before consumer, got stages={stages}, orders={orders}"
+    )
+
+
+def test_pipeline_planning_prioritizes_groups_by_consumer_and_rebinds_wait0():
+    @T.prim_func
+    def before(A: T.Tensor((64,), T.uint8), B: T.Tensor((64,), T.uint8), C: T.Tensor((64,), T.uint8)):
+        SA = T.alloc_buffer((64,), dtype=T.uint8, scope="shared")
+        SB = T.alloc_buffer((64,), dtype=T.uint8, scope="shared")
+        TMP = T.alloc_buffer((64,), dtype=T.uint8, scope="local")
+        for i in T.Pipelined(4, num_stages=2):
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(SA[(i + 1) * 4], "w", 4),
+                    T.access_ptr(A[(i + 1) * 4], "r", 4),
+                    4,
+                )
+            with T.block():
+                T.ptx_commit_group()
+            with T.block():
+                T.ptx_wait_group(0)
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(SB[(i + 1) * 4], "w", 4),
+                    T.access_ptr(B[(i + 1) * 4], "r", 4),
+                    4,
+                )
+            with T.block():
+                T.ptx_commit_group()
+            with T.block():
+                T.ptx_wait_group(0)
+            with T.block():
+                TMP[i * 4] = SB[i * 4]
+            with T.block():
+                C[i * 4] = SA[i * 4] + TMP[i * 4]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.PipelinePlanning()(mod)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    stages = [int(v) for v in annos[0]["software_pipeline_stage"]]
+    orders = [int(v) for v in annos[0]["software_pipeline_order"]]
+    assert len(stages) == 8, f"Expected 8 pipeline statements, got {len(stages)}"
+
+    # Statements:
+    #   0 cpA, 1 commitA, 2 wait0, 3 cpB, 4 commitB, 5 wait0, 6 consumeB, 7 consumeA
+    assert orders[3] < orders[0], f"Expected cp_async(B) before cp_async(A), got orders={orders}"
+    assert orders[4] < orders[1], f"Expected commit(B) before commit(A), got orders={orders}"
+    assert stages[2] == stages[5] == stages[6] == stages[7], f"Expected waits and consumers in the same consumer stage, got stages={stages}"
+    assert orders[2] < orders[6] < orders[5] < orders[7], (
+        f"Expected wait for B before consumeB, then second wait before consumeA, got stages={stages}, orders={orders}"
+    )
+
+
+def test_pipeline_planning_orders_cp_async_groups_by_group_last_use():
+    @T.prim_func
+    def before(
+        A: T.Tensor((64,), T.uint8),
+        B: T.Tensor((64,), T.uint8),
+        C: T.Tensor((64,), T.uint8),
+        D: T.Tensor((64,), T.uint8),
+    ):
+        SA = T.alloc_buffer((64,), dtype=T.uint8, scope="shared")
+        SB = T.alloc_buffer((64,), dtype=T.uint8, scope="shared")
+        for i in T.Pipelined(4, num_stages=2):
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(SA[(i + 1) * 4], "w", 4),
+                    T.access_ptr(A[(i + 1) * 4], "r", 4),
+                    4,
+                )
+            with T.block():
+                T.ptx_commit_group()
+            with T.block():
+                T.ptx_wait_group(0)
+            with T.block():
+                T.ptx_cp_async(
+                    T.access_ptr(SB[(i + 1) * 4], "w", 4),
+                    T.access_ptr(B[(i + 1) * 4], "r", 4),
+                    4,
+                )
+            with T.block():
+                T.ptx_commit_group()
+            with T.block():
+                T.ptx_wait_group(0)
+            # SA is consumed earlier, but it is also consumed again later.
+            with T.block():
+                C[i * 4] = SA[i * 4]
+            # SB is only consumed once, between the two SA consumers.
+            with T.block():
+                C[i * 4 + 1] = SB[i * 4]
+            with T.block():
+                D[i * 4] = SA[i * 4 + 1]
+
+    mod = tvm.IRModule.from_expr(before.with_attr("global_symbol", "main"))
+    mod = tvm.tir.transform.BindTarget(auto_target)(mod)
+    mod = tl.transform.PipelinePlanning()(mod)
+    annos = _collect_pipeline_loop_annotations(mod["main"])
+    assert annos, "Expected at least one loop annotated by PipelinePlanning"
+    stages = [int(v) for v in annos[0]["software_pipeline_stage"]]
+    orders = [int(v) for v in annos[0]["software_pipeline_order"]]
+    assert len(stages) == 9, f"Expected 9 pipeline statements, got {len(stages)}"
+
+    # Statements:
+    #   0 cpA, 1 commitA, 2 waitA, 3 cpB, 4 commitB, 5 waitB,
+    #   6 consumeA(early), 7 consumeB, 8 consumeA(late)
+    #
+    # Group A has earlier first-consumer (stmt 6) but later last-use (stmt 8).
+    # Group B has later first-consumer (stmt 7) but earlier last-use (stmt 7).
+    # PipelinePlanning should keep the synthetic cp.async producer groups
+    # ordered by group placement/last-use, so B is scheduled ahead of A.
+    assert orders[3] < orders[0], f"Expected cp_async(B) before cp_async(A), got orders={orders}"
+    assert orders[4] < orders[1], f"Expected commit(B) before commit(A), got orders={orders}"
+    assert stages[0] == stages[1] == stages[3] == stages[4], (
+        f"Expected both cp.async groups and their commits to stay in the same producer stage, got stages={stages}"
+    )
+
+
+@tilelang.testing.requires_cuda
+def test_pipeline_predicated_copy_preserves_shared_fill_correctness():
+    @T.prim_func
+    def main(
+        A: T.Tensor((8,), T.float16),
+        B: T.Tensor((16,), T.float16),
+    ):
+        with T.Kernel(1, threads=32):
+            S = T.alloc_shared((16,), T.float16)
+            for _ in T.Pipelined(1, num_stages=2):
+                T.fill(S, 0)
+                T.ptx_cp_async(
+                    T.access_ptr(S[0], "w", 16),
+                    T.access_ptr(A[0], "r", 16),
+                    8,
+                    True,
+                )
+                T.ptx_cp_async(
+                    T.access_ptr(S[8], "w", 16),
+                    T.access_ptr(A[0], "r", 16),
+                    8,
+                    False,
+                )
+                T.ptx_commit_group()
+                T.ptx_wait_group(0)
+                T.copy(S, B[0:16])
+
+    kernel = tl.compile(main, out_idx=[1], target="cuda")
+    src = kernel.get_kernel_source()
+    assert "cp_async_gs_conditional<16>" in src, "Expected predicated cp.async in generated CUDA source"
+
+    a = torch.randn((8,), dtype=torch.float16, device="cuda")
+    b = kernel(a)
+
+    expected = torch.zeros((16,), dtype=torch.float16, device="cuda")
+    expected[:8] = a
+    torch.testing.assert_close(b, expected, rtol=0, atol=0)
 
 
 if __name__ == "__main__":
-    tilelang.testing.main()
+    # tilelang.testing.main()
+    test_pipeline_predicated_copy_preserves_shared_fill_correctness()
diff --git a/testing/python/transform/test_tilelang_transform_plan_update_buffer_allocation_location.py b/testing/python/transform/test_tilelang_transform_plan_update_buffer_allocation_location.py
new file mode 100644
index 0000000000..e3d53b813f
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_plan_update_buffer_allocation_location.py
@@ -0,0 +1,71 @@
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tilelang import tvm
+from tilelang.engine.phase import LowerAndLegalize
+
+
+def _apply_plan_update(func: tvm.tir.PrimFunc) -> tvm.IRModule:
+    target = tvm.target.Target("cuda")
+    mod = tvm.IRModule.from_expr(func.with_attr("global_symbol", "main"))
+    with target:
+        mod = LowerAndLegalize(mod, target)
+        mod = tl.transform.LowerSharedTmem()(mod)
+        mod = tl.transform.IfStmtBinding()(mod)
+        mod = tl.transform.PlanAndUpdateBufferAllocationLocation()(mod)
+    return mod
+
+
+def _find_block(stmt: tvm.tir.Stmt, name_hint: str) -> tvm.tir.Block:
+    blocks = []
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.Block) and str(node.name_hint) == name_hint:
+            blocks.append(node)
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, _visit)
+    assert len(blocks) == 1, f"Expected exactly one block named {name_hint}, got {len(blocks)}"
+    return blocks[0]
+
+
+def _find_first_for(stmt: tvm.tir.Stmt) -> tvm.tir.For:
+    loops = []
+
+    def _visit(node):
+        if isinstance(node, tvm.tir.For):
+            loops.append(node)
+
+    tvm.tir.stmt_functor.post_order_visit(stmt, _visit)
+    assert loops, "Expected at least one loop"
+    return loops[0]
+
+
+def test_plan_update_keeps_loop_header_local_var_outside_loop_body():
+    @T.prim_func
+    def func(x: T.Tensor((256,), "int64")):
+        with T.Kernel(256, threads=128):
+            a, b = T.alloc_var(T.int), T.alloc_var(T.int)
+            T.fill(x[a:b], 0)
+
+    mod = _apply_plan_update(func)
+    main = mod["main"]
+
+    tilelang_root = _find_block(main.body, "tilelang_root")
+    root_local_vars = {buf.name for buf in tilelang_root.alloc_buffers if buf.scope() == "local.var"}
+    assert {"a", "b"} <= root_local_vars
+
+    loop = _find_first_for(main.body)
+    loop_body_local_vars = set()
+
+    def _visit_loop_body(node):
+        if isinstance(node, tvm.tir.Block):
+            for buf in node.alloc_buffers:
+                if buf.scope() == "local.var":
+                    loop_body_local_vars.add(buf.name)
+
+    tvm.tir.stmt_functor.post_order_visit(loop.body, _visit_loop_body)
+    assert "b" not in loop_body_local_vars
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_producer_consumer_ws.py b/testing/python/transform/test_tilelang_transform_producer_consumer_ws.py
new file mode 100644
index 0000000000..eee80b2c49
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_producer_consumer_ws.py
@@ -0,0 +1,472 @@
+"""Tests for the warp-specialized producer/consumer pass."""
+
+import tilelang
+import tilelang.language as T
+import tilelang.testing
+from tilelang import tvm as tvm
+from tilelang.layout import make_swizzled_layout
+from tilelang.utils.target import determine_target
+
+
+def matmul_pipelined(M, N, K, block_M, block_K, block_N, num_stages, dtype="float16", threads=128):
+    """A simple pipelined GEMM using T.copy + T.gemm tile ops."""
+
+    @T.prim_func
+    def main(
+        A: T.Buffer((M, K), dtype),
+        B: T.Buffer((K, N), dtype),
+        C: T.Buffer((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (
+            bx,
+            by,
+        ):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), "float32")
+
+            T.clear(C_local)
+
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def matmul_windowed_pipelined(
+    M,
+    N,
+    K,
+    block_M,
+    block_K,
+    block_N,
+    num_stages,
+    window_tiles=2,
+    dtype="float16",
+    threads=128,
+):
+    """A pipelined GEMM whose K-loop has a dynamic lower bound."""
+
+    @T.prim_func
+    def main(
+        A: T.Buffer((M, K), dtype),
+        B: T.Buffer((K, N), dtype),
+        C: T.Buffer((M, N), dtype),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads) as (
+            bx,
+            by,
+        ):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), "float32")
+
+            T.clear(C_local)
+
+            start = T.max(0, bx - (window_tiles - 1))
+            end = T.min(T.ceildiv(K, block_K), bx + 1)
+            for ko in T.Pipelined(start, end, num_stages=num_stages):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    return main
+
+
+def prelude_tma_wait_sink(block=64, iters=2, dtype="float16", threads=128):
+    """A tiled-WS kernel with pre-loop TMA loads consumed at different points."""
+
+    @T.prim_func
+    def main(
+        Q: T.Buffer((iters * block, block), dtype),
+        K_in: T.Buffer((block, block), dtype),
+        V_in: T.Buffer((block, block), dtype),
+        O: T.Buffer((block, block), dtype),
+    ):
+        with T.Kernel(1, threads=threads) as _:
+            K_shared = T.alloc_shared((block, block), dtype)
+            V_shared = T.alloc_shared((block, block), dtype)
+            q = T.alloc_shared((block, block), dtype)
+            acc0 = T.alloc_fragment((block, block), "float32")
+            acc1 = T.alloc_fragment((block, block), "float32")
+            out = T.alloc_fragment((block, block), "float32")
+
+            T.copy(K_in[0, 0], K_shared)
+            T.copy(V_in[0, 0], V_shared)
+            T.clear(out)
+            for ko in T.Pipelined(iters, num_stages=2):
+                T.copy(Q[ko * block, 0], q)
+                T.clear(acc0)
+                T.gemm(K_shared, q, acc0, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                T.clear(acc1)
+                T.gemm(V_shared, q, acc1, transpose_B=True, policy=T.GemmWarpPolicy.FullRow)
+                for i, j in T.Parallel(block, block):
+                    out[i, j] = acc0[i, j] + acc1[i, j]
+
+            T.copy(out, O[0, 0])
+
+    return main
+
+
+def grouped_gemm_padded_pipelined(
+    batch_sizes,
+    K,
+    N,
+    block_M=64,
+    block_N=64,
+    block_K=32,
+    num_stages=2,
+    threads=256,
+    dtype="float16",
+):
+    """Grouped GEMM with padded M tiles to exercise WS shared-prelude local vars."""
+
+    batch_sizes = tuple(batch_sizes)
+    batch_count = len(batch_sizes)
+    batch_sum = sum(batch_sizes)
+    total_m_blocks = sum((size + block_M - 1) // block_M for size in batch_sizes)
+
+    @T.prim_func
+    def main(
+        A: T.Buffer((batch_sum, K), dtype),
+        B: T.Buffer((batch_count, K, N), dtype),
+        C: T.Buffer((batch_sum, N), dtype),
+        batch_sizes_buf: T.Buffer((batch_count,), "int32"),
+        batch_offsets: T.Buffer((batch_count,), "int32"),
+        batch_padded_offsets: T.Buffer((batch_count,), "int32"),
+    ):
+        with T.Kernel(total_m_blocks, T.ceildiv(N, block_N), threads=threads) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), dtype)
+            B_shared = T.alloc_shared((block_K, block_N), dtype)
+            C_local = T.alloc_fragment((block_M, block_N), "float32")
+            cur_batch_idx = T.alloc_var("int32")
+            cur_batch_size = T.alloc_var("int32")
+
+            m_start_padded = bx * block_M
+            for i in range(batch_count):
+                in_cur_batch_idx = m_start_padded >= batch_padded_offsets[i]
+                cur_batch_idx = T.if_then_else(in_cur_batch_idx, i, cur_batch_idx)
+
+            cur_batch_size = batch_sizes_buf[cur_batch_idx]
+            m_start = m_start_padded - batch_padded_offsets[cur_batch_idx] + batch_offsets[cur_batch_idx]
+            actual_rows = T.max(
+                0,
+                T.min(block_M, cur_batch_size + batch_padded_offsets[cur_batch_idx] - m_start_padded),
+            )
+
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[m_start, ko * block_K], A_shared)
+                T.copy(B[cur_batch_idx, ko * block_K, by * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+
+            for i, j in T.Parallel(block_M, block_N):
+                if i < actual_rows:
+                    C[m_start + i, by * block_N + j] = C_local[i, j]
+
+    return main
+
+
+def grouped_gemm_reference(A, B, batch_sizes):
+    import torch
+
+    outputs = []
+    start = 0
+    for idx, size in enumerate(batch_sizes):
+        end = start + size
+        outputs.append(torch.mm(A[start:end], B[idx]))
+        start = end
+    return torch.cat(outputs, dim=0)
+
+
+def grouped_gemm_inputs(batch_sizes, K, N, block_M, dtype="float16"):
+    import math
+    import torch
+
+    batch_sizes = list(batch_sizes)
+    batch_offsets = [0]
+    batch_padded_offsets = [0]
+    for i in range(len(batch_sizes) - 1):
+        batch_offsets.append(batch_offsets[-1] + batch_sizes[i])
+        batch_padded_offsets.append(batch_padded_offsets[-1] + math.ceil(batch_sizes[i] / block_M) * block_M)
+
+    A = torch.randn(sum(batch_sizes), K, dtype=getattr(torch, dtype), device="cuda")
+    B = torch.randn(len(batch_sizes), K, N, dtype=getattr(torch, dtype), device="cuda")
+    batch_sizes_t = torch.tensor(batch_sizes, dtype=torch.int32, device="cuda")
+    batch_offsets_t = torch.tensor(batch_offsets, dtype=torch.int32, device="cuda")
+    batch_padded_offsets_t = torch.tensor(batch_padded_offsets, dtype=torch.int32, device="cuda")
+    return A, B, batch_sizes_t, batch_offsets_t, batch_padded_offsets_t
+
+
+def _find_after(src, needle, start=0):
+    pos = src.find(needle, start)
+    assert pos >= 0, f"missing substring: {needle}"
+    return pos
+
+
+def _compile_grouped_gemm_ws(batch_sizes=(63, 77), K=128, N=128, block_M=64, block_N=64, block_K=32):
+    pass_configs = {tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    func = grouped_gemm_padded_pipelined(batch_sizes, K, N, block_M, block_N, block_K)
+    kernel = _compile_tvm_ffi(func, pass_configs, out_idx=[2])
+    return kernel, batch_sizes
+
+
+def _run_grouped_gemm_ws(kernel, batch_sizes, K=128, N=128, block_M=64, dtype="float16"):
+    import torch
+
+    A, B, batch_sizes_t, batch_offsets_t, batch_padded_offsets_t = grouped_gemm_inputs(batch_sizes, K, N, block_M, dtype)
+    out = kernel(A, B, batch_sizes_t, batch_offsets_t, batch_padded_offsets_t)
+    ref = grouped_gemm_reference(A.float(), B.float(), batch_sizes)
+    torch.testing.assert_close(out.float(), ref, rtol=1e-2, atol=1e-2)
+    return out
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tiled_ws_stage1_dynamic_loop_start():
+    """Stage-1 tiled WS should handle dynamic pipeline loop bounds."""
+    import torch
+
+    M, N, K = 64, 128, 64
+    block_M, block_K, block_N = 64, 32, 64
+    func = matmul_windowed_pipelined(
+        M,
+        N,
+        K,
+        block_M,
+        block_K,
+        block_N,
+        num_stages=1,
+        window_tiles=2,
+    )
+    target = determine_target()
+    kernel = tilelang.compile(func, target=target, out_idx=[2])
+    source = kernel.get_kernel_source()
+
+    assert "__launch_bounds__(256, 1)" in source
+
+    A = torch.randn(M, K, dtype=torch.float16, device="cuda")
+    B = torch.randn(K, N, dtype=torch.float16, device="cuda")
+    C = kernel(A, B)
+
+    ref = torch.zeros(M, N, dtype=torch.float32, device="cuda")
+    num_k_tiles = (K + block_K - 1) // block_K
+    num_n_tiles = (N + block_N - 1) // block_N
+    for bx in range(num_n_tiles):
+        start = max(0, bx - 1)
+        end = min(num_k_tiles, bx + 1)
+        n_slice = slice(bx * block_N, min((bx + 1) * block_N, N))
+        acc = torch.zeros(M, n_slice.stop - n_slice.start, dtype=torch.float32, device="cuda")
+        for ko in range(start, end):
+            k_slice = slice(ko * block_K, min((ko + 1) * block_K, K))
+            acc += A[:, k_slice].float() @ B[k_slice, n_slice].float()
+        ref[:, n_slice] = acc
+
+    torch.testing.assert_close(C.float(), ref, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tiled_ws_correctness():
+    """End-to-end correctness test: pipelined GEMM via tiled WS."""
+    import torch
+
+    M, N, K = 256, 256, 256
+    func = matmul_pipelined(M, N, K, 64, 32, 64, num_stages=2)
+    target = determine_target()
+    kernel = tilelang.compile(func, target=target, out_idx=[2])
+
+    A = torch.randn(M, K, dtype=torch.float16, device="cuda")
+    B = torch.randn(K, N, dtype=torch.float16, device="cuda")
+    C = kernel(A, B)
+
+    ref = A.float() @ B.float()
+    torch.testing.assert_close(C.float(), ref, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tiled_ws_stage3():
+    """Pipelined GEMM with 3 stages."""
+    import torch
+
+    M, N, K = 512, 512, 512
+    func = matmul_pipelined(M, N, K, 128, 64, 128, num_stages=3)
+    target = determine_target()
+    kernel = tilelang.compile(func, target=target, out_idx=[2])
+
+    A = torch.randn(M, K, dtype=torch.float16, device="cuda")
+    B = torch.randn(K, N, dtype=torch.float16, device="cuda")
+    C = kernel(A, B)
+
+    ref = A.float() @ B.float()
+    torch.testing.assert_close(C.float(), ref, rtol=1e-2, atol=1e-2)
+
+
+def _compile_tvm_ffi(func, pass_configs=None, **kwargs):
+    tilelang.disable_cache()
+    try:
+        return tilelang.compile(
+            func,
+            target="cuda",
+            execution_backend="tvm_ffi",
+            pass_configs=pass_configs or {},
+            **kwargs,
+        )
+    finally:
+        tilelang.enable_cache()
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tiled_ws_swizzled_layout_allows_ws():
+    """Swizzled layout on a TMA copy target should NOT block warp specialization.
+
+    Swizzled layouts are valid TMA layouts (TMA supports 32B/64B/128B swizzle).
+    Layout::Expand correctly handles MVB expansion for swizzled layouts.
+    """
+    import torch
+
+    M, N, K = 256, 256, 256
+    block_M, block_K, block_N = 64, 64, 64
+
+    @T.prim_func
+    def gemm_swizzled(
+        A: T.Buffer((M, K), "float16"),
+        B: T.Buffer((K, N), "float16"),
+        C: T.Buffer((M, N), "float16"),
+    ):
+        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
+            A_shared = T.alloc_shared((block_M, block_K), "float16")
+            B_shared = T.alloc_shared((block_K, block_N), "float16")
+            C_local = T.alloc_fragment((block_M, block_N), "float32")
+
+            T.annotate_layout({A_shared: make_swizzled_layout(A_shared), B_shared: make_swizzled_layout(B_shared)})
+
+            T.clear(C_local)
+            for ko in T.Pipelined(T.ceildiv(K, block_K), num_stages=2):
+                T.copy(A[by * block_M, ko * block_K], A_shared)
+                T.copy(B[ko * block_K, bx * block_N], B_shared)
+                T.gemm(A_shared, B_shared, C_local)
+            T.copy(C_local, C[by * block_M, bx * block_N])
+
+    pass_configs = {tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(gemm_swizzled, pass_configs, out_idx=[2])
+    src = kernel.get_kernel_source()
+
+    # WS should be applied: launch bounds should include producer warp group
+    assert "__launch_bounds__(256, 1)" in src
+    # TMA loads should be present
+    assert "tl::tma_load" in src
+
+    # Correctness check
+    A = torch.randn(M, K, dtype=torch.float16, device="cuda")
+    B = torch.randn(K, N, dtype=torch.float16, device="cuda")
+    C = kernel(A, B)
+    ref = A.float() @ B.float()
+    torch.testing.assert_close(C.float(), ref, rtol=1e-2, atol=1e-2)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tiled_ws_incompatible_layout_blocks_ws():
+    """A non-swizzle, non-linear layout on ALL TMA copy targets should block WS.
+
+    If every copy that could be a TMA producer has an incompatible layout,
+    there are no real TMA candidates and WS should not apply.
+    """
+    from tilelang.layout import Layout
+
+    M, K = 16, 128
+    block_m, block_k = 16, 128
+
+    # A padded layout: (i, j) -> i * (block_k + 8) + j
+    # This is neither a swizzle layout nor a linear layout (output shape != input shape).
+    padded_continuous = block_k + 8
+    padded_layout = Layout([block_m, block_k], lambda i, j: i * padded_continuous + j)
+
+    @T.prim_func
+    def copy_with_padded_layout(
+        x: T.Tensor((M, K), "float16"),
+        y: T.Tensor((M, K), "float16"),
+    ):
+        with T.Kernel(T.ceildiv(M, block_m), threads=128) as pid_m:
+            x_shared = T.alloc_shared((block_m, block_k), "float16")
+
+            T.annotate_layout({x_shared: padded_layout})
+
+            for _ in T.Pipelined(1, num_stages=1):
+                T.copy(x[pid_m * block_m, 0], x_shared)
+                T.copy(x_shared, y[pid_m * block_m, 0])
+
+    pass_configs = {tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(copy_with_padded_layout, pass_configs, out_idx=[1])
+    src = kernel.get_kernel_source()
+
+    # WS should NOT be applied: no producer/consumer split
+    assert "__launch_bounds__(256, 1)" not in src
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tiled_ws_sinks_preloop_tma_waits_into_consumer():
+    """Pre-loop TMA loads should not emit immediate waits in the common prelude."""
+
+    pass_configs = {tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: False}
+    kernel = _compile_tvm_ffi(prelude_tma_wait_sink(), pass_configs, out_idx=[3])
+    src = kernel.get_kernel_source()
+
+    k_load = src.find("tl::tma_load(K_in_desc")
+    v_load = src.find("tl::tma_load(V_in_desc")
+    branch = src.find("if (128 <= ((int)threadIdx.x))")
+    first_wait = src.find(".wait(0)")
+
+    assert min(k_load, v_load, branch, first_wait) >= 0
+    assert k_load < v_load < branch < first_wait
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tiled_ws_keeps_shared_prelude_local_vars_for_grouped_gemm():
+    """Shared-prelude grouped-gemm indices must stay outside WS branches."""
+    kernel, batch_sizes = _compile_grouped_gemm_ws()
+    src = kernel.get_kernel_source()
+
+    branch = _find_after(src, "if (256 <= ((int)threadIdx.x))")
+    cur_batch_idx_loop = _find_after(src, "for (int i = 0; i < 2; ++i)")
+    m_start = _find_after(src, "int m_start =")
+    actual_rows = _find_after(src, "int actual_rows =")
+
+    assert cur_batch_idx_loop < m_start < actual_rows < branch
+    _run_grouped_gemm_ws(kernel, batch_sizes)
+
+
+@tilelang.testing.requires_cuda
+@tilelang.testing.requires_cuda_compute_version(9, 0)
+def test_tiled_ws_does_not_clone_local_var_into_producer_branch():
+    """Producer branch should reuse shared local.var state instead of cloning it."""
+    kernel, batch_sizes = _compile_grouped_gemm_ws()
+    src = kernel.get_kernel_source()
+
+    assert "cur_batch_idx_producer_ws" not in src
+    assert "cur_batch_size_producer_ws" not in src
+    assert "tl::tma_load(B_desc" in src
+    assert "cur_batch_idx);" in src
+    _run_grouped_gemm_ws(kernel, batch_sizes)
+
+
+if __name__ == "__main__":
+    test_tiled_ws_stage1_dynamic_loop_start()
+    test_tiled_ws_correctness()
+    test_tiled_ws_stage3()
+    test_tiled_ws_swizzled_layout_allows_ws()
+    test_tiled_ws_incompatible_layout_blocks_ws()
+    test_tiled_ws_sinks_preloop_tma_waits_into_consumer()
+    test_tiled_ws_keeps_shared_prelude_local_vars_for_grouped_gemm()
+    test_tiled_ws_does_not_clone_local_var_into_producer_branch()
diff --git a/testing/python/transform/test_tilelang_transform_simplify.py b/testing/python/transform/test_tilelang_transform_simplify.py
index 3b73768204..1bf08d40c5 100644
--- a/testing/python/transform/test_tilelang_transform_simplify.py
+++ b/testing/python/transform/test_tilelang_transform_simplify.py
@@ -1,90 +1,605 @@
+# ruff: noqa
 from tilelang import tvm as tvm
 import tilelang as tl
 import tilelang.language as T
 import tilelang.testing
+from tilelang.transform import PassConfigKey
 
+from tvm import te
 
-def modify(
-    with_B: bool = False,
-    with_bias: bool = False,
-):
-    @T.prim_func
-    def main(
-        A: T.Tensor((64, 64)),
-        B: T.Tensor((64, 64)),
-        C: T.Tensor((64, 64)),
-        D: T.Tensor((64, 64)),
-        bias: T.Tensor((64, 64)),
-    ):
-        if with_B:
-            if with_bias:
-                T.gemm(A, bias, D)
-            T.gemm(A, B, D)
-        else:
-            with T.block():
-                A_shared = T.alloc_shared((64, 64), dtype=T.float32)
-                C_shared = T.alloc_shared((64, 64), dtype=T.float32)
-                D_shared = T.alloc_shared((64, 64), dtype=T.float32)
-                T.copy(A, A_shared)
-                T.copy(C, C_shared)
-                T.gemm(A_shared, C_shared, D_shared)
-                T.copy(D_shared, D)
-
-    return main
-
-
-def test_modify(with_B=False, with_bias=False):
-    tester = modify(with_B=with_B, with_bias=with_bias)
-    mod = tvm.IRModule({tester.attrs["global_symbol"]: tester})
-    mod2 = tl.transform.Simplify()(mod)
-    assert mod != mod2
-
-
-def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float32):
-    @T.prim_func
-    def main(
-        a: T.handle,
-        b: T.handle,
-        c: T.handle,
-    ):
-        A = T.match_buffer(a, (M, K), dtype=dtype)
-        B = T.match_buffer(b, (K, N), dtype=dtype)
-        C = T.match_buffer(c, (M, N), dtype=accum_dtype)
-
-        with T.Kernel(T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=128) as (bx, by):
-            A_shared = T.alloc_shared((block_M, block_K), dtype)
-            B_shared = T.alloc_shared((block_K, block_N), dtype)
-            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
-
-            T.clear(C_local)
-            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=3):
-                T.copy(A[by * block_M, k * block_K], A_shared)
-                T.copy(B[k * block_K, bx * block_N], B_shared)
-                T.gemm(A_shared, B_shared, C_local)
-
-            T.copy(C_local, C[by * block_M, bx * block_N])
-
-    return main
-
-
-def test_matmul():
-    func = matmul(1024, 1024, 1024, 128, 128, 32)
-    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
-    mod = tl.transform.Simplify()(mod)
-    kernel = tl.compile(mod["main"], out_idx=[2])
-
-    import torch
-
-    a = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
-    b = torch.randn(1024, 1024, dtype=torch.float16).cuda().half()
-    c = kernel(a, b)
-
-    ref_c = a @ b
-    ref_c = ref_c.float()
-    torch.testing.assert_close(c, ref_c, rtol=1e-2, atol=1e-2)
-
-    # Get CUDA Source
-    print(kernel.get_kernel_source())
+
+def simplify_and_compare(before, expected, config=None):
+    """Helper function to run simplify pass and compare results."""
+    if config is None:
+        config = {}
+
+    full_config = {PassConfigKey.TL_SIMPLIFY.value: config}
+
+    with tvm.transform.PassContext(config=full_config):
+        after = tl.transform.Simplify()(before)
+
+    # Compare bodies only, ignoring function name differences
+    # Use map_free_vars=True to allow mapping of free variables (function parameters)
+    after_func = after["main"]
+    expected_func = expected["main"]
+    tvm.ir.assert_structural_equal(after_func.body, expected_func.body, map_free_vars=True)
+
+
+def test_stmt_simplify():
+    ib = tvm.tir.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
+    n = te.size_var("n")
+    with ib.for_range(0, n, name="i") as i, ib.if_scope(i < 12):
+        A[i] = C[i]
+
+    body = tvm.tir.LetStmt(n, 10, ib.get())
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body))
+    body = tl.transform.Simplify()(mod)["main"].body
+    assert isinstance(body.body, tvm.tir.BufferStore)
+
+
+def test_thread_extent_simplify():
+    ib = tvm.tir.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
+    n = te.size_var("n")
+    tx = te.thread_axis("threadIdx.x")
+    ty = te.thread_axis("threadIdx.y")
+    ib.scope_attr(tx, "thread_extent", n)
+    ib.scope_attr(tx, "thread_extent", n)
+    ib.scope_attr(ty, "thread_extent", 1)
+    with ib.if_scope(tx + ty < 12):
+        A[tx] = C[tx + ty]
+    body = tvm.tir.LetStmt(n, 10, ib.get())
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body))
+    body = tl.transform.Simplify()(mod)["main"].body
+    assert isinstance(body.body.body.body, tvm.tir.BufferStore)
+
+
+def test_if_likely():
+    ib = tvm.tir.ir_builder.create()
+    A = ib.pointer("float32", name="A")
+    C = ib.pointer("float32", name="C")
+    n = te.size_var("n")
+    tx = te.thread_axis("threadIdx.x")
+    ty = te.thread_axis("threadIdx.y")
+    ib.scope_attr(tx, "thread_extent", 32)
+    ib.scope_attr(ty, "thread_extent", 32)
+    with ib.if_scope(ib.likely(tx * 32 + ty < n)), ib.if_scope(ib.likely(tx * 32 + ty < n)):
+        A[tx] = C[tx * 32 + ty]
+    body = ib.get()
+    mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([A, C, n], body))
+    body = tl.transform.Simplify()(mod)["main"].body
+    assert isinstance(body.body.body, tvm.tir.IfThenElse)
+    assert not isinstance(body.body.body.then_case, tvm.tir.IfThenElse)
+
+
+def test_load_store_noop():
+    """Store of a value that was just read from the same location is a no-op."""
+
+    @T.prim_func
+    def before(A: T.Buffer((1,), "float32")):
+        A[0] = A[0]
+
+    @T.prim_func
+    def expected(A: T.Buffer((1,), "float32")):
+        T.evaluate(0)
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_load_store_noop_after_simplify():
+    """As test_load_store_noop, but requiring simplification to identify."""
+
+    @T.prim_func
+    def before(A: T.Buffer((1,), "float32")):
+        A[0] = A[0] + (5.0 - 5.0)
+
+    @T.prim_func
+    def expected(A: T.Buffer((1,), "float32")):
+        T.evaluate(0)
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_nested_condition():
+    """Nested IfThenElse with the same condition can be simplified."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "float32")):
+        for i in T.serial(16):
+            if i == 5:
+                if i == 5:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "float32")):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_nested_provable_condition():
+    """Simplify inner conditional using constraint from outer."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "float32")):
+        for i in T.serial(16):
+            if i == 5:
+                if i < 7:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "float32")):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_nested_var_condition():
+    """Simplify inner conditional using constraint from outer."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "float32"), n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                if i == n:
+                    A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "float32"), n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_altered_buffer_contents():
+    """No simplification of data-dependent conditionals."""
+
+    @T.prim_func
+    def before(A: T.Buffer((1,), "int32"), n: T.int32):
+        if A[0] == n:
+            A[0] = A[0] + 1
+            if A[0] == n:
+                A[0] = 0
+
+    mod_before = tvm.IRModule({"main": before})
+    # Expected is the same as before
+    simplify_and_compare(mod_before, mod_before)
+
+
+def test_negation_of_condition():
+    """Use negation of outer condition to simplify inner."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            if i == 5:
+                if i != 5:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            if i == 5:
+                A[i] = 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_negation_of_not_equal():
+    """Test negation with != outer condition."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            if i != 5:
+                if i == 5:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            if i != 5:
+                A[i] = 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_negation_of_var_condition():
+    """Test negation with dynamic condition."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32"), n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                if i != n:
+                    A[i] = 0
+                else:
+                    A[i] = 1
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "int32"), n: T.int32):
+        for i in T.serial(16):
+            if i == n:
+                A[i] = 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_literal_constraint_split_boolean_and():
+    """Split a boolean AND into independent constraints."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16, 16), "int32"), n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n and j == n:
+                if i == n:
+                    A[i, j] = 0
+
+    @T.prim_func
+    def expected(A: T.Buffer((16, 16), "int32"), n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n and j == n:
+                A[i, j] = 0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_literal_constraint_split_boolean_or():
+    """Split a boolean OR into independent constraints."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16, 16), "int32"), n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n or j == n:
+                A[i, j] = 0
+            else:
+                if i == n:
+                    A[i, j] = 1
+                else:
+                    A[i, j] = 2
+
+    @T.prim_func
+    def expected(A: T.Buffer((16, 16), "int32"), n: T.int32):
+        for i, j in T.grid(16, 16):
+            if i == n or j == n:
+                A[i, j] = 0
+            else:
+                A[i, j] = 2
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_if_then_else_expr():
+    @T.prim_func
+    def before(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = T.if_then_else(i < 12, 1.0, 2.0, dtype="float32")
+
+    @T.prim_func
+    def expected(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            if i < 12:
+                A[i] = 1.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_ceil_log2_int():
+    """Simplify expressions resulting from topi.math.ceil_log2"""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = T.cast(T.ceil(T.log2(T.cast(14, "float64"), dtype="float64"), dtype="float64"), dtype="int32")
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "int32")):
+        A[0] = 4
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_left_shift_lower_bound():
+    """Integer bounds are propagated through left shift."""
+
+    @T.prim_func
+    def before(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            if T.shift_left(1, i, dtype="int32") >= 1:
+                A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_left_shift_upper_bound():
+    """Integer bounds are propagated through left shift."""
+
+    @T.prim_func
+    def before(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            if T.shift_left(31, i, dtype="int32") <= 1015808:
+                A[i] = 0.0
+
+    @T.prim_func
+    def expected(A: T.Buffer(16, "float32")):
+        for i in T.serial(16):
+            A[i] = 0.0
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_conditional_floor_mod():
+    """A regression test for negative floormod denominator."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), i: T.int32):
+        if T.floormod(0 - i, 2) == 0:
+            A[0] = T.floormod(i, 2) == 0
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), i: T.int32):
+        if T.floormod(i, -2) == 0:
+            A[0] = True
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_simplify_rhs_of_boolean_and_using_lhs():
+    """Boolean expressions can introduce contexts."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 5 and n < 10
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 5
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES.value: True})
+
+
+def test_simplify_lhs_of_boolean_and_using_rhs():
+    """Boolean expressions can introduce contexts for their arguments."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 10 and n < 5
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 5
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES.value: True})
+
+
+def test_simplify_rhs_of_boolean_or_using_lhs():
+    """Boolean expressions can introduce contexts."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 10 or n < 5
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 10
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES.value: True})
+
+
+def test_simplify_lhs_of_boolean_or_using_rhs():
+    """Boolean expressions can introduce contexts for their arguments."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 5 or n < 10
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "bool"), n: T.int32):
+        A[0] = n < 10
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES.value: True})
+
+
+def test_simplify_conditional_using_buffer_value():
+    """Simplify a conditional using the known value in the buffer."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        if A[0] == 0:
+            A[0] = 42
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        A[0] = 42
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_PROVE_CONDITIONAL.value: True})
+
+
+def test_simplify_non_conditional():
+    """Propagate a known value to later expressions."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        A[0] = A[0] + 1
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        A[0] = 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_SIMPLIFY_EXPRESSIONS.value: True})
+
+
+def test_suppress_simplify_non_conditional():
+    """Propagate a known value to later expressions - disabled."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = 0
+        A[0] = A[0] + 1
+
+    mod_before = tvm.IRModule({"main": before})
+    simplify_and_compare(mod_before, mod_before, {PassConfigKey.TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_SIMPLIFY_EXPRESSIONS.value: False})
+
+
+def test_simplify_buffer_store():
+    """Simplification using prior known."""
+
+    @T.prim_func
+    def before(A: T.Buffer(1, "int32")):
+        A[0] = 5
+        A[0] = A[0] + 7
+
+    @T.prim_func
+    def expected(A: T.Buffer(1, "int32")):
+        A[0] = 5
+        A[0] = 12
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_SIMPLIFY_EXPRESSIONS.value: True})
+
+
+def test_rewrite_as_and_of_ors():
+    """If enabled, rewrite boolean expressions into AND of OR."""
+
+    @T.prim_func
+    def before(A: T.Buffer(3, "bool")):
+        T.evaluate(A[0] or (A[1] and A[2]))
+
+    @T.prim_func
+    def expected(A: T.Buffer(3, "bool")):
+        T.evaluate((A[0] or A[1]) and (A[0] or A[2]))
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_CONVERT_BOOLEAN_TO_AND_OF_ORS.value: True})
+
+
+def test_suppress_rewrite_as_and_of_ors():
+    """Only rewrite into AND of OR when allowed."""
+
+    @T.prim_func
+    def before(A: T.Buffer(3, "bool")):
+        T.evaluate(A[0] or (A[1] and A[2]))
+
+    mod_before = tvm.IRModule({"main": before})
+    simplify_and_compare(mod_before, mod_before, {PassConfigKey.TL_SIMPLIFY_CONVERT_BOOLEAN_TO_AND_OF_ORS.value: False})
+
+
+def test_buffer_shape_constraint():
+    @T.prim_func
+    def before(a: T.handle):
+        n = T.int64()
+        A = T.match_buffer(a, (n * 32,), "float32")
+        A[T.min(T.int64(0), n)] = T.float32(0)
+
+    @T.prim_func
+    def expected(a: T.handle):
+        n = T.int64()
+        A = T.match_buffer(a, (n * 32,), "float32")
+        A[T.int64(0)] = T.float32(0)
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    simplify_and_compare(mod_before, mod_expected)
+
+
+def test_tilelang_enable_simplify_let_inline_true():
+    """Test that let statements are inlined when tilelang_enable_simplify_let_inline=True (default)."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            x = i + 1
+            A[i] = x
+
+    @T.prim_func
+    def expected(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            A[i] = i + 1
+
+    mod_before = tvm.IRModule({"main": before})
+    mod_expected = tvm.IRModule({"main": expected})
+    # Default behavior: let statements are inlined
+    simplify_and_compare(mod_before, mod_expected, {PassConfigKey.TL_SIMPLIFY_ENABLE_LET_INLINE.value: True})
+
+
+def test_tilelang_enable_simplify_let_inline_false():
+    """Test that let statements are NOT inlined when tilelang_enable_simplify_let_inline=False."""
+
+    @T.prim_func
+    def before(A: T.Buffer((16,), "int32")):
+        for i in T.serial(16):
+            x = i + 1
+            A[i] = x
+
+    mod_before = tvm.IRModule({"main": before})
+    # When disabled, let statements should be preserved (before == after)
+    simplify_and_compare(mod_before, mod_before, {PassConfigKey.TL_SIMPLIFY_ENABLE_LET_INLINE.value: False})
 
 
 if __name__ == "__main__":
diff --git a/testing/python/transform/test_tilelang_transform_split_host_device.py b/testing/python/transform/test_tilelang_transform_split_host_device.py
new file mode 100644
index 0000000000..b9a83e8b66
--- /dev/null
+++ b/testing/python/transform/test_tilelang_transform_split_host_device.py
@@ -0,0 +1,253 @@
+# Copyright (c) Tile-AI Corporation. All Rights Reserved.
+"""Tests for SplitHostDevice pass."""
+# ruff: noqa
+
+from tilelang import tvm as tvm
+import tilelang as tl
+import tilelang.language as T
+import tilelang.testing
+from tvm import tir
+
+
+def run_split_host_device_passes(func: tvm.tir.PrimFunc):
+    """Run the necessary passes before and including SplitHostDevice."""
+    mod = tvm.IRModule({func.attrs["global_symbol"]: func})
+    mod = tvm.tir.transform.BindTarget(tvm.target.Target("cuda", "c"))(mod)
+    mod = tl.transform.InjectAssumes()(mod)
+    mod = tl.transform.AnnotateDeviceRegions()(mod)
+    mod = tl.transform.SplitHostDevice()(mod)
+    return mod
+
+
+def get_device_func(mod: tvm.IRModule):
+    """Get the device kernel function from the module."""
+    for gvar, func in mod.functions.items():
+        if "kernel" in gvar.name_hint:
+            return func
+    return None
+
+
+def get_host_func(mod: tvm.IRModule):
+    """Get the host function from the module."""
+    for gvar, func in mod.functions.items():
+        if "kernel" not in gvar.name_hint:
+            return func
+    return None
+
+
+def collect_assume_vars(func: tvm.tir.PrimFunc):
+    """Collect all variables used in assume statements."""
+    assume_vars = set()
+    in_assume = [False]  # Use list to allow mutation in nested function
+    assume_nodes = []
+
+    def collect_assumes(stmt):
+        if isinstance(stmt, tir.AttrStmt) and stmt.attr_key == "tl.assume":
+            assume_nodes.append(stmt.node)
+
+    tir.stmt_functor.post_order_visit(func.body, collect_assumes)
+
+    # Now collect variables from assume nodes
+    def collect_vars_from_expr(expr):
+        if isinstance(expr, tir.Var):
+            assume_vars.add(expr)
+
+    for node in assume_nodes:
+        tir.stmt_functor.post_order_visit(node, collect_vars_from_expr)
+
+    return assume_vars
+
+
+def get_var_name(var):
+    """Get the name of a Var, handling different TVM versions."""
+    if hasattr(var, "name_hint"):
+        return var.name_hint
+    elif hasattr(var, "name"):
+        return var.name
+    else:
+        # Try to get name from string representation
+        return str(var).split(":")[0].strip()
+
+
+def get_param_by_name(func: tvm.tir.PrimFunc, name: str):
+    """Get a parameter by name_hint."""
+    for param in func.params:
+        if get_var_name(param) == name:
+            return param
+    return None
+
+
+@tilelang.testing.requires_cuda
+def test_split_host_device_with_user_assume():
+    """Test that user-defined assumes are correctly copied to device function
+    with proper variable substitution.
+
+    This test verifies that:
+    1. Assumes are copied from host to device function
+    2. Variables in assumes refer to the device function parameters (not dangling)
+    3. Host function correctly calls the kernel with original variables
+    """
+    n = T.dynamic("n")
+
+    @T.prim_func
+    def main(a: T.Tensor[(n,), T.int32]):
+        T.assume(n >= 233 and n <= 1000)
+        with T.Kernel(1, threads=128):
+            for i in T.serial(T.ceildiv(n - 233, 123)):
+                a[i] = 1
+
+    mod = run_split_host_device_passes(main)
+
+    # Check that we have both host and device functions
+    assert len(mod.functions) == 2, "Expected 2 functions (host and device)"
+
+    device_func = get_device_func(mod)
+    host_func = get_host_func(mod)
+
+    assert device_func is not None, "Device function not found"
+    assert host_func is not None, "Host function not found"
+
+    # Check that device function has assume statements
+    device_str = str(device_func)
+    assert "tl.assume" in device_str, "Device function should have assume statements"
+
+    # Check that assume variables are the same objects as function parameters
+    # (not dangling variables like n_1)
+    assume_vars = collect_assume_vars(device_func)
+    param_n = get_param_by_name(device_func, "n")
+
+    assert param_n is not None, "Device function should have parameter 'n'"
+
+    # All 'n' variables in assumes should be the same object as the parameter
+    for var in assume_vars:
+        if get_var_name(var) == "n":
+            assert var.same_as(param_n), (
+                f"Assume variable 'n' (id={id(var)}) should be the same object "
+                f"as parameter 'n' (id={id(param_n)}). "
+                "This indicates a variable substitution bug in SplitHostDevice."
+            )
+
+
+@tilelang.testing.requires_cuda
+def test_split_host_device_with_buffer_shape_assume():
+    """Test that buffer shape assumes (auto-generated) are correctly handled."""
+    n = T.dynamic("n")
+    m = T.dynamic("m")
+
+    @T.prim_func
+    def main(a: T.Tensor[(n, m), T.float32]):
+        with T.Kernel(1, threads=128):
+            for i in T.serial(n):
+                for j in T.serial(m):
+                    a[i, j] = 1.0
+
+    mod = run_split_host_device_passes(main)
+
+    device_func = get_device_func(mod)
+    assert device_func is not None
+
+    # Check that assumes exist
+    device_str = str(device_func)
+    assert "tl.assume" in device_str, "Device function should have assume statements"
+
+    # Check that assume variables match parameters
+    assume_vars = collect_assume_vars(device_func)
+    param_n = get_param_by_name(device_func, "n")
+    param_m = get_param_by_name(device_func, "m")
+
+    for var in assume_vars:
+        if get_var_name(var) == "n" and param_n is not None:
+            assert var.same_as(param_n), "Assume 'n' should match parameter 'n'"
+        elif get_var_name(var) == "m" and param_m is not None:
+            assert var.same_as(param_m), "Assume 'm' should match parameter 'm'"
+
+
+@tilelang.testing.requires_cuda
+def test_split_host_device_multiple_assumes():
+    """Test with multiple user assumes on the same variable."""
+    n = T.dynamic("n")
+
+    @T.prim_func
+    def main(a: T.Tensor[(n,), T.int32]):
+        T.assume(n > 0)
+        T.assume(n < 10000)
+        T.assume(n % 128 == 0)
+        with T.Kernel(1, threads=128):
+            for i in T.serial(n):
+                a[i] = i
+
+    mod = run_split_host_device_passes(main)
+
+    device_func = get_device_func(mod)
+    assert device_func is not None
+
+    device_str = str(device_func)
+    # Should have multiple assume statements
+    assert device_str.count("tl.assume") >= 3, "Should have at least 3 assume statements"
+
+    # All assumes should use the parameter, not dangling variables
+    assume_vars = collect_assume_vars(device_func)
+    param_n = get_param_by_name(device_func, "n")
+
+    assert param_n is not None
+    for var in assume_vars:
+        if get_var_name(var) == "n":
+            assert var.same_as(param_n), "All assume variables should match parameter"
+
+
+@tilelang.testing.requires_cuda
+def test_split_host_device_no_dangling_vars():
+    """Verify that no dangling variable declarations (like n_1 = T.int32())
+    appear in the device function due to incorrect variable handling.
+    """
+    n = T.dynamic("n")
+
+    @T.prim_func
+    def main(a: T.Tensor[(n,), T.int32]):
+        T.assume(n >= 100)
+        with T.Kernel(1, threads=128):
+            for i in T.serial(n):
+                a[i] = 1
+
+    mod = run_split_host_device_passes(main)
+
+    device_func = get_device_func(mod)
+    assert device_func is not None
+
+    device_str = str(device_func)
+
+    # Check for common patterns of dangling variables
+    # These patterns indicate that ConvertSSA created separate variables
+    # for assumes that should have used the function parameters
+    import re
+
+    # Look for patterns like "n_1 = T.int32()" which indicate dangling vars
+    dangling_pattern = r"\bn_\d+\s*=\s*T\.int32\(\)"
+    matches = re.findall(dangling_pattern, device_str)
+
+    # Filter out legitimate uses (like in blocks that might have their own scope)
+    # We're specifically looking for dangling declarations at function level
+    lines = device_str.split("\n")
+    dangling_decls = []
+    for line in lines:
+        # Check if this is a top-level dangling declaration
+        # (not inside a block's T.reads()/T.writes())
+        stripped = line.strip()
+        if re.match(r"^n_\d+\s*=\s*T\.int32\(\)$", stripped):
+            dangling_decls.append(stripped)
+
+    # If assume is immediately followed by a dangling var declaration, that's the bug
+    assume_indices = [i for i, line in enumerate(lines) if "tl.assume" in line]
+    for idx in assume_indices:
+        # Check if line before assume has dangling var
+        if idx > 0:
+            prev_line = lines[idx - 1].strip()
+            if re.match(r"^n_\d+\s*=\s*T\.int32\(\)$", prev_line):
+                raise AssertionError(
+                    f"Found dangling variable declaration '{prev_line}' before assume. "
+                    "This indicates SplitHostDevice did not properly substitute variables."
+                )
+
+
+if __name__ == "__main__":
+    tilelang.testing.main()
diff --git a/testing/python/transform/test_tilelang_transform_thread_sync.py b/testing/python/transform/test_tilelang_transform_thread_sync.py
index 046ed447a4..08ec54e451 100644
--- a/testing/python/transform/test_tilelang_transform_thread_sync.py
+++ b/testing/python/transform/test_tilelang_transform_thread_sync.py
@@ -3,7 +3,6 @@
 from tilelang import tvm as tvm
 import tilelang.testing
 from tvm.script import tir as T
-from tvm import te
 
 
 def run_passes(func: tvm.tir.PrimFunc):
@@ -18,6 +17,83 @@ def run_passes(func: tvm.tir.PrimFunc):
     return tilelang.transform.ThreadSync("shared")(mod)
 
 
+@tilelang.testing.requires_cuda
+def test_no_sync_between_atomic_adds_to_shared():
+    """Atomic WAW (and RMW) should not trigger thread-level sync insertion.
+
+    This is a regression test for the case where ThreadSync conservatively
+    treated atomic pointer accesses as conflicting and inserted syncthreads
+    between atomics, degrading atomics into serialized updates.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        A_shared = T.alloc_buffer((16, 128), dtype="float32", scope="shared")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        for i in range(16):
+            T.evaluate(
+                T.call_intrin(
+                    "float32",
+                    tvm.tir.op.Op.get("tl.atomic_add_elem_op"),
+                    T.tvm_access_ptr(
+                        T.type_annotation("float32"),
+                        A_shared.data,
+                        i * 128 + tx,
+                        1,
+                        3,
+                    ),
+                    T.float32(1),
+                    T.int32(0),
+                )
+            )
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    assert 'T.tvm_storage_sync("shared")' not in s, f"Unexpected sync inserted for atomic ops:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_thread_sync_handles_int64_tvm_access_ptr_offset():
+    """Regression: shared/shared.dyn pointer offsets may be int64.
+
+    ThreadSync used to reconstruct multidimensional indices with hardcoded
+    int32 temporaries, which crashed on expressions like FloorDiv(int64, int32)
+    while analyzing tvm_access_ptr from lowered atomic ops.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        A_shared = T.alloc_buffer((128,), dtype="float32", scope="shared.dyn")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        T.evaluate(
+            T.call_intrin(
+                "float32",
+                tvm.tir.op.Op.get("tl.atomic_add_elem_op"),
+                T.tvm_access_ptr(
+                    T.type_annotation("float32"),
+                    A_shared.data,
+                    T.Cast("int64", tx),
+                    1,
+                    3,
+                ),
+                T.float32(1),
+                T.int32(0),
+            )
+        )
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
+    s = str(mod.script())
+    assert 'T.tvm_storage_sync("shared.dyn")' not in s, f"Unexpected sync inserted for single atomic op:\n{s}"
+
+
 @tilelang.testing.requires_cuda
 def test_sync_if_with_same_index():
     @T.prim_func(check_well_formed=False)
@@ -30,6 +106,8 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
         temp_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
         T.launch_thread(blockIdx_x, 8)
         T.launch_thread(threadIdx_x, 4)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         result_local[0] = T.float32(0)
         if threadIdx_y < 8:
             temp_shared[threadIdx_x] = p0[0]
@@ -37,7 +115,29 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
         result_local[0] = result_local[0] + temp_shared[0]
 
     mod = run_passes(func)
-    assert "T.tvm_storage_sync" in str(mod)
+    assert "T.tvm_storage_sync" in str(mod.script())
+
+
+@tilelang.testing.requires_cuda
+def test_sync_if_with_same_index_with_modulo_if():
+    @T.prim_func(check_well_formed=False)
+    def func() -> None:
+        threadIdx_x = T.env_thread("threadIdx.x")
+        blockIdx_x = T.env_thread("blockIdx.x")
+        p0 = T.alloc_buffer([1], dtype="float32", scope="local")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        temp_shared = T.alloc_buffer([32], dtype="float32", scope="shared")
+        T.launch_thread(blockIdx_x, 1)
+        T.launch_thread(threadIdx_x, 32)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        if threadIdx_x % 4 == 0:
+            temp_shared[threadIdx_x] = p0[0]
+        result_local[0] = temp_shared[threadIdx_x]
+
+    mod = run_passes(func)
+    assert "T.tvm_storage_sync" in str(mod.script())
 
 
 @tilelang.testing.requires_cuda
@@ -51,6 +151,8 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
         temp_shared = T.alloc_buffer([1], dtype="float32", scope="shared")
         T.launch_thread(blockIdx_x, 8)
         T.launch_thread(threadIdx_x, 4)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         result_local[0] = T.float32(0)
         if threadIdx_x < 1:
             temp_shared[0] = p0[0]
@@ -60,7 +162,7 @@ def func(p0_arg: T.Buffer((1, 2, 1, 1), "float32"), p1: T.Buffer(2, "float32"))
         result_local[0] = result_local[0] + temp_shared[0] * p1[1]
 
     mod = run_passes(func)
-    assert "T.tvm_storage_sync" in str(mod)
+    assert "T.tvm_storage_sync" in str(mod.script())
 
 
 @tilelang.testing.requires_cuda
@@ -72,6 +174,8 @@ def func(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
         C = T.allocate([1], "float32", "local")
         D = T.allocate([16], "float32", "shared")
         threadIdx_x = T.launch_thread("threadIdx.x", 16)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         B_1 = T.Buffer((24,), data=B, scope="shared")
         A_1 = T.Buffer((16,), data=A.data)
         B_1[threadIdx_x // 4 * 6 + threadIdx_x % 4] = A_1[threadIdx_x]
@@ -89,6 +193,8 @@ def expected(A: T.Buffer((4, 4), "float32"), E: T.Buffer((4, 4), "float32")):
         C_1 = T.allocate([1], "float32", "local")
         D_1 = T.allocate([16], "float32", "shared")
         threadIdx_x = T.launch_thread("threadIdx.x", 16)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         B_1_1 = T.Buffer((24,), data=B_1, scope="shared")
         A_1 = T.Buffer((16,), data=A.data)
         B_1_1[threadIdx_x // 4 * 6 + threadIdx_x % 4] = A_1[threadIdx_x]
@@ -113,6 +219,8 @@ def func(A: T.Buffer((16 * 512), "float32")):
         in_thread_A_temp = T.allocate([1], "float32", "local")
         cross_thread_A_temp = T.allocate([1], "float32", "local")
         threadIdx_x = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         A_shared_1 = T.Buffer((512,), data=A_shared, scope="shared")
         for ax0 in range(512):
             A_shared_1[ax0] = A[blockIdx_x * 512 + ax0]
@@ -130,7 +238,7 @@ def func(A: T.Buffer((16 * 512), "float32")):
         with T.attr(
             T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
             "reduce_scope",
-            T.reinterpret("handle", T.uint64(0)),
+            T.reinterpret(T.uint64(0), dtype="handle"),
         ):
             T.tvm_thread_allreduce(
                 T.uint32(1),
@@ -147,6 +255,8 @@ def expected(A: T.Buffer((8192,), "float32")):
         in_thread_A_temp_1 = T.allocate([1], "float32", "local")
         cross_thread_A_temp_1 = T.allocate([1], "float32", "local")
         threadIdx_x = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         A_shared_1_1 = T.Buffer((512,), data=A_shared_1, scope="shared")
         for ax0 in range(512):
             A_shared_1_1[ax0] = A[blockIdx_x * 512 + ax0]
@@ -164,7 +274,7 @@ def expected(A: T.Buffer((8192,), "float32")):
         T.attr(
             T.comm_reducer(lambda x0, y0: x0 + y0, [T.float32(0)]),
             "reduce_scope",
-            T.reinterpret("handle", T.uint64(0)),
+            T.reinterpret(T.uint64(0), dtype="handle"),
         )
         cross_thread_A_temp_1_1 = T.Buffer((1,), data=cross_thread_A_temp_1, scope="local")
         T.tvm_thread_allreduce(
@@ -186,6 +296,8 @@ def test_sync_shared_dyn_stmatrix_loop_hoist():
     def func():
         buf_dyn_shmem = T.alloc_buffer((98304,), "uint8", scope="shared.dyn")
         tx = T.launch_thread("threadIdx.x", 384)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
         for i in T.unroll(8):
             off = (
                 i // 4 * 8192
@@ -214,11 +326,411 @@ def func():
 
     mod = tvm.IRModule({"main": func})
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
-    s = str(mod)
+    s = str(mod.script())
     assert 'T.tvm_storage_sync("shared.dyn")' in s
     # Ensure the sync appears before the unrolled loop
     assert s.index('T.tvm_storage_sync("shared.dyn")') < s.index("for i in T.unroll(8)")
 
 
+@tilelang.testing.requires_cuda
+def test_loop_carry_no_dependency_same_index():
+    """Test that A[i] write followed by A[i] read in a loop does NOT need barrier.
+
+    After iteration shift analysis:
+    - Iteration i writes A[i]
+    - Iteration i+1 reads A[i+1] (shifted from A[i])
+    - A[i] vs A[i+1] are disjoint, so no loop-carried dependency
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for i in range(10):
+            # Each iteration writes to A[tx], then reads from A[tx]
+            # No loop-carried dependency because different iterations
+            # access different locations
+            temp_shared[tx] = T.float32(i)
+            result_local[0] = result_local[0] + temp_shared[tx]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    # Should NOT have sync inside the loop since A[tx] in iteration i
+    # does not conflict with A[tx] in iteration i+1 (they're different threads' data)
+    # The key insight: same thread writes and reads its own location
+    assert 'T.tvm_storage_sync("shared")' not in s, f"Unexpected sync in loop:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_loop_carry_with_cross_thread_dependency():
+    """Test loop-carried dependency where different threads access overlapping locations.
+
+    In this test:
+    - Thread tx writes to A[tx]
+    - Then reads from A[(tx + 127) % 128] (neighbor's data from previous iteration)
+
+    After iteration shift analysis, we compare:
+    - Iteration i: thread tx writes A[tx]
+    - Iteration i+1: thread tx reads A[(tx + 127) % 128]
+
+    This creates a cross-thread dependency where thread tx+1's write conflicts
+    with thread tx's read in the next iteration, requiring a barrier.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for i in range(10):
+            # Each thread writes to its own location
+            temp_shared[tx] = T.float32(i)
+            # Then reads from neighbor (creates cross-thread dependency)
+            result_local[0] = result_local[0] + temp_shared[(tx + 127) % 128]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    # Should have sync because thread tx reads from thread (tx+127)%128's location
+    # This is a WAR hazard across threads
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync for cross-thread dependency:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_loop_carry_modulo_buffering():
+    """Test that A[i%2] write followed by A[i%2] read does NOT need barrier (double buffering).
+
+    After iteration shift analysis:
+    - Iteration i writes A[i%2]
+    - Iteration i+1 reads A[(i+1)%2] (shifted from A[i%2])
+    - A[i%2] vs A[(i+1)%2] are disjoint (0 vs 1 or 1 vs 0), so no dependency
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([2, 64], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 64)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for i in range(10):
+            # Double buffering pattern: write to buffer[i%2], read from buffer[i%2]
+            # After shift: write buffer[i%2], read buffer[(i+1)%2]
+            # These are different buffers, so no conflict
+            temp_shared[i % 2, tx] = T.float32(i)
+            result_local[0] = result_local[0] + temp_shared[i % 2, tx]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    # Should NOT have sync inside loop due to modulo buffering analysis
+    # Note: This test verifies the modulo analysis capability
+    print(f"Modulo buffering result:\n{s}")
+
+
+@tilelang.testing.requires_cuda
+def test_loop_carry_different_indices():
+    """Test that A[i] write followed by A[i+1] read does NOT need barrier.
+
+    After iteration shift analysis:
+    - Iteration i writes A[i]
+    - Iteration i+1 reads A[i+2] (shifted from A[i+1], becomes A[(i+1)+1] = A[i+2])
+    - A[i] vs A[i+2] are disjoint, so no loop-carried dependency
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 1)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for i in range(10):
+            # Write to A[i], read from A[i+1]
+            # After shift: comparing A[i] (write) vs A[i+2] (read from i+1 shifted)
+            # No overlap, no dependency
+            temp_shared[i] = T.float32(i)
+            result_local[0] = result_local[0] + temp_shared[i + 1]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    print(f"Different indices result:\n{s}")
+
+
+# =============================================================================
+# Tests for non-uniform if condition sync hoisting
+# =============================================================================
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_non_uniform_if_with_threadidx():
+    """Test that sync is hoisted when if condition directly depends on threadIdx.
+
+    When the if condition uses threadIdx, different threads may take different
+    branches. If a sync is needed inside the if, it must be hoisted to before
+    the if statement to avoid deadlock.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        # First, all threads write to shared memory
+        temp_shared[tx] = T.float32(tx)
+        # Non-uniform condition: only some threads enter the if
+        if tx < 64:
+            # Inside the if, we read from shared memory
+            # This needs a sync, but since condition is non-uniform,
+            # the sync must be hoisted to before the if
+            result_local[0] = temp_shared[tx + 64]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    # Sync should appear before the if statement
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # The sync should be before the if, not inside it
+    sync_pos = s.index('T.tvm_storage_sync("shared")')
+    if_pos = s.index("if tx < 64")
+    assert sync_pos < if_pos, f"Sync should be before if statement:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_non_uniform_if_shared_memory_condition():
+    """Test sync hoisting when if condition reads from shared memory with thread-dependent index.
+
+    This is the exact pattern that caused the original deadlock:
+    - Condition reads shared memory at index depending on threadIdx
+    - Different threads get different values -> non-uniform condition
+    - Sync inside if would cause deadlock
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        token_ids = T.alloc_buffer([128], dtype="int32", scope="shared")
+        data_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        # First phase: all threads write to data_shared
+        data_shared[tx] = T.float32(tx)
+        # Non-uniform condition: reads shared memory with threadIdx-dependent index
+        # token_ids[tx] can be different for each thread (e.g., some are -1, some are valid)
+        if token_ids[tx] != -1:
+            # Inside the if, we read from data_shared
+            # Sync is needed but must be hoisted because condition is non-uniform
+            result_local[0] = data_shared[tx]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    # Sync should appear before the if statement
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # The sync should be before the if that checks token_ids
+    sync_pos = s.index('T.tvm_storage_sync("shared")')
+    if_pos = s.index("if token_ids")
+    assert sync_pos < if_pos, f"Sync should be hoisted before non-uniform if:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_inside_uniform_if_blockidx():
+    """Test that sync can stay inside if when condition is uniform (blockIdx).
+
+    When the if condition only depends on blockIdx (same for all threads in a block),
+    all threads take the same branch, so sync inside the if is safe.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 4)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        # First, all threads write to shared memory
+        temp_shared[tx] = T.float32(tx)
+        # Uniform condition: blockIdx is same for all threads in a block
+        if bx < 2:
+            # Sync inside uniform if is safe - all threads in this block
+            # will either all enter or all skip this branch
+            result_local[0] = temp_shared[(tx + 64) % 128]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    # Should have sync (either inside or outside the if is fine for uniform condition)
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_inside_uniform_if_runtime_block_uniform_condition():
+    """Runtime-loaded but block-uniform conditions should keep syncs in the if."""
+
+    @T.prim_func(private=True)
+    def func(flags: T.Buffer((4,), "int32")):
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 4)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        if flags[bx] > 0:
+            temp_shared[tx] = T.float32(tx)
+            result_local[0] = temp_shared[(tx + 64) % 128]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    assert s.count('T.tvm_storage_sync("shared")') == 1, f"Expected exactly one sync:\n{s}"
+    if_pos = s.index("if flags[bx] > 0")
+    sync_pos = s.index('T.tvm_storage_sync("shared")')
+    assert sync_pos > if_pos, f"Block-uniform runtime condition should keep sync inside if:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_nested_non_uniform_if():
+    """Test sync hoisting with nested if statements where outer is non-uniform."""
+
+    @T.prim_func(private=True)
+    def func():
+        temp_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        # Write to shared memory
+        temp_shared[tx] = T.float32(tx)
+        # Outer non-uniform condition
+        if tx < 64:
+            # Inner condition (also non-uniform)
+            if tx < 32:
+                # Sync needed here must be hoisted all the way out
+                result_local[0] = temp_shared[tx + 64]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # Sync should be before the outermost non-uniform if
+    sync_pos = s.index('T.tvm_storage_sync("shared")')
+    if_pos = s.index("if tx < 64")
+    assert sync_pos < if_pos, f"Sync should be hoisted before outer if:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_non_uniform_if_in_loop():
+    """Test sync hoisting when non-uniform if is inside a loop."""
+
+    @T.prim_func(private=True)
+    def func():
+        token_ids = T.alloc_buffer([128], dtype="int32", scope="shared")
+        data_shared = T.alloc_buffer([128], dtype="float32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for k in range(2):
+            # Write to shared memory
+            data_shared[tx] = T.float32(tx + k)
+            # Non-uniform if inside loop
+            if token_ids[tx] != -1:
+                result_local[0] = result_local[0] + data_shared[tx]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # Sync should be before the if inside the loop, not inside the if
+    # This ensures all threads can reach the sync point
+
+
+@tilelang.testing.requires_cuda
+def test_no_sync_needed_uniform_accesses():
+    """Test that no extra sync is added when accesses are already safe.
+
+    When each thread only accesses its own data (no cross-thread dependency),
+    no sync is needed even inside an if statement.
+    """
+
+    @T.prim_func(private=True)
+    def func():
+        temp_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        temp_local[0] = T.float32(tx)
+        # Non-uniform condition but no shared memory access
+        if tx < 64:
+            result_local[0] = temp_local[0]
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    # No sync needed - only local memory is accessed
+    assert 'T.tvm_storage_sync("shared")' not in s, f"Unexpected sync:\n{s}"
+
+
+@tilelang.testing.requires_cuda
+def test_sync_hoist_non_uniform_if_in_loop_with_shared_memory():
+    """Test sync hoisting when non-uniform if is inside a loop with shared memory."""
+
+    @T.prim_func(private=True)
+    def func():
+        token_ids = T.alloc_buffer([128], dtype="int32", scope="shared")
+        result_local = T.alloc_buffer([1], dtype="float32", scope="local")
+        bx = T.launch_thread("blockIdx.x", 1)
+        tx = T.launch_thread("threadIdx.x", 128)
+        ty = T.launch_thread("threadIdx.y", 1)
+        tz = T.launch_thread("threadIdx.z", 1)
+        result_local[0] = T.float32(0)
+        for k in range(2):
+            # Write to shared memory
+            token_ids[tx] = T.int32(k - 2)
+            # Non-uniform if inside loop
+            if token_ids[tx] >= 0:
+                result_local[0] = T.float32(1)
+
+    mod = tvm.IRModule({"main": func})
+    mod = tilelang.transform.ThreadSync("shared")(mod)
+    s = str(mod.script())
+    assert 'T.tvm_storage_sync("shared")' in s, f"Expected sync:\n{s}"
+    # Sync should be before the if inside the loop, not inside the if
+    sync_pos = s.index('T.tvm_storage_sync("shared")')
+    if_pos = s.index("if token_ids[tx] >= 0")
+    assert sync_pos < if_pos, f"Sync should be hoisted before non-uniform if:\n{s}"
+
+
 if __name__ == "__main__":
     tilelang.testing.main()
diff --git a/tilelang/__init__.py b/tilelang/__init__.py
index a687f4d1b0..861d2cc758 100644
--- a/tilelang/__init__.py
+++ b/tilelang/__init__.py
@@ -1,11 +1,10 @@
-import sys
-import os
+import contextlib
 import ctypes
-
 import logging
+import os
+import sys
 import warnings
 from pathlib import Path
-from tqdm.auto import tqdm
 
 
 def _compute_version() -> str:
@@ -46,39 +45,10 @@ def _compute_version() -> str:
 
 
 __version__ = _compute_version()
-
-from importlib.metadata import PackageNotFoundError, version
-
-try:
-    __version__ = version("tilelang")
-except PackageNotFoundError:
-    try:
-        from version_provider import dynamic_metadata
-
-        __version__ = dynamic_metadata("version")
-    except Exception as exc:
-        warnings.warn(
-            f"tilelang version metadata unavailable ({exc!r}); using development version.",
-            RuntimeWarning,
-            stacklevel=2,
-        )
-        __version__ = "0.0.dev0"
+del _compute_version
 
 
-class TqdmLoggingHandler(logging.Handler):
-    """Custom logging handler that directs log output to tqdm progress bar to avoid interference."""
-
-    def __init__(self, level=logging.NOTSET):
-        """Initialize the handler with an optional log level."""
-        super().__init__(level)
-
-    def emit(self, record):
-        """Emit a log record. Messages are written to tqdm to ensure output in progress bars isn't corrupted."""
-        try:
-            msg = self.format(record)
-            tqdm.write(msg)
-        except Exception:
-            self.handleError(record)
+logger = logging.getLogger(__name__)
 
 
 def set_log_level(level):
@@ -90,16 +60,35 @@ def set_log_level(level):
     """
     if isinstance(level, str):
         level = getattr(logging, level.upper(), logging.INFO)
-    logger = logging.getLogger(__name__)
     logger.setLevel(level)
 
 
 def _init_logger():
     """Initialize the logger specific for this module with custom settings and a Tqdm-based handler."""
-    logger = logging.getLogger(__name__)
+    try:
+        from tqdm.auto import tqdm
+    except ImportError:
+        tqdm = None
+
+    class TqdmLoggingHandler(logging.Handler):
+        """Custom logging handler that directs log output to tqdm progress bar to avoid interference."""
+
+        def __init__(self, level=logging.NOTSET):
+            """Initialize the handler with an optional log level."""
+            super().__init__(level)
+
+        def emit(self, record):
+            """Emit a log record. Messages are written to tqdm to ensure output in progress bars isn't corrupted."""
+            try:
+                msg = self.format(record)
+                if tqdm is not None:
+                    tqdm.write(msg)
+            except Exception:
+                self.handleError(record)
+
     handler = TqdmLoggingHandler()
     formatter = logging.Formatter(
-        fmt="%(asctime)s  [TileLang:%(name)s:%(levelname)s]: %(message)s",
+        fmt="%(asctime)s  [TileLang:%(name)s:%(levelname)s] (%(filename)s:%(lineno)d): %(message)s",
         datefmt="%Y-%m-%d %H:%M:%S",
     )
     handler.setFormatter(formatter)
@@ -108,74 +97,95 @@ def _init_logger():
     set_log_level("INFO")
 
 
-_init_logger()
+from .env import env as env  # noqa: F401
 
-logger = logging.getLogger(__name__)
+# Skip logger initialization in light import mode
+if not env.is_light_import():
+    _init_logger()
 
-from .env import enable_cache, disable_cache, is_cache_enabled  # noqa: F401
-from .env import env as env  # noqa: F401
+del _init_logger
+
+
+@contextlib.contextmanager
+def _lazy_load_lib():
+    import torch  # noqa: F401 # preload torch to avoid dlopen errors
 
-import tvm
-import tvm.base  # noqa: F401
-from tvm import DataType  # noqa: F401
-
-# Setup tvm search path before importing tvm
-from . import libinfo
-
-
-def _load_tile_lang_lib():
-    """Load Tile Lang lib"""
-    if sys.platform.startswith("win32") and sys.version_info >= (3, 8):
-        for path in libinfo.get_dll_directories():
-            os.add_dll_directory(path)
-    # pylint: disable=protected-access
-    lib_name = "tilelang" if tvm.base._RUNTIME_ONLY else "tilelang_module"
-    # pylint: enable=protected-access
-    lib_path = libinfo.find_lib_path(lib_name)
-    return ctypes.CDLL(lib_path), lib_path
-
-
-# only load once here
-if env.SKIP_LOADING_TILELANG_SO == "0":
-    _LIB, _LIB_PATH = _load_tile_lang_lib()
-
-from .jit import jit, lazy_jit, JITKernel, compile, par_compile  # noqa: F401
-from .profiler import Profiler  # noqa: F401
-from .cache import clear_cache  # noqa: F401
-
-from .utils import (
-    TensorSupplyType,  # noqa: F401
-    deprecated,  # noqa: F401
-)
-
-# TileScale distributed extensions (optional - only available when tilescale_ext is installed)
-try:
-    from .utils.tensor import tensor  # noqa: F401
-    from .utils.allocator import get_allocator  # noqa: F401
-except ImportError:
-    # tilescale_ext not installed - distributed features unavailable
-    tensor = None
-    get_allocator = None
-
-from .layout import (
-    Layout,  # noqa: F401
-    Fragment,  # noqa: F401
-)
-from . import (
-    analysis,  # noqa: F401
-    transform,  # noqa: F401
-    language,  # noqa: F401
-    engine,  # noqa: F401
-    tools,  # noqa: F401
-)
-from .language.v2 import dtypes  # noqa: F401
-from .autotuner import autotune  # noqa: F401
-from .transform import PassConfigKey  # noqa: F401
-
-from .engine import lower, register_cuda_postproc, register_hip_postproc, register_c_postproc  # noqa: F401
-
-from .math import *  # noqa: F403
-
-from . import ir  # noqa: F401
-
-from . import tileop  # noqa: F401
+    old_flags = sys.getdlopenflags()
+    old_init = ctypes.CDLL.__init__
+
+    def lazy_init(self, name, mode=ctypes.DEFAULT_MODE, *args, **kwargs):
+        return old_init(self, name, mode | os.RTLD_LAZY, *args, **kwargs)
+
+    sys.setdlopenflags(old_flags | os.RTLD_LAZY)
+    ctypes.CDLL.__init__ = lazy_init
+    try:
+        yield
+    finally:
+        sys.setdlopenflags(old_flags)
+        ctypes.CDLL.__init__ = old_init
+
+
+# Skip heavy imports in light import mode
+if not env.is_light_import():
+    with _lazy_load_lib():
+        from .env import enable_cache, disable_cache, is_cache_enabled  # noqa: F401
+
+        import tvm
+        import tvm.base  # noqa: F401
+        from tvm import DataType  # noqa: F401
+
+        # Setup tvm search path before importing tvm
+        from . import libinfo
+
+        def _load_tile_lang_lib():
+            """Load Tile Lang lib"""
+            if sys.platform.startswith("win32") and sys.version_info >= (3, 8):
+                for path in libinfo.get_dll_directories():
+                    os.add_dll_directory(path)
+            lib_path = libinfo.find_lib_path("tilelang")
+            return ctypes.CDLL(lib_path), lib_path
+
+        # only load once here
+        if env.SKIP_LOADING_TILELANG_SO == "0":
+            _LIB, _LIB_PATH = _load_tile_lang_lib()
+
+    from .jit import jit, JITKernel, compile, par_compile  # noqa: F401
+    from .profiler import Profiler  # noqa: F401
+    from .cache import clear_cache  # noqa: F401
+    from .utils import (
+        TensorSupplyType,  # noqa: F401
+        deprecated,  # noqa: F401
+        build_date,  # noqa: F401
+    )
+
+    # TileScale tensor utilities
+    try:
+        from .utils.tensor import tensor  # noqa: F401
+    except ImportError:
+        tensor = None
+
+    # TileScale distributed allocator (requires shared_memory C extension)
+    try:
+        from .utils.allocator import get_allocator  # noqa: F401
+    except ImportError:
+        get_allocator = None
+    from .layout import (
+        Layout,  # noqa: F401
+        Fragment,  # noqa: F401
+    )
+    from . import (
+        analysis,  # noqa: F401
+        transform,  # noqa: F401
+        language,  # noqa: F401
+        engine,  # noqa: F401
+        tools,  # noqa: F401
+    )
+    from .language import dtypes  # noqa: F401
+    from .autotuner import autotune  # noqa: F401
+    from .transform import PassConfigKey  # noqa: F401
+    from .engine import lower, register_cuda_postproc, register_hip_postproc, register_c_postproc  # noqa: F401
+    from .math import *  # noqa: F403
+    from . import ir  # noqa: F401
+    from . import tileop  # noqa: F401
+
+del _lazy_load_lib
diff --git a/tilelang/_typing.py b/tilelang/_typing.py
new file mode 100644
index 0000000000..9c8aa47f60
--- /dev/null
+++ b/tilelang/_typing.py
@@ -0,0 +1,39 @@
+"""Type annotations for TileLang."""
+
+# NOTE(chaofan): We should name it "_typing.py" to avoid module shadowing with standard library "typing"
+# NOTE: In python 3.9, `from __future__ import annotations` does not for value expression, e.g. to define type alias
+
+# Python 3.9 compatibility
+try:
+    from typing import TypeAlias
+except ImportError:  # Python < 3.10
+    from typing_extensions import TypeAlias
+
+from typing import Union
+
+from tvm import ir
+from tvm import tir
+
+from tvm.tir import BufferLoad, BufferRegion
+from tilelang.dtypes import dtype
+
+# Barrier can only be a Buffer, a BufferLoad
+BarrierType: TypeAlias = Union[tir.Buffer, BufferLoad]
+
+# BufferLikeType can be a Buffer, a BufferLoad, a BufferRegion
+BufferLikeType: TypeAlias = Union[tir.Buffer, BufferLoad, BufferRegion]
+
+# This is for Python 3.9 compatibility.
+# In Python 3.9, we can only use isinstance(a, (TypeA, TypeB, ...)) instead of isinstance(a, TypeA | TypeB | ...))
+BufferLikeTypeTuple = (tir.Buffer, BufferLoad, BufferRegion)
+
+# Difference between "AnyDType" and "DType":
+# - AnyDType is a union of all possible types that can represent a data type, including torch.dtype
+# - DType is a more specific type alias that represents a data type in the context of TileLang, and must be
+#   adapted to string.
+DType: TypeAlias = Union[dtype, ir.Type, str, type]
+ShapeType: TypeAlias = Union[list[Union[tir.PrimExpr, int]], tuple[Union[tir.PrimExpr, int], ...]]
+
+# PrimExpr with adaptation to Python basic data types
+# IntImm, FloatImm, Bool: IntImm, Integer: IntImm
+PyPrimExpr: TypeAlias = Union[tir.PrimExpr, int, float, bool]
diff --git a/tilelang/analysis/ast_printer.py b/tilelang/analysis/ast_printer.py
index fe94505a5c..453fd69668 100644
--- a/tilelang/analysis/ast_printer.py
+++ b/tilelang/analysis/ast_printer.py
@@ -4,7 +4,11 @@
 from tvm.tir.transform import prim_func_pass
 
 
-_child_fields = ["body", "block", "seq"]
+_seq_field_key = "seq"
+_then_field_key = "then_case"
+_else_field_key = "else_case"
+_child_fields = ["body", "block", _seq_field_key, _then_field_key, _else_field_key]
+_ignore_fields = ["span"]
 
 _stmt_line_limit = 140
 _middle_connector = "├── "
@@ -26,54 +30,80 @@ def print_with_clip(self, s: str) -> None:
         print("".join(self.indent) + s)
 
     def print_stmt_brief(self, stmt: Stmt, prefix: str) -> None:
-        stmt_script = repr(stmt).splitlines()[0].split("  ")[0].strip()
-        self.print_with_clip(prefix + f"{stmt.__class__.__name__}: " + stmt_script)
+        # stmt_script = repr(stmt).splitlines()[0].split("  ")[0].strip()
+        self.print_with_clip(prefix + f"{stmt.__class__.__name__}")
 
     def visit_stmt(self, stmt: Stmt) -> None:
-        child_field_name: str = ""
-
-        field_keys = stmt.__class__.__dict__.keys()
+        anno = stmt.__annotations__  # field_key_name -> field_key_type
+        field_keys = anno.keys()
         # Filter out private/built-in fields.
-        field_keys = [key for key in field_keys if not key.startswith("_")]
-
-        for idx, key in enumerate(field_keys):
-            # For child fields, we'll handle them specially below instead of printing them in current line.
-            if key in _child_fields:
-                child_field_name = key
-                continue
+        normal_field_keys = [
+            key for key in field_keys if not key.startswith("_") and key not in _child_fields and key not in _ignore_fields
+        ]
+        child_field_keys = [key for key in field_keys if key in _child_fields]
 
+        for idx, key in enumerate(normal_field_keys):
             value = getattr(stmt, key, None)
-            if value is None:
-                continue
             # Try to get its script representation.
             value = repr(value)
-
-            is_last_child = idx == len(field_keys) - 1 and not child_field_name
+            # If has child fields, we enforce child fields to be last two fields.
+            # So all other fields are not last.
+            is_last = idx == len(normal_field_keys) - 1 and len(child_field_keys) == 0
             # Add tree-like connector
-            connector = _last_connector if is_last_child else _middle_connector
-
-            # Every member
-            self.print_with_clip(connector + f"{key}: {value}")
+            connector = _last_connector if is_last else _middle_connector
+            self.print_with_clip(connector + f"{key}({anno[key]}): {value}")
 
         # Handle child fields
-        if child_field_name and hasattr(stmt, child_field_name):
-            child = getattr(stmt, child_field_name)
-
-            if child_field_name != "seq":
-                prefix = _last_connector + f"{child_field_name}: "
-                self.print_stmt_brief(child, prefix)
+        # Here we have three cases:
+        # 1. SeqStmt, which has a list of child stmts.
+        # 2. IfThenElse w/ else condition, which has 2 child stmts.
+        # 3. Other stmts like For/Block, which has 1 child stmt.
+        if len(child_field_keys) == 2:
+            # Special output format for IfThenElse
+            try:
+                then_child = getattr(stmt, _then_field_key)
+                else_child = getattr(stmt, _else_field_key)
+            except Exception as e:
+                raise ValueError(
+                    "Unexpected error when printing AST: The node has two child fields but it violates IfElseNode structure."
+                ) from e
+            # Some IfElseNodes have no else branch, but they keep the else field and set the value to None.
+            has_else_branch = else_child is not None
+            # Visit then
+            prefix = (_middle_connector if has_else_branch else _last_connector) + f"{_then_field_key}(Stmt): "
+            self.print_stmt_brief(then_child, prefix)
+            self.indent.append(_seq_middle_indent)
+            self.visit_stmt(then_child)
+            self.indent.pop()
+            # Visit else
+            prefix = _last_connector + f"{_else_field_key}(Optional[Stmt]): "
+            self.print_stmt_brief(else_child, prefix)
+            if has_else_branch:
                 self.indent.append(_normal_indent)
-                self.visit_stmt(child)
+                self.visit_stmt(else_child)
                 self.indent.pop()
-            else:
-                # Special output format for SeqStmt
+        elif len(child_field_keys) == 1:
+            child_field_name = child_field_keys[0]
+            child = getattr(stmt, child_field_name)
+
+            # Special output format for SeqStmt
+            if child_field_name == _seq_field_key:
                 for i, child_node in enumerate(child):
                     is_last_child = i == len(child) - 1
-                    prefix = (_last_connector if is_last_child else _middle_connector) + f"seq{i}: "
+                    prefix = (_last_connector if is_last_child else _middle_connector) + f"{_seq_field_key}{i}(Stmt): "
                     self.print_stmt_brief(child_node, prefix)
                     self.indent.append(_normal_indent if is_last_child else _seq_middle_indent)
                     self.visit_stmt(child_node)
                     self.indent.pop()
+            else:
+                # Other cases with only 1 child stmt
+                prefix = _last_connector + f"{child_field_name}(Stmt): "
+                self.print_stmt_brief(child, prefix)
+                self.indent.append(_normal_indent)
+                self.visit_stmt(child)
+                self.indent.pop()
+        else:
+            assert len(child_field_keys) == 0, "Unexpected error when printing AST: Got 3 or more child field keys."
 
 
 def ASTPrinter():
@@ -95,8 +125,8 @@ def pass_fn(func: PrimFunc, mod, ctx) -> PrimFunc:
         func_body_prefix = _last_connector + "body="
         visitor = _ASTPrintVisitor()
         visitor.print_stmt_brief(func.body, func_body_prefix)
-        visitor.visit_stmt(func.body)
         visitor.indent.append(_normal_indent)
+        visitor.visit_stmt(func.body)
         return func
 
     return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/analysis/fragment_loop_checker.py b/tilelang/analysis/fragment_loop_checker.py
index 94900a5cc6..4eaff74b18 100644
--- a/tilelang/analysis/fragment_loop_checker.py
+++ b/tilelang/analysis/fragment_loop_checker.py
@@ -1,9 +1,11 @@
 from __future__ import annotations
 from tvm import tir
-from tvm.tir import PyStmtExprVisitor, BufferStore, For, Var, PrimFunc, BufferLoad, IntImm
+from tvm.tir import PyStmtExprVisitor, BufferStore, For, Var, PrimFunc, BufferLoad, IntImm, ForKind
 from tvm.tir.transform import prim_func_pass
 from tvm.tir.stmt_functor import post_order_visit
 
+from tilelang.utils.language import is_fragment
+
 
 @tir.functor.visitor
 class _LoopVarUseAnalyzer(PyStmtExprVisitor):
@@ -20,9 +22,9 @@ def visit_var_(self, op: Var) -> None:
         # Don't recursively visit children to avoid infinite recursion
 
 
-def collect_local_buffer_accesses(statement) -> list[BufferLoad | BufferStore]:
+def collect_fragment_accesses(statement) -> list[BufferLoad | BufferStore]:
     """
-    Collect local buffer accesses in the loop body.
+    Collect fragment accesses in the loop body.
 
     Args:
         statement: The TIR statement to analyze
@@ -34,7 +36,7 @@ def collect_local_buffer_accesses(statement) -> list[BufferLoad | BufferStore]:
     buffer_accesses = []
 
     def visit_buffer_access(node):
-        if isinstance(node, (BufferLoad, BufferStore)) and node.buffer.scope().startswith("local"):
+        if isinstance(node, (BufferLoad, BufferStore)) and is_fragment(node.buffer):
             buffer_accesses.append(node)
 
     post_order_visit(statement, visit_buffer_access)
@@ -44,29 +46,43 @@ def visit_buffer_access(node):
 
 @tir.functor.visitor
 class _FragmentLoopCheckVisitor(PyStmtExprVisitor):
+    """
+    Check whether the fragment accesses are valid.
+
+    This checker will recursively visit all the for loops until it reaches certain "inner most loop".
+    Then it will start to check the validity of fragment access in the loop body. We need to maintain a stack of
+    loops during the traversal since this is the context/scope of the fragment access.
+    """
+
     def __init__(self) -> None:
         super().__init__()
+        self.loop_stack = []
 
     def visit_for_(self, op: For) -> None:
-        if op.kind == tir.ForKind.PARALLEL:
-            # Fuse consecutive parallel loops
-            # Other nested cases are all invalid in TileLang.
-            loops = [op]
-            child = op.body
-            while isinstance(child, For) and child.kind == tir.ForKind.PARALLEL:
-                loops.append(child)
-                child = child.body
+        self.loop_stack.append(op)
+        child = op.body
+
+        # Reach the innermost loop -- This may cause repeated checks for cases like:
+        #   For1{
+        #       Stmt1;
+        #       For2{};
+        #       For3{};
+        #   };
+        # But it's OK since the check is idempotent.
+        if not isinstance(child, For):
+            buffer_accesses = collect_fragment_accesses(child)
 
             loops_with_symbolic_ranges = []
-            for loop in loops:
-                if not (isinstance(loop.min, IntImm) and isinstance(loop.extent, IntImm)):
+
+            for loop in self.loop_stack:
+                # symbolic
+                if loop.kind == ForKind.PARALLEL and not (isinstance(loop.min, IntImm) and isinstance(loop.extent, IntImm)):
                     loops_with_symbolic_ranges.append(loop)
 
-            if len(loops_with_symbolic_ranges) > 0:
-                buffer_accesses = collect_local_buffer_accesses(child)
-            for loop in loops_with_symbolic_ranges:
-                for buffer_access in buffer_accesses:
-                    indices = buffer_access.indices
+            for buffer_access in buffer_accesses:
+                indices = buffer_access.indices
+                # Check 1
+                for loop in loops_with_symbolic_ranges:
                     analyzer = _LoopVarUseAnalyzer(loop.loop_var)
                     for index in indices:
                         analyzer.visit_expr(index)
@@ -74,12 +90,11 @@ def visit_for_(self, op: For) -> None:
                         raise ValueError(
                             "[Tilelang Semantic Check] "
                             f"Loop variable {loop.loop_var} in a T.Parallel loop with symbolic range (min={loop.min}, extent={loop.extent}) is used to index "
-                            "a local/fragment buffer, which is not allowed in Tilelang."
+                            "a fragment buffer, which is not allowed in Tilelang."
                         )
 
-            return
-
         self.visit_stmt(op.body)
+        self.loop_stack.pop()
 
 
 def FragmentLoopChecker():
diff --git a/tilelang/analysis/layout_visual.py b/tilelang/analysis/layout_visual.py
index 141fb808c4..c554e9a6ee 100644
--- a/tilelang/analysis/layout_visual.py
+++ b/tilelang/analysis/layout_visual.py
@@ -1,3 +1,8 @@
+from __future__ import annotations
+
+from collections.abc import Sequence
+import warnings
+
 import tilelang.language as T
 from tvm import tir
 from tvm.tir import PyStmtExprVisitor
@@ -6,7 +11,7 @@
 from tilelang.tools.plot_layout import plot_layout
 
 
-def print_fragment_format(layout: T.Fragment) -> str:
+def print_fragment_format(layout: T.Fragment) -> None:
     """
     Format fragment layout information into a human-readable string.
 
@@ -23,7 +28,12 @@ def print_fragment_format(layout: T.Fragment) -> str:
     if isinstance(layout, T.Fragment):
         input_shape = layout.get_input_shape()
         output_shape = layout.get_output_shape()
-        lines = [f"  Shape: {input_shape} -> {output_shape}", f"  Thread: {layout.forward_thread}", f"  Index:  {layout.forward_index}"]
+        lines = [
+            f"  Shape: {input_shape} -> {output_shape}",
+            f"  Thread: {layout.forward_thread}",
+            f"  Index:  {layout.forward_index}",
+            f"  Replicate:  {layout.replicate_size}",
+        ]
         print("\n".join(lines))
     else:
         raise ValueError(f"Expected T.Fragment, but got {type(layout).__name__}")
@@ -55,11 +65,21 @@ class _LayoutVisualVisitor(PyStmtExprVisitor):
     - "png,svg": Generate multiple formats (comma-separated)
     """
 
-    def __init__(self, formats: list[str] = ""):
+    def __init__(self, formats: str | Sequence[str] = ""):
         super().__init__()
-        self.layout_found = []
-        self.processed_layouts = set()
-        self.formats_list = [f for f in formats if f != "txt"]
+        if formats is None:
+            parsed: list[str] = []
+        elif isinstance(formats, str):
+            formats_str = formats.strip()
+            if formats_str == "":
+                parsed = []
+            elif formats_str == "all":
+                parsed = ["pdf", "png", "svg"]
+            else:
+                parsed = [f.strip() for f in formats_str.split(",") if f.strip()]
+        else:
+            parsed = [str(f).strip() for f in formats if str(f).strip()]
+        self.formats_list = [f for f in parsed if f != "txt"]
 
     def visit_block_(self, op: tir.Block) -> None:
         if "layout_map" in op.annotations:
@@ -67,18 +87,20 @@ def visit_block_(self, op: tir.Block) -> None:
 
             for key, layout in layout_map.items():
                 if isinstance(layout, T.Fragment):
-                    layout_id = str(layout)
-                    if layout_id not in self.processed_layouts:
-                        print(f"{key} inferenced layout:")
-                        print_fragment_format(layout)
-                        for fmt in self.formats_list:
-                            plot_layout(layout, name=f"{key}_layout", formats=fmt)
-                        self.processed_layouts.add(layout_id)
-
-        # super().visit_block_(op)
-
-
-def LayoutVisual(formats: str = ""):
+                    print(f"{key} inferred layout:")
+                    print_fragment_format(layout)
+                    for fmt in self.formats_list:
+                        input_shape = layout.get_input_shape()
+                        if len(input_shape) != 2:
+                            warnings.warn(
+                                f"Skip plotting {key} layout: input_shape={input_shape} is not 2D.",
+                                stacklevel=2,
+                            )
+                            continue
+                        plot_layout(layout, name=f"{key}_layout", formats=fmt)
+
+
+def LayoutVisual(formats: str | Sequence[str] = ""):
     def pass_fn(func: tir.PrimFunc, mod, ctx):
         _LayoutVisualVisitor(formats=formats).visit_stmt(func.body)
         return func
diff --git a/tilelang/autodd.py b/tilelang/autodd.py
new file mode 100644
index 0000000000..7122282beb
--- /dev/null
+++ b/tilelang/autodd.py
@@ -0,0 +1,1418 @@
+from abc import ABC, abstractmethod
+import ast
+import asyncio
+from collections import Counter
+from copy import copy, deepcopy
+from dataclasses import dataclass
+from pathlib import Path
+import shutil
+from typing import Callable, Literal, NamedTuple, override
+from collections.abc import Sequence
+from collections.abc import Iterable
+import contextlib
+import io
+import multiprocessing
+import queue
+import subprocess
+import tempfile
+import time
+import os
+import traceback
+
+
+class _FreezeSentinel:
+    """No-op context manager and identity function used to mark frozen regions for autodd.
+
+    Usage in the target script::
+
+        from tilelang.autodd import __freeze__
+
+        # Protect a statement block:
+        with __freeze__:
+            critical_call(args)
+
+        # Protect a single expression:
+        result = __freeze__(critical_expr)
+    """
+
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *a):
+        pass
+
+    def __call__(self, x=None):
+        return x
+
+
+__freeze__ = _FreezeSentinel()
+
+
+def _is_freeze_with(node: ast.AST) -> bool:
+    """Detect ``with __freeze__: body`` (no ``as`` clause)."""
+    return (
+        isinstance(node, ast.With)
+        and len(node.items) == 1
+        and node.items[0].optional_vars is None
+        and isinstance(node.items[0].context_expr, ast.Name)
+        and node.items[0].context_expr.id == "__freeze__"
+    )
+
+
+def _is_freeze_call(node: ast.AST) -> bool:
+    """Detect ``__freeze__(expr)``."""
+    return isinstance(node, ast.Call) and isinstance(node.func, ast.Name) and node.func.id == "__freeze__"
+
+
+def ast_replace(node: ast.AST, **changes) -> ast.AST:
+    node = copy(node)
+    for field, value in changes.items():
+        setattr(node, field, value)
+    return node
+
+
+def parse_stmts(s: str) -> list[ast.stmt]:
+    mod = ast.parse(s)
+    return mod.body
+
+
+def parse_expr(s: str) -> ast.expr:
+    mod = ast.parse(s, mode="eval")
+    return mod.body
+
+
+class ASTRewrite(ABC):
+    @abstractmethod
+    def get_name(self) -> str:
+        raise NotImplementedError
+
+    @abstractmethod
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        raise NotImplementedError
+
+    @abstractmethod
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> "ast.AST | list[ast.AST] | None":
+        raise NotImplementedError
+
+
+@dataclass
+class GeneralRemove(ASTRewrite):
+    name: str
+    target_type: type[ast.AST]
+    inside_list: bool = True
+    replace_with: "ast.AST | list[ast.AST] | None" = None
+
+    @override
+    def get_name(self) -> str:
+        return self.name
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, self.target_type) and (not self.inside_list or inside_list)
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> None:
+        return deepcopy(self.replace_with)
+
+
+def expr_to_zeros(target: ast.expr) -> ast.expr:
+    if isinstance(target, ast.Tuple):
+        zeros = [ast.Constant(value=0) for _ in target.elts]
+        return ast.Tuple(elts=zeros, ctx=ast.Load())
+    else:
+        return ast.Constant(value=0)
+
+
+class CallFwdArg1(ASTRewrite):
+    @override
+    def get_name(self) -> str:
+        return "call-fwd-arg1"
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, ast.Call) and len(node.args) >= 1
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        assert isinstance(node, ast.Call)
+        return node.args[0]
+
+
+class AttachFullFuncArgs(ASTRewrite):
+    @override
+    def get_name(self) -> str:
+        return "attach-full-func-args"
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, ast.FunctionDef) and (node.args.vararg is None or node.args.kwarg is None)
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        assert isinstance(node, ast.FunctionDef)
+        node = copy(node)
+        node.args = copy(node.args)
+        if node.args.vararg is None:
+            node.args.vararg = ast.arg(arg="args")
+        if node.args.kwarg is None:
+            node.args.kwarg = ast.arg(arg="kwargs")
+        return node
+
+
+@dataclass
+class IntConstApply(ASTRewrite):
+    matcher: Callable[[int], bool]
+    apply: Callable[[int], ast.AST]
+    name: str
+
+    @override
+    def get_name(self) -> str:
+        return self.name
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, ast.Constant) and isinstance(node.value, int) and self.matcher(node.value)
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        assert isinstance(node, ast.Constant) and isinstance(node.value, int)
+        return ast_replace(node, value=self.apply(node.value))
+
+
+@dataclass
+class BinOpFwdArg(ASTRewrite):
+    forward: Literal["left", "right"] = "left"
+
+    @override
+    def get_name(self) -> str:
+        return f"binop-fwd-arg-{self.forward}"
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return isinstance(node, ast.BinOp)
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        assert isinstance(node, ast.BinOp)
+        if self.forward == "left":
+            return node.left
+        else:
+            return node.right
+
+
+def _as_expr_placeholder(temp: ast.AST) -> "str | None":
+    if isinstance(temp, ast.Name):
+        return temp.id
+    else:
+        return None
+
+
+def _as_stmt_placeholder(temp: ast.AST) -> "str | None":
+    if isinstance(temp, ast.Expr) and isinstance(temp.value, ast.Name):
+        return temp.value.id
+    else:
+        return None
+
+
+def _ast_match(temp: ast.AST, node: ast.expr, placeholders: set[str]):
+    ph_expr = _as_expr_placeholder(temp)
+    if ph_expr is not None and ph_expr in placeholders:
+        return {ph_expr: node}
+    if type(temp) is not type(node):
+        return False
+    result = {}
+    for field, value in ast.iter_fields(temp):
+        if isinstance(value, list):
+            if len(value) == 1:
+                ph_stmts = _as_stmt_placeholder(value[0])
+                if ph_stmts is not None and ph_stmts in placeholders:
+                    result.update({ph_stmts: getattr(node, field)})
+                    continue
+            if not isinstance(getattr(node, field), list):
+                return False
+            if len(value) != len(getattr(node, field)):
+                return False
+            for v1, v2 in zip(value, getattr(node, field)):
+                sub_result = _ast_match(v1, v2, placeholders)
+                if sub_result is False:
+                    return False
+                result.update(sub_result)
+        elif isinstance(value, ast.AST):
+            if not isinstance(getattr(node, field), ast.AST):
+                return False
+            sub_result = _ast_match(value, getattr(node, field), placeholders)
+            if sub_result is False:
+                return False
+            result.update(sub_result)
+        else:
+            if value != getattr(node, field):
+                return False
+    return result
+
+
+def _ast_replace(temp: ast.expr, repl: dict[str, ast.AST]) -> ast.expr:
+    ph_expr = _as_expr_placeholder(temp)
+    if ph_expr is not None and ph_expr in repl:
+        return deepcopy(repl[ph_expr])
+    ph_stmts = _as_stmt_placeholder(temp)
+    if ph_stmts is not None and ph_stmts in repl:
+        return deepcopy(repl[ph_stmts])
+    temp = copy(temp)
+    for field, value in ast.iter_fields(temp):
+        if isinstance(value, list):
+            if len(value) == 1:
+                ph_stmts = _as_stmt_placeholder(value[0])
+                if ph_stmts is not None and ph_stmts in repl:
+                    setattr(temp, field, deepcopy(repl[ph_stmts]))
+                    continue
+            new_values = []
+            for v in value:
+                res = _ast_replace(v, repl)
+                if res is None:
+                    continue
+                if isinstance(res, ast.AST):
+                    new_values.append(res)
+                else:
+                    new_values.extend(res)
+            setattr(temp, field, new_values)
+        elif isinstance(value, ast.AST):
+            setattr(temp, field, _ast_replace(value, repl))
+    return temp
+
+
+ASTPatKind = Literal["expr", "stmt"]
+
+
+@dataclass
+class ASTPat:
+    tree: "ast.expr | list[ast.stmt]"
+    placeholders: set[str]
+
+    @classmethod
+    def from_code(cls, kind: ASTPatKind, code: str, placeholders: set[str]) -> "ASTPat":
+        if kind == "expr":
+            tree = parse_expr(code)
+        elif kind == "stmt":
+            tree = parse_stmts(code)
+            if len(tree) == 1:
+                tree = tree[0]
+        else:
+            raise ValueError(f"Unknown AST pattern kind: {kind}")
+        return cls(tree, placeholders)
+
+    def match_placeholders(self, node: "ast.AST | list[ast.AST]") -> "dict[str, ast.AST] | bool":
+        return _ast_match(self.tree, node, self.placeholders)
+
+    def match(self, node: ast.AST) -> bool:
+        return self.match_placeholders(node) is not False
+
+    def replace(self, repl: dict[str, ast.AST]) -> ast.AST:
+        if isinstance(self.tree, list):
+            replaced_stmts = []
+            for stmt in self.tree:
+                replaced = _ast_replace(stmt, repl)
+                if isinstance(replaced, ast.AST):
+                    replaced_stmts.append(replaced)
+                else:
+                    replaced_stmts.extend(replaced)
+            return replaced_stmts
+        else:
+            return _ast_replace(self.tree, repl)
+
+
+@dataclass
+class ASTPatRewrite(ASTRewrite):
+    name: str
+    match_pat: ASTPat
+    rewrite_pat: ASTPat
+    checker: "Callable[[dict[str, ast.AST]], bool] | dict[str, Callable[[ast.AST], bool]] | None" = None
+    derived: "dict[str, Callable[[dict[str, ast.AST]], ast.AST]] | None" = None
+
+    @classmethod
+    def from_code(
+        cls,
+        name: str,
+        kind: ASTPatKind,
+        match: str,
+        rewrite: str,
+        placeholders: set[str],
+        checker: "Callable[[dict[str, ast.AST]], bool] | dict[str, Callable[[ast.AST], bool]] | None" = None,
+        derived: "dict[str, Callable[[dict[str, ast.AST]], ast.AST]] | None" = None,
+    ) -> "ASTPatRewrite":
+        match_pat = ASTPat.from_code(kind, match, placeholders)
+        rewrite_pat = ASTPat.from_code(kind, rewrite, placeholders)
+        return cls(name, match_pat, rewrite_pat, checker, derived)
+
+    @override
+    def get_name(self) -> str:
+        return self.name
+
+    def match_placeholders(self, node: ast.AST):
+        ph = self.match_pat.match_placeholders(node)
+        if ph is False:
+            return False
+        if self.derived is not None:
+            for k, v in self.derived.items():
+                ph[k] = v(ph)
+        if self.checker is not None:
+            if isinstance(self.checker, dict):
+                for k, v in self.checker.items():
+                    if k not in ph or not v(ph[k]):
+                        return False
+            else:
+                return self.checker(ph)
+        return ph
+
+    @override
+    def match(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> bool:
+        return self.match_placeholders(node) is not False
+
+    def _rewrite(self, node: ast.AST):
+        # this function is for debugging purpose
+        repl = self.match_placeholders(node)
+        assert repl is not False
+        replaced = self.rewrite_pat.replace(repl)
+        return replaced
+
+    @override
+    def rewrite(self, node: ast.AST, parent: ast.AST, field: str, inside_list: bool) -> ast.AST:
+        return self._rewrite(node)
+
+
+class ASTMutator:
+    def generic_visit(self, node):
+        for field, old_value in ast.iter_fields(copy(node)):
+            if isinstance(old_value, list):
+                new_values = []
+                for value in old_value:
+                    if isinstance(value, ast.AST):
+                        value = self.visit(value, node, field, True)
+                        if value is None:
+                            continue
+                        elif not isinstance(value, ast.AST):
+                            new_values.extend(value)
+                            continue
+                    new_values.append(value)
+                old_value[:] = new_values
+            elif isinstance(old_value, ast.AST):
+                new_node = self.visit(old_value, node, field, False)
+                if new_node is None:
+                    delattr(node, field)
+                else:
+                    setattr(node, field, new_node)
+        return node
+
+    def visit(self, node: ast.AST, parent: "ast.AST | None", field: "str | None", inside_list: bool):
+        return self.generic_visit(node)
+
+
+@dataclass
+class LabeledRewrite:
+    label: int
+    rewrite: ASTRewrite
+
+
+class RewriteAttacher(ASTMutator):
+    def __init__(self, rewrites: list[ASTRewrite]):
+        self.rewrites = rewrites
+        self.uid_counter = 0
+        self.rewrite_counter = 0
+        self.rewrite_names = Counter()
+        # Freeze propagation state:
+        #   _frozen       – True when we are currently inside a frozen subtree
+        #   _stmt_stack   – stack of enclosing ast.stmt nodes; used so that a
+        #                   __freeze__(expr) child can retroactively freeze its
+        #                   ancestor statement (preventing e.g. assign_rhs_1 from
+        #                   replacing the whole RHS and destroying the frozen expr).
+        self._frozen: bool = False
+        self._stmt_stack: list[ast.AST] = []
+
+    @override
+    def visit(self, node: ast.AST, parent: "ast.AST | None", field: "str | None", inside_list: bool):
+        node = copy(node)
+        node._dd_uid = self.uid_counter
+        self.uid_counter += 1
+        node._dd_rewrites = []
+
+        # A node is a freeze boundary if it is ``with __freeze__:`` or
+        # ``__freeze__(expr)``.  Once we cross a boundary, every descendant
+        # is also frozen.
+        is_boundary = _is_freeze_with(node) or _is_freeze_call(node)
+        is_frozen = self._frozen or is_boundary
+
+        # If this node is a freeze boundary, retroactively mark *all*
+        # enclosing statements as frozen.  Marking only the directly
+        # enclosing statement is not enough: a parent ``if``/``for``/``while``
+        # could be removed by stmt-remover, which would take the frozen
+        # subtree with it.
+        if is_boundary:
+            for stmt in self._stmt_stack:
+                stmt._dd_ancestor_frozen = True
+
+        if not is_frozen:
+            for r in self.rewrites:
+                if r.match(node, parent, field, inside_list):
+                    lr = LabeledRewrite(self.rewrite_counter, r)
+                    self.rewrite_counter += 1
+                    self.rewrite_names[lr.rewrite.get_name()] += 1
+                    node._dd_rewrites.append(lr)
+
+        is_stmt = isinstance(node, ast.stmt)
+        if is_stmt:
+            self._stmt_stack.append(node)
+
+        old_frozen = self._frozen
+        self._frozen = is_frozen
+        res = self.generic_visit(node)
+        self._frozen = old_frozen
+
+        if is_stmt:
+            self._stmt_stack.pop()
+            # If a child __freeze__() call flagged this statement, wipe any
+            # rewrites that were attached before we discovered the frozen child.
+            if getattr(node, "_dd_ancestor_frozen", False):
+                node._dd_rewrites = []
+
+        return res
+
+
+def attach_rewrites(tree: ast.AST, rewrites: list[ASTRewrite]) -> tuple[ast.AST, int, int]:
+    attacher = RewriteAttacher(rewrites)
+    new_tree = attacher.visit(tree, None, None, False)
+    print("Rewrites:", attacher.rewrite_names)
+    return new_tree, attacher.uid_counter, attacher.rewrite_counter
+
+
+class RewriteApplier(ASTMutator):
+    def __init__(self, target_labels: set[int]):
+        self.target_labels = target_labels
+        self.applied_rewrites: set[int] = set()
+        self.visited: set[int] = set()
+
+    @override
+    def visit(self, node: ast.AST, parent: "ast.AST | None", field: "str | None", inside_list: bool):
+        orig_uid = getattr(node, "_dd_uid", None)
+        if orig_uid in self.visited:
+            return self.generic_visit(node)
+        self.visited.add(orig_uid)
+
+        node = copy(node)
+        for lr in getattr(node, "_dd_rewrites", []):
+            lr: LabeledRewrite
+            if lr.label in self.target_labels:
+                node = lr.rewrite.rewrite(node, parent, field, inside_list)
+                self.applied_rewrites.add(lr.label)
+                break
+
+        if node is None:
+            return None
+        elif isinstance(node, ast.AST):
+            # After rewriting this node, traverse its children without
+            # re-applying rewrite selection logic to the node itself.
+            return self.generic_visit(node)
+        else:
+            new_items = []
+            for item in node:
+                if isinstance(item, ast.AST):
+                    res = self.visit(item, parent, field, inside_list)
+                    if res is None:
+                        continue
+                    elif isinstance(res, ast.AST):
+                        new_items.append(res)
+                    else:
+                        new_items.extend(res)
+            return new_items
+
+
+def apply_rewrites(tree: ast.AST, target_labels: set[int]) -> tuple[ast.AST, set[int]]:
+    applier = RewriteApplier(target_labels)
+    new_tree = applier.visit(deepcopy(tree), None, None, False)
+    return new_tree, applier.applied_rewrites
+
+
+def test_rewrite(rewrite: ASTRewrite, code: str):
+    tree = ast.parse(code)
+    tree, _, num_matched = attach_rewrites(tree, [rewrite])
+    tree, _ = apply_rewrites(tree, set(i for i in range(num_matched)))
+    ast.fix_missing_locations(tree)
+    return ast.unparse(tree)
+
+
+@dataclass
+class Task:
+    source: str
+    applied: list[int]
+    masked: list[int]
+
+    def with_source(self, source: str) -> "Task":
+        return Task(source, self.applied, self.masked)
+
+
+class PDD:
+    def __init__(self, all_labels: list[int], init_proba: float = 0.93):
+        self.all_labels = all_labels
+        self.probas = {label: init_proba for label in all_labels}
+
+    def apply(self, target_labels: set[int]) -> set[int]:
+        return target_labels
+
+    @staticmethod
+    def _update_probas(probas: dict[int, float], task: Task, is_interesting: bool):
+        if is_interesting:
+            for label in task.applied:
+                probas[label] = 1.0
+            for label in task.masked:
+                probas[label] = 0.0
+        else:
+            prod = 1.0
+            for label in task.applied:
+                if probas[label] > 0:
+                    prod *= probas[label]
+            denorm = 1.0 - prod
+            for label in task.applied:
+                p = probas[label]
+                if p >= 1.0:
+                    continue
+                probas[label] = 1.0 - (1.0 - p) / denorm if denorm > 0.0 else 0.0
+
+    def generator(self) -> Iterable[Task]:
+        probas = deepcopy(self.probas)
+        while True:
+            choices = sorted(probas.items(), key=lambda x: (x[1], x[0]), reverse=True)
+            selected = []
+            selected_count, prod = 0.0, 1.0
+            for label, p in choices:
+                if p >= 1.0:
+                    selected.append(label)
+                    continue
+                if (selected_count + 1) * prod * p > selected_count * prod:
+                    selected.append(label)
+                    selected_count, prod = selected_count + 1, prod * p
+                else:
+                    break
+            applied = self.apply(set(selected))
+            masked = set(selected).difference(applied)
+            task = Task(source=None, applied=list(applied), masked=list(masked))
+            if selected_count * prod == 0 or all(probas[label] >= 1.0 for label in applied):
+                break
+            yield deepcopy(task)
+            self._update_probas(probas, task, is_interesting=False)
+
+    def update(self, task: Task, is_interesting: bool):
+        self._update_probas(self.probas, task, is_interesting)
+
+
+class TaskManager(ABC):
+    @abstractmethod
+    def task_generator(self) -> Iterable[Task]: ...
+
+    @abstractmethod
+    def task_update(self, task: Task, is_interesting: bool): ...
+
+    @classmethod
+    @abstractmethod
+    def from_source(cls, source: str, *args, **kwargs) -> "TaskManager": ...
+
+
+class ASTPDD(TaskManager, PDD):
+    def __init__(self, tree: ast.AST, rewrites: list[ASTRewrite], init_proba: float = 0.93):
+        self.tree, _, total_rewrites = attach_rewrites(tree, rewrites)
+        all_labels = [i for i in range(total_rewrites)]
+        super().__init__(all_labels, init_proba)
+
+    @override
+    @classmethod
+    def from_source(cls, source, *args, **kwargs):
+        return cls(ast.parse(source), *args, **kwargs)
+
+    def apply(self, target_labels: set[int]) -> set[int]:
+        _, applied = apply_rewrites(self.tree, target_labels)
+        return applied
+
+    @override
+    def task_generator(self) -> Iterable[Task]:
+        for task in self.generator():
+            new_tree, _ = apply_rewrites(self.tree, task.applied)
+            try:
+                new_tree = deepcopy(new_tree)
+                ast.fix_missing_locations(new_tree)
+                source = ast.unparse(new_tree)
+            except Exception as _:
+                continue
+            yield task.with_source(source)
+            # self.update(task, is_interesting=False)
+
+    @override
+    def task_update(self, task: Task, is_interesting: bool):
+        self.update(task, is_interesting)
+
+
+def ruff_fix_code(code_string: str, fix_lint: bool = True, format_code: bool = True) -> str:
+    ruff_executable = shutil.which("ruff")
+    if not ruff_executable:
+        raise FileNotFoundError("Unable to find ruff")
+
+    with tempfile.NamedTemporaryFile(mode="w+", suffix=".py", delete=False, encoding="utf-8") as tmp:
+        tmp.write(code_string)
+        tmp_path = tmp.name
+
+    try:
+        if fix_lint:
+            print("Running ruff fix on:", tmp_path)
+            subprocess.run([ruff_executable, "check", "--fix", "--unsafe-fixes", tmp_path], capture_output=True, check=False)
+
+        if format_code:
+            print("Running ruff format on:", tmp_path)
+            subprocess.run([ruff_executable, "format", tmp_path], capture_output=True, check=False)
+
+        with open(tmp_path) as f:
+            fixed_code = f.read()
+
+        return fixed_code
+
+    finally:
+        if os.path.exists(tmp_path):
+            os.remove(tmp_path)
+
+
+class LinePDD(TaskManager, PDD):
+    def __init__(self, source: str, init_proba: float = 0.93):
+        lines = [line for line in source.splitlines() if line.strip() != ""]
+        self.lines = lines
+        # Frozen lines are never candidates for removal: exclude them from
+        # all_labels entirely so PDD never generates tasks that delete them.
+        frozen = _find_frozen_line_set(source, lines)
+        all_labels = [i for i in range(len(lines)) if i not in frozen]
+        super().__init__(all_labels, init_proba)
+
+    @override
+    @classmethod
+    def from_source(cls, source, *args, **kwargs):
+        return cls(source, *args, **kwargs)
+
+    @override
+    def task_generator(self) -> Iterable[Task]:
+        for task in self.generator():
+            new_lines = [line for idx, line in enumerate(self.lines) if idx not in task.applied]
+            source = "\n".join(new_lines)
+            try:
+                ast.parse(source)
+            except Exception as _:
+                # self.update(task, is_interesting=False)
+                continue
+            yield task.with_source(source)
+
+    @override
+    def task_update(self, task: Task, is_interesting: bool):
+        self.update(task, is_interesting)
+
+
+class Ruff(TaskManager):
+    def __init__(self, source: str, fix_lint: bool = True, format_code: bool = True):
+        self.source = source
+        self.fix_lint = fix_lint
+        self.format_code = format_code
+        self.finished = False
+
+    @override
+    @classmethod
+    def from_source(cls, source: str, *args, **kwargs) -> "Ruff":
+        return cls(source)
+
+    @override
+    def task_generator(self):
+        if self.finished:
+            return
+        self.finished = True
+        try:
+            fixed_code = ruff_fix_code(self.source, fix_lint=self.fix_lint, format_code=self.format_code)
+            yield Task(source=fixed_code, applied=[], masked=[])
+        except FileNotFoundError as _:
+            return
+
+    @override
+    def task_update(self, task: Task, is_interesting: bool):
+        pass
+
+
+def _worker_loop(input_queue, output_queue):
+    while True:
+        try:
+            task = input_queue.get()
+            if task is None:
+                break
+
+            capture_out = io.StringIO()
+            capture_err = io.StringIO()
+            success = False
+            with tempfile.NamedTemporaryFile("w", suffix=".py", delete=True) as f:
+                f.write(task)
+                f.flush()
+                try:
+                    with contextlib.redirect_stdout(capture_out), contextlib.redirect_stderr(capture_err):
+                        code = compile(task, f.name, "exec")
+                        exec(code, {"__builtins__": __builtins__})
+                    success = True
+                except SystemExit as e:
+                    capture_err.write(f"SystemExit: Code {e.code}\n")
+                except Exception:
+                    traceback.print_exc(file=capture_err)
+
+            output_queue.put((capture_out.getvalue(), capture_err.getvalue(), success))
+        except KeyboardInterrupt:
+            break
+        except Exception as e:
+            output_queue.put(("", f"Critical: {e}", False))
+
+
+# This class is written by Gemini
+class AsyncPythonRunner:
+    def __init__(self):
+        self.process = None
+        self.input_queue = None
+        self.output_queue = None
+        self.lock = asyncio.Lock()
+
+    def start_proc(self):
+        if self.process and self.process.is_alive():
+            return
+        ctx = multiprocessing.get_context("spawn")
+        self.input_queue = ctx.Queue()
+        self.output_queue = ctx.Queue()
+        self.process = ctx.Process(target=_worker_loop, args=(self.input_queue, self.output_queue), daemon=True)
+        self.process.start()
+
+    def stop_proc(self):
+        if self.process:
+            # Try to send a stop signal.
+            # Note: if the queue is full or broken, put may block, so wrap it in a try.
+            with contextlib.suppress(Exception):
+                self.input_queue.put_nowait(None)
+
+            self.process.join(timeout=0.5)
+            if self.process.is_alive():
+                self.process.terminate()
+        self.process = None
+
+    def __enter__(self):
+        self.start_proc()
+        return self
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        self.stop_proc()
+
+    async def run(self, code: str, timeout: float = 5.0):
+        async with self.lock:
+            if not self.process or not self.process.is_alive():
+                self.start_proc()
+
+            try:
+                self.input_queue.put(code)
+            except Exception as e:
+                # Rare case: the pipe is broken.
+                return "", f"Queue Error: {e}", False
+
+            start_time = time.time()
+            while True:
+                # 1. Check whether we timed out.
+                if time.time() - start_time > timeout:
+                    self._handle_timeout(timeout)
+                    return "", f"TimeoutError: Exceeded {timeout}s", False
+
+                # 2. Check whether the child process is still alive (avoid hanging if it segfaults).
+                if not self.process.is_alive():
+                    # Try one last read (in case the result was just written before the process exited).
+                    try:
+                        return self.output_queue.get_nowait()
+                    except queue.Empty:
+                        self.process = None  # Mark as needing restart.
+                        return "", "Error: Worker process died unexpectedly", False
+
+                # 3. Try to read results in a non-blocking way.
+                try:
+                    # get_nowait raises queue.Empty immediately if the queue is empty.
+                    result = self.output_queue.get_nowait()
+                    return result
+                except queue.Empty:
+                    # No data in the queue yet, sleep briefly and yield control back to the event loop.
+                    # A 0.05s delay is perfectly acceptable for interactive usage.
+                    await asyncio.sleep(0.05)
+
+    def _handle_timeout(self, timeout):
+        """Handle cleanup logic when a timeout happens."""
+        # We must force-terminate because exec may still be stuck in a tight loop.
+        if self.process and self.process.is_alive():
+            self.process.terminate()
+            # Give the OS a bit of time to reclaim resources.
+            self.process.join(timeout=0.5)
+
+        # Mark as None so that the next run triggers start_proc and restarts the worker.
+        self.process = None
+        self.input_queue = None
+        self.output_queue = None
+
+
+class SubProcRunner:
+    def __init__(self):
+        pass
+
+    def __enter__(self):
+        pass
+
+    def __exit__(self, exc_type, exc_value, traceback):
+        pass
+
+    async def run(self, code: str, timeout: float = 5.0):
+        with tempfile.NamedTemporaryFile("w", suffix=".py", delete=False) as f:
+            f.write(code)
+
+        def run_subprocess(args):
+            try:
+                proc = subprocess.run(
+                    args,
+                    capture_output=True,
+                    text=True,  # Decodes output as strings (Python 3.5+)
+                    timeout=timeout,  # Timeout
+                    check=False,  # Do not raise exception for non-zero exit codes
+                )
+                return proc.stdout, proc.stderr, proc.returncode == 0
+            except subprocess.TimeoutExpired:
+                return "", f"TimeoutError: Exceeded {timeout}s", False
+
+        result = await asyncio.get_running_loop().run_in_executor(None, run_subprocess, ["python3", f.name])
+        with contextlib.suppress(OSError):
+            os.remove(f.name)
+        return result
+
+
+def clean_empty_pass(code: str) -> str:
+    tree = ast.parse(code)
+
+    class PassRemover(ast.NodeTransformer):
+        def clean_body(self, body: list[ast.stmt], keep_one=True) -> list[ast.stmt]:
+            if body is None:
+                return None
+            res = [stmt for stmt in body if not isinstance(stmt, ast.Pass)]
+            if not res and keep_one:
+                return [ast.Pass()]
+            return res
+
+        def visit_For(self, node: ast.For) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_If(self, node: ast.If) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            node.orelse = self.clean_body(node.orelse, keep_one=False)
+            return node
+
+        def visit_FunctionDef(self, node: ast.FunctionDef) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_ClassDef(self, node):
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_Module(self, node: ast.Module) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_With(self, node: ast.With) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_AsyncWith(self, node: ast.AsyncWith) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_While(self, node: ast.While) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_Try(self, node: ast.Try) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            node.orelse = self.clean_body(node.orelse)
+            node.finalbody = self.clean_body(node.finalbody)
+            for handler in node.handlers:
+                handler.body = self.clean_body(handler.body)
+            return node
+
+        def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_AsyncFor(self, node: ast.AsyncFor) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+        def visit_ExceptHandler(self, node: ast.ExceptHandler) -> ast.AST:
+            self.generic_visit(node)
+            node.body = self.clean_body(node.body)
+            return node
+
+    new_tree = PassRemover().visit(tree)
+    return ast.unparse(new_tree)
+
+
+def _has_freeze_import(source: str) -> bool:
+    """Return True if *source* already contains ``from tilelang.autodd import __freeze__``
+    as an actual import statement (not inside a comment or string literal).
+    """
+    try:
+        tree = ast.parse(source)
+    except SyntaxError:
+        return False
+    for node in ast.walk(tree):
+        if (
+            isinstance(node, ast.ImportFrom)
+            and node.module == "tilelang.autodd"
+            and any(alias.name == "__freeze__" for alias in node.names)
+        ):
+            return True
+    return False
+
+
+def _preprocess_freeze_comments(source: str) -> str:
+    """Convert ``# autodd: freeze`` comment annotations to ``with __freeze__:`` blocks.
+
+    Supports two forms:
+
+    **Block form** – wrap a group of statements::
+
+        # autodd: freeze-start
+        stmt1
+        stmt2
+        # autodd: end-freeze
+
+    **Single-statement form** – end-of-line comment on any non-comment line::
+
+        stmt  # autodd: freeze
+
+    Both forms are converted in-place to ``with __freeze__:`` blocks so that
+    the freeze information survives ``ast.unparse`` round-trips.
+
+    .. note::
+        The single-statement form only works for *physically single-line* statements.
+        Placing ``# autodd: freeze`` on the last line of a multi-line expression (e.g.
+        the closing ``)`` of a parenthesised call) will produce a ``SyntaxError``
+        because only that one line is wrapped.  Use the block form instead.
+
+        The block form prepends exactly 4 spaces to every non-empty line inside the
+        annotated region.  This is correct for regular statements, but it will corrupt
+        **multi-line string literals** whose continuation lines start at column 0: those
+        lines will gain unintended leading spaces (or cause a ``SyntaxError`` if the
+        closing ``\"\"\"`` is shifted).  Avoid using the block form around triple-quoted
+        string literals.
+
+    If any substitution is made and ``from tilelang.autodd import __freeze__`` is not
+    already present in *source*, the import is automatically prepended so that the
+    generated ``with __freeze__:`` blocks remain valid Python when executed.
+    """
+    lines = source.splitlines()
+    result: list[str] = []
+    substituted = False
+    i = 0
+    while i < len(lines):
+        line = lines[i]
+        stripped = line.strip()
+        indent = line[: len(line) - len(line.lstrip())]
+
+        # Block form: standalone comment line "# autodd: freeze-start"
+        if stripped == "# autodd: freeze-start":
+            substituted = True
+            i += 1
+            block: list[str] = []
+            found_end = False
+            while i < len(lines):
+                if lines[i].strip() == "# autodd: end-freeze":
+                    i += 1
+                    found_end = True
+                    break
+                block.append(lines[i])
+                i += 1
+            if not found_end:
+                print(
+                    "autodd WARNING: '# autodd: freeze-start' has no matching "
+                    "'# autodd: end-freeze' — all remaining source is treated as frozen."
+                )
+            result.append(f"{indent}with __freeze__:")
+            for bl in block:
+                # Prepend 4 spaces to preserve relative indentation inside the with block.
+                result.append(f"    {bl}" if bl.strip() else bl)
+
+        # Single-statement form: end-of-line "# autodd: freeze" on a non-comment line.
+        # Extract the comment text and verify it is exactly "# autodd: freeze" so that
+        # "# autodd: freeze-start" used as an inline comment is not misidentified here.
+        elif "# autodd: freeze" in line and not stripped.startswith("#"):
+            marker_idx = line.index("# autodd: freeze")
+            comment_text = line[marker_idx:].strip()
+            if comment_text != "# autodd: freeze":
+                # e.g. "# autodd: freeze-start" or "# autodd: freeze-end" as inline comment
+                result.append(line)
+                i += 1
+            else:
+                substituted = True
+                code_part = line[:marker_idx].rstrip()
+                result.append(f"{indent}with __freeze__:")
+                result.append(f"{indent}    {code_part.lstrip()}")
+                i += 1
+
+        else:
+            result.append(line)
+            i += 1
+
+    body = "\n".join(result)
+
+    # If we made substitutions, ensure __freeze__ is importable in the generated code.
+    # Users who used only comment annotations may not have the explicit import in their
+    # script; without it every exec() call would raise NameError.
+    # We use an AST-level check rather than a plain substring search so that a
+    # commented-out import (e.g. "# from tilelang.autodd import __freeze__") is not
+    # mistaken for an active one.
+    if substituted and not _has_freeze_import(body):
+        body = "from tilelang.autodd import __freeze__\n" + body
+
+    return body
+
+
+def _find_frozen_line_set(source: str, nonempty_lines: list[str]) -> set[int]:
+    """Return the set of indices into *nonempty_lines* that belong to frozen regions.
+
+    A line is considered frozen if it falls within the source span of a
+    ``with __freeze__:`` block or a ``__freeze__(expr)`` call.
+    """
+    try:
+        tree = ast.parse(source)
+    except SyntaxError:
+        return set()
+
+    # Collect 1-indexed source line numbers that are inside frozen regions.
+    frozen_linenos: set[int] = set()
+    for node in ast.walk(tree):
+        if _is_freeze_with(node) or _is_freeze_call(node):
+            start = getattr(node, "lineno", None)
+            end = getattr(node, "end_lineno", None)
+            if start is not None and end is not None:
+                frozen_linenos.update(range(start, end + 1))
+
+    if not frozen_linenos:
+        return set()
+
+    # Map 1-indexed source line numbers → indices in nonempty_lines.
+    frozen_indices: set[int] = set()
+    nonempty_idx = 0
+    for lineno_0, line in enumerate(source.splitlines()):
+        if line.strip():  # non-empty → has an entry in nonempty_lines
+            if (lineno_0 + 1) in frozen_linenos:
+                frozen_indices.add(nonempty_idx)
+            nonempty_idx += 1
+    return frozen_indices
+
+
+JobBackend = Literal["subproc", "runner"]
+
+
+@dataclass
+class ParTaskManager:
+    err_msg: str
+    text: str
+    output_file: Path
+    timeout: int = 60
+    num_workers: int = 1
+    backend: JobBackend = "runner"
+    allow_larger: bool = False
+
+    def __post_init__(self):
+        self.worker_tasks: list[asyncio.Task] = []
+        self.stopped = False
+        self.task_manager: TaskManager | None = None
+        self.generator: Iterable[Task] | None = None
+        self.condition = asyncio.Condition()
+        self.waiting_workers = 0
+        self.finished = True
+        self.task_counter = 0
+        self.updated = False
+
+    @property
+    def text_len(self):
+        return len(self.text)
+
+    def reset(self, task_manager: TaskManager):
+        self.task_manager = task_manager
+        self.generator = task_manager.task_generator()
+        self.finished = False
+
+    async def get_next_task(self) -> "Task | None":
+        async with self.condition:
+            while True:
+                if self.stopped:
+                    return None
+                if self.finished or self.generator is None:
+                    await self.condition.wait()
+                    continue
+                try:
+                    result = deepcopy(next(self.generator))
+                    self.task_counter += 1
+                    if self.task_counter % self.num_workers == 0:
+                        print(f"Dispatched {self.task_counter} tasks")
+                    return result
+                except StopIteration:
+                    self.waiting_workers += 1
+                    if self.waiting_workers == self.num_workers:
+                        self.finished = True
+                        self.generator = None
+                        self.condition.notify_all()
+                    await self.condition.wait()
+                    self.waiting_workers -= 1
+
+    async def submit_result(self, task: Task, is_interested: bool):
+        async with self.condition:
+            self.task_manager.task_update(task, is_interested)
+            if is_interested:
+                self.generator = self.task_manager.task_generator()
+                self.condition.notify_all()
+                text = self.post_proc(task.source)
+                if len(text) <= self.text_len or self.allow_larger:
+                    print("Accept length", len(text))
+                    self.text = text
+                    self.output_file.write_text(text)
+                    self.updated = True
+
+    def post_proc(self, text):
+        return clean_empty_pass(text)
+
+    async def worker(self, wid: int):
+        runner = AsyncPythonRunner() if self.backend == "runner" else SubProcRunner()
+        with runner:
+            while True:
+                task = await self.get_next_task()
+                if task is None:
+                    break
+                out, err, ok = await runner.run(task.source, timeout=self.timeout)
+                is_interested = self.err_msg in out or self.err_msg in err
+                await self.submit_result(task, is_interested)
+
+    async def start_workers(self):
+        if self.worker_tasks:
+            return
+        self.stopped = False
+        self.worker_tasks = [asyncio.create_task(self.worker(wid)) for wid in range(self.num_workers)]
+
+    async def stop_workers(self):
+        if not self.worker_tasks:
+            return
+        self.stopped = True
+        async with self.condition:
+            self.condition.notify_all()
+        await asyncio.gather(*self.worker_tasks, return_exceptions=True)
+        self.worker_tasks = []
+        self.generator = None
+
+    async def run_async(self, task_manager: TaskManager):
+        await self.start_workers()
+        self.reset(task_manager)
+        best_length = self.text_len
+        async with self.condition:
+            self.condition.notify_all()
+            while not self.finished:
+                await self.condition.wait()
+        return self.text_len < best_length
+
+    async def run_with(self, cls: type[TaskManager], *args, **kwargs):
+        allow_larger = kwargs.pop("allow_larger", False)
+        if allow_larger:
+            self.allow_larger = True
+            self.updated = False
+        task_manager = cls.from_source(self.text, *args, **kwargs)
+        res = await self.run_async(task_manager)
+        self.allow_larger = False
+        if allow_larger:
+            return self.updated
+        return res
+
+
+class Args(NamedTuple):
+    source: Path
+    err_msg: str
+    output: Path
+    backend: JobBackend
+    timeout: int
+    jobs: int
+
+
+async def main(args: Args):
+    if not args.source.exists() or not args.source.is_file():
+        raise FileNotFoundError(f"Source file '{args.source}' does not exist or is not a regular file.")
+    if not os.access(args.source, os.R_OK):
+        raise OSError(f"Source file '{args.source}' is not readable.")
+    try:
+        source = args.source.read_text()
+    except OSError as e:
+        raise OSError(f"Failed to read source file '{args.source}': {e}") from e
+
+    manager = ParTaskManager(
+        err_msg=args.err_msg,
+        text=source,
+        output_file=args.output,
+        timeout=args.timeout,
+        backend=args.backend,
+        num_workers=args.jobs,
+    )
+
+    # remove any statement
+
+    for_bind_0 = ASTPatRewrite.from_code(
+        name="for-bind-0",
+        kind="stmt",
+        match="for VARS in EXPR: BODY",
+        rewrite="VARS = ZEROS\nBODY",
+        placeholders={"VARS", "EXPR", "BODY", "ZEROS"},
+        derived={
+            "ZEROS": lambda ph: expr_to_zeros(ph["VARS"]),
+        },
+    )
+
+    with_bind_0 = ASTPatRewrite.from_code(
+        name="with-bind-0",
+        kind="stmt",
+        match="with EXPR as VARS: BODY",
+        rewrite="with EXPR:\n  VARS = ZEROS\n  BODY",
+        placeholders={"VARS", "EXPR", "BODY", "ZEROS"},
+        derived={
+            "ZEROS": lambda ph: expr_to_zeros(ph["VARS"]),
+        },
+    )
+
+    assign_rhs_1 = ASTPatRewrite.from_code(
+        name="assign-rhs-1",
+        kind="stmt",
+        match="VAR = EXPR",
+        rewrite="VAR = 1",
+        placeholders={"VAR", "EXPR"},
+    )
+
+    if_remover_1 = ASTPatRewrite.from_code(
+        name="if-remover-1",
+        kind="stmt",
+        match="if COND: BODY",
+        rewrite="BODY",
+        placeholders={"COND", "BODY"},
+    )
+
+    if_remover_2 = ASTPatRewrite.from_code(
+        name="if-remover-2",
+        kind="stmt",
+        match="if COND: BODY\nelse: ELSE_BODY",
+        rewrite="BODY",
+        placeholders={"COND", "BODY", "ELSE_BODY"},
+    )
+
+    if_remover_3 = ASTPatRewrite.from_code(
+        name="if-remover-3",
+        kind="stmt",
+        match="if COND: BODY\nelse: ELSE_BODY",
+        rewrite="ELSE_BODY",
+        placeholders={"COND", "BODY", "ELSE_BODY"},
+    )
+
+    # replace all integer constant x with x // 2
+    int_reduce = IntConstApply(lambda x: x > 1, lambda x: x // 2, "int-reduce-2")
+
+    # 1. first, we only do statement level fast reductions
+    fast_reducers = [
+        if_remover_1,
+        if_remover_2,
+        if_remover_3,
+        for_bind_0,
+        GeneralRemove("stmt-remover", ast.stmt, replace_with=ast.Pass()),
+    ]
+
+    # 2. canonicalizer enables more simplifications
+    canonicalizers = [
+        with_bind_0,
+        AttachFullFuncArgs(),
+    ]
+
+    # 3. simplifiers
+    simplifiers = [
+        assign_rhs_1,
+        CallFwdArg1(),
+        BinOpFwdArg("left"),
+        BinOpFwdArg("right"),
+        GeneralRemove("func-arg-remover", ast.arg),
+    ] + fast_reducers
+
+    # 4. finally apply expr level slow reductions
+    slow_reducers = [
+        GeneralRemove("func-arg-remover", ast.arg),
+        GeneralRemove("general-expr-remover", ast.expr),
+        GeneralRemove("general-keyword-remover", ast.keyword),
+    ] + fast_reducers
+
+    await manager.start_workers()
+    # One-time preprocessing: convert # autodd: freeze comments to
+    # ``with __freeze__:`` blocks so that freeze annotations survive
+    # ast.unparse round-trips throughout the reduction loop.
+    manager.text = _preprocess_freeze_comments(manager.text)
+    manager.text = manager.post_proc(manager.text)
+    try:
+        while True:
+            changed = False
+            while await manager.run_with(ASTPDD, fast_reducers):
+                changed = True
+            await manager.run_with(ASTPDD, canonicalizers, allow_larger=True)
+            while await manager.run_with(ASTPDD, simplifiers):
+                changed = True
+            while await manager.run_with(ASTPDD, [int_reduce], allow_larger=True):
+                changed = True
+            while await manager.run_with(ASTPDD, slow_reducers):
+                changed = True
+            if not changed:
+                break
+    finally:
+        await manager.stop_workers()
+
+
+def cli_main(argv: "Sequence[str] | None" = None) -> None:
+    from argparse import ArgumentParser
+
+    parser = ArgumentParser(
+        usage="python -m tilelang.autodd source --err-msg MSG -o OUTPUT [--backend {runner,subproc}] [--timeout SEC] [-j N]",
+        description="Delta-debug the provided Python source until the target error message remains reproducible.",
+        epilog="Author: Kexing Zhou <zhoukexing@pku.edu.cn>",
+    )
+    parser.add_argument("source", type=Path, help="Input python source file")
+    parser.add_argument("--err-msg", type=str, required=True, help="Error message to look for")
+    parser.add_argument("-o", "--output", type=Path, required=True, help="Output file path")
+    parser.add_argument(
+        "--backend", default="runner", choices=["runner", "subproc"], help="Backend for running code: runner is faster, subproc is stable"
+    )
+    parser.add_argument("--timeout", type=int, default=60, help="Timeout for each task in seconds (default: 60)")
+    parser.add_argument("-j", "--jobs", type=int, default=1, help="Number of parallel jobs (default: 1)")
+    ns = parser.parse_args(argv)
+
+    args = Args(
+        source=ns.source,
+        err_msg=ns.err_msg,
+        output=ns.output,
+        backend=ns.backend,
+        timeout=ns.timeout,
+        jobs=ns.jobs,
+    )
+    asyncio.run(main(args))
+
+
+if __name__ == "__main__":
+    cli_main()
diff --git a/tilelang/autotuner/param.py b/tilelang/autotuner/param.py
index e3f5401772..39cef841d1 100644
--- a/tilelang/autotuner/param.py
+++ b/tilelang/autotuner/param.py
@@ -9,10 +9,12 @@
 from typing import Callable, Literal, Any
 from dataclasses import dataclass
 from pathlib import Path
+import errno
 
 from tilelang.jit import JITKernel
 import cloudpickle
 import os
+import shutil
 from tilelang.engine.param import KernelParam
 from tilelang import logger
 import json
@@ -23,6 +25,7 @@
 
 BEST_CONFIG_PATH = "best_config.json"
 FUNCTION_PATH = "function.pkl"
+OUT_IDX_PATH = "out_idx.json"
 LATENCY_PATH = "latency.json"
 
 # Align file names with cache/kernel_cache.py
@@ -86,6 +89,7 @@ class ProfileArgs:
         warmup: Number of warmup iterations.
         rep: Number of repetitions for timing.
         timeout: Maximum time per configuration.
+        backend: Profiler backend - "event" (CUDA events), "cupti", or "cudagraph".
         supply_type: Type of tensor supply mechanism.
         ref_prog: Reference program for correctness validation.
         supply_prog: Supply program for input tensors.
@@ -104,6 +108,7 @@ class ProfileArgs:
     warmup: int = 25
     rep: int = 100
     timeout: int = 30
+    backend: Literal["event", "cupti", "cudagraph"] = "event"
     supply_type: tilelang.TensorSupplyType = tilelang.TensorSupplyType.Auto
     ref_prog: Callable = None
     supply_prog: Callable = None
@@ -119,6 +124,7 @@ def __hash__(self):
             "warmup": self.warmup,
             "rep": self.rep,
             "timeout": self.timeout,
+            "backend": self.backend,
             "supply_type": str(self.supply_type),
             "rtol": self.rtol,
             "atol": self.atol,
@@ -189,75 +195,88 @@ def _save_kernel_to_disk(self, cache_path: Path, kernel: JITKernel, verbose: boo
             - kernel_lib.so: The compiled kernel library
             - params.pkl: The serialized kernel parameters
         """
-        os.makedirs(cache_path, exist_ok=True)  # Ensure directory exists
+        os.makedirs(cache_path, exist_ok=True)  # Ensure directory exists.
 
         # Save device kernel source code
-        try:
-            device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
-            if verbose:
-                logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
-            if kernel.kernel_source is not None:
-                self._safe_write_file(device_kernel_path, "w", lambda f: f.write(kernel.kernel_source))
-        except Exception as e:
-            logger.error(f"Error saving kernel source code to disk: {e}")
+        device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
+        if verbose:
+            logger.debug(f"Saving kernel source code to file: {device_kernel_path}")
+        if kernel.kernel_source is not None:
+            self._safe_write_file(device_kernel_path, "w", lambda f: f.write(kernel.kernel_source))
 
         # Save host kernel source code (wrapped)
-        try:
-            host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
-            if verbose:
-                logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
-            # Match kernel_cache behavior: use host source for tvm_ffi, otherwise wrapped kernel
-            if kernel.execution_backend == "tvm_ffi":
-                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_host_source()))
-            else:
-                self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_kernel_source()))
-        except Exception as e:
-            logger.error(f"Error saving wrapped kernel source code to disk: {e}")
+        host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
+        if verbose:
+            logger.debug(f"Saving wrapped kernel source code to file: {host_kernel_path}")
+        # Match kernel_cache behavior: use host source for tvm_ffi, otherwise wrapped kernel
+        if kernel.execution_backend == "tvm_ffi":
+            self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_host_source()))
+        else:
+            self._safe_write_file(host_kernel_path, "w", lambda f: f.write(kernel.adapter.get_kernel_source()))
 
         # Save kernel library (backend-specific)
-        try:
-            if kernel.execution_backend == "nvrtc":
-                kernel_lib_file = KERNEL_CUBIN_PATH
-            elif kernel.execution_backend == "tvm_ffi":
-                kernel_lib_file = EXECUTABLE_PATH
-            else:
-                kernel_lib_file = KERNEL_LIB_PATH
+        kernel_lib_file = self._get_kernel_lib_file(kernel.execution_backend)
 
-            kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
+        kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
 
-            if kernel.execution_backend == "nvrtc":
-                # Save cubin and python helper file
+        if kernel.execution_backend == "nvrtc":
+            # Save cubin and python helper file
+            src_lib_path = kernel.adapter.libpath
+            kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
+            py_src_path = src_lib_path.replace(".cubin", ".py")
+            if verbose:
+                logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
+            self._safe_write_file(kernel_py_path, "wb", lambda f: f.write(self._load_binary(py_src_path)))
+            if verbose:
+                logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+            self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+        elif kernel.execution_backend == "tvm_ffi":
+            if hasattr(kernel.adapter, "libpath") and kernel.adapter.libpath:
                 src_lib_path = kernel.adapter.libpath
-                kernel_py_path = os.path.join(cache_path, KERNEL_PY_PATH)
-                py_src_path = src_lib_path.replace(".cubin", ".py")
-                if verbose:
-                    logger.debug(f"Saving kernel nvrtc python code to file: {kernel_py_path}")
-                self._safe_write_file(kernel_py_path, "wb", lambda f: f.write(self._load_binary(py_src_path)))
                 if verbose:
-                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+                    logger.debug(f"Copying kernel library to file: {kernel_lib_path}")
                 self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
-            elif kernel.execution_backend == "tvm_ffi":
+            else:
                 executable = kernel.adapter.executable
                 if verbose:
                     logger.debug(f"Saving kernel executable to file: {kernel_lib_path}")
                 self._safe_write_executable(executable, kernel_lib_path)
-            else:
-                src_lib_path = kernel.adapter.libpath
+        elif kernel.execution_backend == "cutedsl":
+            # Save the Python source file (CuTeDSL "library" is a .py, not a .so)
+            src_lib_path = kernel.adapter.libpath
+            if verbose:
+                logger.debug(f"Saving CuTeDSL kernel Python source to file: {kernel_lib_path}")
+            self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
+
+            # Save launcher .so if present (compiled C++ launcher for TMA etc.)
+            lib_gen = kernel.adapter.lib_generator
+            launcher_src = getattr(lib_gen, "launcher_libpath", None)
+            if launcher_src and os.path.exists(launcher_src):
+                launcher_name = getattr(lib_gen, "launcher_libname", os.path.basename(launcher_src))
+                dst_launcher = os.path.join(cache_path, launcher_name)
                 if verbose:
-                    logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
-                self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
-
-        except Exception as e:
-            logger.error(f"Error saving kernel library to disk: {e}")
+                    logger.debug(f"Saving CuTeDSL launcher library to file: {dst_launcher}")
+                self._safe_write_file(dst_launcher, "wb", lambda f: f.write(self._load_binary(launcher_src)))
+
+            # Save cubin if already generated (generated during autotuning benchmark)
+            src_dir = os.path.dirname(src_lib_path)
+            src_cubin = os.path.join(src_dir, "kernel.cubin")
+            if os.path.exists(src_cubin):
+                dst_cubin = os.path.join(cache_path, KERNEL_CUBIN_PATH)
+                if verbose:
+                    logger.debug(f"Saving CuTeDSL cubin to file: {dst_cubin}")
+                self._safe_write_file(dst_cubin, "wb", lambda f: f.write(self._load_binary(src_cubin)))
+        else:
+            src_lib_path = kernel.adapter.libpath
+            if verbose:
+                logger.debug(f"Saving kernel library to file: {kernel_lib_path}")
+            self._safe_write_file(kernel_lib_path, "wb", lambda f: f.write(self._load_binary(src_lib_path)))
 
         # Save kernel parameters
-        try:
-            params_path = os.path.join(cache_path, PARAMS_PATH)
-            if verbose:
-                logger.debug(f"Saving kernel parameters to disk: {params_path}")
-            self._safe_write_file(params_path, "wb", lambda f: cloudpickle.dump(kernel.params, f))
-        except Exception as e:
-            logger.error(f"Error saving kernel parameters to disk: {e}")
+        params_path = os.path.join(cache_path, PARAMS_PATH)
+        if verbose:
+            logger.debug(f"Saving kernel parameters to disk: {params_path}")
+        self._safe_write_file(params_path, "wb", lambda f: cloudpickle.dump(kernel.params, f))
 
     def _load_kernel_from_disk(
         self,
@@ -265,7 +284,7 @@ def _load_kernel_from_disk(
         target: str | Target = "auto",
         target_host: str | Target = None,
         out_idx: list[int] | int | None = None,
-        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch"] = "tvm_ffi",
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi",
         pass_configs: dict = None,
         compile_flags: list[str] | str | None = None,
         func: Callable = None,
@@ -292,19 +311,15 @@ def _load_kernel_from_disk(
             return None
 
         # Resolve backend to pick correct file names
-        if execution_backend == "nvrtc":
-            kernel_lib_file = KERNEL_CUBIN_PATH
-        elif execution_backend == "tvm_ffi":
-            kernel_lib_file = EXECUTABLE_PATH
-        else:
-            kernel_lib_file = KERNEL_LIB_PATH
+        kernel_lib_file = self._get_kernel_lib_file(execution_backend)
 
         device_kernel_path = os.path.join(cache_path, DEVICE_KERNEL_PATH)
         host_kernel_path = os.path.join(cache_path, HOST_KERNEL_PATH)
         kernel_lib_path = os.path.join(cache_path, kernel_lib_file)
         params_path = os.path.join(cache_path, PARAMS_PATH)
 
-        if not all([os.path.exists(file) for file in (kernel_lib_path, params_path)]):
+        required_files = [*self._get_required_kernel_files(Path(cache_path), execution_backend), Path(params_path)]
+        if not all(file.exists() for file in required_files):
             return None
 
         device_kernel_source: str | None = None
@@ -356,36 +371,88 @@ def _load_kernel_from_disk(
             return None
 
     def save_to_disk(self, path: Path, verbose: bool = False):
-        if not os.path.exists(path):
-            os.makedirs(path)
+        """Persist autotune result to disk using atomic directory rename.
 
-        # save best config (atomic)
-        if verbose:
-            logger.debug(f"Saving best config to file: {path / BEST_CONFIG_PATH}")
-        self._safe_write_file(str(path / BEST_CONFIG_PATH), "w", lambda f: json.dump(self.config, f))
+        All files are written into a temporary staging directory under the
+        shared namespace staging root. Once complete, the staging directory is
+        atomically renamed to *path* so that concurrent readers never see a
+        half-written result.
+        """
+        # Already saved (e.g. another process won the race with a complete entry).
+        if self._is_complete_result_dir(path, self.kernel.execution_backend):
+            return
 
-        # save function (atomic)
-        if verbose:
-            logger.debug(f"Saving function to file: {path / FUNCTION_PATH}")
-        self._safe_write_file(str(path / FUNCTION_PATH), "wb", lambda f: cloudpickle.dump(self.func, f))
+        # Keep autotuner staging under the shared namespace staging root so stale cleanup
+        # never needs to scan the full cache directory.
+        staging_path = path.parent.parent / ".staging" / f"{Path(path).name}_{os.getpid()}_{uuid.uuid4().hex[:8]}"
+        os.makedirs(staging_path)
+        # Ensure the parent of the final path exists (e.g. ~/.tilelang/cache/<namespace>/autotuner/)
+        os.makedirs(Path(path).parent, exist_ok=True)
 
-        # save ref latency (atomic)
-        if verbose:
-            logger.debug(f"Saving latency to file: {path / LATENCY_PATH}")
-        self._safe_write_file(
-            str(path / LATENCY_PATH),
-            "w",
-            lambda f: json.dump(
-                {
-                    "latency": self.latency,
-                    "ref_latency": self.ref_latency,
-                },
-                f,
-            ),
-        )
+        try:
+            # save best config
+            if verbose:
+                logger.debug(f"Saving best config to file: {staging_path / BEST_CONFIG_PATH}")
+            self._safe_write_file(str(staging_path / BEST_CONFIG_PATH), "w", lambda f: json.dump(self.config, f))
+
+            # save function
+            if verbose:
+                logger.debug(f"Saving function to file: {staging_path / FUNCTION_PATH}")
+            self._safe_write_file(str(staging_path / FUNCTION_PATH), "wb", lambda f: cloudpickle.dump(self.func, f))
 
-        # save kernel
-        self._save_kernel_to_disk(path, self.kernel)
+            # save out idx
+            if verbose:
+                logger.debug(f"Saving out idx to file: {staging_path / OUT_IDX_PATH}")
+            self._safe_write_file(
+                str(staging_path / OUT_IDX_PATH),
+                "w",
+                lambda f: json.dump(
+                    {
+                        "out_idx": list(self.func.attrs["tilelang_out_idx"])
+                        if (self.func.attrs and "tilelang_out_idx" in self.func.attrs)
+                        else None,
+                    },
+                    f,
+                ),
+            )
+
+            # save latency
+            if verbose:
+                logger.debug(f"Saving latency to file: {staging_path / LATENCY_PATH}")
+            self._safe_write_file(
+                str(staging_path / LATENCY_PATH),
+                "w",
+                lambda f: json.dump(
+                    {
+                        "latency": self.latency,
+                        "ref_latency": self.ref_latency,
+                    },
+                    f,
+                ),
+            )
+
+            # save kernel
+            self._save_kernel_to_disk(staging_path, self.kernel, verbose)
+
+            missing_files = self._get_missing_complete_result_files(staging_path, self.kernel.execution_backend)
+            if missing_files:
+                missing_names = ", ".join(path.name for path in missing_files)
+                raise RuntimeError(f"Incomplete autotune staging directory is missing required file(s): {missing_names}")
+
+            # Repair stale/incomplete entries before making the new directory visible.
+            self._remove_incomplete_result_dir(path, self.kernel.execution_backend)
+
+            # Atomic rename — directory becomes visible in one step.
+            try:
+                os.rename(str(staging_path), str(path))
+            except OSError as exc:
+                if not self._is_rename_collision(exc):
+                    raise
+                # Another process won the race with a complete cache entry.
+                shutil.rmtree(str(staging_path), ignore_errors=True)
+        except Exception:
+            shutil.rmtree(str(staging_path), ignore_errors=True)
+            logger.exception("Error during atomic autotune result save")
 
     @classmethod
     def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> AutotuneResult:
@@ -412,6 +479,15 @@ def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> AutotuneResult
         with open(path / FUNCTION_PATH, "rb") as f:
             func = cloudpickle.load(f)
 
+        # load out idx (optional — older caches may not have this file)
+        out_idx_override = None
+        out_idx_file = path / OUT_IDX_PATH
+        if out_idx_file.exists():
+            if verbose:
+                logger.debug(f"Loading out idx from file: {out_idx_file}")
+            with open(out_idx_file) as f:
+                out_idx_override = json.load(f)["out_idx"]
+
         # load latency
         if verbose:
             logger.debug(f"Loading latency from file: {path / LATENCY_PATH}")
@@ -424,7 +500,7 @@ def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> AutotuneResult
             path,
             norm_target,
             compile_args.target_host,
-            compile_args.out_idx,
+            out_idx_override if out_idx_override is not None else compile_args.out_idx,
             resolved_backend,
             compile_args.pass_configs,
             None,  # compile_flags not tracked here
@@ -446,3 +522,55 @@ def load_from_disk(cls, path: Path, compile_args: CompileArgs) -> AutotuneResult
             ref_latency=ref_latency,
         )
         return result
+
+    @staticmethod
+    def _get_kernel_lib_file(execution_backend: str) -> str:
+        if execution_backend == "nvrtc":
+            return KERNEL_CUBIN_PATH
+        if execution_backend == "tvm_ffi":
+            return EXECUTABLE_PATH
+        if execution_backend == "cutedsl":
+            return KERNEL_PY_PATH
+        return KERNEL_LIB_PATH
+
+    @classmethod
+    def _get_required_kernel_files(cls, path: Path, execution_backend: str) -> list[Path]:
+        files = [path / cls._get_kernel_lib_file(execution_backend)]
+        if execution_backend == "nvrtc":
+            files.append(path / KERNEL_PY_PATH)
+        return files
+
+    @classmethod
+    def _get_complete_result_files(cls, path: Path, execution_backend: str) -> list[Path]:
+        return list(
+            dict.fromkeys(
+                [
+                    path / BEST_CONFIG_PATH,
+                    path / FUNCTION_PATH,
+                    path / LATENCY_PATH,
+                    path / DEVICE_KERNEL_PATH,
+                    path / HOST_KERNEL_PATH,
+                    *cls._get_required_kernel_files(path, execution_backend),
+                    path / PARAMS_PATH,
+                ]
+            )
+        )
+
+    @classmethod
+    def _get_missing_complete_result_files(cls, path: Path, execution_backend: str) -> list[Path]:
+        return [file for file in cls._get_complete_result_files(path, execution_backend) if not file.exists()]
+
+    @classmethod
+    def _is_complete_result_dir(cls, path: Path, execution_backend: str) -> bool:
+        return path.is_dir() and not cls._get_missing_complete_result_files(path, execution_backend)
+
+    @classmethod
+    def _remove_incomplete_result_dir(cls, path: Path, execution_backend: str) -> bool:
+        if not path.is_dir() or cls._is_complete_result_dir(path, execution_backend):
+            return False
+        shutil.rmtree(path)
+        return True
+
+    @staticmethod
+    def _is_rename_collision(exc: OSError) -> bool:
+        return exc.errno in {errno.EEXIST, errno.ENOTEMPTY}
diff --git a/tilelang/autotuner/tuner.py b/tilelang/autotuner/tuner.py
index 8968000ecb..413a66602b 100644
--- a/tilelang/autotuner/tuner.py
+++ b/tilelang/autotuner/tuner.py
@@ -5,7 +5,6 @@
 """
 
 from __future__ import annotations
-
 from dataclasses import dataclass
 
 import tilelang
@@ -100,6 +99,21 @@ def get_available_cpu_count() -> int:
     return cpu_count or 1
 
 
+def _normalize_value(value, sort_dict_items: bool = False):
+    if isinstance(value, torch.Tensor):
+        return ("tensor", str(value.dtype), tuple(value.shape), value.stride())
+    if isinstance(value, Var):
+        return str(value)
+    if isinstance(value, (list, tuple)):
+        return tuple(_normalize_value(v, sort_dict_items=sort_dict_items) for v in value)
+    if isinstance(value, dict):
+        items = ((str(k), _normalize_value(v, sort_dict_items=sort_dict_items)) for k, v in value.items())
+        if sort_dict_items:
+            return tuple(sorted(items))
+        return {k: v for k, v in items}
+    return value
+
+
 class AutoTuner:
     """Auto-tuner for tilelang programs.
 
@@ -114,11 +128,10 @@ class AutoTuner:
     compile_args = CompileArgs()
     profile_args = ProfileArgs()
 
-    _kernel_parameters: tuple[str, ...] | None = None
+    _kernel_parameters: tuple[tuple[Any, ...], tuple[tuple[str, Any], ...]] | None = None
     _function_parameters: dict[str, Any] | None = None
     _lock = threading.Lock()  # For thread safety
     _memory_cache = {}  # In-memory cache dictionary
-    cache_dir: Path = Path(env.TILELANG_CACHE_DIR) / "autotuner"
 
     def __init__(self, fn: Callable, configs):
         self.fn = fn
@@ -128,6 +141,16 @@ def __init__(self, fn: Callable, configs):
         self.ref_input_tensors = None
         self.jit_compile = None
 
+    @classmethod
+    def _get_cache_dir(cls) -> Path:
+        from tilelang.cache.kernel_cache import KernelCache
+
+        return Path(KernelCache._get_namespace_root()) / "autotuner"
+
+    @property
+    def cache_dir(self) -> Path:
+        return self._get_cache_dir()
+
     @classmethod
     def from_kernel(cls, kernel: Callable, configs):
         """Create an AutoTuner instance from a kernel function.
@@ -209,6 +232,7 @@ def set_profile_args(
         skip_check: bool = False,
         manual_check_prog: Callable = None,
         cache_input_tensors: bool = False,
+        backend: Literal["event", "cupti", "cudagraph"] = "event",
     ):
         """Set profiling arguments for the auto-tuner.
 
@@ -225,7 +249,7 @@ def set_profile_args(
             warmup: Number of warmup iterations.
             rep: Number of repetitions for timing.
             timeout: Maximum time per configuration.
-
+            backend: Profiler backend - "event" (CUDA events), "cupti", or "cudagraph".
         Returns:
             AutoTuner: Self for method chaining.
         """
@@ -249,6 +273,7 @@ def set_profile_args(
             warmup=warmup,
             rep=rep,
             timeout=timeout,
+            backend=backend,
         )
 
         # If a custom `supply_prog` is provided, the profiler's `supply_type` setting
@@ -258,7 +283,7 @@ def set_profile_args(
 
         return self
 
-    def set_kernel_parameters(self, k_parameters: tuple[str, ...], f_parameters: dict[str, Any]):
+    def set_kernel_parameters(self, k_parameters: tuple[tuple[Any, ...], tuple[tuple[str, Any], ...]], f_parameters: dict[str, Any]):
         # for cache key generation
         self._kernel_parameters = k_parameters
         self._function_parameters = f_parameters
@@ -266,15 +291,6 @@ def set_kernel_parameters(self, k_parameters: tuple[str, ...], f_parameters: dic
     def generate_cache_key(self, parameters: dict[str, Any], extra_parameters: dict[str, Any]) -> AutotuneResult | None:
         """Generate a cache key for the auto-tuning process."""
 
-        def _normalize_param(value):
-            if isinstance(value, Var):
-                return str(value)
-            if isinstance(value, (list, tuple)):
-                return [_normalize_param(v) for v in value]
-            if isinstance(value, dict):
-                return {str(k): _normalize_param(v) for k, v in value.items()}
-            return value
-
         # extract parameters from the function signature
         op_parameters = []
         for _, default_value in parameters.items():
@@ -282,7 +298,7 @@ def _normalize_param(value):
                 op_parameters.append(default_value.default)
 
         if self._kernel_parameters is not None:
-            op_parameters += _normalize_param(self._kernel_parameters)
+            op_parameters += _normalize_value(self._kernel_parameters)
 
         func_source = inspect.getsource(self.fn)
         key_data = {
@@ -341,7 +357,9 @@ def run(self, warmup: int = 25, rep: int = 100, timeout: int = 30):
                 extra_parameters[var_name] = cell.cell_contents
 
         if isinstance(self.configs, Callable):
-            self.configs = self.configs(*self._kernel_parameters)
+            kernel_args, kernel_kwargs = self._kernel_parameters
+            kernel_kwargs = dict(kernel_kwargs)
+            self.configs = self.configs(*kernel_args, **kernel_kwargs)
 
         key = self.generate_cache_key(parameters, extra_parameters)
 
@@ -389,6 +407,7 @@ def target_fn(jit_kernel: tilelang.JITKernel):
             rtol = profile_args.rtol
             atol = profile_args.atol
             max_mismatched_ratio = profile_args.max_mismatched_ratio
+            backend = profile_args.backend
 
             profiler = jit_kernel.get_profiler(tensor_supply_type=supply_type)
 
@@ -449,11 +468,17 @@ def shape_equal(a, b):
                     profiler.assert_allclose(
                         ref_prog, input_tensors=self.jit_input_tensors, rtol=rtol, atol=atol, max_mismatched_ratio=max_mismatched_ratio
                     )
-            latency = profiler.do_bench(warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors)
+            latency = profiler.do_bench(warmup=warmup, rep=rep, input_tensors=self.jit_input_tensors, backend=backend)
 
             if self.ref_latency_cache is None and ref_prog is not None:
                 self.ref_input_tensors = ref_input_tensors_supply()
-                self.ref_latency_cache = profiler.do_bench(ref_prog, n_warmup=warmup, n_repeat=rep, input_tensors=self.ref_input_tensors)
+                self.ref_latency_cache = profiler.do_bench(
+                    ref_prog,
+                    n_warmup=warmup,
+                    n_repeat=rep,
+                    input_tensors=self.ref_input_tensors,
+                    backend=backend,
+                )
 
             return latency, self.ref_latency_cache
 
@@ -672,21 +697,52 @@ def get_tunner(self):
         autotuner.run = partial(autotuner.run, self.warmup, self.rep, self.timeout)
         return autotuner
 
-    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel:
-        key_args_tuple = args
-        key_kwargs_tuple = tuple(sorted(kwargs.items()))
-        key = (key_args_tuple, key_kwargs_tuple)
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel | _T:
+        return_kernel = kwargs.pop("__return_kernel", False)
+
+        mode = self.jit_impl.initialize_jit_mode(*args, **kwargs)
+
+        norm_args = _normalize_value(args, sort_dict_items=True)
+        norm_kwargs = _normalize_value(kwargs, sort_dict_items=True)
+        key = (norm_args, norm_kwargs)
         if key not in self._tuner_cache:
+            if mode == "lazy":
+
+                def jit_compile(**config_arg):
+                    return self.jit_impl(*args, **kwargs, __tune_params=config_arg)
+
+                autotuner = self.get_tunner()
+                autotuner.jit_compile = jit_compile
+                autotuner.set_kernel_parameters(key, self.jit_impl.signature.parameters)
+            else:
 
-            def jit_compile(**config_arg):
-                return self.jit_impl(*args, **kwargs, __tune_params=config_arg)
+                def jit_compile(**config_arg):
+                    merged = dict(kwargs)
+                    merged.update(config_arg)
+                    return self.jit_impl.compile(*args, **merged)
+
+                autotuner = self.get_tunner()
+                autotuner.jit_compile = jit_compile
+                autotuner.set_kernel_parameters(key, self.jit_impl.signature.parameters)
 
-            autotuner = self.get_tunner()
-            autotuner.jit_compile = jit_compile
-            autotuner.set_kernel_parameters(key, self.jit_impl.signature.parameters)
             artifact = autotuner.run()
-            self._tuner_cache[key] = artifact.kernel
-        return self._tuner_cache[key]
+            self._tuner_cache[key] = artifact.kernel, artifact.config
+
+        best_kernel, best_config = self._tuner_cache[key]
+
+        if mode == "lazy":
+            return best_kernel
+        else:
+            if return_kernel:
+                return best_kernel
+            exec_kwargs = dict(kwargs)
+            if best_config is not None:
+                exec_kwargs.update(best_config)
+            _, kernel_args = self.jit_impl.func.parse_args(*args, **exec_kwargs)
+            return best_kernel(*kernel_args.values())
+
+    def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> JITKernel:
+        return self(*args, **kwargs, __return_kernel=True)
 
 
 def autotune(  # This is the new public interface
diff --git a/tilelang/cache/kernel_cache.py b/tilelang/cache/kernel_cache.py
index 76b84590f5..f57d14e6e2 100644
--- a/tilelang/cache/kernel_cache.py
+++ b/tilelang/cache/kernel_cache.py
@@ -2,12 +2,15 @@
 
 from __future__ import annotations
 
+import functools
 import json
 import logging
+import errno
 import os
 import shutil
 import threading
 import uuid
+import sys
 from hashlib import sha256
 from typing import Callable, Literal
 
@@ -20,6 +23,7 @@
 from tilelang import env
 from tilelang.jit import JITKernel
 from tilelang import __version__
+import platform
 
 
 class KernelCache:
@@ -35,11 +39,103 @@ class KernelCache:
     _instance = None  # For implementing singleton pattern
     _lock = threading.Lock()  # For thread safety
     _memory_cache = {}  # In-memory cache dictionary
+    _staging_cleanup_lock = threading.Lock()
+    _last_cleaned_staging_root: str | None = None
     execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"] = "tvm_ffi"
     device_kernel_path = "device_kernel.cu"
     host_kernel_path = "host_kernel.cu"
     kernel_lib_path = "kernel_lib.so"
     params_path = "params.pkl"
+    cache_root_dir = "kernels"
+    staging_root_dir = ".staging"
+
+    @staticmethod
+    @functools.cache
+    def _get_compile_args() -> dict:
+        if sys.platform != "darwin":
+            return {}
+
+        from torch.utils import cpp_extension
+
+        return {"options": ["-x", "objective-c++", "-g", "-std=gnu++17"] + ["-I" + i for i in cpp_extension.include_paths()]}
+
+    @staticmethod
+    @functools.cache
+    def _get_tilelang_lib_stamp() -> str | None:
+        """Return a content-based build-stamp for the TileLang runtime library.
+
+        The kernel cache key historically only depended on `tilelang.__version__`
+        and the TIR script. During development, C++ pass changes can change the
+        generated kernel *without* changing the input TIR, leading to stale cache
+        hits. Including a library stamp avoids this class of bugs.
+
+        We use a SHA-256 content hash of the library file instead of mtime so that
+        the cache key is stable across machines and fresh installs — two identical
+        builds of libtilelang.so will produce the same stamp regardless of when or
+        where they were installed.
+        """
+        import importlib
+
+        lib_dirs: list[str] = []
+        try:
+            env_mod = importlib.import_module("tilelang.env")
+            lib_dirs.extend(getattr(env_mod, "TL_LIBS", []) or [])
+        except Exception:
+            pass
+
+        if sys.platform == "win32":
+            lib_names = ["tilelang.dll", "libtilelang.dll"]
+        elif sys.platform == "darwin":
+            lib_names = ["libtilelang.dylib", "libtilelang.so"]
+        else:
+            lib_names = ["libtilelang.so"]
+
+        for lib_dir in lib_dirs:
+            for name in lib_names:
+                path = os.path.join(lib_dir, name)
+                if os.path.exists(path):
+                    file_hash = sha256()
+                    with open(path, "rb") as f:
+                        for chunk in iter(lambda: f.read(1 << 20), b""):
+                            file_hash.update(chunk)
+                    return f"{name}:{file_hash.hexdigest()}"
+        return None
+
+    @staticmethod
+    @functools.cache
+    def _get_base_key() -> dict:
+        base = {"version": __version__, "platform": platform.machine()}
+        lib_stamp = KernelCache._get_tilelang_lib_stamp()
+        if lib_stamp:
+            base["tilelang_lib"] = lib_stamp
+        if sys.platform == "darwin":
+            import torch
+
+            base["torch"] = torch.__version__
+        return base
+
+    @staticmethod
+    def _sanitize_path_component(component: str) -> str:
+        sanitized = "".join(ch if ch.isalnum() or ch in "._-" else "_" for ch in component)
+        sanitized = sanitized.strip("._-")
+        return sanitized or "unknown"
+
+    @staticmethod
+    def _format_version_namespace(version: str) -> str:
+        public, sep, local = version.partition("+")
+        public = KernelCache._sanitize_path_component(public)
+        if not sep:
+            return public
+        local = "".join(ch if ch.isalnum() else "_" for ch in local).strip("_")
+        return f"{public}_{local}" if local else public
+
+    @staticmethod
+    @functools.cache
+    def _get_cache_namespace() -> str:
+        base_key = KernelCache._get_base_key()
+        version = KernelCache._format_version_namespace(str(base_key.get("version", "unknown")))
+        platform_name = KernelCache._sanitize_path_component(str(base_key.get("platform", "unknown")))
+        return f"{version}-{platform_name}"
 
     def __new__(cls):
         """
@@ -63,6 +159,51 @@ def __new__(cls):
     def _create_dirs():
         os.makedirs(env.TILELANG_CACHE_DIR, exist_ok=True)
         os.makedirs(env.TILELANG_TMP_DIR, exist_ok=True)
+        os.makedirs(KernelCache._get_namespace_root(), exist_ok=True)
+        os.makedirs(KernelCache._get_cache_root(), exist_ok=True)
+        os.makedirs(KernelCache._get_staging_root(), exist_ok=True)
+
+        staging_root = KernelCache._get_staging_root()
+        with KernelCache._staging_cleanup_lock:
+            if KernelCache._last_cleaned_staging_root != staging_root:
+                KernelCache._cleanup_stale_staging_dirs()
+                KernelCache._last_cleaned_staging_root = staging_root
+
+    @staticmethod
+    def _get_namespace_root() -> str:
+        return os.path.join(env.TILELANG_CACHE_DIR, KernelCache._get_cache_namespace())
+
+    @staticmethod
+    def _get_cache_root() -> str:
+        return os.path.join(KernelCache._get_namespace_root(), KernelCache.cache_root_dir)
+
+    @staticmethod
+    def _get_staging_root() -> str:
+        return os.path.join(KernelCache._get_namespace_root(), KernelCache.staging_root_dir)
+
+    @staticmethod
+    def _cleanup_stale_staging_dirs(max_age_seconds: int = 3600):
+        """Remove stale entries from the dedicated staging root.
+
+        These are left behind when a process crashes mid-save.
+        """
+        import time
+
+        try:
+            now = time.time()
+            staging_root = KernelCache._get_staging_root()
+            if not os.path.isdir(staging_root):
+                return
+
+            for entry in os.scandir(staging_root):
+                if entry.is_dir(follow_symlinks=False):
+                    try:
+                        if now - entry.stat().st_mtime > max_age_seconds:
+                            shutil.rmtree(entry.path, ignore_errors=True)
+                    except OSError:
+                        pass
+        except OSError:
+            pass
 
     def _generate_key(
         self,
@@ -92,7 +233,6 @@ def _generate_key(
         self.execution_backend = execution_backend
         func_binary = func.script(show_meta=True).encode()
         key_data = {
-            "version": __version__,
             "func": sha256(func_binary).hexdigest(),  # Use SHA256 to generate hash key
             "out_idx": (tuple(out_idx) if isinstance(out_idx, (list, tuple)) else [out_idx]),
             "args_repr": tuple(repr(arg) for arg in args),  # Use repr to serialize arguments, may need more robust serialization
@@ -101,6 +241,7 @@ def _generate_key(
             "execution_backend": execution_backend,
             "pass_configs": pass_configs,
             "compile_flags": compile_flags,
+            **self._get_base_key(),
         }
         # Sort keys to ensure consistency
         key_string = json.dumps(key_data, sort_keys=True)
@@ -240,7 +381,7 @@ def _get_cache_path(self, key: str) -> str:
         Returns:
             str: Absolute path to the cache directory for this kernel.
         """
-        return os.path.join(env.TILELANG_CACHE_DIR, key)
+        return os.path.join(self._get_cache_root(), key)
 
     @staticmethod
     def _load_binary(path: str):
@@ -258,60 +399,80 @@ def _safe_write_file(path: str, mode: str, operation: Callable):
         # Use atomic POSIX replace, so other processes cannot see a partial write
         os.replace(temp_path, path)
 
-    @staticmethod
-    def _safe_write_executable(executable: Executable, path: str):
+    @classmethod
+    def _safe_write_executable(cls, executable: Executable, path: str):
         temp_path = os.path.join(env.TILELANG_TMP_DIR, f"{os.getpid()}_{uuid.uuid4()}.so")
-        executable.export_library(temp_path)
+        executable.export_library(temp_path, **cls._get_compile_args())
         os.replace(temp_path, path)
 
     def _save_kernel_to_disk(self, key: str, kernel: JITKernel, func: Callable = None, verbose: bool = False):
         """
-        Persists a compiled kernel to disk cache.
+        Persists a compiled kernel to disk cache using atomic directory rename.
+
+        All files are first written into a temporary staging directory under the
+        namespace staging root. Once every file is in place, the staging directory
+        is atomically renamed to the final cache path so that other processes never
+        observe an incomplete cache entry.
 
         Args:
             key (str): The hash key identifying the kernel.
             kernel (JITKernel): The compiled kernel to be saved.
             func (Callable, optional): The original function.
             verbose (bool): Enable verbose log messages.
-
-        Note:
-            Saves the following files:
-            - kernel.cu: The compiled kernel source code
-            - wrapped_kernel.cu: The wrapped kernel source code
-            - kernel_lib.so: The compiled kernel library
-            - params.pkl: The serialized kernel parameters
         """
+        # Env-backed cache roots may change across tests or at runtime; recreate the
+        # namespace-specific directories lazily here so direct save helpers keep working
+        # even when the singleton instance is reused.
+        KernelCache._create_dirs()
         cache_path = self._get_cache_path(key)
-        os.makedirs(cache_path, exist_ok=True)  # Ensure directory exists
 
-        # Save kernel source code
-        try:
-            self._save_kernel_source_code_to_disk(kernel, cache_path, verbose)
-        except Exception:
-            self.logger.exception("Error saving kernel source code to disk")
+        # Another process already wrote a complete entry — nothing to do.
+        if self._is_complete_cache_dir(cache_path):
+            return
 
-        # Save wrapped kernel source code
-        try:
-            self._save_wrapper_kernel_code_to_disk(kernel, cache_path, verbose)
-        except Exception:
-            self.logger.exception("Error saving host kernel source code to disk")
+        # Staging dir lives under CACHE_DIR/<namespace>/.staging (same filesystem) so
+        # os.rename works without scanning the full cache root during stale cleanup.
+        staging_path = os.path.join(
+            self._get_staging_root(),
+            f"{key}_{os.getpid()}_{uuid.uuid4().hex[:8]}",
+        )
+        os.makedirs(staging_path)
 
-        # Save the kernel library
         try:
-            # Save CUBIN or SO file
-            self._save_so_cubin_to_disk(kernel, cache_path, verbose)
+            # Save kernel source code
+            self._save_kernel_source_code_to_disk(kernel, staging_path, verbose)
 
-        except Exception:
-            self.logger.exception("Error saving kernel library to disk")
+            # Save wrapped kernel source code
+            self._save_wrapper_kernel_code_to_disk(kernel, staging_path, verbose)
 
-        # Save kernel parameters
-        try:
-            params_path = os.path.join(cache_path, self.params_path)
+            # Save the kernel library
+            self._save_so_cubin_to_disk(kernel, staging_path, verbose)
+
+            # Save kernel parameters
+            params_path = os.path.join(staging_path, self.params_path)
             if verbose:
                 self.logger.debug(f"Saving kernel parameters to disk: {params_path}")
             KernelCache._safe_write_file(params_path, "wb", lambda file: cloudpickle.dump(kernel.params, file))
+
+            missing_files = self._get_missing_complete_cache_files(staging_path)
+            if missing_files:
+                missing_names = ", ".join(os.path.basename(path) for path in missing_files)
+                raise RuntimeError(f"Incomplete cache staging directory is missing required file(s): {missing_names}")
+
+            # Repair stale/incomplete entries before making the new directory visible.
+            self._remove_incomplete_cache_dir(cache_path)
+
+            # Atomic rename — makes the complete directory visible in one step.
+            try:
+                os.rename(staging_path, cache_path)
+            except OSError as exc:
+                if not self._is_rename_collision(exc):
+                    raise
+                # Another process won the race with a complete cache entry.
+                shutil.rmtree(staging_path, ignore_errors=True)
         except Exception:
-            self.logger.exception("Error saving kernel parameters to disk")
+            shutil.rmtree(staging_path, ignore_errors=True)
+            self.logger.exception("Error during atomic cache save")
 
     def _load_kernel_from_disk(
         self,
@@ -384,14 +545,13 @@ def _clear_disk_cache(self):
         Removes all cached kernels from disk.
 
         Note:
-            This operation will delete the entire cache directory and recreate it empty.
+            This operation will delete the current kernel-cache namespace and recreate it empty.
             Use with caution as this operation cannot be undone.
         """
         try:
-            # Delete the entire cache directory
-            shutil.rmtree(env.TILELANG_CACHE_DIR)
+            shutil.rmtree(self._get_cache_root(), ignore_errors=True)
+            shutil.rmtree(self._get_staging_root(), ignore_errors=True)
 
-            # Re-create the cache directory
             KernelCache._create_dirs()
         except Exception:
             self.logger.exception("Error clearing disk cache")
@@ -421,6 +581,33 @@ def _get_required_files(self, cache_path: str) -> list[str]:
         params_path = os.path.join(cache_path, self.params_path)
         return [kernel_lib_path, params_path]
 
+    def _get_complete_cache_files(self, cache_path: str) -> list[str]:
+        return list(
+            dict.fromkeys(
+                [
+                    os.path.join(cache_path, self.device_kernel_path),
+                    os.path.join(cache_path, self.host_kernel_path),
+                    *self._get_required_files(cache_path),
+                ]
+            )
+        )
+
+    def _get_missing_complete_cache_files(self, cache_path: str) -> list[str]:
+        return [file for file in self._get_complete_cache_files(cache_path) if not os.path.exists(file)]
+
+    def _is_complete_cache_dir(self, cache_path: str) -> bool:
+        return os.path.isdir(cache_path) and not self._get_missing_complete_cache_files(cache_path)
+
+    def _remove_incomplete_cache_dir(self, cache_path: str) -> bool:
+        if not os.path.isdir(cache_path) or self._is_complete_cache_dir(cache_path):
+            return False
+        shutil.rmtree(cache_path)
+        return True
+
+    @staticmethod
+    def _is_rename_collision(exc: OSError) -> bool:
+        return exc.errno in {errno.EEXIST, errno.ENOTEMPTY}
+
     def _load_kernel_source(self, device_kernel_path: str, host_kernel_path: str, verbose: bool = False) -> tuple[str | None, str | None]:
         try:
             if verbose:
diff --git a/tilelang/carver/arch/cdna.py b/tilelang/carver/arch/cdna.py
index 5c2d4c4ed6..f27f21ca8f 100644
--- a/tilelang/carver/arch/cdna.py
+++ b/tilelang/carver/arch/cdna.py
@@ -3,6 +3,10 @@
 from tvm.target import Target
 from .arch_base import TileDevice
 
+# LDS size per CU for specific AMD GPU architectures (in bytes).
+# gfx950 (CDNA4 / MI350): 160 KB — larger than the 64 KB default for gfx942.
+_GFX950_LDS_SIZE = 160 * 1024  # 163840 bytes
+
 
 def is_cdna_arch(arch: TileDevice) -> bool:
     return isinstance(arch, CDNA)
@@ -18,7 +22,17 @@ def __init__(self, target: Target | str):
             raise RuntimeError("Cannot find HIP device 0.")
         self.device: tvm.runtime.Device = device
         self.platform: str = "CDNA"
-        self.smem_cap = device.max_shared_memory_per_block
+
+        # TVM runtime should correctly report 160 KB (163840 B) for gfx950; the
+        # override is kept as a safety net in case an older driver reports the
+        # conservative 64 KB default.
+        mcpu = str(target.attrs.get("mcpu", ""))
+        reported = device.max_shared_memory_per_block
+        if "gfx950" in mcpu and reported < _GFX950_LDS_SIZE:
+            self.smem_cap = _GFX950_LDS_SIZE
+        else:
+            self.smem_cap = reported
+
         self.compute_max_core = device.multi_processor_count
         self.warp_size = device.warp_size
         self.compute_capability = device.compute_version.replace(".", "")
diff --git a/tilelang/carver/roller/shape_inference/common.py b/tilelang/carver/roller/shape_inference/common.py
index 4a3c34f3af..c29ae41298 100644
--- a/tilelang/carver/roller/shape_inference/common.py
+++ b/tilelang/carver/roller/shape_inference/common.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from collections import OrderedDict
 
 from tvm import arith
diff --git a/tilelang/carver/template/base.py b/tilelang/carver/template/base.py
index 98a4fe83fb..4a699fbc7d 100644
--- a/tilelang/carver/template/base.py
+++ b/tilelang/carver/template/base.py
@@ -1,5 +1,4 @@
 # Import necessary modules and classes
-from __future__ import annotations
 from abc import ABC, abstractmethod  # For defining abstract base classes
 from dataclasses import dataclass, field  # For defining data classes
 from ..arch import (  # Import architecture-related utilities and classes
@@ -47,7 +46,7 @@ def get_hardware_aware_configs(self, arch: TileDevice = None, topk: int = 10) ->
         """
         pass
 
-    def with_arch(self, arch: TileDevice) -> BaseTemplate:
+    def with_arch(self, arch: TileDevice) -> "BaseTemplate":
         """
         Sets the architecture for this template and returns itself.
 
@@ -115,7 +114,7 @@ def initialize_function(self) -> None:
         """
         raise NotImplementedError("initialize_function is not implemented")
 
-    def set_function(self, func: PrimFunc) -> BaseTemplate:
+    def set_function(self, func: PrimFunc) -> "BaseTemplate":
         """
         Sets the function for this template and returns itself.
 
@@ -128,7 +127,7 @@ def set_function(self, func: PrimFunc) -> BaseTemplate:
         self._func = func
         return self
 
-    def set_output_nodes(self, output_nodes: list[OutputNode]) -> BaseTemplate:
+    def set_output_nodes(self, output_nodes: list[OutputNode]) -> "BaseTemplate":
         """
         Sets the output nodes for this template and returns itself.
 
diff --git a/tilelang/carver/template/conv.py b/tilelang/carver/template/conv.py
index 33179c7c13..c339e58948 100644
--- a/tilelang/carver/template/conv.py
+++ b/tilelang/carver/template/conv.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te, tir
diff --git a/tilelang/carver/template/elementwise.py b/tilelang/carver/template/elementwise.py
index 26d5315299..8cd3061980 100644
--- a/tilelang/carver/template/elementwise.py
+++ b/tilelang/carver/template/elementwise.py
@@ -1,5 +1,4 @@
 # Import necessary modules
-from __future__ import annotations
 from dataclasses import dataclass  # Used for defining data classes
 from .base import BaseTemplate  # Importing the base class for templates
 from tvm import te  # Importing TVM's tensor expression module
diff --git a/tilelang/carver/template/flashattention.py b/tilelang/carver/template/flashattention.py
index 0a0a4c6dde..933ab95854 100644
--- a/tilelang/carver/template/flashattention.py
+++ b/tilelang/carver/template/flashattention.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
diff --git a/tilelang/carver/template/gemv.py b/tilelang/carver/template/gemv.py
index d96b679ecf..e7962f6ad7 100644
--- a/tilelang/carver/template/gemv.py
+++ b/tilelang/carver/template/gemv.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
diff --git a/tilelang/carver/template/matmul.py b/tilelang/carver/template/matmul.py
index e7cbb66f87..57c92beb75 100644
--- a/tilelang/carver/template/matmul.py
+++ b/tilelang/carver/template/matmul.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 from dataclasses import dataclass
 from .base import BaseTemplate
 from tvm import te
diff --git a/tilelang/contrib/cutedsl/__init__.py b/tilelang/contrib/cutedsl/__init__.py
index 1028badeaa..565cfea3fc 100644
--- a/tilelang/contrib/cutedsl/__init__.py
+++ b/tilelang/contrib/cutedsl/__init__.py
@@ -1,128 +1,39 @@
-import cutlass
-import cutlass.cute as cute
-from cutlass._mlir.dialects import nvvm
-from cutlass.cutlass_dsl import T
+import cutlass  # noqa: F401
+import cutlass.cute as cute  # noqa: F401
 
 # re-export cutlass.cute.arch functions first
 from cutlass.cute.arch import sync_threads  # noqa: F401
 from cutlass.cute.arch import alloc_smem, get_dyn_smem  # noqa: F401
 from cutlass.cute.arch import warpgroup_reg_alloc, warpgroup_reg_dealloc  # noqa: F401
+from cutlass.cute.nvgpu.warpgroup.helpers import wait_group as wgmma_wait_group  # noqa: F401
 
-from cutlass.cute import make_tensor, make_rmem_tensor, recast_ptr  # noqa: F401
-from cutlass.cute.typing import Numeric
+from cutlass.cute import make_tensor, make_rmem_tensor, recast_ptr, where  # noqa: F401
+from cutlass.cute.typing import Numeric  # noqa: F401
 
-from cutlass.base_dsl.typing import as_numeric, Int32, Uint16, Uint32  # noqa: F401
-from cutlass._mlir.dialects import llvm, arith  # noqa: F401
-from cutlass._mlir import ir as mlir_ir
-from cutlass.cutlass_dsl import dsl_user_op
+from cutlass.base_dsl.typing import as_numeric, Int8, Int16, Int32, Uint8, Uint16, Uint32, Float16, Float32, BFloat16  # noqa: F401
+from cutlass._mlir.dialects import llvm, arith, nvvm  # noqa: F401
+from cutlass._mlir import ir as mlir_ir  # noqa: F401
+from cutlass.cutlass_dsl import dsl_user_op  # noqa: F401
 
 # Import our custom implementations (will override if names conflict)
-from .mbar import *
-from .cpasync import *
-from .gemm_V1 import *
-from .reduce import *
-from .ldsm import *
-from .math import *
-from .threadblock_swizzle import *
-
-# Forward nvvm enums
-from cutlass._mlir.dialects.nvvm import (
-    MemOrderKind,
-    MemScopeKind,
-    AtomicOpKind,
-)
-
-BYTES_PER_TENSORMAP = 128
-BYTES_PER_POINTER = 8
-
-
-def make_filled_tensor(shape, value):
-    t = cute.make_rmem_tensor(shape, type(value))
-    t.fill(value)
-    return t
-
-
-def make_tensor_at_offset(ptr: cute.Pointer, offset, shape, div_by=1):
-    if div_by != 1:
-        offset = cute.assume(cutlass.as_numeric(offset), divby=div_by)
-    return cute.make_tensor(ptr + offset, shape)
-
-
-def shuffle_elect(thread_extent):
-    # thread_extent is the number of threads of a warpgroup
-    warp_idx = cute.arch.warp_idx()
-    warp_idx = cute.arch.make_warp_uniform(warp_idx)
-    if thread_extent == 0:
-        return warp_idx == 0
-    else:
-        return (warp_idx % (thread_extent // 32)) == 0
-
-
-def sync_thread_partial(barrier_id=None, thread_count=None):
-    bar_sync_ptx(barrier_id, thread_count)
-
-
-# Packing functions
-def pack_half2(x, y):
-    """
-    Pack two half-precision (fp16) values into a single 32-bit value.
-    Corresponds to CUDA's __pack_half2 intrinsic.
-
-    This packs two fp16 values into a single int32 by treating the fp16 bits
-    as raw data and concatenating them.
-    """
-
-    @dsl_user_op
-    def pack_half2_impl(x_val, y_val, *, loc=None, ip=None):
-        # Cast fp16 to uint16 (bitcast)
-        x_ir = x_val.ir_value(loc=loc, ip=ip) if hasattr(x_val, "ir_value") else x_val
-        y_ir = y_val.ir_value(loc=loc, ip=ip) if hasattr(y_val, "ir_value") else y_val
-
-        # Bitcast fp16 to i16
-        i16_type = mlir_ir.IntegerType.get_signless(16)
-        x_i16 = llvm.bitcast(i16_type, x_ir, loc=loc, ip=ip)
-        y_i16 = llvm.bitcast(i16_type, y_ir, loc=loc, ip=ip)
-
-        packed_xy = llvm.inline_asm(
-            Int32.mlir_type,
-            [x_i16, y_i16],
-            "mov.b32 $0, {$1, $2};",
-            "=r,h,h",
-            has_side_effects=True,
-            is_align_stack=False,
-            asm_dialect=llvm.AsmDialect.AD_ATT,
-            loc=loc,
-            ip=ip,
-        )
-
-        return Int32(packed_xy)
-
-    return pack_half2_impl(x, y)
-
-
-def AtomicAdd(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
-    if ptr.dtype == cutlass.Float32:
-        ret = nvvm.atomicrmw(
-            T.f32(),
-            AtomicOpKind.FADD,
-            ptr.llvm_ptr,
-            ptr.dtype(value).ir_value(loc=loc, ip=ip),
-            mem_order=MemOrderKind.RELAXED,
-            syncscope=MemScopeKind.GPU,
-            loc=loc,
-            ip=ip,
-        )
-    elif ptr.dtype == cutlass.Int32:
-        ret = nvvm.atomicrmw(
-            T.i32(),
-            AtomicOpKind.ADD,
-            ptr.llvm_ptr,
-            ptr.dtype(value).ir_value(loc=loc, ip=ip),
-            mem_order=MemOrderKind.RELAXED,
-            syncscope=MemScopeKind.GPU,
-            loc=loc,
-            ip=ip,
-        )
-    else:
-        raise ValueError(f"Unsupported dtype: {ptr.dtype}")
-    return ptr.dtype(ret)
+from .utils import *  # noqa: F401,F403
+from .cpasync import *  # noqa: F401,F403
+from .gemm_v1 import *  # noqa: F401,F403
+from .reduce import *  # noqa: F401,F403
+from .ldsm import *  # noqa: F401,F403
+from .ptx_mma import *  # noqa: F401,F403
+from .math import *  # noqa: F401,F403
+from .threadblock_swizzle import *  # noqa: F401,F403
+from .atomic import *  # noqa: F401,F403
+from .quantize import *  # noqa: F401,F403
+from .warp import *  # noqa: F401,F403
+from .gemm_v2 import *  # noqa: F401,F403
+from .gemm_tcgen05 import *  # noqa: F401,F403
+from .ieee_math import *  # noqa: F401,F403
+from .grid_sync import *  # noqa: F401,F403
+
+
+def thread_idx():
+    """Return linear thread index (threadIdx.x)."""
+    tidx, _, _ = cute.arch.thread_idx()
+    return tidx
diff --git a/tilelang/contrib/cutedsl/atomic.py b/tilelang/contrib/cutedsl/atomic.py
new file mode 100644
index 0000000000..833ac52a6d
--- /dev/null
+++ b/tilelang/contrib/cutedsl/atomic.py
@@ -0,0 +1,630 @@
+"""
+Atomic operations for CuTeDSL backend.
+
+This module provides implementations of atomic operations using NVVM and LLVM dialects.
+"""
+
+__all__ = [
+    "AtomicAdd",
+    "AtomicAddRet",
+    "AtomicAddx2",
+    "AtomicAddx4",
+    "AtomicMax",
+    "AtomicMaxRet",
+    "AtomicMin",
+    "AtomicMinRet",
+    "AtomicLoad",
+    "AtomicStore",
+]
+
+import cutlass
+from cutlass import cute
+from cutlass._mlir.extras import types as T
+from cutlass._mlir.dialects import nvvm, llvm
+from cutlass._mlir.dialects._nvvm_enum_gen import (
+    AtomicOpKind,
+    MemOrderKind,
+    MemScopeKind,
+)
+
+# Type alias for numeric values
+Numeric = cutlass.Float32 | cutlass.Float16 | cutlass.Int32 | cutlass.Int64 | int | float
+
+
+def _memory_order_to_llvm_load(memory_order: int):
+    """Convert TileLang memory order ID to LLVM atomic ordering for loads.
+
+    TileLang memory order mapping:
+        0: relaxed   -> monotonic
+        1: consume   -> acquire (consume is deprecated, use acquire)
+        2: acquire   -> acquire
+        3: release   -> acquire (release invalid for load)
+        4: acq_rel   -> acquire (acq_rel for load = acquire)
+        5: seq_cst   -> acquire (NVPTX llvm.load doesn't support seq_cst)
+
+    Note: NVPTX backend only supports monotonic/acquire for loads.
+    """
+    mapping = {
+        0: llvm.AtomicOrdering.monotonic,
+        1: llvm.AtomicOrdering.acquire,
+        2: llvm.AtomicOrdering.acquire,
+        3: llvm.AtomicOrdering.acquire,  # release invalid for load
+        4: llvm.AtomicOrdering.acquire,  # acq_rel -> acquire for load
+        5: llvm.AtomicOrdering.acquire,  # seq_cst -> acquire (NVPTX limitation)
+    }
+    return mapping.get(memory_order, llvm.AtomicOrdering.monotonic)
+
+
+def _memory_order_to_llvm_store(memory_order: int):
+    """Convert TileLang memory order ID to LLVM atomic ordering for stores.
+
+    TileLang memory order mapping:
+        0: relaxed   -> monotonic
+        1: consume   -> release (consume invalid for store)
+        2: acquire   -> release (acquire invalid for store)
+        3: release   -> release
+        4: acq_rel   -> release (acq_rel for store = release)
+        5: seq_cst   -> release (NVPTX llvm.store doesn't support seq_cst)
+
+    Note: NVPTX backend only supports monotonic/release for stores.
+    """
+    mapping = {
+        0: llvm.AtomicOrdering.monotonic,
+        1: llvm.AtomicOrdering.release,  # consume invalid for store
+        2: llvm.AtomicOrdering.release,  # acquire invalid for store
+        3: llvm.AtomicOrdering.release,
+        4: llvm.AtomicOrdering.release,  # acq_rel -> release for store
+        5: llvm.AtomicOrdering.release,  # seq_cst -> release (NVPTX limitation)
+    }
+    return mapping.get(memory_order, llvm.AtomicOrdering.monotonic)
+
+
+# =============================================================================
+# AtomicAdd - Scalar atomic addition
+# =============================================================================
+
+
+def AtomicAdd(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    """Perform atomic addition on a pointer.
+
+    Supports float16, float32, int32, and int64 types.
+    Returns the old value before addition (atomicrmw semantics).
+    """
+    if ptr.dtype == cutlass.Float32:
+        ret = nvvm.atomicrmw(
+            T.f32(),
+            AtomicOpKind.FADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+        return ptr.dtype(ret)
+    elif ptr.dtype == cutlass.Float16:
+        # For float16, use inline PTX: atom.add.noftz.f16 (no .global qualifier)
+        # LLVM inline_asm doesn't support f16 directly, so we bitcast to i16
+        # The PTX syntax is: atom.add.noftz.f16 result, [ptr], value
+        val_ir = cutlass.Float16(value).ir_value(loc=loc, ip=ip)
+        # Bitcast f16 -> i16 for inline asm compatibility
+        val_i16 = llvm.bitcast(T.i16(), val_ir, loc=loc, ip=ip)
+        res_i16 = llvm.inline_asm(
+            T.i16(),
+            [ptr.llvm_ptr, val_i16],
+            "atom.add.noftz.f16 $0, [$1], $2;",
+            "=h,l,h",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+        # Bitcast i16 -> f16 for return
+        result = llvm.bitcast(T.f16(), res_i16, loc=loc, ip=ip)
+        return cutlass.Float16(result)
+    elif ptr.dtype == cutlass.Int32:
+        ret = nvvm.atomicrmw(
+            T.i32(),
+            AtomicOpKind.ADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+        return ptr.dtype(ret)
+    elif ptr.dtype == cutlass.Int64:
+        ret = nvvm.atomicrmw(
+            T.i64(),
+            AtomicOpKind.ADD,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+        return ptr.dtype(ret)
+    else:
+        raise ValueError(f"Unsupported dtype for AtomicAdd: {ptr.dtype}")
+
+
+def AtomicAddRet(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    """Perform atomic addition and return the previous value.
+
+    This is the same as AtomicAdd since nvvm.atomicrmw always returns old value.
+    """
+    return AtomicAdd(ptr, value, loc=loc, ip=ip)
+
+
+# =============================================================================
+# AtomicAddx2/x4 - Vectorized atomic addition
+# =============================================================================
+
+
+def _load_from_src(src_values, count):
+    """Load elements from src_values, handling both TensorSSA and _Pointer types."""
+    if isinstance(src_values, cute.Pointer):
+        # Create a tensor from pointer and load elements
+        src_tensor = cute.make_tensor(src_values, cute.make_layout((count,)))
+        return [src_tensor[i] for i in range(count)]
+    return [src_values[i] for i in range(count)]
+
+
+def AtomicAddx2(dst_ptr: cute.Pointer, src_values, *, loc=None, ip=None):
+    """Vectorized atomic add for 2 consecutive elements.
+
+    Uses PTX atom.add.v2.f32 for float32 or atom.add.noftz.v2.f16 for float16.
+
+    Args:
+        dst_ptr: Pointer to destination (2 consecutive elements)
+        src_values: Source values - can be TensorSSA (loaded tensor) or Pointer
+    """
+    vals = _load_from_src(src_values, 2)
+    val0 = vals[0]
+    val1 = vals[1]
+
+    if dst_ptr.dtype == cutlass.Float16:
+        # fp16: use atom.add.noftz.v2.f16 with i16 bitcast (LLVM asm doesn't support f16)
+        val0_ir = cutlass.Float16(val0).ir_value(loc=loc, ip=ip)
+        val1_ir = cutlass.Float16(val1).ir_value(loc=loc, ip=ip)
+        val0_i16 = llvm.bitcast(T.i16(), val0_ir, loc=loc, ip=ip)
+        val1_i16 = llvm.bitcast(T.i16(), val1_ir, loc=loc, ip=ip)
+        res_type = llvm.StructType.get_literal([T.i16()] * 2)
+        llvm.inline_asm(
+            res_type,
+            [dst_ptr.llvm_ptr, val0_i16, val1_i16],
+            "atom.add.noftz.v2.f16 {$0,$1}, [$2], {$3,$4};",
+            "=h,=h,l,h,h",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    elif dst_ptr.dtype == cutlass.BFloat16:
+        # bf16: use atom.add.noftz.v2.bf16 with i16 bitcast
+        val0_ir = cutlass.BFloat16(val0).ir_value(loc=loc, ip=ip)
+        val1_ir = cutlass.BFloat16(val1).ir_value(loc=loc, ip=ip)
+        val0_i16 = llvm.bitcast(T.i16(), val0_ir, loc=loc, ip=ip)
+        val1_i16 = llvm.bitcast(T.i16(), val1_ir, loc=loc, ip=ip)
+        res_type = llvm.StructType.get_literal([T.i16()] * 2)
+        llvm.inline_asm(
+            res_type,
+            [dst_ptr.llvm_ptr, val0_i16, val1_i16],
+            "atom.add.noftz.v2.bf16 {$0,$1}, [$2], {$3,$4};",
+            "=h,=h,l,h,h",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        # float32 (default): use atom.add.v2.f32
+        res_type = llvm.StructType.get_literal([T.f32()] * 2)
+        llvm.inline_asm(
+            res_type,
+            [dst_ptr.llvm_ptr, cutlass.Float32(val0).ir_value(loc=loc, ip=ip), cutlass.Float32(val1).ir_value(loc=loc, ip=ip)],
+            "atom.add.v2.f32 {$0,$1}, [$2], {$3,$4};",
+            "=f,=f,l,f,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+
+def AtomicAddx4(dst_ptr: cute.Pointer, src_values, *, loc=None, ip=None):
+    """Vectorized atomic add for 4 consecutive float32 elements.
+
+    Uses PTX atom.global.add.v4.f32 for true vectorized atomic operation on SM90+.
+
+    Args:
+        dst_ptr: Pointer to destination (4 consecutive float32 elements)
+        src_values: Source values - can be TensorSSA (loaded tensor) or Pointer
+    """
+    vals = _load_from_src(src_values, 4)
+    val0 = vals[0]
+    val1 = vals[1]
+    val2 = vals[2]
+    val3 = vals[3]
+
+    # Use inline PTX for vectorized atomic add
+    res_type = llvm.StructType.get_literal([T.f32()] * 4)
+    llvm.inline_asm(
+        res_type,
+        [
+            dst_ptr.llvm_ptr,
+            cutlass.Float32(val0).ir_value(loc=loc, ip=ip),
+            cutlass.Float32(val1).ir_value(loc=loc, ip=ip),
+            cutlass.Float32(val2).ir_value(loc=loc, ip=ip),
+            cutlass.Float32(val3).ir_value(loc=loc, ip=ip),
+        ],
+        "atom.global.add.v4.f32 {$0,$1,$2,$3}, [$4], {$5,$6,$7,$8};",
+        "=f,=f,=f,=f,l,f,f,f,f",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+        loc=loc,
+        ip=ip,
+    )
+
+
+# =============================================================================
+# AtomicMax - Scalar atomic maximum
+# =============================================================================
+
+
+def AtomicMax(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    """Perform atomic maximum operation.
+
+    For integers, uses nvvm.atomicrmw with MAX.
+    For floats, uses CAS loop since PTX doesn't have atomic max for float32.
+    """
+    if ptr.dtype == cutlass.Int32:
+        ret = nvvm.atomicrmw(
+            T.i32(),
+            AtomicOpKind.MAX,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+        return ptr.dtype(ret)
+    elif ptr.dtype == cutlass.Float32:
+        # For float32, use atomicCAS loop via inline PTX
+        # This implements: atomicMax for float using compare-and-swap
+        # PTX doesn't have atom.max.f32, so we use atom.cas loop
+        val_ir = cutlass.Float32(value).ir_value(loc=loc, ip=ip)
+        # Use inline PTX with a CAS loop for float max
+        # The PTX instruction is: atom.global.cas.b32
+        # We load, compare with max, then CAS until success
+        # NOTE: The retry comparison uses integer (b32) domain instead of
+        # floating-point to avoid infinite loops when values are NaN
+        # (IEEE 754: NaN != NaN is always true in float domain).
+        result = llvm.inline_asm(
+            T.f32(),
+            [ptr.llvm_ptr, val_ir],
+            """
+            {
+                .reg .pred p;
+                .reg .f32 expected, new_val;
+                .reg .b32 expected_bits, new_bits, result_bits;
+                ld.f32 expected, [$1];
+            retry:
+                max.f32 new_val, expected, $2;
+                mov.b32 expected_bits, expected;
+                mov.b32 new_bits, new_val;
+                atom.cas.b32 result_bits, [$1], expected_bits, new_bits;
+                setp.ne.b32 p, result_bits, expected_bits;
+                mov.b32 expected, result_bits;
+                @p bra retry;
+                mov.f32 $0, expected;
+            }
+            """,
+            "=f,l,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+        return cutlass.Float32(result)
+    elif ptr.dtype == cutlass.Int64:
+        ret = nvvm.atomicrmw(
+            T.i64(),
+            AtomicOpKind.MAX,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+        return ptr.dtype(ret)
+    else:
+        raise ValueError(f"Unsupported dtype for AtomicMax: {ptr.dtype}")
+
+
+def AtomicMaxRet(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    """Perform atomic maximum and return the previous value."""
+    return AtomicMax(ptr, value, loc=loc, ip=ip)
+
+
+# =============================================================================
+# AtomicMin - Scalar atomic minimum
+# =============================================================================
+
+
+def AtomicMin(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    """Perform atomic minimum operation.
+
+    For integers, uses nvvm.atomicrmw with MIN.
+    For floats, uses CAS loop since PTX doesn't have atomic min for float32.
+    """
+    if ptr.dtype == cutlass.Int32:
+        ret = nvvm.atomicrmw(
+            T.i32(),
+            AtomicOpKind.MIN,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+        return ptr.dtype(ret)
+    elif ptr.dtype == cutlass.Float32:
+        # For float32, use atomicCAS loop via inline PTX
+        # PTX doesn't have atom.min.f32, so we use atom.cas loop
+        # NOTE: The retry comparison uses integer (b32) domain instead of
+        # floating-point to avoid infinite loops when values are NaN.
+        val_ir = cutlass.Float32(value).ir_value(loc=loc, ip=ip)
+        result = llvm.inline_asm(
+            T.f32(),
+            [ptr.llvm_ptr, val_ir],
+            """
+            {
+                .reg .pred p;
+                .reg .f32 expected, new_val;
+                .reg .b32 expected_bits, new_bits, result_bits;
+                ld.f32 expected, [$1];
+            retry:
+                min.f32 new_val, expected, $2;
+                mov.b32 expected_bits, expected;
+                mov.b32 new_bits, new_val;
+                atom.cas.b32 result_bits, [$1], expected_bits, new_bits;
+                setp.ne.b32 p, result_bits, expected_bits;
+                mov.b32 expected, result_bits;
+                @p bra retry;
+                mov.f32 $0, expected;
+            }
+            """,
+            "=f,l,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+        return cutlass.Float32(result)
+    elif ptr.dtype == cutlass.Int64:
+        ret = nvvm.atomicrmw(
+            T.i64(),
+            AtomicOpKind.MIN,
+            ptr.llvm_ptr,
+            ptr.dtype(value).ir_value(loc=loc, ip=ip),
+            mem_order=MemOrderKind.RELAXED,
+            syncscope=MemScopeKind.GPU,
+            loc=loc,
+            ip=ip,
+        )
+        return ptr.dtype(ret)
+    else:
+        raise ValueError(f"Unsupported dtype for AtomicMin: {ptr.dtype}")
+
+
+def AtomicMinRet(ptr: cute.Pointer, value: Numeric, *, loc=None, ip=None):
+    """Perform atomic minimum and return the previous value."""
+    return AtomicMin(ptr, value, loc=loc, ip=ip)
+
+
+# =============================================================================
+# AtomicLoad - Atomic load with memory ordering
+# =============================================================================
+
+
+def _get_ptx_load_ordering(memory_order: int) -> str:
+    """Get PTX memory ordering modifier for load.
+
+    TileLang memory order:
+        0: relaxed -> .relaxed.gpu
+        1: consume -> .acquire.gpu (consume deprecated)
+        2: acquire -> .acquire.gpu
+        3: release -> .acquire.gpu (invalid for load)
+        4: acq_rel -> .acquire.gpu (load part)
+        5: seq_cst -> fence.sc + .relaxed.gpu
+    """
+    # For seq_cst, we need fence before load - handled separately
+    if memory_order == 5:
+        return "relaxed.gpu"
+    elif memory_order in (1, 2, 3, 4):
+        return "acquire.gpu"
+    else:  # 0 or default
+        return "relaxed.gpu"
+
+
+def AtomicLoad(ptr: cute.Pointer, memory_order: int, *, loc=None, ip=None):
+    """Perform atomic load with specified memory ordering.
+
+    Args:
+        ptr: Pointer to load from
+        memory_order: TileLang memory order ID (0=relaxed, 2=acquire, 5=seq_cst, etc.)
+
+    Returns:
+        The loaded value
+
+    PTX mapping (per NVIDIA ABI):
+        relaxed: ld.relaxed.<scope>
+        acquire: ld.acquire.<scope>
+        seq_cst: fence.sc.<scope>; ld.relaxed.<scope>
+    """
+    ordering = _get_ptx_load_ordering(memory_order)
+    is_seq_cst = memory_order == 5
+
+    if ptr.dtype == cutlass.Int32:
+        if is_seq_cst:
+            # seq_cst requires fence before relaxed load
+            asm_str = "fence.sc.gpu; ld.relaxed.gpu.s32 $0, [$1];"
+        else:
+            asm_str = f"ld.{ordering}.s32 $0, [$1];"
+        result = llvm.inline_asm(
+            T.i32(),
+            [ptr.llvm_ptr],
+            asm_str,
+            "=r,l",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+        return cutlass.Int32(result)
+    elif ptr.dtype == cutlass.Float32:
+        if is_seq_cst:
+            asm_str = "fence.sc.gpu; ld.relaxed.gpu.f32 $0, [$1];"
+        else:
+            asm_str = f"ld.{ordering}.f32 $0, [$1];"
+        result = llvm.inline_asm(
+            T.f32(),
+            [ptr.llvm_ptr],
+            asm_str,
+            "=f,l",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+        return cutlass.Float32(result)
+    elif ptr.dtype == cutlass.Int64:
+        if is_seq_cst:
+            asm_str = "fence.sc.gpu; ld.relaxed.gpu.s64 $0, [$1];"
+        else:
+            asm_str = f"ld.{ordering}.s64 $0, [$1];"
+        result = llvm.inline_asm(
+            T.i64(),
+            [ptr.llvm_ptr],
+            asm_str,
+            "=l,l",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+        return cutlass.Int64(result)
+    else:
+        raise ValueError(f"Unsupported dtype for AtomicLoad: {ptr.dtype}")
+
+
+# =============================================================================
+# AtomicStore - Atomic store with memory ordering
+# =============================================================================
+
+
+def _get_ptx_store_ordering(memory_order: int) -> str:
+    """Get PTX memory ordering modifier for store.
+
+    TileLang memory order:
+        0: relaxed -> .relaxed.gpu
+        1: consume -> .release.gpu (invalid for store)
+        2: acquire -> .release.gpu (invalid for store)
+        3: release -> .release.gpu
+        4: acq_rel -> .release.gpu (store part)
+        5: seq_cst -> fence.sc + .relaxed.gpu
+    """
+    # For seq_cst, we need fence before store - handled separately
+    if memory_order == 5:
+        return "relaxed.gpu"
+    elif memory_order in (1, 2, 3, 4):
+        return "release.gpu"
+    else:  # 0 or default
+        return "relaxed.gpu"
+
+
+def AtomicStore(ptr: cute.Pointer, value: Numeric, memory_order: int, *, loc=None, ip=None):
+    """Perform atomic store with specified memory ordering.
+
+    Args:
+        ptr: Pointer to store to
+        value: Value to store
+        memory_order: TileLang memory order ID (0=relaxed, 3=release, 5=seq_cst, etc.)
+
+    PTX mapping (per NVIDIA ABI):
+        relaxed: st.relaxed.<scope>
+        release: st.release.<scope>
+        seq_cst: fence.sc.<scope>; st.relaxed.<scope>
+    """
+    ordering = _get_ptx_store_ordering(memory_order)
+    is_seq_cst = memory_order == 5
+
+    if ptr.dtype == cutlass.Int32:
+        val_ir = cutlass.Int32(value).ir_value(loc=loc, ip=ip)
+        if is_seq_cst:
+            asm_str = "fence.sc.gpu; st.relaxed.gpu.s32 [$0], $1;"
+        else:
+            asm_str = f"st.{ordering}.s32 [$0], $1;"
+        llvm.inline_asm(
+            None,
+            [ptr.llvm_ptr, val_ir],
+            asm_str,
+            "l,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    elif ptr.dtype == cutlass.Float32:
+        val_ir = cutlass.Float32(value).ir_value(loc=loc, ip=ip)
+        if is_seq_cst:
+            asm_str = "fence.sc.gpu; st.relaxed.gpu.f32 [$0], $1;"
+        else:
+            asm_str = f"st.{ordering}.f32 [$0], $1;"
+        llvm.inline_asm(
+            None,
+            [ptr.llvm_ptr, val_ir],
+            asm_str,
+            "l,f",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    elif ptr.dtype == cutlass.Int64:
+        val_ir = cutlass.Int64(value).ir_value(loc=loc, ip=ip)
+        if is_seq_cst:
+            asm_str = "fence.sc.gpu; st.relaxed.gpu.s64 [$0], $1;"
+        else:
+            asm_str = f"st.{ordering}.s64 [$0], $1;"
+        llvm.inline_asm(
+            None,
+            [ptr.llvm_ptr, val_ir],
+            asm_str,
+            "l,l",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    else:
+        raise ValueError(f"Unsupported dtype for AtomicStore: {ptr.dtype}")
diff --git a/tilelang/contrib/cutedsl/cpasync.py b/tilelang/contrib/cutedsl/cpasync.py
index 6ddeb89337..68526a8198 100644
--- a/tilelang/contrib/cutedsl/cpasync.py
+++ b/tilelang/contrib/cutedsl/cpasync.py
@@ -1,4 +1,32 @@
 from __future__ import annotations
+
+__all__ = [
+    # cp.async operations
+    "cp_async_commit",
+    "cp_async_wait",
+    "cp_async_gs",
+    "cp_async_gs_conditional",
+    "cp_async_shared_global",
+    # TMA operations
+    "extract_tensormap_ptr",
+    "tma_load",
+    "tma_store",
+    "tma_reduce",
+    "tma_store_arrive",
+    "tma_store_wait",
+    "prefetch_tma_descriptor",
+    # Mbarrier operations (merged from mbar.py)
+    "mbarrier_init",
+    "mbarrier_expect_tx",
+    "mbarrier_arrive",
+    "arrive_and_expect_tx",
+    "mbarrier_cp_async_arrive_noinc",
+    "mbarrier_wait",
+    "mbarrier_cp_async_arrive",
+    "fence_proxy_async",
+    "fence_barrier_init",
+]
+
 from cutlass.cutlass_dsl import CuTeDSL, T, if_generate, dsl_user_op  # noqa: F401
 
 from cutlass._mlir.dialects import nvvm, cute_nvgpu  # noqa: F401
@@ -8,17 +36,20 @@
 import cutlass._mlir.dialects.cute_nvgpu as _cute_nvgpu_ir
 
 import cutlass.cute as cute
-from cutlass.cute.typing import Int, Boolean, Int32, Int16, Uint64, Union  # noqa: F401
+from cutlass.cute.typing import Int, Boolean, Int32, Int16, Uint64, Pointer, Union  # noqa: F401
 from cutlass.impl_utils import check_value_in
 
 from cutlass.cute.arch import cp_async_commit_group as cp_async_commit  # noqa: F401
 from cutlass.cute.arch import cp_async_wait_group as cp_async_wait  # noqa: F401
 
-BYTES_PER_TENSORMAP = 128
-BYTES_PER_POINTER = 8
+# Mbarrier operations (merged from mbar.py)
+from cutlass.cute.arch import mbarrier_init, mbarrier_expect_tx, mbarrier_arrive  # noqa: F401
+from cutlass.cute.arch import mbarrier_arrive_and_expect_tx as arrive_and_expect_tx  # noqa: F401
+from cutlass.cute.arch import cp_async_mbarrier_arrive_noinc as mbarrier_cp_async_arrive_noinc  # noqa: F401
+import cutlass.cute.arch as arch
 
 
-def cp_async_gs(size, dst, dst_offset, src, src_offset):
+def cp_async_gs(size, dst, src):
     assert size in [16, 8, 4]
     # use CG (cache global) to by pass L1 when loading contiguous 128B.
     mode = nvvm.LoadCacheModifierKind.CG if size == 16 else nvvm.LoadCacheModifierKind.CA
@@ -34,13 +65,13 @@ def cp_async_gs(size, dst, dst_offset, src, src_offset):
         dst_ptr = dst
     else:
         raise ValueError(f"Invalid destination type: {type(dst)}")
-    cp_async_shared_global(dst_ptr + dst_offset, src_ptr + src_offset, size, mode)
+    cp_async_shared_global(dst_ptr, src_ptr, size, mode)
 
 
 @cute.jit
-def cp_async_gs_conditional(size, dst, dst_offset, src, src_offset, cond):
+def cp_async_gs_conditional(size, dst, src, cond):
     if cond:
-        cp_async_gs(size, dst, dst_offset, src, src_offset)
+        cp_async_gs(size, dst, src)
 
 
 @dsl_user_op
@@ -93,6 +124,9 @@ def tma_load(tma_desc, mbar: cute.Pointer, smem_ptr: cute.Pointer, crd: Int | tu
             tma_desc_ptr = tma_desc.iterator
         else:
             tma_desc_ptr = tma_desc
+        # Ensure crd is a tuple (handle single coordinate case)
+        if not isinstance(crd, tuple):
+            crd = (crd,)
         nvvm.cp_async_bulk_tensor_shared_cluster_global(
             dst_mem=smem_ptr.llvm_ptr,
             tma_descriptor=tma_desc_ptr.llvm_ptr,
@@ -149,6 +183,48 @@ def tma_store(tma_desc, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, l
         )
 
 
+@dsl_user_op
+def tma_reduce(tma_desc, smem_ptr: cute.Pointer, crd: Int | tuple[Int, ...], *, loc=None, ip=None) -> None:
+    """
+    Reduce data from shared memory to global memory using TMA with atomic ADD reduction.
+
+    This performs an atomic add of shared memory data to global memory using
+    the TMA unit's reduce capability.
+
+    :param tma_desc:                 TMA descriptor for the tensor
+    :type tma_desc:                  TMA descriptor
+    :param smem_ptr:                 Source pointer in shared memory
+    :type smem_ptr:                  Pointer
+    :param crd:                      Coordinates tuple for the tensor access
+    :type crd:                       tuple[Int, ...]
+    """
+    from cutlass._mlir.dialects._nvvm_enum_gen import TMAReduxKind, TMAStoreMode
+
+    arch = CuTeDSL._get_dsl().envar.arch
+    check_value_in(arch, ["sm_90", "sm_90a", "sm_100a"], "arch")
+
+    if isinstance(tma_desc, cute.CopyAtom):
+        tma_desc_ptr = extract_tensormap_ptr(tma_desc)
+    elif isinstance(tma_desc, cute.Tensor):
+        tma_desc_ptr = tma_desc.iterator
+    else:
+        tma_desc_ptr = tma_desc
+
+    # Ensure crd is a tuple
+    if not isinstance(crd, tuple):
+        crd = (crd,)
+
+    nvvm.cp_async_bulk_tensor_reduce(
+        tma_descriptor=tma_desc_ptr.llvm_ptr,
+        src_mem=smem_ptr.llvm_ptr,
+        red_kind=TMAReduxKind.ADD,
+        coordinates=[Int32(i).ir_value(loc=loc, ip=ip) for i in crd],
+        mode=TMAStoreMode.TILE,
+        loc=loc,
+        ip=ip,
+    )
+
+
 @dsl_user_op
 def tma_store_arrive(*, loc=None, ip=None) -> None:
     """
@@ -213,3 +289,49 @@ def prefetch_tma_descriptor(tma_desc, *, loc=None, ip=None) -> None:
     else:
         tma_desc_ptr = tma_desc
     nvvm.prefetch_tensormap(tma_desc_ptr.llvm_ptr, loc=loc, ip=ip)
+
+
+# ---------------------------------------------------------------------------
+# Mbarrier operations (merged from mbar.py)
+# ---------------------------------------------------------------------------
+
+from cutlass._mlir.dialects import llvm
+
+
+@dsl_user_op
+def mbarrier_wait(mbar_ptr: Pointer, phase: Int, timeout_ns: Int = 10000000, *, loc=None, ip=None) -> None:
+    """Waits on a mbarrier with a specified phase (blocking loop).
+
+    Uses inline PTX to loop until the try_wait succeeds.
+    The CUDA backend does: while (!mbar.try_wait(parity)) {}
+    """
+    llvm.inline_asm(
+        None,
+        [mbar_ptr.llvm_ptr, Int32(phase).ir_value(loc=loc, ip=ip), Int32(timeout_ns).ir_value(loc=loc, ip=ip)],
+        "{\n.reg .pred p;\nLAB_WAIT:\nmbarrier.try_wait.parity.shared::cta.b64 p, [$0], $1, $2;\n@!p bra LAB_WAIT;\n}",
+        "r,r,r",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+        loc=loc,
+        ip=ip,
+    )
+
+
+@dsl_user_op
+def mbarrier_cp_async_arrive(mbar_ptr: Pointer, *, loc=None, ip=None) -> None:
+    mbar_llvm_ptr = mbar_ptr.llvm_ptr
+    nvvm.cp_async_mbarrier_arrive_shared(
+        mbar_llvm_ptr,
+        noinc=False,
+        loc=loc,
+        ip=ip,
+    )
+
+
+def fence_proxy_async():
+    arch.fence_proxy(arch.ProxyKind.async_shared, space=arch.SharedSpace.shared_cta)
+
+
+def fence_barrier_init():
+    arch.mbarrier_init_fence()
diff --git a/tilelang/contrib/cutedsl/gemm_tcgen05.py b/tilelang/contrib/cutedsl/gemm_tcgen05.py
new file mode 100644
index 0000000000..0ce54ff160
--- /dev/null
+++ b/tilelang/contrib/cutedsl/gemm_tcgen05.py
@@ -0,0 +1,594 @@
+"""
+tcgen05 (SM100/Blackwell) MMA support for CuTeDSL backend.
+
+Provides:
+  - Tcgen05SmemDescriptor: 64-bit SMEM descriptor for tcgen05 MMA
+  - initialize_tcgen05_descriptor: bitfield packing matching common.h layout
+  - tcgen05mma_ss / tcgen05mma_ws_ss / tcgen05mma_ts: MMA PTX inline asm
+  - tcgen05_mma_arrive: mbarrier arrive for MMA commit
+  - tmem_allocate / tmem_deallocate: TMEM allocation/deallocation
+"""
+
+__all__ = [
+    "Tcgen05SmemDescriptor",
+    "initialize_tcgen05_descriptor",
+    "tcgen05mma_ss",
+    "tcgen05mma_ws_ss",
+    "tcgen05mma_ts",
+    "tcgen05_mma_arrive",
+    "tmem_allocate",
+    "tmem_deallocate",
+    "tcgen05_ld_32dp32bNx",
+    "tcgen05_ld_32dp64bNx",
+    "tcgen05_ld_32dp128bNx",
+    "tcgen05_ld_32dp256bNx",
+]
+
+import cutlass
+import cutlass.cute as cute
+from cutlass._mlir.dialects import llvm
+from cutlass._mlir import ir
+from cutlass.cutlass_dsl import Constexpr, dsl_user_op
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Tcgen05 SMEM Descriptor
+# ──────────────────────────────────────────────────────────────────────
+
+
+class Tcgen05SmemDescriptor:
+    """64-bit shared-memory descriptor for tcgen05 MMA (Blackwell).
+
+    Mirrors tl::Tcgen05SMemDescriptor from common.h.
+    Stored as two Int32 registers; recast to Int64 for the PTX operand.
+    """
+
+    def __init__(self, desc_64: cute.Int64 = None):
+        self.desc = cute.make_rmem_tensor((2,), dtype=cutlass.Int32)
+        self.desc_i64 = cute.make_tensor(cute.recast_ptr(self.desc.iterator, dtype=cute.Int64), (1,))
+        if desc_64 is not None:
+            self.desc_i64[0] = desc_64
+
+    def __add__(self, offset):
+        """Add byte offset.  Like C++ operator+, shifts offset >> 4."""
+        res = cute.make_rmem_tensor((2,), dtype=cutlass.Int32)
+        res_i64 = cute.make_tensor(cute.recast_ptr(res.iterator, dtype=cute.Int64), (1,))
+        # Address is in 16-byte units: add (offset >> 4)
+        res[0] = self.desc[0] + (offset >> 4)
+        res[1] = self.desc[1]
+        return Tcgen05SmemDescriptor(res_i64[0])
+
+
+# ──────────────────────────────────────────────────────────────────────
+# Descriptor initialization
+# ──────────────────────────────────────────────────────────────────────
+
+
+def initialize_tcgen05_descriptor(desc, start_address, leading_byte_offset, stride_byte_offset, base_offset, leading_abs, swizzle_mode):
+    """Pack the tcgen05 SMEM descriptor bitfields.
+
+    Matches the C++ ``initialize_tcgen05_descriptor`` in common.h:
+      Low 32 bits (reg32_[0]):
+        [0:14)   start_address >> 4
+        [16:30)  leading_byte_offset  (already >>4 from TIR)
+      High 32 bits (reg32_[1]):
+        [0:14)   stride_byte_offset   (already >>4 from TIR)
+        [14:16)  version = 1
+        [17:20)  base_offset & 0x7
+        [20:21)  lbo_mode (leading_is_absolute ? 1 : 0)
+        [29:32)  layout_type (swizzle_mode & 0x7)
+    """
+    ptr_val = start_address.toint() >> 4
+    desc.desc[0] = cutlass.Int32(ptr_val) | cutlass.Int32(cutlass.Int32(leading_byte_offset) << 16)
+    desc.desc[1] = (
+        cutlass.Int32(stride_byte_offset)
+        | cutlass.Int32(1 << 14)  # version = 1
+        | cutlass.Int32(cutlass.Int32(base_offset & 0x7) << 17)
+        | cutlass.Int32(cutlass.Int32(leading_abs) << 20)
+        | cutlass.Int32(cutlass.Int32(swizzle_mode & 0x7) << 29)
+    )
+
+
+# ──────────────────────────────────────────────────────────────────────
+# PTX kind mapping  (TIR dtype string  ->  PTX kind suffix)
+# ──────────────────────────────────────────────────────────────────────
+
+_TCGEN05_KIND_MAP = {
+    "fp16": "f16",
+    "bf16": "f16",
+    "float16": "f16",
+    "bfloat16": "f16",
+    "tf32": "tf32",
+    "float32": "tf32",
+    "s8": "i8",
+    "u8": "i8",
+    "int8": "i8",
+    "uint8": "i8",
+    "e4m3": "f8f6f4",
+    "e5m2": "f8f6f4",
+    "float8_e4m3": "f8f6f4",
+    "float8_e4m3fn": "f8f6f4",
+    "float8_e5m2": "f8f6f4",
+}
+
+
+def _kind_for(dtype_str):
+    kind = _TCGEN05_KIND_MAP.get(dtype_str)
+    if kind is None:
+        raise ValueError(f"tcgen05mma: unsupported dtype '{dtype_str}'")
+    return kind
+
+
+def _ir(val, loc=None, ip=None):
+    """Extract MLIR IR value from a CuTeDSL value."""
+    return val.ir_value(loc=loc, ip=ip) if hasattr(val, "ir_value") else val
+
+
+# ──────────────────────────────────────────────────────────────────────
+# tcgen05mma_ss  —  both A and B from SMEM descriptors (non-WS)
+# ──────────────────────────────────────────────────────────────────────
+
+
+@cute.jit
+def tcgen05mma_ss(
+    kind_dtype: str,
+    desc_a: Tcgen05SmemDescriptor,
+    desc_b: Tcgen05SmemDescriptor,
+    tmem_c: int,
+    desc_val: int,
+    scale_out: int,
+    mask0: int,
+    mask1: int,
+    mask2: int,
+    mask3: int,
+):
+    """tcgen05.mma.cta_group::1.kind::{kind} [tmem_c], desc_a, desc_b, desc_val, {masks}, p;
+
+    Guarded by elect_one_sync — only one thread in the warp issues the MMA.
+    The TIR codegen also wraps calls in ``if (threadIdx.x >> 5) == 0``
+    which selects warp 0.
+    """
+    kind = _kind_for(kind_dtype)
+
+    # elect.sync selects one thread in the warp to issue the MMA.
+    # The @q predicate goes on the MMA instruction itself (not the block scope).
+    asm_str = (
+        "{\n"
+        ".reg .pred p;\n"
+        ".reg .pred q;\n"
+        "elect.sync _|q, 0xFFFFFFFF;\n"
+        "setp.ne.b32 p, $4, 0;\n"
+        f"@q tcgen05.mma.cta_group::1.kind::{kind} "
+        "[$0], $1, $2, $3, {$5, $6, $7, $8}, p;\n"
+        "}"
+    )
+
+    @dsl_user_op
+    def _do_mma(c_val, da_val, db_val, dv_val, sc_val, m0_val, m1_val, m2_val, m3_val, *, loc=None, ip=None):
+        llvm.inline_asm(
+            None,
+            [
+                _ir(c_val, loc, ip),
+                _ir(da_val, loc, ip),
+                _ir(db_val, loc, ip),
+                _ir(dv_val, loc, ip),
+                _ir(sc_val, loc, ip),
+                _ir(m0_val, loc, ip),
+                _ir(m1_val, loc, ip),
+                _ir(m2_val, loc, ip),
+                _ir(m3_val, loc, ip),
+            ],
+            asm_str,
+            "r,l,l,r,r,r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+    _do_mma(
+        cutlass.Int32(tmem_c),
+        desc_a.desc_i64[0],
+        desc_b.desc_i64[0],
+        cutlass.Int32(desc_val),
+        cutlass.Int32(scale_out),
+        cutlass.Int32(mask0),
+        cutlass.Int32(mask1),
+        cutlass.Int32(mask2),
+        cutlass.Int32(mask3),
+    )
+
+
+# ──────────────────────────────────────────────────────────────────────
+# tcgen05mma_ws_ss  —  warp-specialized variant
+# ──────────────────────────────────────────────────────────────────────
+
+
+@cute.jit
+def tcgen05mma_ws_ss(
+    kind_dtype: str, desc_a: Tcgen05SmemDescriptor, desc_b: Tcgen05SmemDescriptor, tmem_c: int, desc_val: int, scale_out: int
+):
+    """tcgen05.mma.ws.cta_group::1.kind::{kind} [tmem_c], desc_a, desc_b, desc_val, p, 0;"""
+    kind = _kind_for(kind_dtype)
+
+    asm_str = (
+        "{\n"
+        ".reg .pred p;\n"
+        ".reg .pred q;\n"
+        "elect.sync _|q, 0xFFFFFFFF;\n"
+        "setp.ne.b32 p, $4, 0;\n"
+        f"@q tcgen05.mma.ws.cta_group::1.kind::{kind} "
+        "[$0], $1, $2, $3, p, 0;\n"
+        "}"
+    )
+
+    @dsl_user_op
+    def _do_mma_ws(c_val, da_val, db_val, dv_val, sc_val, *, loc=None, ip=None):
+        llvm.inline_asm(
+            None,
+            [_ir(c_val, loc, ip), _ir(da_val, loc, ip), _ir(db_val, loc, ip), _ir(dv_val, loc, ip), _ir(sc_val, loc, ip)],
+            asm_str,
+            "r,l,l,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+    _do_mma_ws(
+        cutlass.Int32(tmem_c),
+        desc_a.desc_i64[0],
+        desc_b.desc_i64[0],
+        cutlass.Int32(desc_val),
+        cutlass.Int32(scale_out),
+    )
+
+
+# ──────────────────────────────────────────────────────────────────────
+# tcgen05mma_ts  —  A from TMEM, B from SMEM descriptor
+# ──────────────────────────────────────────────────────────────────────
+
+
+@cute.jit
+def tcgen05mma_ts(
+    kind_dtype: str,
+    tmem_a: int,
+    desc_b: Tcgen05SmemDescriptor,
+    tmem_c: int,
+    desc_val: int,
+    scale_out: int,
+    mask0: int,
+    mask1: int,
+    mask2: int,
+    mask3: int,
+):
+    """tcgen05.mma.cta_group::1.kind::{kind} [tmem_c], [tmem_a], desc_b, desc_val, {masks}, p;"""
+    kind = _kind_for(kind_dtype)
+
+    # A is [$1] (indirect via TMEM address), not $1 (direct descriptor)
+    asm_str = (
+        "{\n"
+        ".reg .pred p;\n"
+        ".reg .pred q;\n"
+        "elect.sync _|q, 0xFFFFFFFF;\n"
+        "setp.ne.b32 p, $4, 0;\n"
+        f"@q tcgen05.mma.cta_group::1.kind::{kind} "
+        "[$0], [$1], $2, $3, {$5, $6, $7, $8}, p;\n"
+        "}"
+    )
+
+    @dsl_user_op
+    def _do_mma_ts(c_val, a_val, db_val, dv_val, sc_val, m0_val, m1_val, m2_val, m3_val, *, loc=None, ip=None):
+        llvm.inline_asm(
+            None,
+            [
+                _ir(c_val, loc, ip),
+                _ir(a_val, loc, ip),
+                _ir(db_val, loc, ip),
+                _ir(dv_val, loc, ip),
+                _ir(sc_val, loc, ip),
+                _ir(m0_val, loc, ip),
+                _ir(m1_val, loc, ip),
+                _ir(m2_val, loc, ip),
+                _ir(m3_val, loc, ip),
+            ],
+            asm_str,
+            "r,r,l,r,r,r,r,r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+    _do_mma_ts(
+        cutlass.Int32(tmem_c),
+        cutlass.Int32(tmem_a),
+        desc_b.desc_i64[0],
+        cutlass.Int32(desc_val),
+        cutlass.Int32(scale_out),
+        cutlass.Int32(mask0),
+        cutlass.Int32(mask1),
+        cutlass.Int32(mask2),
+        cutlass.Int32(mask3),
+    )
+
+
+# ──────────────────────────────────────────────────────────────────────
+# tcgen05_mma_arrive  —  mbarrier arrive for MMA commit
+# ──────────────────────────────────────────────────────────────────────
+
+
+@cute.jit
+def tcgen05_mma_arrive(mbar_ptr: cute.Pointer):
+    """tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [mbar];
+
+    Guarded by elect_one_sync — only one thread in the warp issues the commit.
+    """
+
+    @dsl_user_op
+    def _do_arrive(bar_val, *, loc=None, ip=None):
+        llvm.inline_asm(
+            None,
+            [_ir(bar_val, loc, ip)],
+            "{\n"
+            ".reg .pred q;\n"
+            "elect.sync _|q, 0xFFFFFFFF;\n"
+            "@q tcgen05.commit.cta_group::1.mbarrier::arrive::one.shared::cluster.b64 [$0];\n"
+            "}",
+            "r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+    bar_intptr = cutlass.Int32(mbar_ptr.toint())
+    _do_arrive(bar_intptr)
+
+
+# ──────────────────────────────────────────────────────────────────────
+# TMEM allocation / deallocation
+# ──────────────────────────────────────────────────────────────────────
+
+
+@cute.jit
+def tmem_allocate(tmem_buffer_ptr: cute.Pointer, num_cols: int):
+    """tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [dst], num_cols;
+
+    tmem_buffer_ptr: SMEM pointer that receives the allocated TMEM address.
+    num_cols: number of columns to allocate.
+    """
+
+    @dsl_user_op
+    def _do_alloc(dst_val, ncols_val, *, loc=None, ip=None):
+        llvm.inline_asm(
+            None,
+            [_ir(dst_val, loc, ip), _ir(ncols_val, loc, ip)],
+            "tcgen05.alloc.cta_group::1.sync.aligned.shared::cta.b32 [$0], $1;",
+            "r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+    dst_intptr = cutlass.Int32(tmem_buffer_ptr.toint())
+    _do_alloc(dst_intptr, cutlass.Int32(num_cols))
+
+
+@cute.jit
+def tmem_deallocate(tmem_ptr: cute.Pointer, num_cols: int):
+    """tcgen05.dealloc.cta_group::1.sync.aligned.b32 tmem_addr, num_cols;
+
+    tmem_ptr: SMEM pointer to the uint32 holding the TMEM address.
+    num_cols: number of columns to deallocate.
+    """
+    # Read the TMEM address from the SMEM location
+    tmem_addr = cute.make_tensor(tmem_ptr, (1,))[0]
+
+    @dsl_user_op
+    def _do_dealloc(tptr_val, ncols_val, *, loc=None, ip=None):
+        llvm.inline_asm(
+            None,
+            [_ir(tptr_val, loc, ip), _ir(ncols_val, loc, ip)],
+            "tcgen05.dealloc.cta_group::1.sync.aligned.b32 $0, $1;",
+            "r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+    _do_dealloc(cutlass.Int32(tmem_addr), cutlass.Int32(num_cols))
+
+
+# ──────────────────────────────────────────────────────────────────────
+# TMEM load  —  tcgen05.ld.sync.aligned.32x32b.xN.b32
+#
+# Uses the same pattern as wgmma_rs: direct llvm.inline_asm calls from
+# within @cute.jit context (no @dsl_user_op wrapper).  The helper
+# functions below are called at Python / JIT-compile time and emit MLIR
+# operations directly into the surrounding @cute.jit function.
+# ──────────────────────────────────────────────────────────────────────
+
+# Max segment size for TMEM loads.  LLVM's NVPTX inline asm chokes on very
+# large operand counts (e.g. x128 = 129 operands), so we cap at x32 which
+# gives 33 operands — well within limits.  For N=128 this produces 4
+# sequential x32 loads.
+_TMEM_LD_MAX_LOG_N = 3  # 1 << 3 = 8  (keep small to avoid LLVM hangs with many operands)
+
+
+def _emit_tmem_ld_segment(ptx_type, seg_x, regs_per_x, addr_ir):
+    """Emit one tcgen05.ld inline asm for a power-of-2 segment.
+
+    Called during @cute.jit compilation — emits MLIR ops directly.
+    ptx_type:    PTX instruction type, e.g. "32x32b", "16x64b", "16x128b", "16x256b"
+    seg_x:       x-count in the PTX instruction (power of 2)
+    regs_per_x:  number of i32 output registers per x-element
+    Returns a list of (seg_x * regs_per_x) cutlass.Int32 values.
+    """
+    i32_type = ir.IntegerType.get_signless(32)
+    total_regs = seg_x * regs_per_x
+
+    if total_regs == 1:
+        result = llvm.inline_asm(
+            i32_type,
+            [addr_ir],
+            f"tcgen05.ld.sync.aligned.{ptx_type}.x{seg_x}.b32 $0, [$1];",
+            "=r,r",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+        )
+        return [cutlass.Int32(result)]
+
+    # Multi-output: struct of i32s
+    out_types = [i32_type] * total_regs
+    result_type = llvm.StructType.get_literal(out_types)
+
+    out_regs = ", ".join(f"${i}" for i in range(total_regs))
+    src_idx = total_regs  # source addr is the last operand
+    asm_str = f"tcgen05.ld.sync.aligned.{ptx_type}.x{seg_x}.b32 {{{out_regs}}}, [${src_idx}];"
+    constraints = ",".join(["=r"] * total_regs) + ",r"
+
+    result = llvm.inline_asm(
+        result_type,
+        [addr_ir],
+        asm_str,
+        constraints,
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+    return [cutlass.Int32(llvm.extractvalue(i32_type, result, [i])) for i in range(total_regs)]
+
+
+def _emit_tmem_ld(n_x, max_log_n, ptx_type, regs_per_x, src_addr, dst_view, dst_offset=0, src_col_offset=0):
+    """Recursively split x-count into power-of-2 segments and emit TMEM loads.
+
+    Called during @cute.jit compilation.
+    n_x:            remaining x-element count to load
+    max_log_n:      max log2 of x-count per PTX instruction
+    ptx_type:       PTX instruction type, e.g. "32x32b", "16x64b"
+    regs_per_x:     i32 output registers per x-element
+    src_addr:       CuTeDSL Int32 (runtime TMEM base address)
+    dst_view:       CuTeDSL tensor view over destination registers
+    dst_offset:     Python int — i32 offset into dst_view (compile-time constant)
+    src_col_offset: Python int — TMEM column offset from src_addr (compile-time constant)
+    """
+    if n_x <= 0:
+        return
+
+    log_n = n_x.bit_length() - 1
+    seg_log = min(log_n, max_log_n)
+    seg_x = 1 << seg_log
+
+    # Compute TMEM address for this segment
+    if src_col_offset == 0:
+        addr_ir = src_addr.ir_value()
+    else:
+        addr_ir = (src_addr + cutlass.Int32(src_col_offset)).ir_value()
+
+    # Emit inline asm and store results
+    results = _emit_tmem_ld_segment(ptx_type, seg_x, regs_per_x, addr_ir)
+    for j, val in enumerate(results):
+        dst_view[dst_offset + j] = val
+
+    # Recurse for remainder
+    total_regs_emitted = seg_x * regs_per_x
+    _emit_tmem_ld(n_x - seg_x, max_log_n, ptx_type, regs_per_x, src_addr, dst_view, dst_offset + total_regs_emitted, src_col_offset + seg_x)
+
+
+def _emit_tmem_fence():
+    """Emit tcgen05.wait fence.  Called during @cute.jit compilation."""
+    llvm.inline_asm(
+        None,
+        [],
+        "tcgen05.wait::ld.sync.aligned;",
+        "",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+
+@cute.jit
+def tcgen05_ld_32dp32bNx(N: Constexpr[int], pack16: Constexpr[bool], tmem_start_col: int, tmem_col_offset: int, dst_ptr: cute.Pointer):
+    """Load N uint32 values from TMEM using tcgen05.ld.sync.aligned.32x32b.
+
+    Matches tl::tcgen05_ld_32dp32bNx from copy_sm100.h.
+    N: number of 32-bit elements to load (x-count, compile-time constant).
+    pack16: if True, use 16-bit packing (not implemented yet).
+    tmem_start_col: TMEM base column address.
+    tmem_col_offset: additional column offset.
+    dst_ptr: destination pointer (register memory).
+    """
+    src_addr = cutlass.Int32(tmem_start_col) + cutlass.Int32(tmem_col_offset)
+    dst_view = cute.make_tensor(cute.recast_ptr(dst_ptr, dtype=cute.Int32), (N,))
+    _emit_tmem_ld(N, _TMEM_LD_MAX_LOG_N, "32x32b", 1, src_addr, dst_view)
+    _emit_tmem_fence()
+
+
+@cute.jit
+def tcgen05_ld_32dp64bNx(N: Constexpr[int], pack16: Constexpr[bool], tmem_start_col: int, tmem_col_offset: int, dst_ptr: cute.Pointer):
+    """Load from TMEM using 32dp64b pattern (2x 16x64b for lower/upper 16 rows).
+
+    Matches tl::tmem_ld_32dp64bNx from tcgen_05_ld.h.
+    N: x-count for 16x64b instructions. Total output: 2*N i32 regs.
+    """
+    total_regs = N * 2
+    src_addr = cutlass.Int32(tmem_start_col) + cutlass.Int32(tmem_col_offset)
+    dst_view = cute.make_tensor(cute.recast_ptr(dst_ptr, dtype=cute.Int32), (total_regs,))
+    # Lower 16 rows
+    _emit_tmem_ld(N, _TMEM_LD_MAX_LOG_N, "16x64b", 1, src_addr, dst_view, dst_offset=0, src_col_offset=0)
+    # Upper 16 rows (TMEM row offset = 16 << 16)
+    upper_addr = src_addr + cutlass.Int32(16 << 16)
+    _emit_tmem_ld(N, _TMEM_LD_MAX_LOG_N, "16x64b", 1, upper_addr, dst_view, dst_offset=N, src_col_offset=0)
+    _emit_tmem_fence()
+
+
+@cute.jit
+def tcgen05_ld_32dp128bNx(N: Constexpr[int], pack16: Constexpr[bool], tmem_start_col: int, tmem_col_offset: int, dst_ptr: cute.Pointer):
+    """Load from TMEM using 32dp128b pattern (2x 16x128b for lower/upper 16 rows).
+
+    Matches tl::tmem_ld_32dp128bNx from tcgen_05_ld.h.
+    N: x-count for 16x128b instructions. Total output: 4*N i32 regs.
+    16x128b.xN produces 2*N i32 regs per half.
+    """
+    regs_per_half = N * 2
+    total_regs = regs_per_half * 2
+    src_addr = cutlass.Int32(tmem_start_col) + cutlass.Int32(tmem_col_offset)
+    dst_view = cute.make_tensor(cute.recast_ptr(dst_ptr, dtype=cute.Int32), (total_regs,))
+    # Lower 16 rows
+    _emit_tmem_ld(N, min(_TMEM_LD_MAX_LOG_N, 6), "16x128b", 2, src_addr, dst_view, dst_offset=0, src_col_offset=0)
+    # Upper 16 rows (TMEM row offset = 16 << 16)
+    upper_addr = src_addr + cutlass.Int32(16 << 16)
+    _emit_tmem_ld(N, min(_TMEM_LD_MAX_LOG_N, 6), "16x128b", 2, upper_addr, dst_view, dst_offset=regs_per_half, src_col_offset=0)
+    _emit_tmem_fence()
+
+
+@cute.jit
+def tcgen05_ld_32dp256bNx(N: Constexpr[int], pack16: Constexpr[bool], tmem_start_col: int, tmem_col_offset: int, dst_ptr: cute.Pointer):
+    """Load from TMEM using 32dp256b pattern (2x 16x256b for lower/upper 16 rows).
+
+    Matches tl::tmem_ld_32dp256bNx from tcgen_05_ld.h.
+    N: x-count for 16x256b instructions. Total output: 8*N i32 regs.
+    16x256b.xN produces 4*N i32 regs per half.
+    """
+    regs_per_half = N * 4
+    total_regs = regs_per_half * 2
+    src_addr = cutlass.Int32(tmem_start_col) + cutlass.Int32(tmem_col_offset)
+    dst_view = cute.make_tensor(cute.recast_ptr(dst_ptr, dtype=cute.Int32), (total_regs,))
+    # Lower 16 rows
+    _emit_tmem_ld(N, min(_TMEM_LD_MAX_LOG_N, 6), "16x256b", 4, src_addr, dst_view, dst_offset=0, src_col_offset=0)
+    # Upper 16 rows (TMEM row offset = 16 << 16)
+    upper_addr = src_addr + cutlass.Int32(16 << 16)
+    _emit_tmem_ld(N, min(_TMEM_LD_MAX_LOG_N, 6), "16x256b", 4, upper_addr, dst_view, dst_offset=regs_per_half, src_col_offset=0)
+    _emit_tmem_fence()
diff --git a/tilelang/contrib/cutedsl/gemm_v1.py b/tilelang/contrib/cutedsl/gemm_v1.py
new file mode 100644
index 0000000000..865bf1bab6
--- /dev/null
+++ b/tilelang/contrib/cutedsl/gemm_v1.py
@@ -0,0 +1,579 @@
+__all__ = [
+    "make_aligned_tensor",
+    "gemm_ss",
+    "gemm_rs",
+    "gemm_sr",
+    "gemm_rr",
+    "Gemm_SM80",
+    "Gemm_SM90",
+]
+
+import cutlass
+import cutlass.cute as cute
+import cutlass.utils as utils  # noqa: F401
+import math
+import cutlass.utils.hopper_helpers as hopper_utils
+from cutlass.utils import LayoutEnum
+from cutlass.cute.nvgpu.warpgroup import OperandMajorMode, OperandSource, make_smem_layout_atom
+
+
+def make_aligned_tensor(ptr: cute.Pointer, layout: cute.Layout, align_bytes: int, swizzle=False):
+    ptr = ptr.align(align_bytes)
+    if swizzle and isinstance(layout, cute.ComposedLayout):
+        ptr = cute.recast_ptr(ptr=ptr, swizzle_=layout.inner, dtype=ptr.dtype)
+        return cute.make_tensor(ptr, layout.outer)
+    return cute.make_tensor(ptr, layout)
+
+
+def gemm_ss(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rs(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from register/fragment and B from shared memory"""
+    if use_wgmma:
+        gemm = Gemm_SM90(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr, wg_wait=wg_wait, clear_accum=clear_accum)
+    else:
+        gemm = Gemm_SM80(
+            M,
+            N,
+            K,
+            warp_m,
+            warp_n,
+            trans_A,
+            trans_B,
+            clear_accum,
+            stride_A,
+            stride_B,
+            offset_A,
+            offset_B,
+            A_ptr.dtype,
+            B_ptr.dtype,
+            C_ptr.dtype,
+        )
+        gemm.body_rs(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_sr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with A from shared memory and B from register/fragment"""
+    # wgmma doesn't support gemm_sr, only use SM80
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    gemm.body_sr(A_ptr, B_ptr, C_ptr)
+
+
+def gemm_rr(
+    M,
+    N,
+    K,
+    warp_m,
+    warp_n,
+    trans_A,
+    trans_B,
+    clear_accum,
+    stride_A,
+    stride_B,
+    offset_A,
+    offset_B,
+    use_wgmma=None,
+    wg_wait=0,
+    A_ptr: cute.Pointer = None,
+    B_ptr: cute.Pointer = None,
+    C_ptr: cute.Pointer = None,
+):
+    """GEMM with both A and B from register/fragment"""
+    # Both operands in register, no copy needed
+    gemm = Gemm_SM80(
+        M,
+        N,
+        K,
+        warp_m,
+        warp_n,
+        trans_A,
+        trans_B,
+        clear_accum,
+        stride_A,
+        stride_B,
+        offset_A,
+        offset_B,
+        A_ptr.dtype,
+        B_ptr.dtype,
+        C_ptr.dtype,
+    )
+    # For gemm_rr, directly call _body_impl with copy_A=False, copy_B=False
+    gemm._body_impl(A_ptr, B_ptr, C_ptr, copy_A=False, copy_B=False)
+
+
+class Gemm_SM80:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.mma_inst_shape = (16, 8, 16)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self._make_smem_layout_AB(A_type, A_major_mode, 128, (M, K))
+            self.B_layout = self._make_smem_layout_AB(B_type, B_major_mode, 128, (N, K))
+            self.ab_dtype = A_type
+            self.acc_dtype = C_type
+            self.tiled_mma = self._make_tiled_mma(warp_m, warp_n)
+            self.clear_accum = clear_accum
+
+    def _make_smem_layout_AB(self, dtype, major_mode, copy_bits, smem_tiler):
+        is_row_major = major_mode == LayoutEnum.ROW_MAJOR
+        major_mode_size = smem_tiler[1] if is_row_major else smem_tiler[0]
+        major_mode_size = 64 if major_mode_size >= 64 else major_mode_size
+
+        swizzle_bits = int(math.log2(major_mode_size * dtype.width // copy_bits))
+        swizzle_bits = min(swizzle_bits, 3)
+
+        layout_atom_outer = (
+            cute.make_layout((8, major_mode_size), stride=(major_mode_size, 1))
+            if is_row_major
+            else cute.make_layout((major_mode_size, 8), stride=(1, major_mode_size))
+        )
+        layout_atom = cute.make_composed_layout(
+            cute.make_swizzle(swizzle_bits, 3, 3),
+            0,
+            layout_atom_outer,
+        )
+        layout = cute.tile_to_shape(layout_atom, smem_tiler, (0, 1) if is_row_major else (1, 0))
+        return layout
+
+    def _make_tiled_mma(self, warp_m, warp_n):
+        atom_layout_mnk = (warp_m, warp_n, 1)
+        op = cute.nvgpu.warp.MmaF16BF16Op(self.ab_dtype, self.acc_dtype, self.mma_inst_shape)
+        permutation_mnk = (
+            atom_layout_mnk[0] * self.mma_inst_shape[0],
+            atom_layout_mnk[1] * self.mma_inst_shape[1] * 2,
+            atom_layout_mnk[2] * self.mma_inst_shape[2],
+        )
+        tiled_mma = cute.make_tiled_mma(op, atom_layout_mnk, permutation_mnk)
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body: both A and B from shared memory"""
+        self._body_impl(sA_ptr, sB_ptr, rC_ptr, copy_A=True, copy_B=True)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_rs: A from register, B from shared memory"""
+        self._body_impl(rA_ptr, sB_ptr, rC_ptr, copy_A=False, copy_B=True)
+
+    @cute.jit
+    def body_sr(
+        self,
+        sA_ptr: cute.Pointer,  # A from shared memory
+        rB_ptr: cute.Pointer,  # B already in register
+        rC_ptr: cute.Pointer,
+    ):
+        """GEMM body_sr: A from shared memory, B from register"""
+        self._body_impl(sA_ptr, rB_ptr, rC_ptr, copy_A=True, copy_B=False)
+
+    @cute.jit
+    def _body_impl(
+        self,
+        A_ptr: cute.Pointer,
+        B_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        copy_A: cutlass.Constexpr = True,
+        copy_B: cutlass.Constexpr = True,
+    ):
+        """Internal implementation with configurable copy operations"""
+        tidx, _, _ = cute.arch.thread_idx()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        tCrA = None
+        tCrB = None
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Create copy operations only for operands that need copying
+        if cutlass.const_expr(copy_A):
+            sA = make_aligned_tensor(A_ptr, self.A_layout, 16)
+            tCsA = thr_mma.partition_A(sA)
+            tCrA = self.tiled_mma.make_fragment_A(tCsA)
+            atom_copy_s2r_A = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_A, 4),
+                sA.element_type,
+            )
+            tiled_copy_s2r_A = cute.make_tiled_copy(
+                atom_copy_s2r_A,
+                layout_tv=self.tiled_mma.tv_layout_A_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(0), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_A = tiled_copy_s2r_A.get_slice(tidx)
+            tCsA_copy_view = thr_copy_ldmatrix_A.partition_S(sA)
+            tCrA_copy_view = thr_copy_ldmatrix_A.retile(tCrA)
+        else:
+            # A already in register
+            tCrA = cute.make_tensor(A_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        if cutlass.const_expr(copy_B):
+            sB = make_aligned_tensor(B_ptr, self.B_layout, 16)
+            tCsB = thr_mma.partition_B(sB)
+            tCrB = self.tiled_mma.make_fragment_B(tCsB)
+            atom_copy_s2r_B = cute.make_copy_atom(
+                cute.nvgpu.warp.LdMatrix8x8x16bOp(self.trans_B, 4),
+                sB.element_type,
+            )
+            tiled_copy_s2r_B = cute.make_tiled_copy(
+                atom_copy_s2r_B,
+                layout_tv=self.tiled_mma.tv_layout_B_tiled,
+                tiler_mn=(self.tiled_mma.get_tile_size(1), self.tiled_mma.get_tile_size(2)),
+            )
+            thr_copy_ldmatrix_B = tiled_copy_s2r_B.get_slice(tidx)
+            tCsB_copy_view = thr_copy_ldmatrix_B.partition_S(sB)
+            tCrB_copy_view = thr_copy_ldmatrix_B.retile(tCrB)
+        else:
+            # B already in register
+            tCrB = cute.make_tensor(B_ptr, self.tiled_mma.partition_shape_B((self.cta_tiler[1], self.cta_tiler[2])))
+
+        if self.clear_accum:
+            tCrC.fill(0)
+
+        for k in cutlass.range(cute.size(tCrA, mode=[2])):
+            if cutlass.const_expr(copy_A):
+                cute.copy(tiled_copy_s2r_A, tCsA_copy_view[None, None, k], tCrA_copy_view[None, None, k])
+            if cutlass.const_expr(copy_B):
+                cute.copy(tiled_copy_s2r_B, tCsB_copy_view[None, None, k], tCrB_copy_view[None, None, k])
+            cute.gemm(self.tiled_mma, tCrC, tCrA[None, None, k], tCrB[None, None, k], tCrC)
+
+
+class Gemm_SM90:
+    _instances = {}  # cache instances for the same arguments
+
+    def __new__(cls, *args):
+        key = args
+        if key not in cls._instances:
+            cls._instances[key] = super().__new__(cls)
+        return cls._instances[key]
+
+    # in Tilelang, trans_A == 0 or trans_B == 1 means K major
+    # in Cute, trans == 0 means K major
+    def __init__(
+        self, M, N, K, warp_m, warp_n, trans_A, trans_B, clear_accum, stride_A, stride_B, offset_A, offset_B, A_type, B_type, C_type
+    ):
+        if not hasattr(self, "initialized"):
+            self.cta_tiler = (M, N, K)
+            self.tiler_mn = (M, N)
+            self.atom_layout_mnk = (warp_m // 4, warp_n, 1)
+            self.trans_A = trans_A != 0  # same with Tilelang
+            self.trans_B = trans_B == 0  # inverse with Tilelang
+            self.a_leading_mode = OperandMajorMode.MN if self.trans_A else OperandMajorMode.K
+            self.b_leading_mode = OperandMajorMode.MN if self.trans_B else OperandMajorMode.K
+            A_major_mode = LayoutEnum.COL_MAJOR if self.trans_A else LayoutEnum.ROW_MAJOR
+            B_major_mode = LayoutEnum.COL_MAJOR if self.trans_B else LayoutEnum.ROW_MAJOR
+            self.A_layout = self.make_smem_layout_AB(A_type, A_major_mode, (M, K))
+            self.B_layout = self.make_smem_layout_AB(B_type, B_major_mode, (N, K))
+            self.a_dtype = A_type
+            self.b_dtype = B_type
+            self.acc_dtype = C_type
+            self.tiled_mma = None
+            self.A_source = None
+            self.clear_accum = clear_accum
+
+    @staticmethod
+    def make_tma_atom(
+        tensor,
+        smem_layout_staged,
+        smem_tile,
+        mcast_dim,
+    ):
+        op = cute.nvgpu.cpasync.CopyBulkTensorTileG2SOp() if mcast_dim == 1 else cute.nvgpu.cpasync.CopyBulkTensorTileG2SMulticastOp()
+
+        smem_layout = cute.slice_(smem_layout_staged, (None, None, 0))
+
+        tma_atom, tma_tensor = cute.nvgpu.cpasync.make_tiled_tma_atom(
+            op,
+            tensor,
+            smem_layout,
+            smem_tile,
+            num_multicast=mcast_dim,
+        )
+
+        return tma_atom
+
+    @staticmethod
+    def get_tma_atom(tensor, tiler_mk, stages=1):
+        smem_layout_staged = Gemm_SM90.make_smem_layout_AB(tensor.element_type, LayoutEnum.from_tensor(tensor), tiler_mk, stages)
+        tma_atom = Gemm_SM90.make_tma_atom(tensor, smem_layout_staged, tiler_mk, 1)
+        return tma_atom
+
+    @staticmethod
+    def make_smem_layout_AB(dtype, major_mode: LayoutEnum, tiler_mk, stages=1):
+        smem_shape = tiler_mk
+        # Determine if K is the major mode and get the major mode size
+        is_k_major = major_mode.sm90_mma_major_mode() == cute.nvgpu.warpgroup.OperandMajorMode.K
+        major_mode_size = tiler_mk[1] if is_k_major else tiler_mk[0]
+
+        # Create SMEM layout atom for A tensor based on major mode and data type
+        smem_layout_atom = make_smem_layout_atom(
+            hopper_utils.get_smem_layout_atom(major_mode, dtype, major_mode_size),
+            dtype,
+        )
+        # Tile the SMEM layout atom to the A tensor shape and add staging dimension
+        smem_layout = cute.tile_to_shape(smem_layout_atom, cute.append(smem_shape, stages), order=(0, 1, 2) if is_k_major else (1, 0, 2))
+        return smem_layout
+
+    def _make_tiled_mma(self, is_rsMode=False):
+        tiled_mma = hopper_utils.make_trivial_tiled_mma(
+            self.a_dtype,
+            self.b_dtype,
+            self.a_leading_mode,
+            self.b_leading_mode,
+            self.acc_dtype,
+            self.atom_layout_mnk,
+            (64, self.tiler_mn[1] // self.atom_layout_mnk[1]),
+            OperandSource.SMEM if not is_rsMode else OperandSource.RMEM,
+        )
+        return tiled_mma
+
+    @cute.jit
+    def __call__(
+        self,
+        sA_ptr: cute.Pointer,
+        sB_ptr: cute.Pointer,
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma()
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        sA_ptr = cute.recast_ptr(sA_ptr, self.A_layout.inner, dtype=sA_ptr.dtype)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sA = cute.make_tensor(sA_ptr, self.A_layout.outer)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        tCsA = thr_mma.partition_A(sA)
+        tCsB = thr_mma.partition_B(sB)
+
+        tCrA = self.tiled_mma.make_fragment_A(tCsA)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        cute.nvgpu.warpgroup.fence()
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        num_k_blocks = cute.size(tCrA, mode=[2])
+        for k in cutlass.range(num_k_blocks):
+            tCrA_1phase = tCrA[None, None, k, 0]
+            tCrB_1phase = tCrB[None, None, k, 0]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_1phase, tCrB_1phase, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
+
+    @cute.jit
+    def body_rs(
+        self,
+        rA_ptr: cute.Pointer,  # A already in register (Fragment)
+        sB_ptr: cute.Pointer,  # B from shared memory
+        rC_ptr: cute.Pointer,
+        wg_wait: cutlass.Constexpr = 0,
+        clear_accum: cutlass.Constexpr = False,
+    ):
+        """
+        GEMM body_rs for SM90/Hopper: A from register, B from shared memory.
+        Based on cute::tl_wgmma::GemmTensorOp::body_rs from gemm_sm90.h
+        """
+        tidx, _, _ = cute.arch.thread_idx()
+        self.tiled_mma = self._make_tiled_mma(is_rsMode=True)
+        # if self.A_source != OperandSource.RMEM or self.tiled_mma is None:
+        #     self.tiled_mma = self._make_tiled_mma(is_rsMode = True)
+        #     self.A_source = OperandSource.RMEM
+        # B from shared memory (with swizzle)
+        sB_ptr = cute.recast_ptr(sB_ptr, self.B_layout.inner, dtype=sB_ptr.dtype)
+        sB = cute.make_tensor(sB_ptr, self.B_layout.outer)
+
+        # Use the existing tiled_mma
+        thr_mma = self.tiled_mma.get_slice(tidx)
+
+        # Partition B from shared memory - standard path
+        tCsB = thr_mma.partition_B(sB)
+        tCrB = self.tiled_mma.make_fragment_B(tCsB)
+
+        # A already in register
+        # For body_rs, A is NOT partitioned through thr_mma (it's already partitioned)
+        # We create the tensor directly with the full shape
+        # This matches C++: make_tensor(make_rmem_ptr(pA), partition_shape_A(...))
+        tCrA = cute.make_tensor(rA_ptr, self.tiled_mma.partition_shape_A((self.cta_tiler[0], self.cta_tiler[2])))
+
+        # C accumulator
+        tCrC = cute.make_tensor(rC_ptr, self.tiled_mma.partition_shape_C((self.cta_tiler[0], self.cta_tiler[1])))
+
+        # Fence operands (prepare for wgmma)
+        cute.nvgpu.warpgroup.fence()
+        # Note: warpgroup_arrive() is called internally by wgmma
+        # Set accumulation mode
+        if cutlass.const_expr(clear_accum):
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, False)
+        else:
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+        # GEMM loop
+        num_k_blocks = cute.size(tCrB, mode=[2])
+        for k_block in cutlass.range(num_k_blocks):
+            # Match the indexing pattern from __call__
+            # If tCrB has 4 dimensions (with pipeline), use [None, None, k, 0]
+            # Otherwise use [None, None, k]
+            tCrB_k = tCrB[None, None, k_block, 0] if cute.rank(tCrB) >= 4 else tCrB[None, None, k_block]
+            tCrA_k = tCrA[None, None, k_block, 0] if cute.rank(tCrA) >= 4 else tCrA[None, None, k_block]
+            cute.gemm(self.tiled_mma, tCrC, tCrA_k, tCrB_k, tCrC)
+            self.tiled_mma.set(cute.nvgpu.warpgroup.Field.ACCUMULATE, True)
+
+        cute.nvgpu.warpgroup.commit_group()
+        if cutlass.const_expr(wg_wait >= 0):
+            cute.nvgpu.warpgroup.wait_group(wg_wait)
diff --git a/tilelang/contrib/cutedsl/gemm_v2.py b/tilelang/contrib/cutedsl/gemm_v2.py
new file mode 100644
index 0000000000..ef6a314093
--- /dev/null
+++ b/tilelang/contrib/cutedsl/gemm_v2.py
@@ -0,0 +1,281 @@
+__all__ = [
+    "GmmaDescriptor",
+    "initialize_wgmma_descriptor",
+    "increase_descriptor_offset",
+    "warpgroup_fence_operand",
+    "warpgroup_arrive",
+    "warpgroup_commit_batch",
+    "warpgroup_wait",
+    "wgmma_ss",
+    "wgmma_rs",
+]
+
+import cutlass
+import cutlass.cute as cute
+from cutlass._mlir.dialects import nvvm, llvm
+from cutlass._mlir import ir
+from cutlass.cutlass_dsl import T, Constexpr
+import cutlass.cute.nvgpu.warpgroup as warpgroup
+from .utils import type_map
+
+
+class GmmaDescriptor:
+    def __init__(self, desc_64: cute.Int64 = None):
+        self.desc = cute.make_rmem_tensor((2,), dtype=cutlass.Int32)
+        self.desc_i64 = cute.make_tensor(cute.recast_ptr(self.desc.iterator, dtype=cute.Int64), (1,))
+        if desc_64 is not None:
+            self.desc_i64[0] = desc_64
+
+    def __add__(self, offset):
+        res = cute.make_rmem_tensor((2,), dtype=cutlass.Int32)
+        res_i64 = cute.make_tensor(cute.recast_ptr(res.iterator, dtype=cute.Int64), (1,))
+        res[0] = self.desc[0] + offset
+        res[1] = self.desc[1]
+        return GmmaDescriptor(res_i64[0])
+
+
+def initialize_wgmma_descriptor(layout_type, leading_byte_offset, stride_byte_offset, desc: GmmaDescriptor, start_address: cute.Pointer):
+    # Manually pack the descriptor bits to match the WGMMA descriptor format:
+    #   Bits [0:13]  = start_address >> 4
+    #   Bits [16:29] = leading_byte_offset
+    #   Bits [32:45] = stride_byte_offset
+    #   Bits [49:51] = base_offset (0)
+    #   Bits [62:63] = layout_type
+    ptr_val = start_address.toint() >> 4
+    # Low 32 bits: start_address[0:13] | leading[16:29]
+    desc.desc[0] = cutlass.Int32(ptr_val) | cutlass.Int32(cutlass.Int32(leading_byte_offset) << 16)
+    # High 32 bits: stride[0:13] | layout_type[30:31]
+    desc.desc[1] = cutlass.Int32(stride_byte_offset) | cutlass.Int32(cutlass.Int32(layout_type) << 30)
+
+
+def increase_descriptor_offset(desc: GmmaDescriptor, offset):
+    desc.desc[0] += offset >> 4
+
+
+def warpgroup_fence_operand(*args):
+    # No-op in CuTeDSL: warpgroup synchronization is handled by CUTLASS cuTe
+    # primitives (warpgroup.fence(), warpgroup.commit_group(), etc.) and by the
+    # has_side_effects=True flag on inline PTX asm in wgmma_rs/wgmma_ss.
+    # The codegen emits calls to this (see codegen_cutedsl.cc) but they are
+    # intentionally empty.
+    pass
+
+
+def warpgroup_arrive():
+    warpgroup.fence()
+
+
+def warpgroup_commit_batch():
+    warpgroup.commit_group()
+
+
+def warpgroup_wait(N):
+    warpgroup.wait_group(N)
+
+
+# PTX dtype suffix mapping for WGMMA instructions
+_PTX_DTYPE_MAP = {
+    "fp16": "f16",
+    "bf16": "bf16",
+    "tf32": "tf32",
+    "fp32": "f32",
+    "e4m3": "e4m3",
+    "e5m2": "e5m2",
+    "s8": "s8",
+    "u8": "u8",
+    "float16": "f16",
+    "bfloat16": "bf16",
+    "float32": "f32",
+    "float8_e4m3": "e4m3",
+    "float8_e4m3fn": "e4m3",
+    "float8_e5m2": "e5m2",
+    "int8": "s8",
+    "uint8": "u8",
+}
+
+# For WGMMA A/B operands, fp32 must be treated as tf32 on SM90
+_FP32_TO_TF32 = {"fp32": "tf32", "f32": "tf32", "float32": "tf32"}
+
+# Canonical PTX dtype -> cutlass scalar type (for output dtypes only).
+_PTX_TO_CUTLASS_TYPE = {
+    "f16": cutlass.Float16,
+    "bf16": cutlass.BFloat16,
+    "f32": cutlass.Float32,
+    "s32": cutlass.Int32,
+}
+
+
+def _wgmma_num_c_regs(M, N, C_dtype):
+    """Number of i32 result registers per thread for a WGMMA op.
+
+    Each i32 register holds ``32 // elem_bits`` packed elements.
+    """
+    canonical = _PTX_DTYPE_MAP.get(C_dtype, C_dtype)
+    elem_bits = _PTX_TO_CUTLASS_TYPE[canonical].width
+    return M * N * elem_bits // (128 * 32)
+
+
+def _wgmma_ab_dtype(dtype_str):
+    """Map A/B operand dtype for WGMMA: fp32 -> tf32 (SM90 compatibility)."""
+    return _FP32_TO_TF32.get(dtype_str, dtype_str)
+
+
+@cute.jit
+def wgmma_ss(
+    A_dtype: str,
+    B_dtype: str,
+    C_dtype: str,
+    M: Constexpr[int],
+    N: Constexpr[int],
+    K: Constexpr[int],
+    tnspA: bool,
+    tnspB: bool,
+    scaleA: int,
+    scaleB: int,
+    desc_a: GmmaDescriptor,
+    desc_b: GmmaDescriptor,
+    C_ptr: cute.Pointer,
+    scale_out: Constexpr[int],
+):
+    num_elems_per_thread = _wgmma_num_c_regs(M, N, C_dtype)
+
+    C_types = llvm.StructType.get_literal([T.i32()] * num_elems_per_thread)
+
+    C_vecs = cute.make_tensor(cute.recast_ptr(C_ptr, dtype=cute.Int32), (num_elems_per_thread,))
+
+    # Pack current accumulator values into a struct
+    inouts_struct = llvm.mlir_undef(C_types)
+    for i in cutlass.range_constexpr(num_elems_per_thread):
+        inouts_struct = llvm.insertvalue(inouts_struct, C_vecs[i].ir_value(), [i])
+
+    shape_attr = ir.Attribute.parse(f"#nvvm.shape<m={M},n={N},k={K}>")
+
+    new_C_vecs = nvvm.wgmma_mma_async(
+        results_=C_types,
+        inouts=inouts_struct,
+        descriptor_a=desc_a.desc_i64[0].ir_value(),
+        descriptor_b=desc_b.desc_i64[0].ir_value(),
+        shape=shape_attr,
+        type_a=type_map[_wgmma_ab_dtype(A_dtype)],
+        type_b=type_map[_wgmma_ab_dtype(B_dtype)],
+        type_d=type_map[C_dtype],
+        scale_d=nvvm.WGMMAScaleOut.zero if scale_out == 0 else nvvm.WGMMAScaleOut.one,
+        scale_a=nvvm.WGMMAScaleIn.one if scaleA == 1 else nvvm.WGMMAScaleIn.neg,
+        scale_b=nvvm.WGMMAScaleIn.one if scaleB == 1 else nvvm.WGMMAScaleIn.neg,
+        layout_a=nvvm.MMALayout.col if tnspA else nvvm.MMALayout.row,
+        layout_b=nvvm.MMALayout.row if tnspB else nvvm.MMALayout.col,
+    )
+    for i in cutlass.range_constexpr(num_elems_per_thread):
+        C_vecs[i] = llvm.extractvalue(T.i32(), new_C_vecs, [i])
+
+
+@cute.jit
+def wgmma_rs(
+    A_dtype: str,
+    B_dtype: str,
+    C_dtype: str,
+    M: Constexpr[int],
+    N: Constexpr[int],
+    K: Constexpr[int],
+    tnspB: Constexpr[bool],
+    scaleA: Constexpr[int],
+    scaleB: Constexpr[int],
+    A_ptr: cute.Pointer,
+    desc_b: GmmaDescriptor,
+    C_ptr: cute.Pointer,
+    scale_out: Constexpr[int],
+):
+    """WGMMA register-shared variant using PTX inline asm.
+
+    A operand comes from registers, B from shared memory descriptor.
+    M is always 64. A is always K-major (not transposed).
+    """
+    num_a_regs = 4  # Always 4 for M=64, all supported dtypes
+    num_c_regs = _wgmma_num_c_regs(M, N, C_dtype)
+
+    ptx_a = _PTX_DTYPE_MAP[_wgmma_ab_dtype(A_dtype)]
+    ptx_b = _PTX_DTYPE_MAP[_wgmma_ab_dtype(B_dtype)]
+    ptx_c = _PTX_DTYPE_MAP[C_dtype]
+
+    # Create tensor views over register data
+    A_vecs = cute.make_tensor(cute.recast_ptr(A_ptr, dtype=cute.Int32), (num_a_regs,))
+    C_vecs = cute.make_tensor(cute.recast_ptr(C_ptr, dtype=cute.Int32), (num_c_regs,))
+
+    # Operand numbering in the inline asm:
+    #   $0 .. $(num_c_regs-1)                          : output D regs (f32)
+    #   $(num_c_regs) .. $(2*num_c_regs-1)             : tied C inputs
+    #   $(2*num_c_regs) .. $(2*num_c_regs+3)           : A regs (i32)
+    #   $(2*num_c_regs+4)                              : desc_b (i64)
+    a_base = 2 * num_c_regs
+    desc_b_idx = a_base + num_a_regs
+
+    # Build PTX asm string
+    d_regs_str = ", ".join(f"${i}" for i in range(num_c_regs))
+    a_regs_str = ", ".join(f"${a_base + i}" for i in range(num_a_regs))
+    tnsp_b_imm = 1 if tnspB else 0
+    # Embed scale_out as immediate in predicate setup
+    scale_const = 1 if scale_out != 0 else 0
+
+    # TF32 WGMMA does not have a tnspB parameter (B is always K-major)
+    tail_args = f"p, {scaleA}, {scaleB};" if ptx_a == "tf32" else f"p, {scaleA}, {scaleB}, {tnsp_b_imm};"
+
+    asm_str = (
+        "{\n"
+        ".reg .pred p;\n"
+        f"setp.ne.b32 p, {scale_const}, 0;\n"
+        f"wgmma.mma_async.sync.aligned.m{M}n{N}k{K}"
+        f".{ptx_c}.{ptx_a}.{ptx_b} "
+        f"{{{d_regs_str}}}, "
+        f"{{{a_regs_str}}}, "
+        f"${desc_b_idx}, "
+        f"{tail_args}\n"
+        "}\n"
+    )
+
+    # Determine if C/D is float or integer based on canonical PTX dtype.
+    # All type/constraint decisions are made here (no branching in the DSL loop).
+    is_int_accum = ptx_c == "s32"
+    c_constraint = "r" if is_int_accum else "f"
+    i32_type = ir.IntegerType.get_signless(32)
+    f32_type = ir.F32Type.get()
+    c_ir_type = i32_type if is_int_accum else f32_type
+
+    # Build constraint string
+    out_constraints = ",".join([f"={c_constraint}"] * num_c_regs)
+    tied_constraints = ",".join([str(i) for i in range(num_c_regs)])
+    a_constraints = ",".join(["r"] * num_a_regs)
+    constraints = f"{out_constraints},{tied_constraints},{a_constraints},l"
+
+    # Prepare operands list
+    operands = []
+
+    # Tied C inputs: for 'f' constraint bitcast i32→f32, for 'r' pass i32 as-is
+    for i in cutlass.range_constexpr(num_c_regs):
+        val = C_vecs[i].ir_value()
+        operands.append(val if is_int_accum else llvm.bitcast(f32_type, val))
+
+    # A inputs (i32)
+    for i in cutlass.range_constexpr(num_a_regs):
+        operands.append(A_vecs[i].ir_value())
+
+    # desc_b (i64)
+    operands.append(desc_b.desc_i64[0].ir_value())
+
+    # Result type
+    result_type = llvm.StructType.get_literal([c_ir_type] * num_c_regs)
+
+    # Execute inline asm
+    result = llvm.inline_asm(
+        result_type,
+        operands,
+        asm_str,
+        constraints,
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+    )
+
+    # Extract results and store back: for 'f' bitcast f32→i32, for 'r' direct
+    for i in cutlass.range_constexpr(num_c_regs):
+        extracted = llvm.extractvalue(c_ir_type, result, [i])
+        C_vecs[i] = extracted if is_int_accum else llvm.bitcast(i32_type, extracted)
diff --git a/tilelang/contrib/cutedsl/grid_sync.py b/tilelang/contrib/cutedsl/grid_sync.py
new file mode 100644
index 0000000000..d4d0423e76
--- /dev/null
+++ b/tilelang/contrib/cutedsl/grid_sync.py
@@ -0,0 +1,154 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+"""
+Grid-level synchronization for CuTeDSL backend.
+
+Implements a software grid barrier using atomic operations on a device-global
+counter declared via llvm.mlir.global. Requires cooperative kernel launch
+(cuLaunchCooperativeKernel) to guarantee all thread blocks are resident.
+
+The barrier:
+1. __syncthreads() within each block
+2. Thread 0 atomically increments global counter, spin-waits until all blocks arrive
+3. Thread 0 resets counter
+4. __syncthreads() within each block
+"""
+
+__all__ = ["sync_grid"]
+
+from cutlass._mlir import ir as mlir_ir
+from cutlass._mlir.dialects import llvm
+from cutlass.cutlass_dsl import T, dsl_user_op
+import cutlass.cute as cute
+
+
+def _find_gpu_module_from_ip():
+    """Find gpu.module by walking up from current insertion point.
+
+    Hierarchy: block -> KernelOp -> gpu.module -> builtin.module
+    Uses bounded for-loop to avoid CuTeDSL DSL preprocessor issues with while/break.
+    """
+    current_ip = mlir_ir.InsertionPoint.current
+    block = current_ip.block
+    op = block.owner
+    for _ in range(10):
+        if op is None:
+            return None
+        if hasattr(op, "name") and op.name == "gpu.module":
+            return op
+        op = op.parent
+    return None
+
+
+def _ensure_global_counter(loc=None):
+    """Declare the global counter variable at gpu.module level if not already done.
+
+    Creates: @__grid_sync_ctr = internal global i32 0, addrspace(1)
+    """
+    gpu_mod = _find_gpu_module_from_ip()
+    if gpu_mod is None:
+        raise RuntimeError("Cannot find gpu.module in MLIR hierarchy for global variable declaration")
+
+    # Check if the global already exists
+    module_body = gpu_mod.regions[0].blocks[0]
+    for op in module_body.operations:
+        if op.name == "llvm.mlir.global":
+            sym = op.attributes.get("sym_name")
+            if sym is not None and str(sym) == '"__grid_sync_ctr"':
+                return
+
+    # Insert at the beginning of the module body
+    ctx = mlir_ir.Context.current
+    linkage_attr = mlir_ir.Attribute.parse("#llvm.linkage<internal>", ctx)
+
+    with mlir_ir.InsertionPoint.at_block_begin(module_body):
+        llvm.GlobalOp(
+            T.i32(),
+            "__grid_sync_ctr",
+            linkage_attr,
+            value=mlir_ir.IntegerAttr.get(T.i32(), 0),
+            addr_space=1,
+            loc=loc,
+        )
+
+
+def sync_grid():
+    """Synchronize all thread blocks in a grid.
+
+    NOTE: This requires the kernel to be launched with cuLaunchCooperativeKernel
+    to guarantee all blocks are resident simultaneously. The CuTeDSL wrapper
+    handles this automatically when the kernel uses sync_grid().
+    """
+    cute.arch.sync_threads()
+    _grid_sync_barrier()
+    cute.arch.sync_threads()
+
+
+@dsl_user_op
+def _grid_sync_barrier(*, loc=None, ip=None) -> None:
+    """Software grid barrier using inline PTX.
+
+    Declares a module-level global counter via llvm.mlir.global, then uses
+    inline PTX for the barrier protocol (atomic increment + spin-wait + reset).
+    Only thread 0 per block participates.
+    """
+    _ensure_global_counter(loc=loc)
+
+    # Get address of the global counter (ptr in address space 1 = global memory)
+    counter_ptr = llvm.mlir_addressof(
+        llvm.PointerType.get(1),
+        "__grid_sync_ctr",
+        loc=loc,
+        ip=ip,
+    )
+
+    # All barrier logic in one inline PTX block:
+    # - Thread 0 check, grid dim computation, atomic increment, spin-wait, reset
+    llvm.inline_asm(
+        None,
+        [counter_ptr],
+        """
+{
+    .reg .s32 %r_tid;
+    .reg .s32 %r_nctaid_x;
+    .reg .s32 %r_nctaid_y;
+    .reg .s32 %r_nctaid_z;
+    .reg .s32 %r_num_blocks;
+    .reg .s32 %r_arrived;
+    .reg .pred %p_is_thread0;
+    .reg .pred %p_done;
+
+    // Check if this is thread 0
+    mov.u32 %r_tid, %tid.x;
+    setp.ne.s32 %p_is_thread0, %r_tid, 0;
+    @%p_is_thread0 bra GRID_SYNC_DONE;
+
+    // Compute total number of blocks
+    mov.u32 %r_nctaid_x, %nctaid.x;
+    mov.u32 %r_nctaid_y, %nctaid.y;
+    mov.u32 %r_nctaid_z, %nctaid.z;
+    mul.lo.s32 %r_num_blocks, %r_nctaid_x, %r_nctaid_y;
+    mul.lo.s32 %r_num_blocks, %r_num_blocks, %r_nctaid_z;
+
+    // Atomic increment with GPU scope
+    atom.add.release.gpu.s32 %r_arrived, [$0], 1;
+
+    // Spin until all blocks arrive (acquire GPU scope)
+GRID_SYNC_SPIN:
+    ld.acquire.gpu.global.s32 %r_arrived, [$0];
+    setp.ge.s32 %p_done, %r_arrived, %r_num_blocks;
+    @!%p_done bra GRID_SYNC_SPIN;
+
+    // Reset counter with GPU scope
+    st.release.gpu.global.s32 [$0], 0;
+
+GRID_SYNC_DONE:
+}
+""",
+        "l",
+        has_side_effects=True,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+        loc=loc,
+        ip=ip,
+    )
diff --git a/tilelang/contrib/cutedsl/ieee_math.py b/tilelang/contrib/cutedsl/ieee_math.py
new file mode 100644
index 0000000000..3e50866235
--- /dev/null
+++ b/tilelang/contrib/cutedsl/ieee_math.py
@@ -0,0 +1,306 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+"""
+IEEE-754 compliant floating-point operations with explicit rounding modes.
+
+These correspond to CUDA __fadd_rn, __fsub_rz, etc. Implemented via inline PTX
+to ensure exact rounding mode compliance.
+
+Rounding modes: rn (nearest), rz (toward zero), rm (toward -inf), rp (toward +inf)
+"""
+
+__all__ = [
+    "ieee_fadd",
+    "ieee_fsub",
+    "ieee_fmul",
+    "ieee_fmaf",
+    "ieee_frcp",
+    "ieee_fsqrt",
+    "ieee_fdiv",
+]
+
+from cutlass._mlir.dialects import llvm
+from cutlass.base_dsl.typing import Float32, Float64
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+
+# --- f32 binary ops ---
+
+
+@dsl_user_op
+def _fadd_f32(a: Float32, b: Float32, *, rounding: str = "rn", loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(), Float32(b).ir_value()],
+            f"add.{rounding}.f32 $0, $1, $2;",
+            "=f,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fsub_f32(a: Float32, b: Float32, *, rounding: str = "rn", loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(), Float32(b).ir_value()],
+            f"sub.{rounding}.f32 $0, $1, $2;",
+            "=f,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fmul_f32(a: Float32, b: Float32, *, rounding: str = "rn", loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(), Float32(b).ir_value()],
+            f"mul.{rounding}.f32 $0, $1, $2;",
+            "=f,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fmaf_f32(a: Float32, b: Float32, c: Float32, *, rounding: str = "rn", loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(), Float32(b).ir_value(), Float32(c).ir_value()],
+            f"fma.{rounding}.f32 $0, $1, $2, $3;",
+            "=f,f,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _frcp_f32(a: Float32, *, rounding: str = "rn", loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value()],
+            f"rcp.{rounding}.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fsqrt_f32(a: Float32, *, rounding: str = "rn", loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value()],
+            f"sqrt.{rounding}.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fdiv_f32(a: Float32, b: Float32, *, rounding: str = "rn", loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(a).ir_value(), Float32(b).ir_value()],
+            f"div.{rounding}.f32 $0, $1, $2;",
+            "=f,f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+# --- f64 binary ops ---
+
+
+@dsl_user_op
+def _dadd_f64(a: Float64, b: Float64, *, rounding: str = "rn", loc=None, ip=None) -> Float64:
+    return Float64(
+        llvm.inline_asm(
+            T.f64(),
+            [Float64(a).ir_value(), Float64(b).ir_value()],
+            f"add.{rounding}.f64 $0, $1, $2;",
+            "=d,d,d",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _dsub_f64(a: Float64, b: Float64, *, rounding: str = "rn", loc=None, ip=None) -> Float64:
+    return Float64(
+        llvm.inline_asm(
+            T.f64(),
+            [Float64(a).ir_value(), Float64(b).ir_value()],
+            f"sub.{rounding}.f64 $0, $1, $2;",
+            "=d,d,d",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _dmul_f64(a: Float64, b: Float64, *, rounding: str = "rn", loc=None, ip=None) -> Float64:
+    return Float64(
+        llvm.inline_asm(
+            T.f64(),
+            [Float64(a).ir_value(), Float64(b).ir_value()],
+            f"mul.{rounding}.f64 $0, $1, $2;",
+            "=d,d,d",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _dmaf_f64(a: Float64, b: Float64, c: Float64, *, rounding: str = "rn", loc=None, ip=None) -> Float64:
+    return Float64(
+        llvm.inline_asm(
+            T.f64(),
+            [Float64(a).ir_value(), Float64(b).ir_value(), Float64(c).ir_value()],
+            f"fma.{rounding}.f64 $0, $1, $2, $3;",
+            "=d,d,d,d",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _drcp_f64(a: Float64, *, rounding: str = "rn", loc=None, ip=None) -> Float64:
+    return Float64(
+        llvm.inline_asm(
+            T.f64(),
+            [Float64(a).ir_value()],
+            f"rcp.{rounding}.f64 $0, $1;",
+            "=d,d",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _dsqrt_f64(a: Float64, *, rounding: str = "rn", loc=None, ip=None) -> Float64:
+    return Float64(
+        llvm.inline_asm(
+            T.f64(),
+            [Float64(a).ir_value()],
+            f"sqrt.{rounding}.f64 $0, $1;",
+            "=d,d",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _ddiv_f64(a: Float64, b: Float64, *, rounding: str = "rn", loc=None, ip=None) -> Float64:
+    return Float64(
+        llvm.inline_asm(
+            T.f64(),
+            [Float64(a).ir_value(), Float64(b).ir_value()],
+            f"div.{rounding}.f64 $0, $1, $2;",
+            "=d,d,d",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+# --- Public API (dispatches by dtype) ---
+
+
+def ieee_fadd(a, b, rounding="rn"):
+    """IEEE-754 add with explicit rounding mode."""
+    return _fadd_f32(a, b, rounding=rounding)
+
+
+def ieee_fsub(a, b, rounding="rn"):
+    """IEEE-754 subtract with explicit rounding mode."""
+    return _fsub_f32(a, b, rounding=rounding)
+
+
+def ieee_fmul(a, b, rounding="rn"):
+    """IEEE-754 multiply with explicit rounding mode."""
+    return _fmul_f32(a, b, rounding=rounding)
+
+
+def ieee_fmaf(a, b, c, rounding="rn"):
+    """IEEE-754 fused multiply-add with explicit rounding mode."""
+    return _fmaf_f32(a, b, c, rounding=rounding)
+
+
+def ieee_frcp(a, rounding="rn"):
+    """IEEE-754 reciprocal with explicit rounding mode."""
+    return _frcp_f32(a, rounding=rounding)
+
+
+def ieee_fsqrt(a, rounding="rn"):
+    """IEEE-754 square root with explicit rounding mode."""
+    return _fsqrt_f32(a, rounding=rounding)
+
+
+def ieee_fdiv(a, b, rounding="rn"):
+    """IEEE-754 divide with explicit rounding mode."""
+    return _fdiv_f32(a, b, rounding=rounding)
diff --git a/tilelang/contrib/cutedsl/ldsm.py b/tilelang/contrib/cutedsl/ldsm.py
index 4f36026975..1d8a51d4d8 100644
--- a/tilelang/contrib/cutedsl/ldsm.py
+++ b/tilelang/contrib/cutedsl/ldsm.py
@@ -6,6 +6,21 @@
 for loading/storing 8x8 matrix fragments between shared memory and registers.
 """
 
+__all__ = [
+    "ptx_ldmatrix_x1",
+    "ptx_ldmatrix_x2",
+    "ptx_ldmatrix_x4",
+    "ptx_ldmatrix_x1_trans",
+    "ptx_ldmatrix_x2_trans",
+    "ptx_ldmatrix_x4_trans",
+    "ptx_stmatrix_x1",
+    "ptx_stmatrix_x2",
+    "ptx_stmatrix_x4",
+    "ptx_stmatrix_x1_trans",
+    "ptx_stmatrix_x2_trans",
+    "ptx_stmatrix_x4_trans",
+]
+
 from cutlass.cutlass_dsl import T, dsl_user_op
 from cutlass._mlir.dialects import nvvm, llvm
 from cutlass._mlir import ir  # noqa: F401
diff --git a/tilelang/contrib/cutedsl/math.py b/tilelang/contrib/cutedsl/math.py
index 3f775091be..2fe40c903e 100644
--- a/tilelang/contrib/cutedsl/math.py
+++ b/tilelang/contrib/cutedsl/math.py
@@ -1,9 +1,63 @@
+__all__ = [
+    "exp",
+    "exp2",
+    "exp10",
+    "log",
+    "log2",
+    "log10",
+    "tan",
+    "cos",
+    "sin",
+    "sqrt",
+    "rsqrt",
+    "fabsf",
+    "divf",
+    "tanh",
+]
+
 import cutlass.cute as cute
 from cutlass.cute.typing import Union, Numeric
 from cutlass.cute.tensor import TensorSSA
-from cutlass._mlir.dialects import arith
-from cutlass.cute.math import exp, exp2, log, log2, log10, tan, cos, sin, sqrt  # noqa: F401
+from cutlass._mlir.dialects import arith, math
+from cutlass.cute.math import exp, exp2, log, log2, log10, tan, cos, sin, sqrt, rsqrt  # noqa: F401
+from cutlass.cute.math import _math_op as _cute_math_op
+
+from cutlass._mlir.dialects import llvm
+from cutlass.base_dsl.typing import Float32
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+
+def exp10(x: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
+    """Compute 10^x using exp2(x * log2(10))."""
+    _LOG2_10 = 3.3219280948873626  # log2(10)
+    return exp2(x * _LOG2_10, fastmath=fastmath)
+
+
+def fabsf(x: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
+    return _cute_math_op(math.absf, fastmath, x)
 
 
 def divf(x: Union[TensorSSA, Numeric], y: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
-    return cute.math._math_op(arith.divf, fastmath, x, y)
+    return _cute_math_op(arith.divf, fastmath, x, y)
+
+
+@dsl_user_op
+def __tanhf(x: Union[float, Float32], *, fastmath, loc=None, ip=None) -> Float32:
+    return Float32(
+        llvm.inline_asm(
+            T.f32(),
+            [Float32(x).ir_value()],
+            "tanh.approx.f32 $0, $1;",
+            "=f,f",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+def tanh(x: Union[TensorSSA, Numeric], fastmath: bool = False) -> Union[TensorSSA, Numeric]:
+    tanh_op = __tanhf if fastmath else math.tanh
+    return cute.math._math_op(tanh_op, False, x)
diff --git a/tilelang/contrib/cutedsl/ptx_mma.py b/tilelang/contrib/cutedsl/ptx_mma.py
new file mode 100644
index 0000000000..0224494c30
--- /dev/null
+++ b/tilelang/contrib/cutedsl/ptx_mma.py
@@ -0,0 +1,444 @@
+"""
+PTX MMA operations for CuTeDSL backend.
+Based on tl_templates/cuda/instruction/mma.h
+
+These functions provide wrappers around PTX mma.sync instructions
+for performing matrix multiply-accumulate operations using Tensor Cores.
+
+Uses inline PTX assembly for direct MMA instruction generation.
+
+Supported dense configurations (from mma.h):
+- FP16: m16n8k16 -> f16/f32 accumulator
+- BF16: m16n8k16 -> f32 accumulator
+- INT8: m16n8k32 -> i32 accumulator
+- UINT8: m16n8k32 -> i32 accumulator
+- INT4: m16n8k32 -> i32 accumulator (mapped to m16n8k64 in PTX)
+- UINT4: m16n8k32 -> i32 accumulator
+- FP8 (e4m3/e5m2): m16n8k32 -> f16/f32 accumulator
+- TF32: m16n8k4, m16n8k8 -> f32 accumulator
+- FP64: m8n8k4 -> f64 accumulator
+
+Sparse (mma.sp) variants mirror the dense ones with halved A registers,
+an extra metadata register, and a sparse_selector literal.
+"""
+
+__all__ = [
+    "ptx_mma_m16n8k16_f16_f16_f32",
+    "ptx_mma_m16n8k16_f16_f16_f16",
+    "ptx_mma_m16n8k16_bf16_bf16_f32",
+    "ptx_mma_m16n8k32_s8_s8_s32",
+    "ptx_mma_m16n8k32_u8_u8_s32",
+    "ptx_mma_m16n8k32_s4_s4_s32",
+    "ptx_mma_m16n8k32_u4_u4_s32",
+    "ptx_mma_m16n8k4_tf32_tf32_f32",
+    "ptx_mma_m16n8k8_tf32_tf32_f32",
+    "ptx_mma_m8n8k4_f64_f64_f64",
+    "ptx_mma_m16n8k32_e4m3_e4m3_f32",
+    "ptx_mma_m16n8k32_e4m3_e4m3_f16",
+    "ptx_mma_m16n8k32_e5m2_e5m2_f32",
+    "ptx_mma",
+    "ptx_mma_sp",
+]
+
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass._mlir.dialects import llvm
+from cutlass.cute.typing import Pointer
+import cutlass.cute as cute
+
+_VALID_LAYOUTS = {"row", "col"}
+
+# Flavor configs: (llvm_type_fn, c_constraint, ab_constraint,
+#                  ab_wrapper, c_wrapper, ab_recast, c_recast)
+_FLAVOR = {
+    "f32": (T.f32, "f", "r", cute.Int32, cute.Float32, cute.Int32, None),
+    "i32": (T.i32, "r", "r", cute.Int32, cute.Int32, cute.Int32, cute.Int32),
+    "f64": (T.f64, "d", "d", cute.Float64, cute.Float64, None, None),
+}
+
+
+# =============================================================================
+# Dense MMA factory
+# =============================================================================
+
+
+def _make_ptx_mma(ptx_shape, ptx_dtypes, n_a, n_b, n_c, flavor):
+    """Factory to create a @dsl_user_op for ``mma.sync.aligned``.
+
+    Args:
+        ptx_shape: PTX shape, e.g. "m16n8k16"
+        ptx_dtypes: PTX dtype suffixes, e.g. "f32.f16.f16.f32" (D.A.B.C)
+        n_a: Number of A registers
+        n_b: Number of B registers
+        n_c: Number of C/D registers
+        flavor: "f32", "i32", or "f64" — selects register types and constraints
+    """
+    llvm_type_fn, c_con, ab_con, ab_wrap, c_wrap, ab_recast, c_recast = _FLAVOR[flavor]
+
+    # Pre-build constraints string
+    constraints = ",".join([f"={c_con}"] * n_c + [ab_con] * (n_a + n_b) + [c_con] * n_c)
+
+    # Pre-build PTX asm template ({a_layout}/{b_layout} substituted per call)
+    d_regs = ", ".join(f"${i}" for i in range(n_c))
+    a_regs = ", ".join(f"${n_c + i}" for i in range(n_a))
+    b_regs = ", ".join(f"${n_c + n_a + i}" for i in range(n_b))
+    c_regs = ", ".join(f"${n_c + n_a + n_b + i}" for i in range(n_c))
+    ptx_template = (
+        f"mma.sync.aligned.{ptx_shape}.{{a_layout}}.{{b_layout}}.{ptx_dtypes}"
+        f" {{{{{d_regs}}}}}, {{{{{a_regs}}}}}, {{{{{b_regs}}}}}, {{{{{c_regs}}}}};"
+    )
+
+    @dsl_user_op
+    def mma_op(
+        a_ptr: Pointer,
+        a_offset,
+        b_ptr: Pointer,
+        b_offset,
+        c_ptr: Pointer,
+        c_offset,
+        a_layout: str = "row",
+        b_layout: str = "col",
+        *,
+        loc=None,
+        ip=None,
+    ) -> None:
+        assert a_layout in _VALID_LAYOUTS, f"invalid a_layout: {a_layout!r}"
+        assert b_layout in _VALID_LAYOUTS, f"invalid b_layout: {b_layout!r}"
+
+        # A operand
+        a_base = cute.recast_ptr(a_ptr + a_offset, dtype=ab_recast) if ab_recast else (a_ptr + a_offset)
+        a_tensor = cute.make_tensor(a_base, (n_a,))
+        a_vals = [ab_wrap(a_tensor[i]).ir_value(loc=loc, ip=ip) for i in range(n_a)]
+
+        # B operand
+        b_base = cute.recast_ptr(b_ptr + b_offset, dtype=ab_recast) if ab_recast else (b_ptr + b_offset)
+        b_tensor = cute.make_tensor(b_base, (n_b,))
+        b_vals = [ab_wrap(b_tensor[i]).ir_value(loc=loc, ip=ip) for i in range(n_b)]
+
+        # C operand
+        c_base = cute.recast_ptr(c_ptr + c_offset, dtype=c_recast) if c_recast else (c_ptr + c_offset)
+        c_tensor = cute.make_tensor(c_base, (n_c,))
+        c_vals = [c_wrap(c_tensor[i]).ir_value(loc=loc, ip=ip) for i in range(n_c)]
+
+        # Inline asm
+        llvm_elem = llvm_type_fn()
+        res_type = llvm.StructType.get_literal([llvm_elem] * n_c)
+        ptx_asm = ptx_template.format(a_layout=a_layout, b_layout=b_layout)
+
+        result = llvm.inline_asm(
+            res_type,
+            a_vals + b_vals + c_vals,
+            ptx_asm,
+            constraints,
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+        # Write results back
+        d_tensor = cute.make_tensor(c_base, (n_c,))
+        for i in range(n_c):
+            d_tensor[i] = c_wrap(llvm.extractvalue(llvm_elem, result, [i], loc=loc, ip=ip))
+
+    return mma_op
+
+
+# =============================================================================
+# Sparse MMA factory
+# =============================================================================
+
+
+def _make_ptx_mma_sp(ptx_shape, ptx_dtypes, n_a, n_b, n_c, flavor):
+    """Factory to create a @dsl_user_op for ``mma.sp.sync.aligned``.
+
+    Args:
+        ptx_shape: PTX shape, e.g. "m16n8k32"
+        ptx_dtypes: PTX dtype suffixes, e.g. "f32.f16.f16.f32" (D.A.B.C)
+        n_a: Number of A registers
+        n_b: Number of B registers
+        n_c: Number of C/D registers
+        flavor: "f32" or "i32" — selects register types and constraints
+    """
+    llvm_type_fn, c_con, ab_con, ab_wrap, c_wrap, ab_recast, c_recast = _FLAVOR[flavor]
+
+    # Pre-build constraints string (extra "r" for the 1 metadata register)
+    constraints = ",".join([f"={c_con}"] * n_c + [ab_con] * (n_a + n_b) + [c_con] * n_c + ["r"])
+
+    # Pre-build PTX asm template ({a_layout}/{b_layout}/{sparse_selector} per call)
+    d_regs = ", ".join(f"${i}" for i in range(n_c))
+    a_regs = ", ".join(f"${n_c + i}" for i in range(n_a))
+    b_regs = ", ".join(f"${n_c + n_a + i}" for i in range(n_b))
+    c_regs = ", ".join(f"${n_c + n_a + n_b + i}" for i in range(n_c))
+    meta_reg = f"${n_c + n_a + n_b + n_c}"
+    ptx_template = (
+        f"mma.sp.sync.aligned.{ptx_shape}.{{a_layout}}.{{b_layout}}.{ptx_dtypes}"
+        f" {{{{{d_regs}}}}}, {{{{{a_regs}}}}}, {{{{{b_regs}}}}}, {{{{{c_regs}}}}}, {meta_reg}, 0x{{sparse_selector}};"
+    )
+
+    @dsl_user_op
+    def mma_sp_op(
+        a_ptr: Pointer,
+        a_offset,
+        b_ptr: Pointer,
+        b_offset,
+        c_ptr: Pointer,
+        c_offset,
+        meta_ptr: Pointer,
+        meta_offset,
+        sparse_selector: int = 0,
+        a_layout: str = "row",
+        b_layout: str = "col",
+        *,
+        loc=None,
+        ip=None,
+    ) -> None:
+        assert a_layout in _VALID_LAYOUTS, f"invalid a_layout: {a_layout!r}"
+        assert b_layout in _VALID_LAYOUTS, f"invalid b_layout: {b_layout!r}"
+
+        # A operand
+        a_base = cute.recast_ptr(a_ptr + a_offset, dtype=ab_recast) if ab_recast else (a_ptr + a_offset)
+        a_tensor = cute.make_tensor(a_base, (n_a,))
+        a_vals = [ab_wrap(a_tensor[i]).ir_value(loc=loc, ip=ip) for i in range(n_a)]
+
+        # B operand
+        b_base = cute.recast_ptr(b_ptr + b_offset, dtype=ab_recast) if ab_recast else (b_ptr + b_offset)
+        b_tensor = cute.make_tensor(b_base, (n_b,))
+        b_vals = [ab_wrap(b_tensor[i]).ir_value(loc=loc, ip=ip) for i in range(n_b)]
+
+        # C operand
+        c_base = cute.recast_ptr(c_ptr + c_offset, dtype=c_recast) if c_recast else (c_ptr + c_offset)
+        c_tensor = cute.make_tensor(c_base, (n_c,))
+        c_vals = [c_wrap(c_tensor[i]).ir_value(loc=loc, ip=ip) for i in range(n_c)]
+
+        # Metadata (1 register, always Int32)
+        meta_base = cute.recast_ptr(meta_ptr + meta_offset, dtype=cute.Int32)
+        meta_tensor = cute.make_tensor(meta_base, (1,))
+        meta_val = cute.Int32(meta_tensor[0]).ir_value(loc=loc, ip=ip)
+
+        # Inline asm
+        llvm_elem = llvm_type_fn()
+        res_type = llvm.StructType.get_literal([llvm_elem] * n_c)
+        ptx_asm = ptx_template.format(
+            a_layout=a_layout,
+            b_layout=b_layout,
+            sparse_selector=sparse_selector,
+        )
+
+        result = llvm.inline_asm(
+            res_type,
+            a_vals + b_vals + c_vals + [meta_val],
+            ptx_asm,
+            constraints,
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+        # Write results back
+        d_tensor = cute.make_tensor(c_base, (n_c,))
+        for i in range(n_c):
+            d_tensor[i] = c_wrap(llvm.extractvalue(llvm_elem, result, [i], loc=loc, ip=ip))
+
+    return mma_sp_op
+
+
+# =============================================================================
+# Instantiate dense MMA variants
+#   args: ptx_shape, ptx_dtypes (D.A.B.C), n_a, n_b, n_c, flavor
+#   flavor: "f32" (float accum), "i32" (int/packed-half accum), "f64"
+# =============================================================================
+
+# FP16
+ptx_mma_m16n8k16_f16_f16_f32 = _make_ptx_mma("m16n8k16", "f32.f16.f16.f32", 4, 2, 4, "f32")
+ptx_mma_m16n8k16_f16_f16_f16 = _make_ptx_mma("m16n8k16", "f16.f16.f16.f16", 4, 2, 2, "i32")
+# BF16
+ptx_mma_m16n8k16_bf16_bf16_f32 = _make_ptx_mma("m16n8k16", "f32.bf16.bf16.f32", 4, 2, 4, "f32")
+# INT8
+ptx_mma_m16n8k32_s8_s8_s32 = _make_ptx_mma("m16n8k32", "s32.s8.s8.s32", 4, 2, 4, "i32")
+ptx_mma_m16n8k32_u8_u8_s32 = _make_ptx_mma("m16n8k32", "s32.u8.u8.s32", 4, 2, 4, "i32")
+# INT4 (TileLang m16n8k32 -> PTX m16n8k64)
+ptx_mma_m16n8k32_s4_s4_s32 = _make_ptx_mma("m16n8k64", "s32.s4.s4.s32", 4, 2, 4, "i32")
+ptx_mma_m16n8k32_u4_u4_s32 = _make_ptx_mma("m16n8k64", "s32.u4.u4.s32", 4, 2, 4, "i32")
+# TF32
+ptx_mma_m16n8k4_tf32_tf32_f32 = _make_ptx_mma("m16n8k4", "f32.tf32.tf32.f32", 2, 1, 4, "f32")
+ptx_mma_m16n8k8_tf32_tf32_f32 = _make_ptx_mma("m16n8k8", "f32.tf32.tf32.f32", 4, 2, 4, "f32")
+# FP64
+ptx_mma_m8n8k4_f64_f64_f64 = _make_ptx_mma("m8n8k4", "f64.f64.f64.f64", 1, 1, 2, "f64")
+# FP8 (SM89+)
+ptx_mma_m16n8k32_e4m3_e4m3_f32 = _make_ptx_mma("m16n8k32", "f32.e4m3.e4m3.f32", 4, 2, 4, "f32")
+ptx_mma_m16n8k32_e4m3_e4m3_f16 = _make_ptx_mma("m16n8k32", "f16.e4m3.e4m3.f16", 4, 2, 2, "i32")
+ptx_mma_m16n8k32_e5m2_e5m2_f32 = _make_ptx_mma("m16n8k32", "f32.e5m2.e5m2.f32", 4, 2, 4, "f32")
+
+
+# =============================================================================
+# Instantiate sparse MMA variants
+#   args: ptx_shape, ptx_dtypes (D.A.B.C), n_a, n_b, n_c, flavor
+#   flavor: "f32" (float accum), "i32" (int/packed-half accum)
+# =============================================================================
+
+# FP16
+ptx_mma_sp_m16n8k32_f16_f16_f32 = _make_ptx_mma_sp("m16n8k32", "f32.f16.f16.f32", 4, 4, 4, "f32")
+ptx_mma_sp_m16n8k32_f16_f16_f16 = _make_ptx_mma_sp("m16n8k32", "f16.f16.f16.f16", 4, 4, 2, "i32")
+ptx_mma_sp_m16n8k16_f16_f16_f32 = _make_ptx_mma_sp("m16n8k16", "f32.f16.f16.f32", 2, 2, 4, "f32")
+ptx_mma_sp_m16n8k16_f16_f16_f16 = _make_ptx_mma_sp("m16n8k16", "f16.f16.f16.f16", 2, 2, 2, "i32")
+# BF16
+ptx_mma_sp_m16n8k32_bf16_bf16_f32 = _make_ptx_mma_sp("m16n8k32", "f32.bf16.bf16.f32", 4, 4, 4, "f32")
+ptx_mma_sp_m16n8k16_bf16_bf16_f32 = _make_ptx_mma_sp("m16n8k16", "f32.bf16.bf16.f32", 2, 2, 4, "f32")
+# INT8
+ptx_mma_sp_m16n8k64_s8_s8_s32 = _make_ptx_mma_sp("m16n8k64", "s32.s8.s8.s32", 4, 4, 4, "i32")
+ptx_mma_sp_m16n8k32_s8_s8_s32 = _make_ptx_mma_sp("m16n8k32", "s32.s8.s8.s32", 2, 2, 4, "i32")
+# TF32
+ptx_mma_sp_m16n8k16_tf32_tf32_f32 = _make_ptx_mma_sp("m16n8k16", "f32.tf32.tf32.f32", 2, 2, 4, "f32")
+ptx_mma_sp_m16n8k8_tf32_tf32_f32 = _make_ptx_mma_sp("m16n8k8", "f32.tf32.tf32.f32", 1, 1, 4, "f32")
+
+
+# =============================================================================
+# Dense MMA dispatcher
+# =============================================================================
+
+
+def ptx_mma(
+    shape: str,
+    a_layout: str,
+    b_layout: str,
+    a_dtype: str,
+    b_dtype: str,
+    c_dtype: str,
+    a_ptr,
+    a_offset,
+    b_ptr,
+    b_offset,
+    c_ptr,
+    c_offset,
+    saturate: bool = False,
+):
+    """Generic PTX MMA dispatcher.
+
+    Dispatches to the appropriate specialized MMA function based on
+    shape and data types.
+    """
+    if saturate:
+        raise NotImplementedError("saturate=True (.satfinite) is not yet supported in CuTeDSL backend")
+
+    shape = shape.lower()
+    a_dtype = a_dtype.lower()
+    b_dtype = b_dtype.lower()
+    c_dtype = c_dtype.lower()
+
+    _args = (a_ptr, a_offset, b_ptr, b_offset, c_ptr, c_offset, a_layout, b_layout)
+
+    # Dispatch based on shape and types
+    if shape == "m16n8k16":
+        if a_dtype in ["fp16", "f16", "float16"]:
+            if c_dtype in ["fp32", "f32", "float32"]:
+                return ptx_mma_m16n8k16_f16_f16_f32(*_args)
+            elif c_dtype in ["fp16", "f16", "float16"]:
+                return ptx_mma_m16n8k16_f16_f16_f16(*_args)
+        elif a_dtype in ["bf16", "bfloat16"] and c_dtype in ["fp32", "f32", "float32"]:
+            return ptx_mma_m16n8k16_bf16_bf16_f32(*_args)
+
+    elif shape == "m16n8k32":
+        if a_dtype in ["int8", "s8"]:
+            return ptx_mma_m16n8k32_s8_s8_s32(*_args)
+        elif a_dtype in ["uint8", "u8"]:
+            return ptx_mma_m16n8k32_u8_u8_s32(*_args)
+        elif a_dtype in ["int4", "s4"]:
+            return ptx_mma_m16n8k32_s4_s4_s32(*_args)
+        elif a_dtype in ["uint4", "u4"]:
+            return ptx_mma_m16n8k32_u4_u4_s32(*_args)
+        elif a_dtype in ["e4m3", "float8_e4m3", "fp8_e4m3"]:
+            if c_dtype in ["fp32", "f32", "float32"]:
+                return ptx_mma_m16n8k32_e4m3_e4m3_f32(*_args)
+            elif c_dtype in ["fp16", "f16", "float16"]:
+                return ptx_mma_m16n8k32_e4m3_e4m3_f16(*_args)
+        elif a_dtype in ["e5m2", "float8_e5m2", "fp8_e5m2"] and c_dtype in ["fp32", "f32", "float32"]:
+            return ptx_mma_m16n8k32_e5m2_e5m2_f32(*_args)
+
+    elif shape == "m16n8k4":
+        # TF32: accept tf32 or fp32 (TileLang may pass float32 for TF32 GEMM)
+        if a_dtype in ["tf32", "tensorfloat32", "fp32", "f32", "float32"]:
+            return ptx_mma_m16n8k4_tf32_tf32_f32(*_args)
+
+    elif shape == "m16n8k8":
+        # TF32: accept tf32 or fp32 (e.g. deepseek_mhc)
+        if a_dtype in ["tf32", "tensorfloat32", "fp32", "f32", "float32"]:
+            return ptx_mma_m16n8k8_tf32_tf32_f32(*_args)
+
+    elif shape == "m8n8k4" and a_dtype in ["fp64", "f64", "float64"]:
+        return ptx_mma_m8n8k4_f64_f64_f64(*_args)
+
+    raise ValueError(f"Unsupported MMA configuration: shape={shape}, a_dtype={a_dtype}, b_dtype={b_dtype}, c_dtype={c_dtype}")
+
+
+# =============================================================================
+# Sparse MMA dispatcher
+# =============================================================================
+
+
+def ptx_mma_sp(
+    shape: str,
+    a_layout: str,
+    b_layout: str,
+    a_dtype: str,
+    b_dtype: str,
+    c_dtype: str,
+    a_ptr,
+    a_offset,
+    b_ptr,
+    b_offset,
+    c_ptr,
+    c_offset,
+    meta_ptr,
+    meta_offset,
+    sparse_selector: int = 0,
+    saturate: bool = False,
+):
+    """Generic PTX sparse MMA dispatcher.
+
+    Dispatches to the appropriate specialized sparse MMA function based on
+    shape and data types.
+    """
+    if saturate:
+        raise NotImplementedError("saturate=True (.satfinite) is not yet supported in CuTeDSL backend")
+
+    shape = shape.lower()
+    a_dtype = a_dtype.lower()
+    b_dtype = b_dtype.lower()
+    c_dtype = c_dtype.lower()
+
+    _args = (a_ptr, a_offset, b_ptr, b_offset, c_ptr, c_offset, meta_ptr, meta_offset, sparse_selector, a_layout, b_layout)
+
+    # Dispatch based on shape and types
+    if shape == "m16n8k32":
+        if a_dtype in ["fp16", "f16", "float16"]:
+            if c_dtype in ["fp32", "f32", "float32"]:
+                return ptx_mma_sp_m16n8k32_f16_f16_f32(*_args)
+            elif c_dtype in ["fp16", "f16", "float16"]:
+                return ptx_mma_sp_m16n8k32_f16_f16_f16(*_args)
+        elif a_dtype in ["bf16", "bfloat16"] and c_dtype in ["fp32", "f32", "float32"]:
+            return ptx_mma_sp_m16n8k32_bf16_bf16_f32(*_args)
+        elif a_dtype in ["int8", "s8"]:
+            return ptx_mma_sp_m16n8k32_s8_s8_s32(*_args)
+
+    elif shape == "m16n8k16":
+        if a_dtype in ["fp16", "f16", "float16"]:
+            if c_dtype in ["fp32", "f32", "float32"]:
+                return ptx_mma_sp_m16n8k16_f16_f16_f32(*_args)
+            elif c_dtype in ["fp16", "f16", "float16"]:
+                return ptx_mma_sp_m16n8k16_f16_f16_f16(*_args)
+        elif a_dtype in ["bf16", "bfloat16"] and c_dtype in ["fp32", "f32", "float32"]:
+            return ptx_mma_sp_m16n8k16_bf16_bf16_f32(*_args)
+        elif a_dtype in ["tf32", "tensorfloat32", "fp32", "f32", "float32"]:
+            return ptx_mma_sp_m16n8k16_tf32_tf32_f32(*_args)
+
+    elif shape == "m16n8k64":
+        if a_dtype in ["int8", "s8"]:
+            return ptx_mma_sp_m16n8k64_s8_s8_s32(*_args)
+
+    elif shape == "m16n8k8" and a_dtype in ["tf32", "tensorfloat32", "fp32", "f32", "float32"]:
+        return ptx_mma_sp_m16n8k8_tf32_tf32_f32(*_args)
+
+    raise ValueError(f"Unsupported sparse MMA configuration: shape={shape}, a_dtype={a_dtype}, b_dtype={b_dtype}, c_dtype={c_dtype}")
diff --git a/tilelang/contrib/cutedsl/quantize.py b/tilelang/contrib/cutedsl/quantize.py
new file mode 100644
index 0000000000..33781d17ed
--- /dev/null
+++ b/tilelang/contrib/cutedsl/quantize.py
@@ -0,0 +1,323 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+"""
+Quantization/dequantization functions for CuTeDSL backend.
+These implement the same functionality as the CUDA templates in tilelang/quantize/lop3.py
+using inline PTX via llvm.inline_asm.
+"""
+
+__all__ = [
+    "BOTTOM_MASK",
+    "FP16_TOP_MAGIC_NUM",
+    "IMMLUT",
+    "MEDIAN_NUM_UNSIGNED",
+    "MEDIAN_NUM_SIGNED",
+    "decode_i4u_to_f16",
+    "decode_i4s_to_f16",
+    "decode_fp4_to_bf16_twiddling",
+]
+
+import cutlass
+import cutlass.cute as cute
+from cutlass._mlir.dialects import llvm, arith
+from cutlass.base_dsl.typing import Uint32
+from cutlass.cutlass_dsl import T, dsl_user_op
+
+
+# Constants for decode operations
+BOTTOM_MASK = 0x000F000F
+FP16_TOP_MAGIC_NUM = 0x64006400
+IMMLUT = (0xF0 & 0xCC) | 0xAA  # = 0xea = 234
+MEDIAN_NUM_UNSIGNED = 0x64006400
+MEDIAN_NUM_SIGNED = 0x64086408
+
+
+@dsl_user_op
+def _lop3_sub_f16x2_unsigned(i4s: Uint32, shift: int, *, loc=None, ip=None) -> Uint32:
+    """
+    LOP3 + sub.f16x2 for unsigned i4 to f16x2 decode.
+
+    PTX equivalent:
+        shr.b32 shifted, i4s, shift;
+        lop3.b32 h, shifted, BOTTOM_MASK, FP16_TOP_MAGIC_NUM, immLut;
+        sub.f16x2 h, h, MEDIAN_NUM;
+
+    Note: lop3.b32's immLut must be an immediate constant.
+          sub.f16x2 requires register operands, not immediates.
+    """
+    # Shift right
+    shifted = Uint32(arith.shrui(Uint32(i4s).ir_value(), Uint32(shift).ir_value()))
+
+    # LOP3 + sub in single asm block
+    # immLut=234 (0xea), BOTTOM_MASK=0x000f000f, FP16_TOP_MAGIC_NUM=0x64006400
+    # MEDIAN_NUM_UNSIGNED=0x64006400
+    result = Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [shifted.ir_value()],
+            "{ .reg .b32 tmp, median; "
+            "lop3.b32 tmp, $1, 0x000f000f, 0x64006400, 0xea; "
+            "mov.b32 median, 0x64006400; "
+            "sub.f16x2 $0, tmp, median; }",
+            "=r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+    return result
+
+
+@dsl_user_op
+def _lop3_sub_f16x2_signed(i4s: Uint32, shift: int, *, loc=None, ip=None) -> Uint32:
+    """LOP3 + sub.f16x2 for signed i4 to f16x2 decode."""
+    shifted = Uint32(arith.shrui(Uint32(i4s).ir_value(), Uint32(shift).ir_value()))
+
+    # MEDIAN_NUM_SIGNED=0x64086408
+    result = Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [shifted.ir_value()],
+            "{ .reg .b32 tmp, median; "
+            "lop3.b32 tmp, $1, 0x000f000f, 0x64006400, 0xea; "
+            "mov.b32 median, 0x64086408; "
+            "sub.f16x2 $0, tmp, median; }",
+            "=r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+    return result
+
+
+def decode_i4u_to_f16(src_ptr, dst_ptr, N: int = 8):
+    """
+    Decode unsigned INT4 to FP16.
+
+    Equivalent to CUDA template:
+        decode_i4b_to_f16<T1, T2, false>(_i4u, B_local_decode, N);
+
+    Args:
+        src_ptr: Pointer to packed INT4 data (4 bytes for 8 elements)
+        dst_ptr: Pointer to FP16 output (16 bytes for 8 elements)
+        N: Number of elements to decode (default 8, must be even)
+    """
+    assert N % 2 == 0, f"N must be even for i4-to-f16 decode, got {N}"
+    # Load packed i4 values (32 bits = 8 x 4-bit values)
+    # Use make_tensor to create a tensor view, then access element
+    src_u32_ptr = cute.recast_ptr(src_ptr, dtype=cutlass.Uint32)
+    src_tensor = cute.make_tensor(src_u32_ptr, (1,))
+    i4s = src_tensor[0]
+
+    # Output as uint32 pairs (each holds 2 x f16)
+    dst_u32_ptr = cute.recast_ptr(dst_ptr, dtype=cutlass.Uint32)
+    h = cute.make_tensor(dst_u32_ptr, (N // 2,))
+
+    # Decode 2 elements at a time (N/2 iterations)
+    # Note: N must be compile-time constant for unrolling
+    for i in range(N // 2):
+        shift = 4 * i
+        h[i] = _lop3_sub_f16x2_unsigned(i4s, shift)
+
+
+def decode_i4s_to_f16(src_ptr, dst_ptr, N: int = 8):
+    """Decode signed INT4 to FP16. N must be even."""
+    assert N % 2 == 0, f"N must be even for i4-to-f16 decode, got {N}"
+    src_u32_ptr = cute.recast_ptr(src_ptr, dtype=cutlass.Uint32)
+    src_tensor = cute.make_tensor(src_u32_ptr, (1,))
+    i4s = src_tensor[0]
+
+    dst_u32_ptr = cute.recast_ptr(dst_ptr, dtype=cutlass.Uint32)
+    h = cute.make_tensor(dst_u32_ptr, (N // 2,))
+
+    for i in range(N // 2):
+        shift = 4 * i
+        h[i] = _lop3_sub_f16x2_signed(i4s, shift)
+
+
+@dsl_user_op
+def _fp4_to_bf16_twiddling_r0(inp: Uint32, *, loc=None, ip=None) -> Uint32:
+    """Convert 8 FP4 to 8 BF16, return output 0."""
+    return Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [Uint32(inp).ir_value()],
+            "{ .reg .b32 tmp, bias; "
+            "prmt.b32 tmp, $1, 0, 0x0123; "
+            "mov.b32 bias, 0x7e807e80; "
+            "and.b32 $0, tmp, 0b10000001110000001000000111000000; "
+            "mul.bf16x2 $0, $0, bias; }",
+            "=r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fp4_to_bf16_twiddling_r1(inp: Uint32, *, loc=None, ip=None) -> Uint32:
+    """Convert 8 FP4 to 8 BF16, return output 1."""
+    return Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [Uint32(inp).ir_value()],
+            "{ .reg .b32 tmp, bias, d0; "
+            "prmt.b32 tmp, $1, 0, 0x0123; "
+            "mov.b32 bias, 0x7e807e80; "
+            "shl.b32 d0, tmp, 3; "
+            "and.b32 $0, d0, 0b10000001110000001000000111000000; "
+            "mul.bf16x2 $0, $0, bias; }",
+            "=r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fp4_to_bf16_twiddling_r2(inp: Uint32, *, loc=None, ip=None) -> Uint32:
+    """Convert 8 FP4 to 8 BF16, return output 2."""
+    return Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [Uint32(inp).ir_value()],
+            "{ .reg .b32 tmp, bias, d0; "
+            "prmt.b32 tmp, $1, 0, 0x0123; "
+            "mov.b32 bias, 0x7e807e80; "
+            "shl.b32 d0, tmp, 6; "
+            "and.b32 $0, d0, 0b10000001110000001000000111000000; "
+            "mul.bf16x2 $0, $0, bias; }",
+            "=r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fp4_to_bf16_twiddling_r3(inp: Uint32, *, loc=None, ip=None) -> Uint32:
+    """Convert 8 FP4 to 8 BF16, return output 3."""
+    return Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [Uint32(inp).ir_value()],
+            "{ .reg .b32 tmp, bias, d0, d1, d2, d3, d4, d5; "
+            "prmt.b32 tmp, $1, 0, 0x0123; "
+            "mov.b32 bias, 0x7e807e80; "
+            "shl.b32 d0, tmp, 1; "
+            "and.b32 d1, d0, 0b10000000000000001000000000000000; "
+            "shr.b32 d2, tmp, 3; "
+            "and.b32 d3, d2, 0b00000001100000000000000110000000; "
+            "or.b32 d4, d1, d3; "
+            "shr.b32 d0, tmp, 7; "
+            "and.b32 d5, d0, 0b00000000010000000000000001000000; "
+            "or.b32 $0, d4, d5; "
+            "mul.bf16x2 $0, $0, bias; }",
+            "=r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _pack_bf16_high(r0: Uint32, r1: Uint32, *, loc=None, ip=None) -> Uint32:
+    """Pack high 16-bits of r0 and r1 into one uint32."""
+    # r0[31:16] -> result[15:0], r1[31:16] -> result[31:16]
+    return Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [Uint32(r0).ir_value(), Uint32(r1).ir_value()],
+            "prmt.b32 $0, $1, $2, 0x7632;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _pack_bf16_low(r0: Uint32, r1: Uint32, *, loc=None, ip=None) -> Uint32:
+    """Pack low 16-bits of r0 and r1 into one uint32."""
+    # r0[15:0] -> result[15:0], r1[15:0] -> result[31:16]
+    return Uint32(
+        llvm.inline_asm(
+            T.i32(),
+            [Uint32(r0).ir_value(), Uint32(r1).ir_value()],
+            "prmt.b32 $0, $1, $2, 0x5410;",
+            "=r,r,r",
+            has_side_effects=False,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+def decode_fp4_to_bf16_twiddling(src_ptr, dst_ptr, N: int = 8):
+    """
+    Decode FP4 to BF16 using twiddling technique.
+
+    Reference: triton/tensor_details/layout_details/hopper_value.py
+
+    For each iteration:
+        - Input: 4 bytes (uint32) = 8 FP4 values
+        - Output: 8 BF16 values (16 bytes)
+
+    C code output layout:
+        B_local_decode[(i << 3) + j] = vec[j].high  (j=0..3)
+        B_local_decode[(i << 3) + j + 4] = vec[j].low  (j=0..3)
+
+    So output as uint32:
+        dst[i*4 + 0] = {r1.high, r0.high}
+        dst[i*4 + 1] = {r3.high, r2.high}
+        dst[i*4 + 2] = {r1.low, r0.low}
+        dst[i*4 + 3] = {r3.low, r2.low}
+
+    Args:
+        src_ptr: Pointer to packed FP4 data
+        dst_ptr: Pointer to BF16 output
+        N: Number of iterations (default 8, processing 64 FP4 -> 64 BF16)
+    """
+    # Input: read as uint32 (4 bytes at a time)
+    src_u32_ptr = cute.recast_ptr(src_ptr, dtype=cutlass.Uint32)
+    src_tensor = cute.make_tensor(src_u32_ptr, (N,))
+
+    # Output: write as uint32 (each holds 2 BF16)
+    dst_u32_ptr = cute.recast_ptr(dst_ptr, dtype=cutlass.Uint32)
+    dst_tensor = cute.make_tensor(dst_u32_ptr, (N * 4,))  # N iterations * 4 outputs each
+
+    for i in range(N):
+        inp = src_tensor[i]
+        r0 = _fp4_to_bf16_twiddling_r0(inp)
+        r1 = _fp4_to_bf16_twiddling_r1(inp)
+        r2 = _fp4_to_bf16_twiddling_r2(inp)
+        r3 = _fp4_to_bf16_twiddling_r3(inp)
+
+        # Pack high and low halves according to C output layout
+        base = i * 4
+        dst_tensor[base + 0] = _pack_bf16_high(r0, r1)  # {r1.high, r0.high}
+        dst_tensor[base + 1] = _pack_bf16_high(r2, r3)  # {r3.high, r2.high}
+        dst_tensor[base + 2] = _pack_bf16_low(r0, r1)  # {r1.low, r0.low}
+        dst_tensor[base + 3] = _pack_bf16_low(r2, r3)  # {r3.low, r2.low}
diff --git a/tilelang/contrib/cutedsl/reduce.py b/tilelang/contrib/cutedsl/reduce.py
index f835b149b6..b031bf1b45 100644
--- a/tilelang/contrib/cutedsl/reduce.py
+++ b/tilelang/contrib/cutedsl/reduce.py
@@ -5,16 +5,57 @@
 
 from __future__ import annotations
 
+__all__ = [
+    "min",
+    "max",
+    "SumOp",
+    "MaxOp",
+    "MinOp",
+    "BitAndOp",
+    "BitOrOp",
+    "BitXorOp",
+    "bar_sync",
+    "bar_sync_ptx",
+    "CumSum1D",
+    "CumSum2D",
+    "NamedBarrier",
+    "AllReduce",
+]
+
 import cutlass
 import cutlass.cute as cute
 from cutlass.cute.typing import Int32, Float32
+from cutlass.base_dsl.typing import Numeric
 from cutlass.cutlass_dsl import dsl_user_op, T
-from cutlass._mlir.dialects import nvvm
+from cutlass._mlir.dialects import arith, nvvm
 from cutlass.cute.arch.nvvm_wrappers import shuffle_sync_op
 
 
+def _is_int_type(val):
+    """Check if a value is an integer Numeric type."""
+    if isinstance(val, Int32):
+        return True
+    if isinstance(val, Numeric) and hasattr(val, "mlir_type"):
+        from cutlass._mlir import ir as mlir_ir
+
+        return isinstance(val.mlir_type, mlir_ir.IntegerType)
+    if isinstance(val, int) and not isinstance(val, bool):
+        return True
+    # Check for signless integer ArithValue (from DSL expressions)
+    if hasattr(val, "ir_value"):
+        try:
+            from cutlass._mlir import ir as mlir_ir
+
+            ir_val = val.ir_value()
+            if hasattr(ir_val, "type") and isinstance(ir_val.type, mlir_ir.IntegerType):
+                return True
+        except Exception:
+            pass
+    return False
+
+
 @dsl_user_op
-def min(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+def _fmin(a, b, c=None, *, loc=None, ip=None):
     return Float32(
         nvvm.fmin(
             T.f32(),
@@ -28,7 +69,19 @@ def min(a: float | Float32, b: float | Float32, c: float | Float32 | None = None
 
 
 @dsl_user_op
-def max(a: float | Float32, b: float | Float32, c: float | Float32 | None = None, *, loc=None, ip=None) -> Float32:
+def _imin(a, b, *, loc=None, ip=None):
+    return Int32(
+        arith.minsi(
+            Int32(a).ir_value(loc=loc, ip=ip),
+            Int32(b).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+@dsl_user_op
+def _fmax(a, b, c=None, *, loc=None, ip=None):
     return Float32(
         nvvm.fmax(
             T.f32(),
@@ -41,6 +94,42 @@ def max(a: float | Float32, b: float | Float32, c: float | Float32 | None = None
     )
 
 
+@dsl_user_op
+def _imax(a, b, *, loc=None, ip=None):
+    return Int32(
+        arith.maxsi(
+            Int32(a).ir_value(loc=loc, ip=ip),
+            Int32(b).ir_value(loc=loc, ip=ip),
+            loc=loc,
+            ip=ip,
+        )
+    )
+
+
+def min(a, b, c=None):
+    """Type-aware min: uses arith.minsi for integers, nvvm.fmin for floats.
+    Falls back to integer path if float conversion fails (signless int types)."""
+    if _is_int_type(a) and _is_int_type(b):
+        return _imin(a, b)
+    try:
+        return _fmin(a, b, c)
+    except Exception:
+        # Float32 conversion may fail for signless integer types
+        return _imin(a, b)
+
+
+def max(a, b, c=None):
+    """Type-aware max: uses arith.maxsi for integers, nvvm.fmax for floats.
+    Falls back to integer path if float conversion fails (signless int types)."""
+    if _is_int_type(a) and _is_int_type(b):
+        return _imax(a, b)
+    try:
+        return _fmax(a, b, c)
+    except Exception:
+        # Float32 conversion may fail for signless integer types
+        return _imax(a, b)
+
+
 class SumOp:
     """Sum reduction operator"""
 
@@ -108,7 +197,194 @@ def bar_sync_ptx(barrier_id, number_of_threads):
     )
 
 
-def AllReduce(reducer, threads, scale, thread_offset, all_threads=None):
+# Import shuffle functions from warp module
+from .warp import __shfl_up_sync, __shfl_down_sync
+
+
+def _warp_prefix_sum_forward(val, lane, MASK=0xFFFFFFFF):
+    """
+    Warp-level inclusive prefix sum (forward).
+    Uses shfl.up to propagate values from lower lanes.
+    """
+    # Unrolled loop for SEG=32: off = 1, 2, 4, 8, 16
+    n = __shfl_up_sync(MASK, val, 1)
+    val = cutlass.select_(lane >= 1, val + n, val)
+    n = __shfl_up_sync(MASK, val, 2)
+    val = cutlass.select_(lane >= 2, val + n, val)
+    n = __shfl_up_sync(MASK, val, 4)
+    val = cutlass.select_(lane >= 4, val + n, val)
+    n = __shfl_up_sync(MASK, val, 8)
+    val = cutlass.select_(lane >= 8, val + n, val)
+    n = __shfl_up_sync(MASK, val, 16)
+    val = cutlass.select_(lane >= 16, val + n, val)
+    return val
+
+
+def _warp_prefix_sum_reverse(val, lane, MASK=0xFFFFFFFF):
+    """
+    Warp-level inclusive prefix sum (reverse).
+    Uses shfl.down to propagate values from higher lanes.
+    """
+    SEG = 32
+    # Unrolled loop for SEG=32: off = 1, 2, 4, 8, 16
+    n = __shfl_down_sync(MASK, val, 1)
+    val = cutlass.select_(lane < SEG - 1, val + n, val)
+    n = __shfl_down_sync(MASK, val, 2)
+    val = cutlass.select_(lane < SEG - 2, val + n, val)
+    n = __shfl_down_sync(MASK, val, 4)
+    val = cutlass.select_(lane < SEG - 4, val + n, val)
+    n = __shfl_down_sync(MASK, val, 8)
+    val = cutlass.select_(lane < SEG - 8, val + n, val)
+    n = __shfl_down_sync(MASK, val, 16)
+    val = cutlass.select_(lane < SEG - 16, val + n, val)
+    return val
+
+
+class CumSum1D:
+    """
+    1D cumulative sum operation.
+    Based on tl::CumSum1D from reduce.h
+
+    Template params:
+        threads: Number of threads
+        reverse: Whether to cumsum in reverse order
+    """
+
+    def __init__(self, threads: cutlass.Constexpr[int], reverse: cutlass.Constexpr[bool]):
+        self.threads = threads
+        self.reverse = reverse
+        self.SEG = 32  # Warp size
+
+    @cute.jit
+    def run(self, src: cute.Pointer, dst: cute.Pointer, N):
+        """
+        Perform 1D cumulative sum.
+
+        Args:
+            src: Source pointer
+            dst: Destination pointer
+            N: Number of elements (must be compile-time constant or small)
+        """
+        MASK = 0xFFFFFFFF
+        tidx, _, _ = cute.arch.thread_idx()
+        lane = tidx % self.SEG
+
+        src_tensor = cute.make_tensor(src, (N,))
+        dst_tensor = cute.make_tensor(dst, (N,))
+
+        # Load value (0 if out of bounds)
+        val = Float32(0.0)
+        if tidx < N:
+            val = src_tensor[tidx]
+
+        # Warp-level prefix sum
+        if self.reverse:
+            val = _warp_prefix_sum_reverse(val, lane, MASK)
+        else:
+            val = _warp_prefix_sum_forward(val, lane, MASK)
+
+        # Store result - only valid threads write
+        if tidx < N:
+            dst_tensor[tidx] = val
+
+
+class CumSum2D:
+    """
+    2D cumulative sum operation.
+    Based on tl::CumSum2D from reduce.h
+
+    Template params:
+        threads: Number of threads (must be power of 2, 32-1024)
+        dim: Axis along which to cumsum (0 or 1)
+        reverse: Whether to cumsum in reverse order
+    """
+
+    def __init__(self, threads: cutlass.Constexpr[int], dim: cutlass.Constexpr[int], reverse: cutlass.Constexpr[bool]):
+        self.threads = threads
+        self.dim = dim
+        self.reverse = reverse
+        self.SEG = 32  # Warp size
+        self.TILE_H = threads // 32
+
+    @cute.jit
+    def run(self, src: cute.Pointer, dst: cute.Pointer, H, W):
+        """
+        Perform 2D cumulative sum.
+
+        Args:
+            src: Source pointer
+            dst: Destination pointer
+            H: Number of rows
+            W: Number of columns (should be <= 32 for single-segment case)
+        """
+        MASK = 0xFFFFFFFF
+        tidx, _, _ = cute.arch.thread_idx()
+        lane = tidx % self.SEG
+        row = tidx // self.SEG
+
+        src_tensor = cute.make_tensor(src, (H * W,))
+        dst_tensor = cute.make_tensor(dst, (H * W,))
+
+        # For 2D cumsum along dim=1 (row-wise cumsum):
+        # Each warp handles one row, lane id is the column index
+        # For dim=0 (column-wise), interpretation is swapped
+
+        if self.dim == 1:
+            # Row-wise cumsum: each warp processes one row
+            # row = which row this warp handles
+            # lane = column index within the row
+            col = lane
+            # Linear index into the flattened buffer
+            idx = row * W + col
+
+            # Load value (0 if out of bounds)
+            val = Float32(0.0)
+            if row < H and col < W:
+                val = src_tensor[idx]
+
+            # Warp-level prefix sum along the row
+            if self.reverse:
+                val = _warp_prefix_sum_reverse(val, lane, MASK)
+            else:
+                val = _warp_prefix_sum_forward(val, lane, MASK)
+
+            # Store result - only valid threads write
+            if row < H and col < W:
+                dst_tensor[idx] = val
+        else:
+            # Column-wise cumsum (dim=0): each warp processes one column
+            # Each lane maps to a row index, so H must be <= 32 (warp size).
+            assert H <= 32, (
+                f"CumSum2D dim=0 only supports H <= 32 (got H={H}). Use dim=1 for row-wise cumsum or implement multi-warp column iteration."
+            )
+            col = row  # warp index becomes column index
+            row_in_col = lane  # lane becomes row index within column
+            idx = row_in_col * W + col
+
+            # Load value (0 if out of bounds)
+            val = Float32(0.0)
+            if row_in_col < H and col < W:
+                val = src_tensor[idx]
+
+            if self.reverse:
+                val = _warp_prefix_sum_reverse(val, lane, MASK)
+            else:
+                val = _warp_prefix_sum_forward(val, lane, MASK)
+
+            # Store result - only valid threads write
+            if row_in_col < H and col < W:
+                dst_tensor[idx] = val
+
+
+class NamedBarrier:
+    """Named barrier policy for AllReduce, uses bar.sync instead of __syncthreads.
+    Based on tl::NamedBarrier<all_threads> from reduce.h"""
+
+    def __init__(self, all_threads):
+        self.all_threads = all_threads
+
+
+def AllReduce(reducer, threads, scale, thread_offset, all_threads=None, batch_size=1, workspace_stride=0):
     """
     AllReduce operation implementing warp/block-level reduction.
     Based on tl::AllReduce from reduce.h
@@ -118,69 +394,127 @@ def AllReduce(reducer, threads, scale, thread_offset, all_threads=None):
         threads: Number of threads participating in reduction
         scale: Reduction scale factor
         thread_offset: Thread ID offset
-        all_threads: Total number of threads in block
+        all_threads: Total number of threads in block (or NamedBarrier instance)
+        batch_size: Number of elements per thread to reduce in parallel (default 1)
+        workspace_stride: Stride between batch channels in shared memory (default 0)
 
     Returns:
         A callable object with run() and run_hopper() methods
     """
 
+    # Detect NamedBarrier: extract all_threads and use bar.sync path
+    use_named_barrier = isinstance(all_threads, NamedBarrier)
+    if use_named_barrier:
+        barrier_threads = all_threads.all_threads
+    else:
+        barrier_threads = all_threads
+
     class AllReduceInstance:
-        def __init__(self, reducer, threads, scale, thread_offset: cutlass.Constexpr[int], all_threads: cutlass.Constexpr[int]):
+        def __init__(
+            self,
+            reducer,
+            threads,
+            scale,
+            thread_offset: cutlass.Constexpr[int],
+            all_threads: cutlass.Constexpr[int],
+            use_named_barrier: cutlass.Constexpr[bool],
+            batch_size: cutlass.Constexpr[int],
+            workspace_stride: cutlass.Constexpr[int],
+        ):
             self.reducer = reducer
             self.threads = threads
             self.scale = scale
             self.thread_offset = thread_offset
             self.all_threads = all_threads if all_threads is not None else threads
+            self.use_named_barrier = use_named_barrier
+            self.batch_size = batch_size
+            self.workspace_stride = workspace_stride
 
         def run(self, x, red_buf: cute.Pointer = None):
             """
             Perform all-reduce across threads.
             Based on tl::AllReduce<...>::run from reduce.h
+            When NamedBarrier is used, delegates to run_hopper.
+            Supports both scalar (x is a value) and batched (x is a pointer) modes.
             """
+            if self.use_named_barrier:
+                return self.run_hopper(x, red_buf)
+
             offset = self.threads // 2
 
             if offset >= 32:
-                # Use shared memory for large thread counts
                 cute.arch.sync_threads()
                 tidx, _, _ = cute.arch.thread_idx()
-                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
-                cute.arch.sync_threads()
-                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+                if self.batch_size > 1:
+                    x_tensor = cute.make_tensor(x, (self.batch_size,))
+                    for i in range(self.batch_size):
+                        cute.make_tensor(red_buf + (tidx - self.thread_offset) + i * self.workspace_stride, (1,))[0] = x_tensor[i]
+                    cute.arch.sync_threads()
+                    for i in range(self.batch_size):
+                        x_tensor[i] = self.reducer()(
+                            x_tensor[i],
+                            cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset) + i * self.workspace_stride, (1,))[0],
+                        )
+                else:
+                    cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                    cute.arch.sync_threads()
+                    x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
             else:
-                # Use warp shuffle for small thread counts
-                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
-                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
-                x = self.reducer()(x, other)
-
-            return (
-                x
-                if offset == self.scale
-                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run(x, red_buf)
-            )
+                if self.batch_size > 1:
+                    x_tensor = cute.make_tensor(x, (self.batch_size,))
+                    for i in range(self.batch_size):
+                        other = shuffle_sync_op(x_tensor[i], offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                        x_tensor[i] = self.reducer()(x_tensor[i], other)
+                else:
+                    other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                    x = self.reducer()(x, other)
+
+            if offset == self.scale:
+                return x
+            else:
+                return AllReduce(
+                    self.reducer, offset, self.scale, self.thread_offset, self.all_threads, self.batch_size, self.workspace_stride
+                ).run(x, red_buf)
 
         def run_hopper(self, x, red_buf: cute.Pointer = None):
             """
             Perform all-reduce on Hopper architecture using bar.sync.
             Based on tl::AllReduce<...>::run_hopper from reduce.h
+            Supports both scalar and batched modes.
             """
             offset = self.threads // 2
             tidx, _, _ = cute.arch.thread_idx()
             if offset >= 32:
-                # Use inlined asm for bar.sync to avoid instruction reordering
                 bar_sync_ptx(1, self.all_threads)
-                cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
-                bar_sync_ptx(2, self.all_threads)
-                x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
+                if self.batch_size > 1:
+                    x_tensor = cute.make_tensor(x, (self.batch_size,))
+                    for i in range(self.batch_size):
+                        cute.make_tensor(red_buf + (tidx - self.thread_offset) + i * self.workspace_stride, (1,))[0] = x_tensor[i]
+                    bar_sync_ptx(2, self.all_threads)
+                    for i in range(self.batch_size):
+                        x_tensor[i] = self.reducer()(
+                            x_tensor[i],
+                            cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset) + i * self.workspace_stride, (1,))[0],
+                        )
+                else:
+                    cute.make_tensor(red_buf + tidx - self.thread_offset, (1,))[0] = x
+                    bar_sync_ptx(2, self.all_threads)
+                    x = self.reducer()(x, cute.make_tensor(red_buf + ((tidx - self.thread_offset) ^ offset), (1,))[0])
             else:
-                # Use warp shuffle for small thread counts
-                # Use the pre-existing shuffle_sync_op with butterfly (XOR) mode
-                other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
-                x = self.reducer()(x, other)
-
-            return (
-                x
-                if offset == self.scale
-                else AllReduce(self.reducer, offset, self.scale, self.thread_offset, self.all_threads).run_hopper(x, red_buf)
-            )
+                if self.batch_size > 1:
+                    x_tensor = cute.make_tensor(x, (self.batch_size,))
+                    for i in range(self.batch_size):
+                        other = shuffle_sync_op(x_tensor[i], offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                        x_tensor[i] = self.reducer()(x_tensor[i], other)
+                else:
+                    other = shuffle_sync_op(x, offset, mask=0xFFFFFFFF, mask_and_clamp=0x1F, kind=nvvm.ShflKind.bfly)
+                    x = self.reducer()(x, other)
+
+            if offset == self.scale:
+                return x
+            else:
+                return AllReduce(
+                    self.reducer, offset, self.scale, self.thread_offset, self.all_threads, self.batch_size, self.workspace_stride
+                ).run_hopper(x, red_buf)
 
-    return AllReduceInstance(reducer, threads, scale, thread_offset, all_threads)
+    return AllReduceInstance(reducer, threads, scale, thread_offset, barrier_threads, use_named_barrier, batch_size, workspace_stride)
diff --git a/tilelang/contrib/cutedsl/threadblock_swizzle.py b/tilelang/contrib/cutedsl/threadblock_swizzle.py
index 1ce78eb860..1b7d3fe0f5 100644
--- a/tilelang/contrib/cutedsl/threadblock_swizzle.py
+++ b/tilelang/contrib/cutedsl/threadblock_swizzle.py
@@ -1,3 +1,12 @@
+__all__ = [
+    "dim3",
+    "ThreadIdx",
+    "BlockIdx",
+    "GridDim",
+    "rasterization2DRow",
+    "rasterization2DColumn",
+]
+
 import cutlass.cute as cute
 from cutlass.cute.typing import Constexpr
 from dataclasses import dataclass
diff --git a/tilelang/contrib/cutedsl/utils.py b/tilelang/contrib/cutedsl/utils.py
new file mode 100644
index 0000000000..86010e44cb
--- /dev/null
+++ b/tilelang/contrib/cutedsl/utils.py
@@ -0,0 +1,165 @@
+"""
+Utility functions for CuTeDSL backend.
+
+Provides common helpers used across the CuTeDSL codegen:
+bitcast, tensor construction, warp election, barrier sync, and FP16 packing.
+"""
+
+import cutlass
+import cutlass.cute as cute
+
+from cutlass.base_dsl.typing import Int8, Int16, Int32, Uint8, Uint16, Uint32, Float16, Float32, BFloat16
+from cutlass._mlir.dialects import llvm, nvvm
+from cutlass._mlir import ir as mlir_ir
+from cutlass.cutlass_dsl import dsl_user_op
+
+__all__ = [
+    "BYTES_PER_TENSORMAP",
+    "BYTES_PER_POINTER",
+    "type_map",
+    "bitcast",
+    "make_filled_tensor",
+    "make_tensor_at_offset",
+    "shuffle_elect",
+    "sync_thread_partial",
+    "pack_half2",
+]
+
+BYTES_PER_TENSORMAP = 128
+BYTES_PER_POINTER = 8
+
+# Map dtype to WGMMA types (moved from typing.py)
+type_map = {
+    "int8": nvvm.WGMMATypes.s8,
+    "int32": nvvm.WGMMATypes.s32,
+    "uint8": nvvm.WGMMATypes.u8,
+    "float16": nvvm.WGMMATypes.f16,
+    "fp16": nvvm.WGMMATypes.f16,
+    "bfloat16": nvvm.WGMMATypes.bf16,
+    "bf16": nvvm.WGMMATypes.bf16,
+    "float32": nvvm.WGMMATypes.f32,
+    "fp32": nvvm.WGMMATypes.f32,
+    "tf32": nvvm.WGMMATypes.tf32,
+    "float8_e4m3": nvvm.WGMMATypes.e4m3,
+    "float8_e5m2": nvvm.WGMMATypes.e5m2,
+    "float8_e4m3fn": nvvm.WGMMATypes.e4m3,
+    "e4m3": nvvm.WGMMATypes.e4m3,
+    "e5m2": nvvm.WGMMATypes.e5m2,
+}
+
+# Map dtype to CuTeDSL type (internal)
+_DTYPE_TO_CUTEDSL_TYPE = {
+    "int8": Int8,
+    "int16": Int16,
+    "int32": Int32,
+    "uint8": Uint8,
+    "uint16": Uint16,
+    "uint32": Uint32,
+    "float16": Float16,
+    "float32": Float32,
+    "bfloat16": BFloat16,
+}
+
+
+def bitcast(value, target_dtype):
+    """
+    Reinterpret the bits of a value as a different type.
+    Equivalent to C's (*(target_type *)(&value)).
+
+    Args:
+        value: Source value (Numeric type from CuTeDSL)
+        target_dtype: Target type (CuTeDSL type like Int8, Float16, etc.)
+
+    Returns:
+        Value reinterpreted as target type
+    """
+    # Get the target MLIR type
+    if isinstance(target_dtype, type) or hasattr(target_dtype, "mlir_type"):
+        tgt_mlir_type = target_dtype.mlir_type
+        tgt_wrapper = target_dtype
+    else:
+        # Assume it's a string like "int8", "float16", etc.
+        tgt_wrapper = _DTYPE_TO_CUTEDSL_TYPE.get(str(target_dtype))
+        if tgt_wrapper is None:
+            raise ValueError(f"Unknown target dtype: {target_dtype}")
+        tgt_mlir_type = tgt_wrapper.mlir_type
+
+    @dsl_user_op
+    def bitcast_impl(src_val, *, loc=None, ip=None):
+        src_ir = src_val.ir_value(loc=loc, ip=ip) if hasattr(src_val, "ir_value") else src_val
+        result = llvm.bitcast(tgt_mlir_type, src_ir, loc=loc, ip=ip)
+        return tgt_wrapper(result)
+
+    return bitcast_impl(value)
+
+
+def make_filled_tensor(shape, value):
+    t = cute.make_rmem_tensor(shape, type(value))
+    t.fill(value)
+    return t
+
+
+def make_tensor_at_offset(ptr: cute.Pointer, offset, shape, div_by=1):
+    from cutlass.cute.typing import is_integer as cute_is_integer
+
+    # Ensure offset is a cute-compatible integer.  Complex arithmetic
+    # (e.g. cutlass.Int64 mixed ops) can produce ArithValue / Float types
+    # that Pointer.__add__ -> _pack_int_tuple doesn't accept.
+    if not isinstance(offset, int) and not cute_is_integer(offset):
+        offset = cutlass.Int64(offset)
+    if div_by != 1:
+        offset = cute.assume(cutlass.as_numeric(offset), divby=div_by)
+    return cute.make_tensor(ptr + offset, shape)
+
+
+def shuffle_elect(thread_extent):
+    # thread_extent is the number of threads of a warpgroup
+    warp_idx = cute.arch.warp_idx()
+    warp_idx = cute.arch.make_warp_uniform(warp_idx)
+    if thread_extent == 0:
+        return warp_idx == 0
+    else:
+        return (warp_idx % (thread_extent // 32)) == 0
+
+
+def sync_thread_partial(barrier_id=None, thread_count=None):
+    from .reduce import bar_sync_ptx
+
+    bar_sync_ptx(barrier_id, thread_count)
+
+
+def pack_half2(x, y):
+    """
+    Pack two half-precision (fp16) values into a single 32-bit value.
+    Corresponds to CUDA's __pack_half2 intrinsic.
+
+    This packs two fp16 values into a single int32 by treating the fp16 bits
+    as raw data and concatenating them.
+    """
+
+    @dsl_user_op
+    def pack_half2_impl(x_val, y_val, *, loc=None, ip=None):
+        # Cast fp16 to uint16 (bitcast)
+        x_ir = x_val.ir_value(loc=loc, ip=ip) if hasattr(x_val, "ir_value") else x_val
+        y_ir = y_val.ir_value(loc=loc, ip=ip) if hasattr(y_val, "ir_value") else y_val
+
+        # Bitcast fp16 to i16
+        i16_type = mlir_ir.IntegerType.get_signless(16)
+        x_i16 = llvm.bitcast(i16_type, x_ir, loc=loc, ip=ip)
+        y_i16 = llvm.bitcast(i16_type, y_ir, loc=loc, ip=ip)
+
+        packed_xy = llvm.inline_asm(
+            Int32.mlir_type,
+            [x_i16, y_i16],
+            "mov.b32 $0, {$1, $2};",
+            "=r,h,h",
+            has_side_effects=True,
+            is_align_stack=False,
+            asm_dialect=llvm.AsmDialect.AD_ATT,
+            loc=loc,
+            ip=ip,
+        )
+
+        return Int32(packed_xy)
+
+    return pack_half2_impl(x, y)
diff --git a/tilelang/contrib/cutedsl/warp.py b/tilelang/contrib/cutedsl/warp.py
new file mode 100644
index 0000000000..b7ddf64e81
--- /dev/null
+++ b/tilelang/contrib/cutedsl/warp.py
@@ -0,0 +1,150 @@
+# Copyright (c) Tile-AI Corporation.
+# Licensed under the MIT License.
+"""
+Warp-level primitives for CuTeDSL backend.
+Re-exports from cutlass.cute.arch with TileLang naming conventions.
+"""
+
+__all__ = [
+    "__activemask",
+    "__shfl_down_sync",
+    "__shfl_up_sync",
+    "__shfl_sync",
+    "warp_reduce_sum",
+    "warp_reduce_max",
+    "warp_reduce_min",
+    "warp_reduce_bitand",
+    "warp_reduce_bitor",
+]
+
+from cutlass._mlir.dialects import llvm, arith
+from cutlass.base_dsl.typing import Uint32, Int32
+from cutlass.cutlass_dsl import T, dsl_user_op
+from cutlass.cute.arch import shuffle_sync, shuffle_sync_down, shuffle_sync_up, shuffle_sync_bfly
+
+
+FULL_MASK = 0xFFFFFFFF
+WARP_SIZE = 32
+
+
+@dsl_user_op
+def __activemask(*, loc=None, ip=None) -> Uint32:
+    """
+    Returns a 32-bit integer mask of all currently active threads in the calling warp.
+
+    PTX: activemask.b32 %mask;
+    """
+    result = llvm.inline_asm(
+        T.i32(),
+        [],
+        "activemask.b32 $0;",
+        "=r",
+        has_side_effects=False,
+        is_align_stack=False,
+        asm_dialect=llvm.AsmDialect.AD_ATT,
+        loc=loc,
+        ip=ip,
+    )
+    return Uint32(result)
+
+
+def __shfl_down_sync(mask, val, delta, width=32):
+    """
+    Shuffle down within warp.
+
+    Uses CuTeDSL's shuffle_sync_down with proper mask_and_clamp calculation.
+    Matches CUDA: c = ((warpSize - width) << 8) | 0x1f
+    """
+    mask_and_clamp = ((WARP_SIZE - width) << 8) | 0x1F
+    return shuffle_sync_down(val, offset=delta, mask=mask, mask_and_clamp=mask_and_clamp)
+
+
+def __shfl_up_sync(mask, val, delta, width=32):
+    """
+    Shuffle up within warp.
+
+    Uses CuTeDSL's shuffle_sync_up with proper mask_and_clamp calculation.
+    Matches CUDA: c = (warpSize - width) << 8
+    """
+    mask_and_clamp = (WARP_SIZE - width) << 8
+    return shuffle_sync_up(val, offset=delta, mask=mask, mask_and_clamp=mask_and_clamp)
+
+
+def __shfl_sync(mask, val, srcLane, width=32):
+    """
+    Broadcast from a specific lane within warp.
+
+    Uses CuTeDSL's shuffle_sync (idx mode) with proper mask_and_clamp.
+    Matches CUDA: c = ((warpSize - width) << 8) | (width - 1)
+    """
+    mask_and_clamp = ((WARP_SIZE - width) << 8) | ((width - 1) & 0x1F)
+    return shuffle_sync(val, offset=srcLane, mask=mask, mask_and_clamp=mask_and_clamp)
+
+
+def _shfl_xor_sync(val, lane_mask):
+    """Butterfly (XOR) shuffle within full warp."""
+    return shuffle_sync_bfly(val, offset=lane_mask, mask=FULL_MASK, mask_and_clamp=0x1F)
+
+
+def warp_reduce_sum(value):
+    """Warp-level parallel reduction: sum across all 32 lanes."""
+    value = value + _shfl_xor_sync(value, 16)
+    value = value + _shfl_xor_sync(value, 8)
+    value = value + _shfl_xor_sync(value, 4)
+    value = value + _shfl_xor_sync(value, 2)
+    value = value + _shfl_xor_sync(value, 1)
+    return value
+
+
+def warp_reduce_max(value):
+    """Warp-level parallel reduction: max across all 32 lanes."""
+    from .reduce import max as tl_max
+
+    value = tl_max(value, _shfl_xor_sync(value, 16))
+    value = tl_max(value, _shfl_xor_sync(value, 8))
+    value = tl_max(value, _shfl_xor_sync(value, 4))
+    value = tl_max(value, _shfl_xor_sync(value, 2))
+    value = tl_max(value, _shfl_xor_sync(value, 1))
+    return value
+
+
+def warp_reduce_min(value):
+    """Warp-level parallel reduction: min across all 32 lanes."""
+    from .reduce import min as tl_min
+
+    value = tl_min(value, _shfl_xor_sync(value, 16))
+    value = tl_min(value, _shfl_xor_sync(value, 8))
+    value = tl_min(value, _shfl_xor_sync(value, 4))
+    value = tl_min(value, _shfl_xor_sync(value, 2))
+    value = tl_min(value, _shfl_xor_sync(value, 1))
+    return value
+
+
+@dsl_user_op
+def _bitand_i32(a: Int32, b: Int32, *, loc=None, ip=None) -> Int32:
+    return Int32(arith.andi(Int32(a).ir_value(), Int32(b).ir_value(), loc=loc, ip=ip))
+
+
+@dsl_user_op
+def _bitor_i32(a: Int32, b: Int32, *, loc=None, ip=None) -> Int32:
+    return Int32(arith.ori(Int32(a).ir_value(), Int32(b).ir_value(), loc=loc, ip=ip))
+
+
+def warp_reduce_bitand(value):
+    """Warp-level parallel reduction: bitwise AND across all 32 lanes."""
+    value = _bitand_i32(value, _shfl_xor_sync(value, 16))
+    value = _bitand_i32(value, _shfl_xor_sync(value, 8))
+    value = _bitand_i32(value, _shfl_xor_sync(value, 4))
+    value = _bitand_i32(value, _shfl_xor_sync(value, 2))
+    value = _bitand_i32(value, _shfl_xor_sync(value, 1))
+    return value
+
+
+def warp_reduce_bitor(value):
+    """Warp-level parallel reduction: bitwise OR across all 32 lanes."""
+    value = _bitor_i32(value, _shfl_xor_sync(value, 16))
+    value = _bitor_i32(value, _shfl_xor_sync(value, 8))
+    value = _bitor_i32(value, _shfl_xor_sync(value, 4))
+    value = _bitor_i32(value, _shfl_xor_sync(value, 2))
+    value = _bitor_i32(value, _shfl_xor_sync(value, 1))
+    return value
diff --git a/tilelang/contrib/nvcc.py b/tilelang/contrib/nvcc.py
index 0f6ec52eb3..94c1aae523 100644
--- a/tilelang/contrib/nvcc.py
+++ b/tilelang/contrib/nvcc.py
@@ -8,30 +8,7 @@
 import subprocess
 import warnings
 import contextlib
-from tilelang.env import CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
-
-
-def _get_nvshmem_include_path():
-    """Get NVSHMEM include path from pip-installed nvidia-nvshmem-cu12 or environment."""
-    # Try pip-installed nvidia-nvshmem-cu12
-    try:
-        import nvidia.nvshmem
-
-        nvshmem_path = nvidia.nvshmem.__path__[0]
-        include_path = os.path.join(nvshmem_path, "include")
-        if os.path.exists(include_path):
-            return include_path
-    except ImportError:
-        pass
-    # Try environment variable
-    nvshmem_home = os.environ.get("NVSHMEM_HOME", "")
-    if nvshmem_home:
-        include_path = os.path.join(nvshmem_home, "include")
-        if os.path.exists(include_path):
-            return include_path
-    return None
-
-
+from tilelang.env import CUDA_HOME, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH, env
 import shutil
 import tempfile
 import tvm_ffi
@@ -42,6 +19,18 @@ def _get_nvshmem_include_path():
 from tvm.contrib import utils
 
 
+def _resolve_artifact_paths(temp, file_name, target_format, kernels_output_dir=None):
+    if kernels_output_dir is None:
+        return temp.relpath(f"{file_name}.cu"), temp.relpath(f"{file_name}.{target_format}")
+
+    os.makedirs(kernels_output_dir, exist_ok=True)
+    fd, temp_code = tempfile.mkstemp(prefix=f"{file_name}_", suffix=".cu", dir=kernels_output_dir)
+    os.close(fd)
+    file_stem, _ = os.path.splitext(os.path.basename(temp_code))
+    temp_target = os.path.join(kernels_output_dir, f"{file_stem}.{target_format}")
+    return temp_code, temp_target
+
+
 def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target=None, verbose=False):
     """Compile cuda code with NVCC from env.
 
@@ -78,20 +67,13 @@ def compile_cuda(code, target_format="ptx", arch=None, options=None, path_target
         target_arch = get_target_arch(compute_version)
         arch = ["-gencode", f"arch=compute_{target_arch},code=sm_{target_arch}"]
 
-    temp = utils.tempdir()
+    temp = utils.tempdir(keep_for_debug=not env.should_cleanup_temp_files())
     file_name = "tvm_kernels"
     if target_format not in ["cubin", "ptx", "fatbin"]:
         raise ValueError("target_format must be in cubin, ptx, fatbin")
-    temp_code = temp.relpath(f"{file_name}.cu")
-    temp_target = temp.relpath(f"{file_name}.{target_format}")
-
     pass_context = tvm.get_global_func("transform.GetCurrentPassContext")()
     kernels_output_dir = pass_context.config.get("cuda.kernels_output_dir", None)
-    if kernels_output_dir is not None:
-        if not os.path.isdir(kernels_output_dir):
-            os.makedirs(kernels_output_dir)
-        temp_code = os.path.join(kernels_output_dir, f"{file_name}.cu")
-        temp_target = os.path.join(kernels_output_dir, f"{file_name}.{target_format}")
+    temp_code, temp_target = _resolve_artifact_paths(temp, file_name, target_format, kernels_output_dir=kernels_output_dir)
 
     with open(temp_code, "w") as out_file:
         out_file.write(code)
@@ -177,11 +159,6 @@ def default_compile_options(compile_flags: list[str] | None = None) -> list[str]
     except Exception:
         pass
 
-    # Add NVSHMEM include path for distributed support
-    nvshmem_include = _get_nvshmem_include_path()
-    if nvshmem_include:
-        options.append(f"-I{nvshmem_include}")
-
     # Preserve user flags exactly, including repeated tokens required by NVCC
     # (e.g., multiple "-gencode" pairs or repeated "-Xcompiler" entries).
     if compile_flags:
@@ -347,71 +324,6 @@ def get_cuda_version(cuda_path=None):
     raise RuntimeError("Cannot read cuda version file")
 
 
-@tvm_ffi.register_global_func("tilelang_callback_libdevice_path", override=True)
-def find_libdevice_path(arch):
-    """Utility function to find libdevice
-
-    Parameters
-    ----------
-    arch : int
-        The compute architecture in int
-
-    Returns
-    -------
-    path : str
-        Path to libdevice.
-    """
-    cuda_path = find_cuda_path()
-    lib_path = os.path.join(cuda_path, "nvvm/libdevice")
-    if not os.path.exists(lib_path):
-        # Debian/Ubuntu repackaged CUDA path
-        lib_path = os.path.join(cuda_path, "lib/nvidia-cuda-toolkit/libdevice")
-    selected_ver = 0
-    selected_path = None
-    cuda_ver = get_cuda_version(cuda_path)
-    major_minor = (cuda_ver[0], cuda_ver[1])
-    if major_minor in (
-        (9, 0),
-        (9, 1),
-        (10, 0),
-        (10, 1),
-        (10, 2),
-        (11, 0),
-        (11, 1),
-        (11, 2),
-        (11, 3),
-    ):
-        path = os.path.join(lib_path, "libdevice.10.bc")
-    else:
-        for fn in os.listdir(lib_path):
-            if not fn.startswith("libdevice"):
-                continue
-
-            try:
-                # expected pattern: libdevice.${ARCH}.10.bc
-                #             e.g., libdevice.compute_20.10.bc
-                ver = int(fn.split(".")[-3].split("_")[-1])
-                if selected_ver < ver <= arch:
-                    selected_ver = ver
-                    selected_path = fn
-            except ValueError:
-                # it can just be `libdevice.10.bc` in CUDA 10
-                selected_path = fn
-
-        if selected_path is None:
-            raise RuntimeError(f"Cannot find libdevice for arch {arch}")
-        path = os.path.join(lib_path, selected_path)
-    return path
-
-
-def callback_libdevice_path(arch):
-    try:
-        return find_libdevice_path(arch)
-    except RuntimeError:
-        warnings.warn("Cannot find libdevice path", stacklevel=2)
-        return ""
-
-
 @tvm_ffi.register_global_func("tvm.contrib.nvcc.get_compute_version", override=True)
 def get_target_compute_version(target=None):
     """Utility function to get compute capability of compilation target.
@@ -614,6 +526,14 @@ def is_hopper(target):
     return major == 9 and minor == 0
 
 
+def have_pdl(target):
+    if target.kind.name != "cuda":
+        return False
+    compute_version = get_target_compute_version(target)
+    major, minor = parse_compute_version(compute_version)
+    return major >= 9
+
+
 def get_nvcc_compiler() -> str:
     """Get the path to the nvcc compiler"""
     return os.path.join(find_cuda_path(), "bin", "nvcc")
diff --git a/tilelang/contrib/nvrtc.py b/tilelang/contrib/nvrtc.py
index 105c518198..c3f083dc99 100644
--- a/tilelang/contrib/nvrtc.py
+++ b/tilelang/contrib/nvrtc.py
@@ -70,6 +70,10 @@ def compile_cuda(
             raise ValueError("options must be str or list of str")
 
     code = "#include <tl_templates/cuda/nvrtc_std.h>\n" + code
+
+    if "cudaGridDependencySynchronize" in code or "cudaTriggerProgrammaticLaunchCompletion" in code:
+        code = '#include "cuda_device_runtime_api.h"\n' + code
+
     code_bytes = bytes(code, "utf-8")
     result, program = nvrtc.nvrtcCreateProgram(code_bytes, bytes(file_name, "utf-8"), 0, [], [])
     assert result == nvrtc.nvrtcResult.NVRTC_SUCCESS, f"Failed to create program: {result}"
diff --git a/tilelang/distributed/__init__.py b/tilelang/distributed/__init__.py
index 0aa3803773..276cc91632 100644
--- a/tilelang/distributed/__init__.py
+++ b/tilelang/distributed/__init__.py
@@ -1,4 +1,17 @@
 """The distributed modules"""
 
 from .utils import *  # noqa: F401
-from tilescale_ext import _create_tensor, _create_ipc_handle, _sync_ipc_handles  # noqa: F401
+from .shared_memory import (  # noqa: F401
+    tensor_from_ptr,
+    _create_tensor,
+    _create_ipc_handle,
+    _sync_ipc_handles,
+    _supports_vmm_fabric,
+    _vmm_malloc,
+    _vmm_free,
+    _create_vmm_handle,
+    _open_vmm_handle,
+    _close_vmm_handle,
+    _sync_vmm_handles,
+    _supports_multicast,
+)
diff --git a/tilelang/distributed/conf_vmm.sh b/tilelang/distributed/conf_vmm.sh
new file mode 100644
index 0000000000..9049347f2f
--- /dev/null
+++ b/tilelang/distributed/conf_vmm.sh
@@ -0,0 +1,10 @@
+MAJOR=$(grep nvidia-caps-imex-channels /proc/devices | awk '{print $1}')
+
+# Use mknod to open fabric channel 0
+sudo mkdir -p /dev/nvidia-caps-imex-channels/
+sudo mknod /dev/nvidia-caps-imex-channels/channel0 c $MAJOR 0
+sudo chmod 666 /dev/nvidia-caps-imex-channels/channel0
+
+# Optional: set environment variables to use VMM and distributed
+export TILESCALE_USE_VMM=1
+export TILESCALE_USE_DISTRIBUTED=1
diff --git a/tilelang/distributed/shared_memory/__init__.py b/tilelang/distributed/shared_memory/__init__.py
new file mode 100644
index 0000000000..8f84c89a69
--- /dev/null
+++ b/tilelang/distributed/shared_memory/__init__.py
@@ -0,0 +1,194 @@
+"""Shared memory allocator for distributed communication (IPC + VMM/fabric).
+
+All ops registered via TVM FFI under tl.shared_memory.* namespace.
+"""
+
+import torch
+import tvm_ffi
+
+# ---------- TVM FFI function handles ----------
+
+_vmm_malloc = tvm_ffi.get_global_func("tl.shared_memory.vmm_malloc")
+_vmm_free = tvm_ffi.get_global_func("tl.shared_memory.vmm_free")
+_create_vmm_handle = tvm_ffi.get_global_func("tl.shared_memory.create_vmm_handle")
+_open_vmm_handle = tvm_ffi.get_global_func("tl.shared_memory.open_vmm_handle")
+_close_vmm_handle = tvm_ffi.get_global_func("tl.shared_memory.close_vmm_handle")
+_sync_vmm_handles_raw = tvm_ffi.get_global_func("tl.shared_memory.sync_vmm_handles")
+
+_create_ipc_handle = tvm_ffi.get_global_func("tl.shared_memory.create_ipc_handle")
+_open_ipc_handle = tvm_ffi.get_global_func("tl.shared_memory.open_ipc_handle")
+_close_ipc_handle = tvm_ffi.get_global_func("tl.shared_memory.close_ipc_handle")
+_sync_ipc_handles_raw = tvm_ffi.get_global_func("tl.shared_memory.sync_ipc_handles")
+
+_supports_vmm_fabric = tvm_ffi.get_global_func("tl.shared_memory.supports_vmm_fabric")
+_supports_multicast = tvm_ffi.get_global_func("tl.shared_memory.supports_multicast")
+
+# Multicast (NVSwitch) ops
+_mc_create = tvm_ffi.get_global_func("tl.shared_memory.mc_create")
+_mc_export_handle = tvm_ffi.get_global_func("tl.shared_memory.mc_export_handle")
+_mc_import_handle = tvm_ffi.get_global_func("tl.shared_memory.mc_import_handle")
+_mc_add_device = tvm_ffi.get_global_func("tl.shared_memory.mc_add_device")
+_mc_bind_mem = tvm_ffi.get_global_func("tl.shared_memory.mc_bind_mem")
+_mc_map = tvm_ffi.get_global_func("tl.shared_memory.mc_map")
+_mc_release_handle = tvm_ffi.get_global_func("tl.shared_memory.mc_release_handle")
+_mc_unmap = tvm_ffi.get_global_func("tl.shared_memory.mc_unmap")
+_mc_get_aligned_size = tvm_ffi.get_global_func("tl.shared_memory.mc_get_aligned_size")
+
+
+# ---------- tensor_from_ptr (pure Python, no C++ torch dependency) ----------
+
+_dtype_str_to_torch = {
+    "float32": torch.float32,
+    "float": torch.float32,
+    "float16": torch.float16,
+    "half": torch.float16,
+    "bfloat16": torch.bfloat16,
+    "float64": torch.float64,
+    "double": torch.float64,
+    "int32": torch.int32,
+    "int": torch.int32,
+    "int64": torch.int64,
+    "long": torch.int64,
+    "uint8": torch.uint8,
+    "byte": torch.uint8,
+    "int8": torch.int8,
+    "bool": torch.bool,
+    "uint32": torch.uint32,
+    "uint64": torch.uint64,
+}
+
+# __cuda_array_interface__ typestr mapping
+_torch_dtype_to_typestr = {
+    torch.float32: "<f4",
+    torch.float64: "<f8",
+    torch.float16: "<f2",
+    torch.int8: "<i1",
+    torch.int16: "<i2",
+    torch.int32: "<i4",
+    torch.int64: "<i8",
+    torch.uint8: "<u1",
+    torch.bool: "|b1",
+    # bfloat16/uint32/uint64 handled specially
+}
+
+
+class _ExternalCUDAArray:
+    """Minimal __cuda_array_interface__ provider for zero-copy tensor creation."""
+
+    def __init__(self, ptr: int, shape: tuple, typestr: str):
+        self.__cuda_array_interface__ = {
+            "data": (ptr, False),
+            "shape": shape,
+            "typestr": typestr,
+            "version": 3,
+            "strides": None,
+        }
+
+
+def tensor_from_ptr(
+    ptr_val: int,
+    shape: list,
+    dtype_str: str = "float32",
+    device: int = 0,
+    take_ownership: bool = False,
+) -> torch.Tensor:
+    """Create a CUDA tensor viewing external device memory (zero-copy)."""
+    if ptr_val == 0:
+        raise RuntimeError("Received null pointer (0).")
+
+    dtype = _dtype_str_to_torch.get(dtype_str)
+    if dtype is None:
+        raise ValueError(f"Unsupported dtype string: '{dtype_str}'")
+
+    if isinstance(shape, (list, tuple)):
+        shape = tuple(shape)
+    else:
+        shape = (shape,)
+
+    numel = 1
+    for s in shape:
+        numel *= s
+    if numel == 0:
+        return torch.empty(shape, dtype=dtype, device=f"cuda:{device}")
+
+    typestr = _torch_dtype_to_typestr.get(dtype)
+    if typestr is not None:
+        # Standard path via __cuda_array_interface__
+        arr = _ExternalCUDAArray(ptr_val, shape, typestr)
+        return torch.as_tensor(arr, device=f"cuda:{device}")
+    else:
+        # bfloat16 / uint32 / uint64: create as matching-size int type, then view
+        element_size = torch.empty((), dtype=dtype).element_size()
+        if element_size == 2:
+            # proxy_dtype = torch.int16
+            proxy_typestr = "<i2"
+        elif element_size == 4:
+            # proxy_dtype = torch.int32
+            proxy_typestr = "<i4"
+        elif element_size == 8:
+            # proxy_dtype = torch.int64
+            proxy_typestr = "<i8"
+        else:
+            raise ValueError(f"Cannot handle dtype {dtype} with element_size={element_size}")
+
+        arr = _ExternalCUDAArray(ptr_val, shape, proxy_typestr)
+        t = torch.as_tensor(arr, device=f"cuda:{device}")
+        return t.view(dtype)
+
+
+# ---------- Higher-level Python wrappers ----------
+
+
+def _sync_vmm_handles(rank, device_ids, buffer_ptrs_gpu_addr, all_gathered_handles):
+    """Compatibility wrapper: packs handles into a single bytes blob and calls FFI."""
+    num = len(device_ids)
+    # all_gathered_handles is a list of bytearrays (or bytes)
+    # Pack into single contiguous bytes blob
+    # handle_size = len(all_gathered_handles[0]) if all_gathered_handles[0] is not None else 0
+    packed = b""
+    for h in all_gathered_handles:
+        packed += bytes(h)
+    _sync_vmm_handles_raw(rank, num, buffer_ptrs_gpu_addr, packed)
+
+
+def _sync_ipc_handles(rank, device_ids, buffer_ptrs_gpu_addr, all_gathered_handles, root_unique_id_opt=None):
+    """Compatibility wrapper for IPC handle sync."""
+    num = len(device_ids)
+    packed = b""
+    for h in all_gathered_handles:
+        packed += bytes(h)
+    _sync_ipc_handles_raw(rank, num, buffer_ptrs_gpu_addr, packed)
+
+
+def _create_tensor(shape, dtype):
+    """Create a CUDA tensor (simple cudaMalloc-backed)."""
+    return torch.empty(shape, dtype=dtype, device="cuda")
+
+
+def create_host_device_tensor(shape, dtype):
+    """Create host/device shared pinned-mapped tensor."""
+    numel = 1
+    for s in shape:
+        numel *= s
+    host = torch.empty(shape, dtype=dtype, pin_memory=True)
+    device = torch.empty(shape, dtype=dtype, device="cuda")
+    return host, device
+
+
+__all__ = [
+    "tensor_from_ptr",
+    "_create_tensor",
+    "_create_ipc_handle",
+    "_open_ipc_handle",
+    "_close_ipc_handle",
+    "_sync_ipc_handles",
+    "create_host_device_tensor",
+    "_supports_vmm_fabric",
+    "_vmm_malloc",
+    "_vmm_free",
+    "_create_vmm_handle",
+    "_open_vmm_handle",
+    "_close_vmm_handle",
+    "_sync_vmm_handles",
+    "_supports_multicast",
+]
diff --git a/tilelang/distributed/utils.py b/tilelang/distributed/utils.py
index 1e9ab6d647..2cb16b4ae8 100644
--- a/tilelang/distributed/utils.py
+++ b/tilelang/distributed/utils.py
@@ -21,7 +21,14 @@
     from cuda import cuda, cudart
 
 import ctypes
-from tilescale_ext import _create_tensor, _create_ipc_handle, _sync_ipc_handles, create_host_device_tensor
+from tilelang.distributed.shared_memory import (
+    _create_tensor,
+    _create_ipc_handle,
+    _sync_ipc_handles,
+    _create_vmm_handle,
+    _sync_vmm_handles,
+    create_host_device_tensor,
+)
 import functools
 from functools import lru_cache
 from threading import Lock
@@ -55,10 +62,9 @@ def init_dist(local_rank: int, num_local_ranks: int):
     if "device_id" in sig.parameters:
         # noinspection PyTypeChecker
         params["device_id"] = torch.device(f"cuda:{local_rank}")
+    torch.cuda.set_device(local_rank)
     dist.init_process_group(**params)
     torch.set_default_dtype(torch.bfloat16)
-    torch.set_default_device("cuda")
-    torch.cuda.set_device(local_rank)
 
     return dist.get_rank(), dist.get_world_size(), dist.new_group(list(range(num_local_ranks * num_nodes)))
 
@@ -109,8 +115,29 @@ def get_local_ipc_handle(data: torch.Tensor):
     return handle
 
 
-def create_dist_tensor(local_rank: int, num_local_ranks: int, data: torch.Tensor, rank: int, group: dist.ProcessGroup):
+def _resolve_use_vmm(use_vmm: bool | None) -> bool:
+    """Resolve whether to use VMM based on env var and hardware support."""
+    import os
+
+    env_val = os.environ.get("TILESCALE_USE_VMM", None)
+    if env_val is not None:
+        return env_val == "1"
+    if use_vmm is not None:
+        return use_vmm
+    return False
+
+
+def create_dist_tensor(
+    local_rank: int,
+    num_local_ranks: int,
+    data: torch.Tensor,
+    rank: int,
+    group: dist.ProcessGroup,
+    use_vmm: bool | None = None,
+):
     assert num_local_ranks == group.size()
+    _use_vmm = _resolve_use_vmm(use_vmm)
+
     # Synchronize device IDs
     device_ids = [
         None,
@@ -118,14 +145,21 @@ def create_dist_tensor(local_rank: int, num_local_ranks: int, data: torch.Tensor
     local_device_id = local_rank
     dist.all_gather_object(device_ids, local_device_id, group)
 
-    # Synchronize IPC handles
-    ipc_handles = [
+    # Synchronize handles (VMM or IPC)
+    handles = [
         None,
     ] * group.size()
-    local_ipc_handle = get_local_ipc_handle(data)
-    dist.all_gather_object(ipc_handles, local_ipc_handle, group)
+    if _use_vmm:
+        local_handle = _create_vmm_handle(ctypes.c_void_p(data.data_ptr()).value)
+    else:
+        local_handle = get_local_ipc_handle(data)
+    dist.all_gather_object(handles, local_handle, group)
+
     buffer_ptrs_gpu = torch.empty(group.size(), dtype=torch.uint64, device="cuda")
-    _sync_ipc_handles(rank, device_ids, ctypes.c_void_p(buffer_ptrs_gpu.data_ptr()).value, ipc_handles, None)
+    if _use_vmm:
+        _sync_vmm_handles(rank, device_ids, ctypes.c_void_p(buffer_ptrs_gpu.data_ptr()).value, handles)
+    else:
+        _sync_ipc_handles(rank, device_ids, ctypes.c_void_p(buffer_ptrs_gpu.data_ptr()).value, handles, None)
     return buffer_ptrs_gpu
 
 
diff --git a/tilelang/dtypes.py b/tilelang/dtypes.py
new file mode 100644
index 0000000000..3e12d285db
--- /dev/null
+++ b/tilelang/dtypes.py
@@ -0,0 +1,3 @@
+# Re-export from language.dtypes for convenient access via `from tilelang.dtypes import ...`
+from tilelang.language.dtypes import *  # noqa: F401, F403
+from tilelang.language.dtypes import dtype, AnyDType, get_tvm_dtype  # noqa: F401
diff --git a/tilelang/engine/callback.py b/tilelang/engine/callback.py
index d65f1eb2bf..cc211498f3 100644
--- a/tilelang/engine/callback.py
+++ b/tilelang/engine/callback.py
@@ -41,6 +41,17 @@ def register_c_postproc(func: Callable[[str, Target], str], override: bool = Tru
     tvm_ffi.register_global_func("tilelang_callback_c_host_postproc", f=func, override=override)
 
 
+def register_metal_postproc(func: Callable[[str, Target], str], override: bool = True):
+    """Register a post-processing function for Metal code generation.
+
+    Args:
+        func: A callable that takes generated code (str) and target (Target) as input,
+             and returns the processed code (str).
+        override: Whether to override existing registered function. Defaults to True.
+    """
+    tvm_ffi.register_global_func("tvm_callback_metal_compile", f=func, override=override)
+
+
 def register_cuda_postproc_callback(func: Callable | bool = None, override: bool = True):
     """Decorator for registering CUDA post-processing callback function.
 
@@ -138,3 +149,36 @@ def _register(fn: Callable[[str, Target], str]):
         return _register
 
     raise TypeError("Invalid decorator usage")
+
+
+def register_metal_postproc_callback(func: Callable | bool = None, override: bool = True):
+    """Decorator for registering Metal post-processing callback function.
+
+    Can be used with or without parentheses:
+        @register_metal_postproc_callback
+        def func(code, target): ...
+
+        @register_metal_postproc_callback()
+        def func(code, target): ...
+
+        @register_metal_postproc_callback(override=False)
+        def func(code, target): ...
+
+    Args:
+        func: The function to be decorated or a boolean override flag
+        override: Whether to override existing registered function. Defaults to True.
+    """
+    if callable(func):
+        register_metal_postproc(func, override)
+        return func
+
+    if func is None or isinstance(func, bool):
+        _override = func if isinstance(func, bool) else override
+
+        def _register(fn: Callable[[str, Target], str]):
+            register_metal_postproc(fn, _override)
+            return fn
+
+        return _register
+
+    raise TypeError("Invalid decorator usage")
diff --git a/tilelang/engine/lower.py b/tilelang/engine/lower.py
index 44557ec3a9..e900890300 100644
--- a/tilelang/engine/lower.py
+++ b/tilelang/engine/lower.py
@@ -2,8 +2,7 @@
 
 from __future__ import annotations
 
-import os
-import os.path as osp
+import re
 from typing import Callable
 import tilelang.transform
 from tilelang import tvm as tvm
@@ -12,7 +11,9 @@
 from tvm.ir import CallingConv
 from tvm.target import Target
 from tilelang.contrib import hipcc, nvcc
+from tilelang.env import COMPOSABLE_KERNEL_INCLUDE_DIR, CUTLASS_INCLUDE_DIR, TILELANG_TEMPLATE_PATH
 from tilelang.transform import PassConfigKey
+from tilelang.transform.metal import MarkHostMetalContext
 from tilelang.engine.param import KernelParam, CompiledArtifact
 from tilelang.utils.target import determine_target
 from tilelang.engine.phase import (
@@ -20,7 +21,6 @@
     LowerAndLegalize,
     OptimizeForTarget,
 )
-from tilelang import env
 
 
 def is_cpu_device_backend(target: Target):
@@ -56,19 +56,52 @@ def get_host_call(is_device_c: bool = False) -> Callable[[tir.PrimFunc], bool]:
     return lambda func: not get_device_call(is_device_c)(func)
 
 
+_CUDA_GLOBAL_KERNEL_PATTERN = re.compile(r'(?:extern\s+"C"\s+)?__global__\s+void\s+(?:__launch_bounds__\([^\)]*\)\s+)?(\w+)')
+
+
+def _collect_external_cuda_kernel_names(source: str) -> list[str]:
+    kernel_names: list[str] = []
+    seen_names: set[str] = set()
+    for match in _CUDA_GLOBAL_KERNEL_PATTERN.finditer(source):
+        kernel_name = match.group(1)
+        if kernel_name not in seen_names:
+            kernel_names.append(kernel_name)
+            seen_names.add(kernel_name)
+    return kernel_names
+
+
+@tvm_ffi.register_global_func("tilelang_callback_cuda_validate", override=True)
+def tilelang_callback_cuda_validate(device_mod):
+    for _, base_func in device_mod.functions.items():
+        if not isinstance(base_func, tir.PrimFunc) or not base_func.attrs:
+            continue
+
+        code_block_source = base_func.attrs.get("code_block_source")
+        if code_block_source is None:
+            continue
+
+        global_symbol = base_func.attrs.get("global_symbol")
+        if global_symbol is None:
+            raise ValueError("CodeGenTileLangCUDA expects source-kernel PrimFunc to have the global_symbol attribute")
+
+        expected_name = str(global_symbol)
+        code_block_entry_name = base_func.attrs.get("code_block_entry_name")
+        if code_block_entry_name is not None and str(code_block_entry_name) != expected_name:
+            raise ValueError("T.CUDASourceCodeKernel expects the lowered device global_symbol to match entry_name")
+
+        kernel_names = _collect_external_cuda_kernel_names(str(code_block_source))
+        if not kernel_names:
+            raise ValueError("T.CUDASourceCodeKernel expects external CUDA source to declare at least one __global__ kernel")
+        if expected_name not in kernel_names:
+            raise ValueError(
+                "T.CUDASourceCodeKernel expected device global_symbol "
+                f"`{expected_name}` to match a __global__ kernel in the provided CUDA source. "
+                f"Available entries: {', '.join(kernel_names)}"
+            )
+
+
 @tvm_ffi.register_global_func("tilelang_callback_cuda_compile", override=True)
 def tilelang_callback_cuda_compile(code, target, pass_config=None):
-    project_root = osp.join(osp.dirname(__file__), "../..")
-    if "TL_TEMPLATE_PATH" in os.environ:
-        tl_template_path = os.environ["TL_TEMPLATE_PATH"]
-    else:
-        tl_template_path = osp.abspath(osp.join(project_root, "src"))
-    # TODO(lei): this indeed should be renamed into
-    # TL_CUTLASS_INCLUDE_PATH in the future
-    if "TL_CUTLASS_PATH" in os.environ:
-        cutlass_path = os.environ["TL_CUTLASS_PATH"]
-    else:
-        cutlass_path = osp.abspath(osp.join(project_root, "3rdparty/cutlass/include"))
     target_arch = nvcc.get_target_arch(nvcc.get_target_compute_version(target))
 
     arch = [f"-arch=sm_{target_arch}"]
@@ -79,25 +112,15 @@ def tilelang_callback_cuda_compile(code, target, pass_config=None):
     enable_fast_math = bool(cfg.get(PassConfigKey.TL_ENABLE_FAST_MATH, False))
 
     ptxas_usage_level = cfg.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+    if ptxas_usage_level is not None:
+        ptxas_usage_level = int(ptxas_usage_level)
     verbose_ptxas_output = bool(cfg.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False))
 
     options = [
         "-std=c++17",
-        "-I" + tl_template_path,
-        "-I" + cutlass_path,
+        "-I" + TILELANG_TEMPLATE_PATH,
+        "-I" + CUTLASS_INCLUDE_DIR,
     ]
-    # Add NVSHMEM include path and library linking for distributed support
-    if env.USE_DISTRIBUTED and env.USE_NVSHMEM:
-        if env.NVSHMEM_INCLUDE_DIR and env.NVSHMEM_LIB_PATH:
-            options.append("-I" + env.NVSHMEM_INCLUDE_DIR)
-            options.append("-L" + env.NVSHMEM_LIB_PATH)
-            options.append("-lnvshmem_device")
-            options.append("-rdc=true")
-        else:
-            raise ValueError(
-                "TILELANG_USE_DISTRIBUTED is enabled but NVSHMEM paths not found. Install nvidia-nvshmem-cu12 via pip or set NVSHMEM_SRC."
-            )
-
     # Merge extra device compiler flags from pass config, if provided
     extra_flags = cfg.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS, None)
     if extra_flags:
@@ -137,23 +160,13 @@ def tilelang_callback_cuda_compile(code, target, pass_config=None):
 
 @tvm_ffi.register_global_func("tilelang_callback_hip_compile", override=True)
 def tilelang_callback_hip_compile(code, target):
-    project_root = osp.join(osp.dirname(__file__), "../..")
-    tl_template_path = osp.abspath(osp.join(project_root, "src"))
-
-    # TODO(lei): actually this indeed should be renamed into
-    # TL_COMPOSABLE_KERNEL_INCLUDE_PATH in the future
-    if "TL_COMPOSABLE_KERNEL_PATH" in os.environ:
-        ck_path = os.environ["TL_COMPOSABLE_KERNEL_PATH"]
-    else:
-        ck_path = osp.abspath(osp.join(project_root, "3rdparty/composable_kernel/include"))
-
     hsaco = hipcc.compile_hip(
         code,
         target_format="hsaco",
         options=[
             "-std=c++17",
-            "-I" + tl_template_path,
-            "-I" + ck_path,
+            "-I" + TILELANG_TEMPLATE_PATH,
+            "-I" + COMPOSABLE_KERNEL_INCLUDE_DIR,
         ],
         verbose=False,
     )
@@ -167,11 +180,6 @@ def extrac_params(func: tir.PrimFunc) -> list[KernelParam]:
         if var in func.buffer_map:
             tensor_types.append(KernelParam.from_buffer(func.buffer_map[var]))
         else:
-            if var.dtype == "handle":
-                raise ValueError(
-                    f"Handle parameter {var} must be mapped to a buffer.\n"
-                    f"Please use T.tensor({var.name}, shape=..., dtype=...) to map it to a buffer."
-                )
             tensor_types.append(KernelParam.from_var(var))
     return tensor_types
 
@@ -183,7 +191,20 @@ def canon_target_host(target: str | Target, target_host: str | Target | None):
     return target_host
 
 
-def host_codegen(host_mod: tvm.IRModule, target_host: Target) -> tvm.IRModule:
+def host_codegen(host_mod: tvm.IRModule, target_host: Target, target: Target | None = None) -> tvm.IRModule:
+    """Generate host-side code from the lowered IR module.
+
+    Parameters
+    ----------
+    host_mod : tvm.IRModule
+        The host-side IR module to compile.
+    target_host : Target
+        The host compilation target (e.g. "llvm" or "c").
+    target : Target, optional
+        The device target.  When the device target is Metal, the pass
+        MarkHostMetalContext is applied so that the generated host code
+        contains the Metal/MPS synchronisation logic.
+    """
     host_mod = tir.transform.BindTarget(target_host)(host_mod)
     host_mod = tir.transform.FP8StorageLegalize()(host_mod)
     host_mod = tir.transform.BF16StorageLegalize()(host_mod)
@@ -192,10 +213,12 @@ def host_codegen(host_mod: tvm.IRModule, target_host: Target) -> tvm.IRModule:
     host_mod = tilelang.transform.LowerIntrin()(host_mod)
     host_mod = tilelang.transform.LowerDeviceStorageAccessInfo()(host_mod)
     host_mod = tir.transform.CombineContextCall()(host_mod)
+    if target is not None and target.kind.name == "metal":
+        host_mod = MarkHostMetalContext()(host_mod)
     if target_host.kind.name == "llvm":
         host_mod = tvm.ffi.get_global_func("target.build.llvm")(host_mod, target_host)
     elif target_host.kind.name == "c":
-        host_mod = tvm.ffi.get_global_func("target.build.tilelang_c")(host_mod, target_host)
+        host_mod = tvm.ffi.get_global_func("target.build.tilelang_c_host")(host_mod, target_host)
     else:
         raise ValueError(f"Target host {target_host.kind.name} is not supported")
     return host_mod
@@ -205,12 +228,15 @@ def device_codegen(device_mod: tvm.IRModule, target: Target) -> tvm.IRModule:
     device_mod = tilelang.transform.LowerDeviceStorageAccessInfo()(device_mod)
     device_mod = tilelang.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
+    device_mod = tilelang.transform.HoistBroadcastValues()(device_mod)
 
     if target.kind.name == "cuda":
         global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda")
         device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip")(device_mod, target)
+    elif target.kind.name == "metal":
+        device_mod = tvm.ffi.get_global_func("target.build.metal")(device_mod, target)
     else:
         raise ValueError(f"Target {target.kind.name} is not supported")
 
@@ -221,13 +247,15 @@ def device_codegen_without_compile(device_mod: tvm.IRModule, target: Target) ->
     device_mod = tilelang.transform.LowerDeviceStorageAccessInfo()(device_mod)
     device_mod = tilelang.transform.LowerIntrin()(device_mod)
     device_mod = tir.transform.Simplify()(device_mod)
+    device_mod = tilelang.transform.HoistBroadcastValues()(device_mod)
+
     if target.kind.name == "cuda":
         global_func = "target.build.tilelang_" + ("cutedsl" if "cutedsl" in target.keys else "cuda") + "_without_compile"
         device_mod = tvm.ffi.get_global_func(global_func)(device_mod, target)
     elif target.kind.name == "hip":
         device_mod = tvm.ffi.get_global_func("target.build.tilelang_hip_without_compile")(device_mod, target)
     elif target.kind.name == "c":
-        device_mod = tvm.ffi.get_global_func("target.build.tilelang_cpp")(device_mod, target)
+        device_mod = tvm.ffi.get_global_func("target.build.tilelang_c")(device_mod, target)
     elif target.kind.name == "llvm":
         device_mod = tvm.ffi.get_global_func("target.build.llvm")(device_mod, target)
     elif target.kind.name == "webgpu":
@@ -284,12 +312,12 @@ def lower(
 
     host_mod = tir.transform.Filter(_is_host_call)(mod)
     device_mod = tir.transform.Filter(_is_device_call)(mod)
-
     codegen_mod = device_codegen(device_mod, target) if enable_device_compile else device_codegen_without_compile(device_mod, target)
+    kernel_source = codegen_mod.inspect_source()
 
     if enable_host_codegen:
-        host_mod = host_codegen(host_mod, target_host)
+        host_mod = host_codegen(host_mod, target_host, target=target)
         host_mod.import_module(codegen_mod)
-        return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source(), rt_mod=host_mod)
+        return CompiledArtifact(host_mod, device_mod, params, kernel_source, rt_mod=host_mod)
 
-    return CompiledArtifact(host_mod, device_mod, params, codegen_mod.inspect_source())
+    return CompiledArtifact(host_mod, device_mod, params, kernel_source)
diff --git a/tilelang/engine/phase.py b/tilelang/engine/phase.py
index 952a175a21..5563845214 100644
--- a/tilelang/engine/phase.py
+++ b/tilelang/engine/phase.py
@@ -3,7 +3,7 @@
 from tvm.target import Target
 import tilelang
 from tilelang.transform import PassContext
-from tilelang.contrib.nvcc import have_tma, is_hopper
+from tilelang.contrib.nvcc import have_tma, have_pdl
 
 
 def allow_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
@@ -18,17 +18,14 @@ def allow_warp_specialized(pass_ctx: PassContext | None = None, target: Target |
     return not disable_warp_specialized
 
 
-def allow_tma_and_warp_specialized(pass_ctx: PassContext | None = None, target: Target | None = None) -> bool:
-    if pass_ctx is None:
-        pass_ctx = tilelang.transform.get_pass_context()
-    if not have_tma(target):
-        return False
-    disable_tma_lower = pass_ctx.config.get("tl.disable_tma_lower", False)
-    return not disable_tma_lower and allow_warp_specialized(pass_ctx=pass_ctx, target=target)
-
+def module_has_tma(mod: IRModule) -> bool:
+    """Check if any function in the module was lowered with TMA operations.
 
-def allow_fence_proxy(target: Target | None = None) -> bool:
-    return have_tma(target)
+    This reads the ``tl.has_tma`` attribute set by ``LowerTileOp`` during
+    ``LowerAndLegalize``, which is the source of truth for whether TMA
+    copies were actually generated.
+    """
+    return any(func.attrs and func.attrs.get("tl.has_tma", False) for _, func in mod.functions.items())
 
 
 def allow_vectorize(pass_ctx: PassContext | None = None) -> bool:
@@ -76,6 +73,20 @@ def should_enable_layout_visual(pass_ctx: PassContext | None = None) -> bool:
     return enabled
 
 
+def should_enable_race_check(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    enabled = not pass_ctx.config.get(tilelang.PassConfigKey.TL_DISABLE_DATA_RACE_CHECK, False)
+    return enabled
+
+
+def should_enable_prelower_semantic_check(pass_ctx: PassContext | None = None) -> bool:
+    if pass_ctx is None:
+        pass_ctx = tilelang.transform.get_pass_context()
+    enabled = not pass_ctx.config.get(tilelang.PassConfigKey.TL_DISABLE_PRELOWER_SEMANTIC_CHECK, False)
+    return enabled
+
+
 def get_layout_visual_formats(pass_ctx: PassContext | None = None) -> list[str]:
     if pass_ctx is None:
         pass_ctx = tilelang.transform.get_pass_context()
@@ -118,6 +129,9 @@ def PreLowerSemanticCheck(mod: IRModule) -> None:
     Note: This is a validation-only pipeline of passes and does not modify or return the module.
     """
 
+    if not should_enable_prelower_semantic_check():
+        return
+
     # Print AST for debugging purpose
     if should_enable_ast_print():
         tilelang.analysis.ASTPrinter()(mod)
@@ -158,12 +172,31 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     mod = tilelang.transform.AddWrapperForSingleBufStore()(mod)
     # Normalize negative indices to canonical non-negative form
     mod = tilelang.transform.LegalizeNegativeIndex()(mod)
+    # Verify parallel loop correctness
+    if should_enable_race_check():
+        mod = tilelang.transform.VerifyParallelLoop()(mod)
     # Inject assumes to speedup tvm prover
     mod = tilelang.transform.InjectAssumes()(mod)
     # Simplify the IR expressions
     mod = tilelang.transform.Simplify()(mod)
     # Set layouts for reducers
     mod = tilelang.transform.LayoutReducer()(mod)
+    # Tile-level warp specialization: runs before layout inference so that
+    # producer/consumer split happens at the high-level tile-op IR.
+    # The pass classifies copy ops as TMA/cp.async/sync inline (no prior
+    # InstructionAnnotation pass needed). Shared buffers are multi-versioned
+    # internally only for functions where the WS transformation actually
+    # applies.
+    if allow_warp_specialized(target=target):
+        mod = tilelang.transform.ProducerConsumerWarpSpecialized()(mod)
+    # Lower 2SM TCGEN5MMA and related on Blackwell target (must run before
+    # LayoutInference so that the use_2cta annotation is visible to infer_layout)
+    mod = tilelang.transform.LowerBlackwell2SM()(mod)
+    # Run pipeline planning and software-pipeline rewriting before layout
+    # inference so inferred layouts see the final pipelined structure directly.
+    mod = tilelang.transform.PipelinePlanning()(mod)
+    mod = tilelang.transform.InjectSoftwarePipeline()(mod)
+    mod = tilelang.transform.Simplify()(mod)
     # Infer memory layouts for fragments and shared memory
     mod = tilelang.transform.LayoutInference()(mod)
     # Visualize the layout
@@ -172,10 +205,14 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
     mod = tilelang.transform.LowerTileOp()(mod)
     # Lower l2 persistent map
     mod = tilelang.transform.LowerL2Persistent()(mod)
+    # Decouple type cast vectorization constraints before vectorization
+    mod = tilelang.transform.DecoupleTypeCast()(mod)
     # Legalize vectorized loops to ensure they are valid
     mod = tilelang.transform.LegalizeVectorizedLoop()(mod)
     # Add safety checks for memory accesses
     mod = tilelang.transform.LegalizeSafeMemoryAccess()(mod)
+    # Lower frontend pointer metadata op to standard tvm_access_ptr
+    mod = tilelang.transform.LowerAccessPtr()(mod)
     # Simplify again to clean up any duplicated conditions
     # that may have been introduced by safety checks
     # use an enhanced pass to simplify the dynamic symbolics
@@ -189,39 +226,19 @@ def LowerAndLegalize(mod: IRModule, target: Target) -> IRModule:
 
 def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     pass_ctx = tilelang.transform.get_pass_context()
-    # Lower the barrier.arrive into specific initialization slot
-    mod = tilelang.transform.LowerSharedBarrier()(mod)
     # Lower the shared.tmem into specific initialization slot
     mod = tilelang.transform.LowerSharedTmem()(mod)
     # which may be introduced by the LegalizeSafeMemoryAccess
-    if allow_tma_and_warp_specialized(pass_ctx=pass_ctx, target=target):
-        mod = tilelang.transform.IfStmtBinding()(mod)
-        mod = tilelang.transform.MultiVersionBuffer()(mod)
-        mod = tilelang.transform.WarpSpecialized()(mod)
-        mod = tilelang.transform.InjectTmaBarrier()(mod)
-        mod = tilelang.transform.AnnotateWarpGroupRegAlloc()(mod)
-        # if tma is not enabled, we can also do pipeline planning
-        # to get better performance with async copy
-        mod = tilelang.transform.PipelinePlanning()(mod)
-        mod = tilelang.transform.InjectSoftwarePipeline()(mod)
-        # warp_specialized pass will pack the if stmt into the block
-        # so we need to lower the opaque block first
-        mod = tilelang.transform.LowerOpaqueBlock()(mod)
-        mod = tilelang.transform.MergeIfStmt()(mod)
-        if is_hopper(target):
-            mod = tilelang.transform.RewriteWgmmaSync()(mod)
-        mod = tilelang.transform.InjectFenceProxy()(mod)
-    else:
-        mod = tilelang.transform.IfStmtBinding()(mod)
-        mod = tilelang.transform.PlanAndUpdateBufferAllocationLocation()(mod)
-        mod = tilelang.transform.PipelinePlanning()(mod)
-        mod = tilelang.transform.InjectSoftwarePipeline()(mod)
-        mod = tilelang.transform.MergeIfStmt()(mod)
-        if allow_fence_proxy(target=target):
-            # in hopper device, wgmma is an async proxy
-            # so we need to inject a fence proxy before it
-            mod = tilelang.transform.InjectFenceProxy()(mod)
-
+    mod = tilelang.transform.IfStmtBinding()(mod)
+    has_tma = module_has_tma(mod)
+    # Pipeline barriers are now created at final expanded size by
+    # InjectSoftwarePipeline, so no late MVB barrier fixup is needed.
+    # Buffer allocation placement is handled uniformly for both paths.
+    mod = tilelang.transform.PlanAndUpdateBufferAllocationLocation()(mod)
+    mod = tilelang.transform.LowerSharedBarrier()(mod)
+    if has_tma:
+        mod = tilelang.transform.FuseMBarrierArriveExpectTx()(mod)
+    mod = tilelang.transform.HoistGlobalBufferAllocations()(mod)
     mod = tilelang.transform.LowerOpaqueBlock()(mod)
     mod = tilelang.transform.Simplify()(mod)
     mod = tir.transform.NarrowDataType(32)(mod)
@@ -232,11 +249,11 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     mod = tir.transform.Simplify()(mod)
     mod = tilelang.transform.VectorizeLoop(enable_vectorize=allow_vectorize(pass_ctx=pass_ctx))(mod)
     mod = tilelang.transform.StorageRewrite()(mod)
-    mod = tir.transform.UnrollLoop()(mod)
+    mod = tilelang.transform.LoopUnswitching()(mod)
+    mod = tilelang.transform.UnrollLoop()(mod)
     mod = tir.transform.RenormalizeSplitPattern()(mod)
     mod = tir.transform.Simplify()(mod)
     mod = tir.transform.RemoveNoOp()(mod)
-    mod = tir.transform.RewriteUnsafeSelect()(mod)
     mod = tir.transform.HoistIfThenElse()(mod)
 
     mod = tir.transform.VerifyMemory()(mod)
@@ -252,7 +269,7 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
     # the Legalization.
     mod = tir.transform.InferFragment()(mod)
     mod = tilelang.transform.LowerThreadAllreduce()(mod)
-
+    mod = tilelang.transform.LowerLDGSTG()(mod)
     mod = tilelang.transform.LowerHopperIntrin()(mod)
     # Global Barrier Synchronization must be applied before
     # SplitHostDevice pass, as the global barrier
@@ -260,17 +277,28 @@ def OptimizeForTarget(mod: IRModule, target: Target) -> IRModule:
         mod = tilelang.transform.ThreadSync("global")(mod)
     mod = tilelang.transform.AnnotateDeviceRegions()(mod)
     mod = tilelang.transform.SplitHostDevice()(mod)
+
+    # Mark the function contains pdl_sync or pdl_trigger
+    mod = tilelang.transform.MarkCudaSyncCalls(have_pdl(target))(mod)
+
     mod = tilelang.transform.AnnotateReadOnlyParams()(mod)
     # MergeSharedMemoryAllocations must be applied after SplitHostDevice
     # because the merged allocation site is at the beginning of each device function
     enable_aggressive_merge = should_enable_aggressive_merge(pass_ctx=pass_ctx, target=target)
     mod = tilelang.transform.MergeSharedMemoryAllocations(enable_aggressive_merge=enable_aggressive_merge)(mod)
+    # InjectFenceProxy is a no-op on targets that lack the TMA / async-proxy
+    # programming model; the pass itself checks the PrimFunc's target.
+    mod = tilelang.transform.InjectFenceProxy()(mod)
     mod = tilelang.transform.ThreadSync("shared")(mod)
     mod = tilelang.transform.ThreadSync("shared.dyn")(mod)
-    # Inject PTX async copy must behind the thread sync pass
-    # as ptx async copy won't be recognized as a valid buffer load
-    mod = tilelang.transform.InjectPTXAsyncCopy()(mod)
-    if allow_tma_and_warp_specialized(pass_ctx=pass_ctx, target=target):
+    # Inject conservative tcgen05 fences on Blackwell (SM100+).
+    # Must run after ThreadSync so that tvm_storage_sync calls are present.
+    # The pass handles shared syncs and simple linear wait/use, use/arrive
+    # handoffs, and is a no-op on non-SM100 targets or functions without TMEM.
+    mod = tilelang.transform.InjectTcgen05Fence()(mod)
+    mod = tilelang.transform.MergeIfStmt()(mod)
+    # NOTE: LowerPTXAsyncCopy is applied earlier (before PipelinePlanning).
+    if allow_warp_specialized(pass_ctx=pass_ctx, target=target):
         mod = tilelang.transform.AnnotateWarpGroupRegAlloc()(mod)
     mod = tilelang.transform.MakePackedAPI()(mod)
     mod = tilelang.transform.Simplify()(mod)
diff --git a/tilelang/env.py b/tilelang/env.py
index a15477f04a..9a9e3e5e1c 100644
--- a/tilelang/env.py
+++ b/tilelang/env.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import importlib.metadata
 import sys
 import os
 import pathlib
@@ -44,6 +45,28 @@
         sys.path.insert(0, lib)
 
 
+def _get_package_version(pkg: str) -> str | None:
+    try:
+        return importlib.metadata.version(pkg)
+    except importlib.metadata.PackageNotFoundError:
+        return None
+
+
+def _is_running_autodd() -> bool:
+    """Detect if we are running under `python -m tilelang.autodd`."""
+    orig_argv = getattr(sys, "orig_argv", None)
+    if orig_argv is None:
+        return False
+    if "-mtilelang.autodd" in orig_argv:
+        return True
+    pos = orig_argv.index("-m") if "-m" in orig_argv else -1
+    if pos != -1 and pos + 1 < len(orig_argv):
+        module_name = orig_argv[pos + 1]
+        if module_name == "tilelang.autodd" or module_name.startswith("tilelang.autodd."):
+            return True
+    return False
+
+
 def _find_cuda_home() -> str:
     """Find the CUDA install path.
 
@@ -66,8 +89,19 @@ def _find_cuda_home() -> str:
             else:
                 cuda_home = os.path.dirname(os.path.dirname(nvcc_path))
 
-        else:
+        elif _get_package_version("nvidia-cuda-nvcc") is not None:
             # Guess #3
+            # from pypi package nvidia-cuda-nvcc, only nvidia-cuda-nvcc>=13.0 works.
+            # nvidia-cuda-nvcc-cu12, etc. only installs `ptxas`, not `nvcc`
+            for file in importlib.metadata.files("nvidia-cuda-nvcc") or []:
+                if file.name == "nvcc" or file.name == "nvcc.exe":
+                    cuda_home = str(pathlib.Path(file.locate()).parent.parent)
+                    break
+            else:
+                raise AssertionError("`nvidia-cuda-nvcc` installed but no `nvcc` found")
+
+        else:
+            # Guess #4
             if sys.platform == "win32":
                 cuda_homes = glob.glob("C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v*.*")
                 cuda_home = "" if len(cuda_homes) == 0 else cuda_homes[0]
@@ -78,9 +112,9 @@ def _find_cuda_home() -> str:
                 elif os.path.exists("/opt/nvidia/hpc_sdk/Linux_x86_64"):
                     cuda_home = "/opt/nvidia/hpc_sdk/Linux_x86_64"
 
-            # Validate found path
-            if cuda_home is None or not os.path.exists(cuda_home):
-                cuda_home = None
+        # Validate found path
+        if cuda_home is None or not os.path.exists(cuda_home):
+            cuda_home = None
 
     return cuda_home if cuda_home is not None else ""
 
@@ -234,10 +268,9 @@ class Environment:
         "TILELANG_DISABLE_CACHE", "0"
     )  # disable kernel cache, usually for unit testing / debugging, high priority
     TILELANG_CLEAR_CACHE = EnvVar("TILELANG_CLEAR_CACHE", "0")  # DEPRECATED! clear cache automatically if set
-
-    # Kernel selection options
-    # Default to GEMM v2; set to "1"/"true"/"yes"/"on" to force v1
-    TILELANG_USE_GEMM_V1 = EnvVar("TILELANG_USE_GEMM_V1", "0")
+    TILELANG_CLEANUP_TEMP_FILES = EnvVar(
+        "TILELANG_CLEANUP_TEMP_FILES", "0"
+    )  # cleanup temporary compiler files/dirs after compilation (default: keep for debugging)
 
     # Auto-tuning settings
     TILELANG_AUTO_TUNING_DISABLE_CACHE = EnvVar("TILELANG_AUTO_TUNING_DISABLE_CACHE", "0")
@@ -255,78 +288,6 @@ class Environment:
     SKIP_LOADING_TILELANG_SO = EnvVar("SKIP_LOADING_TILELANG_SO", "0")
     TVM_IMPORT_PYTHON_PATH = EnvVar("TVM_IMPORT_PYTHON_PATH", None)
 
-    # NVSHMEM paths - auto-detect from pip-installed nvidia-nvshmem-cu12 or NVSHMEM_HOME
-    _nvshmem_include_dir: str | None = None
-    _nvshmem_lib_path: str | None = None
-
-    @property
-    def USE_NVSHMEM(self) -> bool:
-        """Return True if NVSHMEM is enabled (dynamically reads env var)."""
-        return os.environ.get("TILELANG_USE_NVSHMEM", "0").lower() in ("1", "true", "on")
-
-    @property
-    def USE_DISTRIBUTED(self) -> bool:
-        """Return True if distributed mode is enabled (dynamically reads env var)."""
-        return os.environ.get("TILELANG_USE_DISTRIBUTED", "0").lower() in ("1", "true", "on")
-
-    @property
-    def NVSHMEM_INCLUDE_DIR(self) -> str | None:
-        """Get NVSHMEM include directory, auto-detecting if needed."""
-        if self._nvshmem_include_dir is None and self.USE_DISTRIBUTED:
-            self._nvshmem_include_dir, self._nvshmem_lib_path = Environment._find_nvshmem_paths()
-        return self._nvshmem_include_dir
-
-    @property
-    def NVSHMEM_LIB_PATH(self) -> str | None:
-        """Get NVSHMEM library path, auto-detecting if needed."""
-        if self._nvshmem_lib_path is None and self.USE_DISTRIBUTED:
-            self._nvshmem_include_dir, self._nvshmem_lib_path = Environment._find_nvshmem_paths()
-        return self._nvshmem_lib_path
-
-    @staticmethod
-    def _find_nvshmem_paths():
-        """Find NVSHMEM include and library paths from source build, env vars, or pip package."""
-        include_dir = None
-        lib_path = None
-
-        # First priority: NVSHMEM_HOME or NVSHMEM_SRC environment variables
-        nvshmem_home = os.environ.get("NVSHMEM_HOME", "")
-        if nvshmem_home and os.path.exists(nvshmem_home):
-            include_dir = os.path.join(nvshmem_home, "include")
-            lib_path = os.path.join(nvshmem_home, "lib")
-        else:
-            nvshmem_src = os.environ.get("NVSHMEM_SRC", "")
-            if nvshmem_src and os.path.exists(nvshmem_src):
-                include_dir = os.path.join(nvshmem_src, "build/src/include")
-                lib_path = os.path.join(nvshmem_src, "build/src/lib")
-
-        # Second priority: Check 3rdparty/nvshmem_src in the project
-        if include_dir is None:
-            # Check relative to THIRD_PARTY_ROOT
-            nvshmem_3rdparty = os.path.join(THIRD_PARTY_ROOT, "nvshmem_src")
-            if os.path.exists(nvshmem_3rdparty):
-                candidate_inc = os.path.join(nvshmem_3rdparty, "build/src/include")
-                candidate_lib = os.path.join(nvshmem_3rdparty, "build/src/lib")
-                if os.path.exists(candidate_inc) and os.path.exists(candidate_lib):
-                    include_dir = candidate_inc
-                    lib_path = candidate_lib
-
-        # Third priority: pip-installed nvidia-nvshmem-cu12 (but has header compatibility issues)
-        if include_dir is None:
-            try:
-                import nvidia.nvshmem
-
-                nvshmem_pip_home = nvidia.nvshmem.__path__[0]
-                pip_include = os.path.join(nvshmem_pip_home, "include")
-                pip_lib = os.path.join(nvshmem_pip_home, "lib")
-                if os.path.exists(pip_include) and os.path.exists(pip_lib):
-                    include_dir = pip_include
-                    lib_path = pip_lib
-            except ImportError:
-                pass
-
-        return include_dir, lib_path
-
     def _initialize_torch_cuda_arch_flags(self) -> None:
         """
         Detect target CUDA architecture and set TORCH_CUDA_ARCH_LIST
@@ -359,13 +320,8 @@ def is_autotune_cache_disabled(self) -> bool:
     def is_print_on_compilation_enabled(self) -> bool:
         return self.TILELANG_PRINT_ON_COMPILATION.lower() in ("1", "true", "yes", "on")
 
-    def use_gemm_v1(self) -> bool:
-        """Return True if GEMM v1 should be used based on env.
-
-        Controlled by `TILELANG_USE_GEMM_V1`. Truthy values are one of
-        {"1", "true", "yes", "on"} (case-insensitive).
-        """
-        return str(self.TILELANG_USE_GEMM_V1).lower() in ("1", "true", "yes", "on")
+    def should_cleanup_temp_files(self) -> bool:
+        return str(self.TILELANG_CLEANUP_TEMP_FILES).lower() in ("1", "true", "yes", "on")
 
     def get_default_target(self) -> str:
         """Get default compilation target from environment."""
@@ -379,6 +335,17 @@ def get_default_verbose(self) -> bool:
         """Get default verbose flag from environment."""
         return self.TILELANG_DEFAULT_VERBOSE.lower() in ("1", "true", "yes", "on")
 
+    def is_running_autodd(self) -> bool:
+        """Return True if we are running under `python -m tilelang.autodd`."""
+        # means we are running under `python -m tilelang.autodd`
+        return _is_running_autodd()
+
+    def is_light_import(self) -> bool:
+        """Return True if we are running in light import mode."""
+        # means we are running under `python -m tilelang.autodd` or some
+        # other scripts that only require the minimal environment variables.
+        return self.is_running_autodd()
+
 
 # Instantiate as a global configuration object
 env = Environment()
@@ -440,9 +407,6 @@ def prepend_pythonpath(path):
     else:
         logger.warning(TL_TEMPLATE_NOT_FOUND_MESSAGE)
 
-# NVSHMEM paths are now lazily initialized via properties in Environment class
-# when USE_DISTRIBUTED is enabled. No need for eager initialization here.
-
 # Export static variables after initialization.
 CUTLASS_INCLUDE_DIR = env.CUTLASS_INCLUDE_DIR
 COMPOSABLE_KERNEL_INCLUDE_DIR = env.COMPOSABLE_KERNEL_INCLUDE_DIR
diff --git a/tilelang/intrinsics/mfma_layout.py b/tilelang/intrinsics/mfma_layout.py
index 3895964946..3c4bda41d2 100644
--- a/tilelang/intrinsics/mfma_layout.py
+++ b/tilelang/intrinsics/mfma_layout.py
@@ -1,11 +1,12 @@
 from tvm import DataType
 from tvm.runtime import convert
+from tvm.tir import const
 import tilelang.language as T
 
 
 def shared_16x4_to_local_64x1_layout_A(i, j):
     thread_id = j * 16 + i
-    return thread_id, convert(0)
+    return thread_id, const(0)
 
 
 def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
@@ -16,7 +17,7 @@ def thread_id_shared_access_64x1_to_16x4_layout_A(thread_id, local_id):
 
 def shared_4x16_to_local_64x1_layout_B(i, j):
     thread_id = i * 16 + j
-    return thread_id, convert(0)
+    return thread_id, const(0)
 
 
 def thread_id_shared_access_64x1_to_4x16_layout_B(thread_id, local_id):
@@ -127,6 +128,56 @@ def shared_16x64_to_local_64x16_layout_B(i, j):
     return thread_id, local
 
 
+def shared_32x32_to_local_64x16_layout_C(i, j):
+    thread_id = (i % 8 // 4) * 32 + j
+    local_id = (i // 8) * 4 + i % 4
+    return thread_id, local_id
+
+
+def thread_id_shared_access_64x16_to_32x32_layout_C_n_m(thread_id, local_id):
+    i = (thread_id // 32) * 4 + local_id % 4 + (local_id // 4) * 8
+    j = thread_id % 32
+    return i, j
+
+
+def thread_id_shared_access_64x16_to_32x32_layout_C_m_n(thread_id, local_id):
+    """Return (m, n) = (row, col) for the 32x32 MFMA output register layout.
+
+    For v_mfma_i32_32x32x32_i8 (gfx950), each wave-64 lane holds 16 output
+    i32 values.  The column (N-dimension) is indexed by ``thread_id % 32``
+    and the row (M-dimension) is given by the interleaved formula below.
+    This function returns ``(m_idx, n_idx)`` matching the ``(row, col)``
+    convention expected by ``stmatrix``.
+    """
+    m = (thread_id // 32) * 4 + local_id % 4 + (local_id // 4) * 8
+    n = thread_id % 32
+    return m, n
+
+
+def shared_32x32_to_local_64x16_layout_A(i, j):
+    thread_id = i + 32 * (j // 16)
+    local_id = j % 16
+    return thread_id, local_id
+
+
+def thread_id_shared_access_64x16_to_32x32_layout_A(thread_id, local_id):
+    i = thread_id % 32
+    j = (thread_id // 32) * 16 + local_id
+    return i, j
+
+
+def shared_32x32_to_local_64x16_layout_B(i, j):
+    thread_id = j + 32 * (i // 16)
+    local_id = i % 16
+    return thread_id, local_id
+
+
+def thread_id_shared_access_64x16_to_32x32_layout_B(thread_id, local_id):
+    i = (thread_id // 32) * 16 + local_id
+    j = thread_id % 32
+    return i, j
+
+
 def make_mfma_swizzle_layout(shared_buf, vecSize=8):
     dtype = shared_buf.dtype
     shape = shared_buf.shape
diff --git a/tilelang/intrinsics/mfma_macro_generator.py b/tilelang/intrinsics/mfma_macro_generator.py
index ad21920610..71c76051d8 100644
--- a/tilelang/intrinsics/mfma_macro_generator.py
+++ b/tilelang/intrinsics/mfma_macro_generator.py
@@ -4,13 +4,16 @@
 from tvm import DataType
 from tvm import tir
 from tvm.ir import Range
+from tvm.target import Target
 from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tvm.runtime import convert
 from .utils import mfma_store_index_map
 from typing import Literal, Callable
+import warnings
 
+from tilelang.utils.target import target_is_gfx950, determine_target
 from tilelang.utils import is_fragment
-from tilelang.utils.language import get_buffer_region_from_load
+from tilelang.language.utils import get_buffer_region_from_load
 from .mfma_layout import (
     shared_16x4_to_local_64x1_layout_A,
     shared_4x16_to_local_64x1_layout_B,
@@ -20,6 +23,8 @@
     shared_16x32_to_local_64x8_layout_B,
     shared_16x64_to_local_64x16_layout_A,
     shared_16x64_to_local_64x16_layout_B,
+    shared_32x32_to_local_64x16_layout_A,
+    shared_32x32_to_local_64x16_layout_B,
     thread_id_shared_access_64x1_to_16x4_layout_A,
     thread_id_shared_access_64x1_to_4x16_layout_B,
     thread_id_shared_access_64x4_to_16x16_layout_A,
@@ -28,6 +33,8 @@
     thread_id_shared_access_64x8_to_16x32_layout_B,
     thread_id_shared_access_64x16_to_16x64_layout_A,
     thread_id_shared_access_64x16_to_16x64_layout_B,
+    thread_id_shared_access_64x16_to_32x32_layout_A,
+    thread_id_shared_access_64x16_to_32x32_layout_B,
 )
 
 lift = convert
@@ -38,8 +45,6 @@ class MatrixCoreIntrinEmitter:
     To eliminate Python syntax within TIR Macro.
     """
 
-    M_DIM = 16
-    N_DIM = 16
     WARP_SIZE = 64
     dtype_abbrv = {
         "float16": "fp16",
@@ -49,7 +54,9 @@ class MatrixCoreIntrinEmitter:
         "int32": "int32",
         "float8_e4m3": "e4m3",
         "float8_e5m2": "e5m2",
+        "float8_e4m3fn": "e4m3fn",
         "float8_e4m3fnuz": "e4m3fnuz",
+        "float8_e5m2fnuz": "e5m2fnuz",
     }
 
     # k_pack represents the number of elements in a vectorized instruction
@@ -77,24 +84,31 @@ def __init__(
         is_m_first: bool | None = False,
         b_preshuffle: bool | None = False,
         thread_var: Var | None = None,
+        target: Target | None = None,
+        mfma_shape: tuple[int, int, int] | None = None,
     ):
         self.a_dtype = a_dtype
         self.b_dtype = b_dtype
         self.accum_dtype = accum_dtype
         self.a_transposed = a_transposed
         self.b_transposed = b_transposed
+        if target is None:
+            warnings.warn("Target is not provided, using auto detection", stacklevel=2)
+            target = determine_target("auto", return_object=True)
+        self.target = target
         # Hint Information
         self.block_row_warps = block_row_warps
         self.block_col_warps = block_col_warps
         self.warp_row_tiles = warp_row_tiles
         self.warp_col_tiles = warp_col_tiles
         self.chunk = chunk
-        self._initialize_k_dim(a_dtype)
+        self._initialize_k_pack(k_pack)
+        self._initialize_mfma_shape(mfma_shape, a_dtype)
+        self._normalize_gfx950_f16_bf16_kpack()
         self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
         self._initialize_local_size(self.M_DIM, self.N_DIM, self.k_dim, self.WARP_SIZE)
         self._initialize_mfma_prefix(self.k_dim)
         self._initialize_micro_size(self.M_DIM, self.N_DIM, self.k_dim)
-        self._initialize_k_pack(k_pack)
         self._initialize_is_m_first(is_m_first)
         self._initialize_b_preshuffle(b_preshuffle)
 
@@ -105,17 +119,36 @@ def __init__(
         self.num_elems_per_byte = num_elems_per_byte
         self.thread_var = thread_var
 
-    def _initialize_k_dim(self, a_dtype=T.float16):
+    def _initialize_mfma_shape(self, mfma_shape: tuple[int, int, int] | None, a_dtype):
+        """Set ``(M_DIM, N_DIM, k_dim)`` from an explicit shape or auto-detect.
+
+        Supported shapes on CDNA4 (gfx950) for int8:
+            (16, 16, 32)  — ``v_mfma_i32_16x16x32_i8``  (default, CDNA3-compatible)
+            (16, 16, 64)  — ``v_mfma_i32_16x16x64_i8``  (doubled K throughput)
+            (32, 32, 32)  — ``v_mfma_i32_32x32x32_i8``  (doubled MN throughput)
+        """
+        if mfma_shape is not None:
+            self.M_DIM, self.N_DIM, self.k_dim = mfma_shape
+            return
+
+        # Auto-detect: same logic as the old _initialize_k_dim, defaulting to 16x16.
+        self.M_DIM = 16
+        self.N_DIM = 16
         if isinstance(a_dtype, str):
-            if a_dtype in ["float8_e4m3fnuz", T.int8]:
+            if a_dtype in ["float8_e4m3fn", "float8_e4m3fnuz", "float8_e5m2", "float8_e5m2fnuz", T.int8]:
                 self.k_dim = 32
                 return
             a_dtype = DataType(a_dtype)
 
         if a_dtype.bits == 32:
             self.k_dim = 4
-        elif a_dtype.bits in {16, 8}:
+        elif a_dtype.bits == 16:
             self.k_dim = 16
+        elif a_dtype.bits == 8:
+            if target_is_gfx950(self.target):
+                self.k_dim = 32
+            else:
+                self.k_dim = 16
         else:
             raise ValueError(f"Unsupported a_dtype = {a_dtype}")
 
@@ -140,16 +173,34 @@ def _initialize_mfma_prefix(self, k_dim=16):
             "float32": "f32",
             "int8": "i8",
             "int32": "i32",
+            "float8_e4m3fn": "fp8",
             "float8_e4m3fnuz": "fp8",
+            # ROCm treats E5M2 as BF8 in MFMA intrinsics.
+            "float8_e5m2": "bf8",
+            "float8_e5m2fnuz": "bf8",
         }[in_dtype]
 
         if in_dtype_abbrv == "fp8":
             self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_fp8_fp8"
+        elif in_dtype_abbrv == "bf8":
+            self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_bf8_bf8"
         elif in_dtype_abbrv == "i8":
             self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_i8"
         elif in_dtype_abbrv == "bf16":
             # HIP intrinsic uses ...x{K}bf16_1k without an underscore before bf16
-            self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}bf16_1k"
+            if k_dim == 32:
+                # GFX950: __builtin_amdgcn_mfma_f32_16x16x32_bf16
+                self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_bf16"
+            else:
+                # CDNA2/3: __builtin_amdgcn_mfma_f32_16x16x16bf16_1k
+                self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}bf16_1k"
+        elif in_dtype_abbrv == "f16":
+            # HIP 950 have _ before f16
+            if k_dim == 32:
+                # GFX950: __builtin_amdgcn_mfma_f32_16x16x32_f16
+                self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}_f16"
+            else:
+                self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}{in_dtype_abbrv}"
         else:
             self.mfma_suffix = f"{out_dtype_abbrv}_{M_DIM}x{N_DIM}x{k_dim}{in_dtype_abbrv}"
 
@@ -162,6 +213,17 @@ def _initialize_k_pack(self, k_pack: int | None = None):
         if k_pack is not None:
             self.k_pack = k_pack
 
+    def _normalize_gfx950_f16_bf16_kpack(self):
+        is_f16_or_bf16 = self.a_dtype in {T.float16, T.bfloat16} and self.b_dtype in {T.float16, T.bfloat16}
+        # https://github.com/triton-lang/triton/blob/v3.6.0/third_party/amd/backend/compiler.py#L85-L89
+        if target_is_gfx950(self.target) and is_f16_or_bf16 and self.k_dim == 16 and self.k_pack == 2:
+            warnings.warn(
+                "On gfx950 with f16/bf16, remapping (k_dim=16, k_pack=2) to (k_dim=32, k_pack=1).",
+                stacklevel=3,
+            )
+            self.k_dim = 32
+            self.k_pack = 1
+
     def _initialize_is_m_first(self, is_m_first: bool | None = False):
         if is_m_first is not None:
             self.is_m_first = is_m_first
@@ -173,9 +235,17 @@ def _initialize_b_preshuffle(self, b_preshuffle: bool | None = False):
     def get_ldmatrix_index_map(self, is_b=False):
         k_dim = self.k_dim * self.k_pack
         transposed = self.a_transposed if not is_b else self.b_transposed
+        mn_dim = self.N_DIM if is_b else self.M_DIM
+
+        # 32x32 MFMA instructions use a different set of layout maps.
+        if mn_dim == 32:
+            return self._get_ldmatrix_index_map_32(is_b, k_dim, transposed)
+
         if k_dim == 4:
-            index_map = shared_16x4_to_local_64x1_layout_A
-            reverse_index_map = thread_id_shared_access_64x1_to_16x4_layout_A
+            index_map = shared_4x16_to_local_64x1_layout_B if transposed else shared_16x4_to_local_64x1_layout_A
+            reverse_index_map = (
+                thread_id_shared_access_64x1_to_4x16_layout_B if transposed else thread_id_shared_access_64x1_to_16x4_layout_A
+            )
             if is_b:
                 index_map = shared_16x4_to_local_64x1_layout_A if transposed else shared_4x16_to_local_64x1_layout_B
                 reverse_index_map = (
@@ -219,9 +289,45 @@ def get_ldmatrix_index_map(self, is_b=False):
 
         return index_map, reverse_index_map
 
+    def _get_ldmatrix_index_map_32(self, is_b, k_dim, transposed):
+        """Index maps for 32x32xK MFMA instructions (M_DIM=N_DIM=32).
+
+        For int8 with mfma_shape=(32,32,32): k_dim*k_pack=32, local_size=16
+        so the tile is 32×32 → 64×16 thread/local layout.
+        """
+        # For 32x32 MFMA, the A/B layouts have tile dim 32 on the MN side.
+        # The maps are symmetric: A[row=M, col=K] and B[row=K, col=N] use the
+        # same underlying 32xK↔64×local layout, just with axes swapped for transpose.
+        if k_dim != 32:
+            raise ValueError(f"32x32 MFMA with effective k_dim={k_dim} is not supported yet; only k_dim=32 (k_dim*k_pack) is implemented.")
+        # k_dim=32 → shared_32x32_to_local_64x16
+        if not is_b:
+            # A: non-transposed = [M=32, K=32], transposed = [K=32, M=32]
+            if transposed:
+                index_map = shared_32x32_to_local_64x16_layout_B
+                reverse_index_map = thread_id_shared_access_64x16_to_32x32_layout_B
+            else:
+                index_map = shared_32x32_to_local_64x16_layout_A
+                reverse_index_map = thread_id_shared_access_64x16_to_32x32_layout_A
+        else:
+            # B: transposed = [N=32, K=32], non-transposed = [K=32, N=32]
+            if transposed:
+                index_map = shared_32x32_to_local_64x16_layout_A
+                reverse_index_map = thread_id_shared_access_64x16_to_32x32_layout_A
+            else:
+                index_map = shared_32x32_to_local_64x16_layout_B
+                reverse_index_map = thread_id_shared_access_64x16_to_32x32_layout_B
+        return index_map, reverse_index_map
+
     def get_store_index_map(self, inverse: bool = False) -> IndexMap:
         warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
-        index_map = IndexMap.from_func(mfma_store_index_map, index_dtype=T.int32)
+        if self.M_DIM == 32:
+            from .utils import mfma_store_index_map_32x32
+
+            map_func = mfma_store_index_map_32x32
+        else:
+            map_func = mfma_store_index_map
+        index_map = IndexMap.from_func(map_func, index_dtype=T.int32)
         if not inverse:
             return index_map
         inverse_index_map = index_map.inverse([warp_size, local_size_c])
@@ -281,6 +387,8 @@ def ldmatrix_a(self, A_local_buf, A_shared_buf: Buffer | BufferRegion, ki, rk=0)
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
+        # Leading dimensions (e.g. pipeline stage axis) – empty for 2-D buffers
+        A_other = [r.min for r in A_region.region[:-2]]
 
         @T.macro
         def _warp_ldmatrix_a(
@@ -296,13 +404,13 @@ def _warp_ldmatrix_a(
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
                         l, r = (rk * chunk + ki * (k_pack * micro_size_k), warp_m * warp_row_tiles + i * micro_size_x)
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[tuple(A_other) + (A_base0 + l + row, A_base1 + r + col)]
             else:
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
                         l, r = (warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * (k_pack * micro_size_k))
-                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[A_base0 + l + row, A_base1 + r + col]
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[tuple(A_other) + (A_base0 + l + row, A_base1 + r + col)]
 
         return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
 
@@ -323,6 +431,8 @@ def ldmatrix_b(self, B_local_buf, B_shared_buf: Buffer | BufferRegion, ki, rk=0)
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
+        # Leading dimensions (e.g. pipeline stage axis) – empty for 2-D buffers
+        B_other = [r.min for r in B_region.region[:-2]]
 
         @T.macro
         def _warp_ldmatrix_b(
@@ -341,7 +451,7 @@ def _warp_ldmatrix_b(
                             warp_n * warp_col_tiles + j * micro_size_y,
                             rk * chunk + ki * (k_pack * micro_size_k),
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[tuple(B_other) + (B_base0 + l + row, B_base1 + r + col)]
 
             else:
                 for j in T.serial(warp_cols):
@@ -351,7 +461,7 @@ def _warp_ldmatrix_b(
                             rk * chunk + ki * (k_pack * micro_size_k),
                             warp_n * warp_col_tiles + j * micro_size_y,
                         )
-                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[B_base0 + l + row, B_base1 + r + col]
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[tuple(B_other) + (B_base0 + l + row, B_base1 + r + col)]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
@@ -408,6 +518,13 @@ def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
         C_buf_dims = len(C_buf.shape)
         assert C_buf_dims in {2, 4}, "C_buf should be 2D or 4D"
 
+        if M_DIM == 32:
+            from .mfma_layout import thread_id_shared_access_64x16_to_32x32_layout_C_m_n
+
+            _store_map = thread_id_shared_access_64x16_to_32x32_layout_C_m_n
+        else:
+            _store_map = mfma_store_index_map
+
         # STS
         # MFMA Store must be in simulated instead of TVM Intrins
         # As TVM Intrins is like a hack that the threadIdx.x should be always
@@ -417,7 +534,7 @@ def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
             tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
             for i, j in T.grid(warp_rows, warp_cols):
                 for local_id in T.vectorized(local_size_out):
-                    row, col = T.meta_var(mfma_store_index_map(tx, local_id))
+                    row, col = T.meta_var(_store_map(tx, local_id))
                     if C_buf_dims == 2:
                         C_buf[(warp_m * warp_rows + i) * M_DIM + row, (warp_n * warp_cols + j) * N_DIM + col] = C_local_buf[
                             i * (warp_cols * local_size_out) + j * local_size_out + local_id
@@ -432,7 +549,7 @@ def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
             tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
             for i, j in T.grid(warp_rows, warp_cols):
                 for local_id in T.vectorized(local_size_out):
-                    row, col = T.meta_var(mfma_store_index_map(tx, local_id))
+                    row, col = T.meta_var(_store_map(tx, local_id))
                     C_buf[
                         (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row, (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM + col
                     ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
@@ -480,8 +597,16 @@ def make_mfma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "
         transform_func_sr_b: Callable = None
 
         k_dim = self.k_dim * self.k_pack
+        mn_dim = self.M_DIM  # M_DIM == N_DIM for all supported shapes
 
-        if k_dim == 4:
+        if mn_dim == 32:
+            if k_dim != 32:
+                raise ValueError(
+                    f"make_mfma_load_layout: 32x32 MFMA with effective k_dim={k_dim} is not supported; only k_dim=32 is implemented."
+                )
+            transform_func_sr_a = shared_32x32_to_local_64x16_layout_A
+            transform_func_sr_b = shared_32x32_to_local_64x16_layout_A
+        elif k_dim == 4:
             transform_func_sr_a = shared_16x4_to_local_64x1_layout_A
             transform_func_sr_b = shared_16x4_to_local_64x1_layout_A
         elif k_dim == 16:
@@ -687,6 +812,8 @@ def __init__(
         a_preshuffle: bool | None = False,
         b_preshuffle: bool | None = False,
         thread_var: Var | None = None,
+        target: Target | None = None,
+        mfma_shape: tuple[int, int, int] | None = None,
     ):
         super().__init__(
             a_dtype=a_dtype,
@@ -704,12 +831,14 @@ def __init__(
             k_pack=k_pack,
             is_m_first=is_m_first,
             thread_var=thread_var,
+            mfma_shape=mfma_shape,
+            target=target,
         )
         self._initialize_preshuffle(a_preshuffle, b_preshuffle)
 
-    def _initialize_preshuffle(self, a_preshuffle: bool, b_preshuffle: bool):
-        if a_preshuffle is not None:
-            self.a_preshuffle = a_preshuffle
+    def _initialize_preshuffle(self, a_preshuffle: bool | None, b_preshuffle: bool | None):
+        # Parent does not set a_preshuffle; default False when omitted.
+        self.a_preshuffle = False if a_preshuffle is None else a_preshuffle
         if b_preshuffle is not None:
             self.b_preshuffle = b_preshuffle
 
@@ -720,10 +849,10 @@ def ldmatrix_a(self, A_local_buf, A_buf, ki, rk=0, pid_m=None, pid_n=None):
         local_size_a = self.local_size_a
         k_pack = self.k_pack
         is_transposed = self.a_transposed
-        current_frame = T.KernelLaunchFrame.Current()
-        thread_binding = current_frame.get_thread_binding()
+        thread_binding = self.get_thread_binding()
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=False)
-        is_global = pid_m is not None and pid_n is not None
+        # A-side global load only depends on the row block id (pid_m)
+        is_global = pid_m is not None
 
         # no preshuffle, use the default implementation
         if self.a_preshuffle is False:
@@ -775,7 +904,6 @@ def _warp_ldmatrix_a_shared(
                         )
                         A_local_buf[i * k_pack * local_size_a + local_id] = A_shared_buf[l, r, row, col]
             else:
-                print(self.a_preshuffle)
                 for i in T.serial(warp_rows):
                     for local_id in T.vectorized(k_pack * local_size_a):
                         row, col = T.meta_var(reverse_index_map(tx, local_id))
@@ -795,13 +923,13 @@ def ldmatrix_b(self, B_local_buf, B_buf, ki, rk=0, pid_m=None, pid_n=None):
         local_size_b = self.local_size_b
         k_pack = self.k_pack
         is_transposed = self.b_transposed
-        current_frame = T.KernelLaunchFrame.Current()
-        thread_binding = current_frame.get_thread_binding()
+        thread_binding = self.get_thread_binding()
         _, reverse_index_map = self.get_ldmatrix_index_map(is_b=True)
-        is_global = pid_m is not None and pid_n is not None
+        # B-side global load only depends on the column block id (pid_n)
+        is_global = pid_n is not None
 
         if self.b_preshuffle is False:
-            return super().ldmatrix_b(B_local_buf, B_buf, ki, rk, pid_m, pid_n)
+            return super().ldmatrix_b(B_local_buf, B_buf, ki, rk)
 
         @T.macro
         def _warp_ldmatrix_b_global(
diff --git a/tilelang/intrinsics/mma_macro_generator.py b/tilelang/intrinsics/mma_macro_generator.py
index 4b41eef2af..26e34e6a51 100644
--- a/tilelang/intrinsics/mma_macro_generator.py
+++ b/tilelang/intrinsics/mma_macro_generator.py
@@ -46,17 +46,24 @@ class TensorCoreIntrinEmitter:
         "bfloat16": "bf16",
         "float32": "fp32",
         "float64": "fp64",
+        "int4": "int4",
         "int8": "int8",
+        "uint8": "uint8",
         "int32": "int32",
         "float8_e4m3": "e4m3",
         "float8_e4m3fn": "e4m3",
         "float8_e4m3fnuz": "e4m3",
         "float8_e5m2": "e5m2",
         "float8_e5m2fnuz": "e5m2",
+        "float6_e2m3fn": "e2m3",
+        "float6_e3m2fn": "e3m2",
+        "float4_e2m1fn": "e2m1",
     }
 
     # Represent the thread binding in the form of (tx, warp_n, warp_m)
-    is_m_first = False
+    is_m_first: bool = False
+    warp_rows: int = 1
+    warp_cols: int = 1
 
     def __init__(
         self,
@@ -86,15 +93,15 @@ def __init__(
         self.warp_row_tiles = warp_row_tiles
         self.warp_col_tiles = warp_col_tiles
         self.chunk = chunk
-        self._initialize_k_dim(a_dtype)
+        self._initialize_k_dim(self.a_dtype)
         # For FP64, MMA shape is m8n8k4; adjust instance dims early
-        if DataType(a_dtype).bits == 64:
+        if DataType(self.a_dtype).bits == 64:
             # Override default M/N dims for fp64 MMA
             self.M_DIM = 8
             # n_dim will be set to 8 in _initialize_micro_size via k_dim==4
-        self._initialize_abbrev(a_dtype, b_dtype, accum_dtype)
         self._initialize_micro_size(self.M_DIM, self.k_dim)
         self._initialize_local_size(self.M_DIM, self.n_dim, self.k_dim, self.WARP_SIZE)
+        self._initialize_abbrev(self.a_dtype, self.b_dtype, accum_dtype)
         self._initialize_mma_prefix(self.k_dim)
         self._initialize_is_m_first(is_m_first)
 
@@ -111,7 +118,7 @@ def __init__(
     def _initialize_k_dim(self, a_dtype=T.float16):
         if isinstance(a_dtype, str):
             a_dtype = DataType(a_dtype)
-        self.k_dim = 256 // a_dtype.bits
+        self.k_dim = min(256 // a_dtype.bits, self.chunk)
 
     def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
         self.local_size_a = (m_dim * k_dim) // warp_size
@@ -124,10 +131,9 @@ def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
         self.accum_dtype_abbrv = self._get_dtype_abbrv(accum_dtype)
 
     def _get_dtype_abbrv(self, dtype: str) -> str:
-        try:
-            return self.dtype_abbrv[dtype]
-        except KeyError as err:
-            raise ValueError(f"Unsupported dtype: {dtype}") from err
+        if dtype not in self.dtype_abbrv:
+            raise ValueError(f"Unsupported dtype: {dtype}")
+        return self.dtype_abbrv[dtype]
 
     def _initialize_mma_prefix(self, k_dim: int = 16):
         if k_dim == 4:
@@ -141,9 +147,19 @@ def _initialize_mma_prefix(self, k_dim: int = 16):
             self.mma_prefix = "m16n8k16"
         elif k_dim == 32:
             # typically used for int8/fp8
+            # sometimes int4/uint4 is also supported
             self.mma_prefix = "m16n8k32"
+        elif k_dim == 64:
+            # typically used for int4/uint4
+            self.mma_prefix = "m16n8k64"
+        elif k_dim == 128:
+            # typically used for int2/uint2
+            self.mma_prefix = "m16n8k128"
+        elif k_dim == 256:
+            # typically used for uint1
+            self.mma_prefix = "m16n8k256"
         else:
-            raise ValueError("Unsupported k_dim")
+            raise ValueError(f"Unsupported k_dim {k_dim}")
 
     def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
         warp_row_tiles = self.warp_row_tiles
@@ -233,7 +249,8 @@ def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None =
 
     def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         # Fast path for fp64: no ldmatrix support, do direct per-lane loads
-        if DataType(self.a_dtype).bits == 64:
+        a_dtype = self.a_dtype
+        if DataType(a_dtype).bits == 64:
             warp_row_tiles = self.warp_row_tiles
             warp_rows = self.warp_rows
             chunk = self.chunk
@@ -248,6 +265,7 @@ def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion, k
             A_buf = A_region.buffer
             A_base0 = A_region.region[-2].min
             A_base1 = A_region.region[-1].min
+            A_other = [r.min for r in A_region.region[:-2]]
 
             @T.macro
             def _warp_ld_a_fp64(
@@ -264,9 +282,9 @@ def _warp_ld_a_fp64(
                     mi = tx // micro_size_k
                     mk = tx % micro_size_k
                     if a_transposed:
-                        A_local_buf[i * local_size_a] = A_buf[A_base0 + wk + mk, A_base1 + wi + mi]
+                        A_local_buf[i * local_size_a] = A_buf[tuple(A_other) + (A_base0 + wk + mk, A_base1 + wi + mi)]
                     else:
-                        A_local_buf[i * local_size_a] = A_buf[A_base0 + wi + mi, A_base1 + wk + mk]
+                        A_local_buf[i * local_size_a] = A_buf[tuple(A_other) + (A_base0 + wi + mi, A_base1 + wk + mk)]
 
             return _warp_ld_a_fp64(A_local_buf, A_region, ki, thread_binding, rk)
 
@@ -276,7 +294,6 @@ def _warp_ld_a_fp64(
         micro_size_x = self.micro_size_x
         micro_size_k = self.micro_size_k
         local_size_a = self.local_size_a
-        a_dtype = self.a_dtype
         a_transposed = self.a_transposed
         # ldmatrix cannot be used for int8 + trans case.
         ldmatrix_available = not (DataType(a_dtype).bits != 16 and a_transposed)
@@ -301,6 +318,7 @@ def mma_load_layout(i, j):
         A_buf = A_region.buffer
         A_base0 = A_region.region[-2].min
         A_base1 = A_region.region[-1].min
+        A_other = [r.min for r in A_region.region[:-2]]
         A_stride_last = A_buf.shape[-1]
 
         @T.macro
@@ -316,34 +334,35 @@ def _warp_ldmatrix_a(
             trans = self.a_transposed
 
             for i in T.serial(warp_rows):
-                # Assign A_shared_buf_elem
                 wi, wk = warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * micro_size_k
-                A_shared_buf_elem = A_buf[A_base0 + wk, A_base1 + wi] if a_transposed else A_buf[A_base0 + wi, A_base1 + wk]
 
                 if ldmatrix_available:
+                    row_off, col_off = get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed)
+                    src_indices = (
+                        tuple(A_other) + (A_base0 + wk + row_off, A_base1 + wi + col_off)
+                        if a_transposed
+                        else tuple(A_other) + (A_base0 + wi + row_off, A_base1 + wk + col_off)
+                    )
                     T.ptx_ldmatrix(
-                        a_dtype,
                         T.bool(trans),
                         4,
-                        ".b16",
-                        A_local_buf.data,
-                        i * local_size_a,
-                        T.address_of(A_shared_buf_elem),
-                        get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
+                        T.access_ptr(A_buf[src_indices], "r", extent=8),
+                        T.access_ptr(A_local_buf[i * local_size_a], "w", extent=8),
                     )
                 else:
                     for j in T.serial(local_size_a):
                         mi, mk = mma_load_layout(tx, j)
                         if a_transposed:
-                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wk + mk, A_base1 + wi + mi]
+                            A_local_buf[i * local_size_a + j] = A_buf[tuple(A_other) + (A_base0 + wk + mk, A_base1 + wi + mi)]
                         else:
-                            A_local_buf[i * local_size_a + j] = A_buf[A_base0 + wi + mi, A_base1 + wk + mk]
+                            A_local_buf[i * local_size_a + j] = A_buf[tuple(A_other) + (A_base0 + wi + mi, A_base1 + wk + mk)]
 
         return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
 
     def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, ki: PrimExpr, rk: PrimExpr | None = 0):
         # Fast path for fp64: no ldmatrix support, do direct per-lane loads
-        if DataType(self.b_dtype).bits == 64:
+        b_dtype = self.b_dtype
+        if DataType(b_dtype).bits == 64:
             warp_col_tiles = self.warp_col_tiles
             warp_cols = self.warp_cols
             chunk = self.chunk
@@ -358,6 +377,7 @@ def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion, k
             B_buf = B_region.buffer
             B_base0 = B_region.region[-2].min
             B_base1 = B_region.region[-1].min
+            B_other = [r.min for r in B_region.region[:-2]]
 
             @T.macro
             def _warp_ld_b_fp64(
@@ -374,9 +394,9 @@ def _warp_ld_b_fp64(
                     mi = tx // micro_size_k
                     mk = tx % micro_size_k
                     if b_transposed:
-                        B_local_buf[j * local_size_b] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
+                        B_local_buf[j * local_size_b] = B_buf[tuple(B_other) + (B_base0 + wi + mi, B_base1 + wk + mk)]
                     else:
-                        B_local_buf[j * local_size_b] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
+                        B_local_buf[j * local_size_b] = B_buf[tuple(B_other) + (B_base0 + wk + mk, B_base1 + wi + mi)]
 
             return _warp_ld_b_fp64(B_local_buf, B_region, ki, thread_binding, rk)
 
@@ -386,7 +406,6 @@ def _warp_ld_b_fp64(
         micro_size_y = self.micro_size_y
         micro_size_k = self.micro_size_k
         local_size_b = self.local_size_b
-        b_dtype = self.b_dtype
         b_transposed = self.b_transposed
         thread_binding = self.get_thread_binding()
 
@@ -395,6 +414,7 @@ def _warp_ld_b_fp64(
         B_buf = B_region.buffer
         B_base0 = B_region.region[-2].min
         B_base1 = B_region.region[-1].min
+        B_other = [r.min for r in B_region.region[:-2]]
         B_stride_last = B_buf.shape[-1]
         replicate_b = self.n_dim == 16
         # ldmatrix cannot be used for int8 + trans case.
@@ -433,17 +453,18 @@ def _warp_ldmatrix_b(
                 )
 
                 if ldmatrix_available:
-                    B_shared_buf_elem = B_buf[B_base0 + wi, B_base1 + wk] if b_transposed else B_buf[B_base0 + wk, B_base1 + wi]
-
+                    num = 4 if replicate_b else 2
+                    row_off, col_off = get_ldmatrix_offset("B", tx, 0, stride, b_dtype, b_transposed)
+                    src_indices = (
+                        tuple(B_other) + (B_base0 + wi + row_off, B_base1 + wk + col_off)
+                        if b_transposed
+                        else tuple(B_other) + (B_base0 + wk + row_off, B_base1 + wi + col_off)
+                    )
                     T.ptx_ldmatrix(
-                        b_dtype,
                         T.bool(trans),
-                        4 if replicate_b else 2,
-                        ".b16",
-                        B_local_buf.data,
-                        i * local_size_b,
-                        T.address_of(B_shared_buf_elem),
-                        get_ldmatrix_offset("B", tx, 0, stride, b_dtype, b_transposed),
+                        num,
+                        T.access_ptr(B_buf[src_indices], "r", extent=2 * num),
+                        T.access_ptr(B_local_buf[i * local_size_b], "w", extent=2 * num),
                     )
 
                 else:
@@ -452,9 +473,9 @@ def _warp_ldmatrix_b(
                     for j in T.serial(local_size_b):
                         mi, mk = mma_load_layout(tx, j)
                         if b_transposed:
-                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wi + mi, B_base1 + wk + mk]
+                            B_local_buf[i * local_size_b + j] = B_buf[tuple(B_other) + (B_base0 + wi + mi, B_base1 + wk + mk)]
                         else:
-                            B_local_buf[i * local_size_b + j] = B_buf[B_base0 + wk + mk, B_base1 + wi + mi]
+                            B_local_buf[i * local_size_b + j] = B_buf[tuple(B_other) + (B_base0 + wk + mk, B_base1 + wi + mi)]
 
         return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
 
@@ -848,9 +869,9 @@ def _initialize_local_size(self, m_dim=16, n_dim=16, k_dim=16, warp_size=32):
         self.local_size_out = (m_dim * n_dim) // warp_size
 
     def _initialize_abbrev(self, a_dtype, b_dtype, accum_dtype):
-        self.a_dtype_abbrv = self.dtype_abbrv[a_dtype]
-        self.b_dtype_abbrv = self.dtype_abbrv[b_dtype]
-        self.accum_dtype_abbrv = self.dtype_abbrv[accum_dtype]
+        self.a_dtype_abbrv = self._get_dtype_abbrv(a_dtype)
+        self.b_dtype_abbrv = self._get_dtype_abbrv(b_dtype)
+        self.accum_dtype_abbrv = self._get_dtype_abbrv(accum_dtype)
 
     def _initialize_mma_prefix(self, k_dim=16):
         if k_dim == 16:
@@ -908,20 +929,19 @@ def _warp_ldmatrix_a(
             tx, _, warp_m = self.extract_thread_binding(thread_binding)
             if transform_kind_a == TransformKind.NonTransform:
                 for i in T.serial(warp_rows):
+                    row_off, col_off = get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed)
                     T.ptx_ldmatrix(
-                        a_dtype,
                         T.bool(False),
                         4,
-                        ".b16",
-                        A_local_buf.data,
-                        i * local_size_a,
-                        T.address_of(
+                        T.access_ptr(
                             A_shared_buf[
-                                warp_m * warp_row_tiles + i * micro_size_x,
-                                rk * chunk + ki * micro_size_k,
-                            ]
+                                warp_m * warp_row_tiles + i * micro_size_x + row_off,
+                                rk * chunk + ki * micro_size_k + col_off,
+                            ],
+                            "r",
+                            extent=8,
                         ),
-                        get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
+                        T.access_ptr(A_local_buf[i * local_size_a], "w", extent=8),
                     )
             elif transform_kind_a == TransformKind.InterWarpTransform:
                 for i in T.serial(warp_rows):
@@ -936,18 +956,12 @@ def _warp_ldmatrix_a(
                         (ri) % micro_size_x,
                         (rj) % micro_size_k,
                     )
-                    args = (ni, nj, nii, njj) if transform_kind_a > 0 else (ri, rj)
-                    A_shared_elem = A_shared_buf[args]
-
+                    row_off, col_off = get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed)
                     T.ptx_ldmatrix(
-                        a_dtype,
                         T.bool(False),
                         4,
-                        ".b16",
-                        A_local_buf.data,
-                        i * local_size_a,
-                        T.address_of(A_shared_elem),
-                        get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
+                        T.access_ptr(A_shared_buf[ni, nj, nii + row_off, njj + col_off], "r", extent=8),
+                        T.access_ptr(A_local_buf[i * local_size_a], "w", extent=8),
                     )
             elif transform_kind_a == TransformKind.IntraWarpTransform:
                 for i in T.serial(warp_rows):
@@ -962,17 +976,13 @@ def _warp_ldmatrix_a(
                         (ri) % micro_size_x,
                         (rj) % micro_size_k,
                     )
-                    A_shared_elem = A_shared_buf[ni, nj, nii, njj]
-
+                    row_off = (tx * local_size_a) // stride
+                    col_off = (tx * local_size_a) % stride
                     T.ptx_ldmatrix(
-                        a_dtype,
                         T.bool(False),
                         4,
-                        ".b16",
-                        A_local_buf.data,
-                        i * local_size_a,
-                        T.address_of(A_shared_elem),
-                        tx * local_size_a,
+                        T.access_ptr(A_shared_buf[ni, nj, nii + row_off, njj + col_off], "r", extent=8),
+                        T.access_ptr(A_local_buf[i * local_size_a], "w", extent=8),
                     )
             elif transform_kind_a == TransformKind.LDMatrixTransform:
                 for j in T.serial(warp_rows):
@@ -1021,17 +1031,12 @@ def _warp_ldmatrix_b(
                         warp_n * warp_col_tiles + j * micro_size_y,
                         rk * chunk + ki * micro_size_k,
                     )
-                    B_shared_elem = B_shared_buf[ri, rj]
-
+                    row_off, col_off = get_ldmatrix_offset("B", tx, 0, stride, b_dtype, b_transposed)
                     T.ptx_ldmatrix(
-                        b_dtype,
                         T.bool(False),
                         4,
-                        ".b16",
-                        B_local_buf.data,
-                        j * local_size_b,
-                        T.address_of(B_shared_elem),
-                        get_ldmatrix_offset("B", tx, 0, stride, b_dtype, b_transposed),
+                        T.access_ptr(B_shared_buf[ri + row_off, rj + col_off], "r", extent=8),
+                        T.access_ptr(B_local_buf[j * local_size_b], "w", extent=8),
                     )
             elif transform_kind_b == TransformKind.InterWarpTransform:
                 for j in T.serial(warp_cols):
@@ -1046,17 +1051,12 @@ def _warp_ldmatrix_b(
                         (ri) % micro_size_y,
                         (rj) % micro_size_k,
                     )
-                    B_shared_elem = B_shared_buf[ni, nj, nii, njj]
-
+                    row_off, col_off = get_ldmatrix_offset("B", tx, 0, stride, b_dtype, b_transposed)
                     T.ptx_ldmatrix(
-                        b_dtype,
                         T.bool(False),  # TODO(lei): should be optimized
                         4,
-                        ".b16",
-                        B_local_buf.data,
-                        j * local_size_b,
-                        T.address_of(B_shared_elem),
-                        get_ldmatrix_offset("B", tx, 0, stride, b_dtype, b_transposed),
+                        T.access_ptr(B_shared_buf[ni, nj, nii + row_off, njj + col_off], "r", extent=8),
+                        T.access_ptr(B_local_buf[j * local_size_b], "w", extent=8),
                     )
             elif transform_kind_b == TransformKind.IntraWarpTransform:
                 for j in T.serial(warp_cols):
@@ -1071,17 +1071,13 @@ def _warp_ldmatrix_b(
                         (ri) % micro_size_y,
                         (rj) % micro_size_k,
                     )
-                    B_shared_elem = B_shared_buf[ni, nj, nii, njj]
-
+                    row_off = (tx * local_size_b) // stride
+                    col_off = (tx * local_size_b) % stride
                     T.ptx_ldmatrix(
-                        b_dtype,
                         T.bool(False),  # TODO(lei): should be optimized
                         4,
-                        ".b16",
-                        B_local_buf.data,
-                        j * local_size_b,
-                        T.address_of(B_shared_elem),
-                        tx * local_size_b,
+                        T.access_ptr(B_shared_buf[ni, nj, nii + row_off, njj + col_off], "r", extent=8),
+                        T.access_ptr(B_local_buf[j * local_size_b], "w", extent=8),
                     )
             elif transform_kind_b == TransformKind.LDMatrixTransform:
                 local_size_dequantize = local_size_b // num_elems_per_byte
@@ -1152,207 +1148,3 @@ def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
                 )
 
         return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
-
-
-class INT4TensorCoreIntrinEmitter(TensorCoreIntrinEmitter):
-    def mma(self, A_local_buf, B_local_buf, C_local_buf):
-        warp_rows = self.warp_rows
-        warp_cols = self.warp_cols
-        local_size_a = self.local_size_a
-        local_size_b = self.local_size_b
-        local_size_out = self.local_size_out
-        a_dtype_abbrv = "int4"
-        b_dtype_abbrv = "int4"
-        accum_dtype = self.accum_dtype
-        accum_dtype_abbrv = accum_dtype
-        mma_prefix = "m16n8k32"
-
-        @T.macro
-        def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
-            for i, j in T.grid(warp_rows, warp_cols):
-                """
-                A[16, 32], B[16, 32], C[16, 16]
-                A_local_size -> 16
-                B_local_size -> 16
-                C_local_size -> 8
-                For each m16n8k32 inst
-                For A: m16k32 consume 16 int4 elements -> 8 A_local_size
-                For A: n8k32 consume 8 int4 elements -> 4 B_local_size
-                For C: m16n8 consume 4 int32 elements -> 4 C_local_size
-                """
-
-                # A[0:16, 0:16] * B[0:8, 0:16] -> C[0:16, 0:8]
-                T.ptx_mma(
-                    accum_dtype,
-                    mma_prefix,
-                    "row",
-                    "col",
-                    a_dtype_abbrv,
-                    b_dtype_abbrv,
-                    accum_dtype_abbrv,
-                    A_local_buf.data,
-                    i * local_size_a,
-                    B_local_buf.data,
-                    j * local_size_b,
-                    C_local_buf.data,
-                    i * warp_cols * local_size_out + j * local_size_out,
-                    T.bool(False),
-                )
-
-                # A[0:16, 0:16] * B[8:16, 0:16] -> C[0:16, 8:16]
-                T.ptx_mma(
-                    accum_dtype,
-                    mma_prefix,
-                    "row",
-                    "col",
-                    a_dtype_abbrv,
-                    b_dtype_abbrv,
-                    accum_dtype_abbrv,
-                    A_local_buf.data,
-                    i * local_size_a,
-                    B_local_buf.data,
-                    j * local_size_b + lift(local_size_b) // 2,
-                    C_local_buf.data,
-                    i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
-                    T.bool(False),
-                )
-
-                # A[0:16, 16:32] * B[0:8, 16:32] -> C[0:16, 0:8]
-                T.ptx_mma(
-                    accum_dtype,
-                    mma_prefix,
-                    "row",
-                    "col",
-                    a_dtype_abbrv,
-                    b_dtype_abbrv,
-                    accum_dtype_abbrv,
-                    A_local_buf.data,
-                    i * local_size_a + lift(local_size_a) // 2,
-                    B_local_buf.data,
-                    j * local_size_b + lift(local_size_b) // 4,
-                    C_local_buf.data,
-                    i * warp_cols * local_size_out + j * local_size_out,
-                    T.bool(False),
-                )
-
-                # A[0:16, 16:32] * B[8:16, 16:32] -> C[0:16, 8:16]
-                T.ptx_mma(
-                    accum_dtype,
-                    mma_prefix,
-                    "row",
-                    "col",
-                    a_dtype_abbrv,
-                    b_dtype_abbrv,
-                    accum_dtype_abbrv,
-                    A_local_buf.data,
-                    i * local_size_a + lift(local_size_b) // 2,
-                    B_local_buf.data,
-                    j * local_size_b + lift(local_size_b) // 2 + lift(local_size_b) // 4,
-                    C_local_buf.data,
-                    i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
-                    T.bool(False),
-                )
-
-        return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
-
-
-class INT4TensorCoreIntrinEmitterWithLadderTransform(TensorCoreIntrinEmitterWithLadderTransform):
-    def mma(self, A_local_buf, B_local_buf, C_local_buf):
-        warp_rows = self.warp_rows
-        warp_cols = self.warp_cols
-        local_size_a = self.local_size_a
-        local_size_b = self.local_size_b
-        local_size_out = self.local_size_out
-        a_dtype_abbrv = "int4"
-        b_dtype_abbrv = "int4"
-        accum_dtype = self.accum_dtype
-        accum_dtype_abbrv = T.int32
-        mma_prefix = "m16n8k32"
-
-        @T.macro
-        def _warp_mma(A_local_buf, B_local_buf, C_local_buf):
-            for i, j in T.grid(warp_rows, warp_cols):
-                """
-                A[16, 32], B[16, 32], C[16, 16]
-                A_local_size -> 16
-                B_local_size -> 16
-                C_local_size -> 8
-                For each m16n8k32 inst
-                For A: m16k32 consume 16 int4 elements -> 8 A_local_size
-                For A: n8k32 consume 8 int4 elements -> 4 B_local_size
-                For C: m16n8 consume 4 int32 elements -> 4 C_local_size
-                """
-
-                # A[0:16, 0:16] * B[0:8, 0:16] -> C[0:16, 0:8]
-                T.ptx_mma(
-                    accum_dtype,
-                    mma_prefix,
-                    "row",
-                    "col",
-                    a_dtype_abbrv,
-                    b_dtype_abbrv,
-                    accum_dtype_abbrv,
-                    A_local_buf.data,
-                    i * local_size_a,
-                    B_local_buf.data,
-                    j * local_size_b,
-                    C_local_buf.data,
-                    i * warp_cols * local_size_out + j * local_size_out,
-                    T.bool(False),
-                )
-
-                # A[0:16, 0:16] * B[8:16, 0:16] -> C[0:16, 8:16]
-                T.ptx_mma(
-                    accum_dtype,
-                    mma_prefix,
-                    "row",
-                    "col",
-                    a_dtype_abbrv,
-                    b_dtype_abbrv,
-                    accum_dtype_abbrv,
-                    A_local_buf.data,
-                    i * local_size_a,
-                    B_local_buf.data,
-                    j * local_size_b + lift(local_size_b) // 2,
-                    C_local_buf.data,
-                    i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
-                    T.bool(False),
-                )
-
-                # A[0:16, 16:32] * B[0:8, 16:32] -> C[0:16, 0:8]
-                T.ptx_mma(
-                    accum_dtype,
-                    mma_prefix,
-                    "row",
-                    "col",
-                    a_dtype_abbrv,
-                    b_dtype_abbrv,
-                    accum_dtype_abbrv,
-                    A_local_buf.data,
-                    i * local_size_a + lift(local_size_a) // 2,
-                    B_local_buf.data,
-                    j * local_size_b + lift(local_size_b) // 4,
-                    C_local_buf.data,
-                    i * warp_cols * local_size_out + j * local_size_out,
-                    T.bool(False),
-                )
-
-                # A[0:16, 16:32] * B[8:16, 16:32] -> C[0:16, 8:16]
-                T.ptx_mma(
-                    accum_dtype,
-                    mma_prefix,
-                    "row",
-                    "col",
-                    a_dtype_abbrv,
-                    b_dtype_abbrv,
-                    accum_dtype_abbrv,
-                    A_local_buf.data,
-                    i * local_size_a + lift(local_size_b) // 2,
-                    B_local_buf.data,
-                    j * local_size_b + lift(local_size_b) // 2 + lift(local_size_b) // 4,
-                    C_local_buf.data,
-                    i * warp_cols * local_size_out + j * local_size_out + lift(local_size_out) // 2,
-                    T.bool(False),
-                )
-
-        return _warp_mma(A_local_buf, B_local_buf, C_local_buf)
diff --git a/tilelang/intrinsics/mma_sm70_macro_generator.py b/tilelang/intrinsics/mma_sm70_macro_generator.py
index 6acc40a4cd..52679b169a 100644
--- a/tilelang/intrinsics/mma_sm70_macro_generator.py
+++ b/tilelang/intrinsics/mma_sm70_macro_generator.py
@@ -2,10 +2,12 @@
 import tilelang.language as T
 from typing import Literal, Callable
 from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion
+from tvm import tir
+from tvm.ir import Range
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
 from tilelang import tvm as tvm
 from tvm.runtime import convert
-from tilelang.utils import is_fragment
+from tilelang.utils import is_fragment, get_buffer_region_from_load
 from tilelang.intrinsics.mma_sm70_layout import (
     shared_16x4_to_mma_a_32x4_layout,
     shared_4x16_to_mma_b_32x4_layout,
@@ -493,3 +495,30 @@ def forward_index(i: int, j: int) -> int:
             forward_thread_fn=forward_thread,
             forward_index_fn=forward_index,
         )
+
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        """
+        Convert Buffer/BufferRegion/BufferLoad to a BufferRegion.
+
+        - Buffer -> full-region BufferRegion covering entire shape
+        - BufferRegion -> returned as-is
+        - BufferLoad -> best-effort convert via get_buffer_region_from_load;
+        if scalar, fall back to 1-sized ranges at given indices
+        """
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            # Fallback: scalar load -> 1-sized ranges at indices
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
diff --git a/tilelang/intrinsics/mma_sp_layout.py b/tilelang/intrinsics/mma_sp_layout.py
index 58034e7fdb..73da1289ab 100644
--- a/tilelang/intrinsics/mma_sp_layout.py
+++ b/tilelang/intrinsics/mma_sp_layout.py
@@ -158,7 +158,7 @@ def get_ldmatrix_offset_b(
         if transposed:
             transform_func = ldmatrix_trans_32x8_to_shared_16x16_layout
             new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            return new_row_idx, new_col_idx
         else:
             raise ValueError("ldmatrix only supports B transposed for 32-bit dtype")
     elif dtype_bits == 16:
@@ -166,15 +166,15 @@ def get_ldmatrix_offset_b(
         transform_func_trans = ldmatrix_trans_32x16_to_shared_16x32_layout
         if transposed:
             new_row_idx, new_col_idx = transform_func_trans(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            return new_row_idx, new_col_idx
         else:
             new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            return new_row_idx, new_col_idx
     elif dtype_bits == 8:
         if transposed:
             transform_func = ldmatrix_trans_32x32_to_shared_shared_16x64_layout
             new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            return new_row_idx, new_col_idx
         else:
             raise ValueError("ldmatrix only supports B transposed for 8-bit dtype")
     else:
diff --git a/tilelang/intrinsics/mma_sp_macro_generator.py b/tilelang/intrinsics/mma_sp_macro_generator.py
index 3e375b46b9..18a37b8e83 100644
--- a/tilelang/intrinsics/mma_sp_macro_generator.py
+++ b/tilelang/intrinsics/mma_sp_macro_generator.py
@@ -2,14 +2,15 @@
 
 import tilelang.language as T
 from typing import Literal, Callable
-from tvm import DataType
-from tvm.tir import PrimExpr, IndexMap, Buffer, Var
+from tvm import DataType, tir
+from tvm.tir import PrimExpr, IndexMap, Buffer, Var, BufferRegion, BufferLoad
+from tvm.ir import Range
 from tvm.runtime import convert
 from .utils import (
     mma_store_index_map,
     get_ldmatrix_offset,
 )
-from tilelang.utils import is_fragment
+from tilelang.utils import is_fragment, get_buffer_region_from_load
 
 from tilelang.intrinsics.mma_sp_layout import (
     shared_16x16_to_mma_sp_layout_sr_a,
@@ -60,62 +61,14 @@ class SparseTensorCoreIntrinEmitter:
     }
 
     E_FACTOR_MAP = {  # e_kdim = mma_kdim // e_factor
-        "float": {
-            "int16": 8,
-            "uint16": 8,
-        },
-        "float32": {
-            "int16": 8,
-            "uint16": 8,
-        },
-        "float16": {
-            "int8": 8,
-            "uint8": 8,
-            "int16": 16,
-            "uint16": 16,
-            "int32": 32,
-            "uint32": 32,
-        },
-        "bfloat16": {
-            "int8": 8,
-            "uint8": 8,
-            "int16": 16,
-            "uint16": 16,
-            "int32": 32,
-            "uint32": 32,
-        },
-        "int8": {
-            "int8": 8,
-            "uint8": 8,
-            "int16": 16,
-            "uint16": 16,
-            "int32": 32,
-            "uint32": 32,
-        },
-        "uint8": {
-            "int8": 8,
-            "uint8": 8,
-            "int16": 16,
-            "uint16": 16,
-            "int32": 32,
-            "uint32": 32,
-        },
-        "float8_e4m3": {
-            "int8": 8,
-            "uint8": 8,
-            "int16": 16,
-            "uint16": 16,
-            "int32": 32,
-            "uint32": 32,
-        },
-        "float8_e5m2": {
-            "int8": 8,
-            "uint8": 8,
-            "int16": 16,
-            "uint16": 16,
-            "int32": 32,
-            "uint32": 32,
-        },
+        "float": {"int16": 8, "uint16": 8},
+        "float32": {"int16": 8, "uint16": 8},
+        "float16": {"int8": 8, "uint8": 8, "int16": 16, "uint16": 16, "int32": 32, "uint32": 32},
+        "bfloat16": {"int8": 8, "uint8": 8, "int16": 16, "uint16": 16, "int32": 32, "uint32": 32},
+        "int8": {"int8": 8, "uint8": 8, "int16": 16, "uint16": 16, "int32": 32, "uint32": 32},
+        "uint8": {"int8": 8, "uint8": 8, "int16": 16, "uint16": 16, "int32": 32, "uint32": 32},
+        "float8_e4m3": {"int8": 8, "uint8": 8, "int16": 16, "uint16": 16, "int32": 32, "uint32": 32},
+        "float8_e5m2": {"int8": 8, "uint8": 8, "int16": 16, "uint16": 16, "int32": 32, "uint32": 32},
     }
 
     E_REPLICATE_FACTOR = {  # metadata replicate every 4 consecutive threads
@@ -285,7 +238,7 @@ def extract_thread_binding(self, thread_id: PrimExpr, is_m_first: bool | None =
             )
             return lane_id, warp_n, warp_m
 
-    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+    def ldmatrix_a(self, A_local_buf: Buffer, A_shared_buf: Buffer | BufferRegion | BufferLoad, ki: PrimExpr, rk: PrimExpr = 0):
         warp_row_tiles = self.warp_row_tiles
         warp_rows = self.warp_rows
         warp_k = self.warp_k
@@ -312,6 +265,13 @@ def mma_load_layout(i, j):
 
         thread_binding = self.get_thread_binding()
 
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
+        A_buf = A_region.buffer
+        A_base0 = A_region.region[-2].min
+        A_base1 = A_region.region[-1].min
+        A_other = [r.min for r in A_region.region[:-2]]
+        A_stride_last = A_buf.shape[-1]
+
         @T.macro
         def _warp_ldmatrix_a(
             A_local_buf,
@@ -320,36 +280,38 @@ def _warp_ldmatrix_a(
             thread_binding,
             rk=0,
         ):
-            stride = A_shared_buf.shape[-1]
+            stride = A_stride_last
             tx, _, warp_m = self.extract_thread_binding(thread_binding)
             trans = self.a_transposed
 
             for i in T.serial(warp_rows):
-                # Assign A_shared_buf_elem
                 wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.SPARSE_FACTOR
-                A_shared_buf_elem = A_shared_buf[wk, wi] if a_transposed else A_shared_buf[wi, wk]
 
                 if ldmatrix_available:
+                    row_off, col_off = get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed)
+                    src_indices = (
+                        tuple(A_other) + (A_base0 + wk + row_off, A_base1 + wi + col_off)
+                        if a_transposed
+                        else tuple(A_other) + (A_base0 + wi + row_off, A_base1 + wk + col_off)
+                    )
                     T.ptx_ldmatrix(
-                        a_dtype,
                         T.bool(trans),
                         4,
-                        ".b16",
-                        A_local_buf.data,
-                        i * local_size_a,
-                        T.address_of(A_shared_buf_elem),
-                        get_ldmatrix_offset("A", tx, 0, stride, a_dtype, a_transposed),
+                        T.access_ptr(A_buf[src_indices], "r", extent=8),
+                        T.access_ptr(A_local_buf[i * local_size_a], "w", extent=8),
                     )
                 else:
                     for j in T.serial(local_size_a):
                         mi, mk = mma_load_layout(tx, j)
                         A_local_buf[i * local_size_a + j] = (
-                            A_shared_buf[wk + mk, wi + mi] if a_transposed else A_shared_buf[wi + mi, wk + mk]
+                            A_buf[tuple(A_other) + (A_base0 + wk + mk, A_base1 + wi + mi)]
+                            if a_transposed
+                            else A_buf[tuple(A_other) + (A_base0 + wi + mi, A_base1 + wk + mk)]
                         )
 
-        return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
+        return _warp_ldmatrix_a(A_local_buf, A_region, ki, thread_binding, rk)
 
-    def ldmatrix_e(self, E_local_buf: Buffer, E_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+    def ldmatrix_e(self, E_local_buf: Buffer, E_shared_buf: Buffer | BufferRegion | BufferLoad, ki: PrimExpr, rk: PrimExpr = 0):
         warp_row_tiles = self.warp_row_tiles
         warp_rows = self.warp_rows
         warp_k = self.warp_k
@@ -395,6 +357,12 @@ def mma_load_layout(i, j):
 
         thread_binding = self.get_thread_binding()
 
+        E_region = self._legalize_to_buffer_region(E_shared_buf)
+        E_buf = E_region.buffer
+        E_base0 = E_region.region[-2].min
+        E_base1 = E_region.region[-1].min
+        E_other = [r.min for r in E_region.region[:-2]]
+
         @T.macro
         def _warp_ldmatrix_e(
             E_local_buf,
@@ -409,11 +377,15 @@ def _warp_ldmatrix_e(
                 wi, wk = warp_m * warp_row_tiles + i * micro_size_x, (rk * warp_k + ki * micro_size_k) // self.e_factor
                 for j in T.serial(local_size_e):
                     mi, mk = mma_load_layout(tx, j)
-                    E_local_buf[i * local_size_e + j] = E_shared_buf[wk + mk, wi + mi] if trans else E_shared_buf[wi + mi, wk + mk]
+                    E_local_buf[i * local_size_e + j] = (
+                        E_buf[tuple(E_other) + (E_base0 + wk + mk, E_base1 + wi + mi)]
+                        if trans
+                        else E_buf[tuple(E_other) + (E_base0 + wi + mi, E_base1 + wk + mk)]
+                    )
 
-        return _warp_ldmatrix_e(E_local_buf, E_shared_buf, ki, thread_binding, rk)
+        return _warp_ldmatrix_e(E_local_buf, E_region, ki, thread_binding, rk)
 
-    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer, ki: PrimExpr, rk: PrimExpr = 0):
+    def ldmatrix_b(self, B_local_buf: Buffer, B_shared_buf: Buffer | BufferRegion | BufferLoad, ki: PrimExpr, rk: PrimExpr = 0):
         warp_col_tiles = self.warp_col_tiles
         warp_cols = self.warp_cols
         warp_k = self.warp_k
@@ -440,6 +412,13 @@ def mma_load_layout(i, j):
             else:
                 raise ValueError(f"Unsupported dtype: {b_dtype}")
 
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
+        B_buf = B_region.buffer
+        B_base0 = B_region.region[-2].min
+        B_base1 = B_region.region[-1].min
+        B_other = [r.min for r in B_region.region[:-2]]
+        B_stride_last = B_buf.shape[-1]
+
         @T.macro
         def _warp_ldmatrix_b(
             B_local_buf,
@@ -448,7 +427,7 @@ def _warp_ldmatrix_b(
             thread_binding,
             rk=0,
         ):
-            stride = B_shared_buf.shape[-1]
+            stride = B_stride_last
             tx, warp_n, _ = self.extract_thread_binding(thread_binding)
             trans = not b_transposed
 
@@ -460,40 +439,44 @@ def _warp_ldmatrix_b(
                 )
 
                 if ldmatrix_available:
-                    B_shared_buf_elem = B_shared_buf[wi, wk] if b_transposed else B_shared_buf[wk, wi]
-
                     if replicate_b:
+                        row_off, col_off = get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed)
+                        src_indices = (
+                            tuple(B_other) + (B_base0 + wi + row_off, B_base1 + wk + col_off)
+                            if b_transposed
+                            else tuple(B_other) + (B_base0 + wk + row_off, B_base1 + wi + col_off)
+                        )
                         T.ptx_ldmatrix(
-                            b_dtype,
                             T.bool(trans),
                             4,
-                            ".b16",
-                            B_local_buf.data,
-                            i * local_size_b,
-                            T.address_of(B_shared_buf_elem),
-                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                            T.access_ptr(B_buf[src_indices], "r", extent=8),
+                            T.access_ptr(B_local_buf[i * local_size_b], "w", extent=8),
                         )
 
+                        row_off, col_off = get_ldmatrix_offset_b("B", tx, lift(local_size_b) // 2, stride, b_dtype, b_transposed)
+                        src_indices = (
+                            tuple(B_other) + (B_base0 + wi + row_off, B_base1 + wk + col_off)
+                            if b_transposed
+                            else tuple(B_other) + (B_base0 + wk + row_off, B_base1 + wi + col_off)
+                        )
                         T.ptx_ldmatrix(
-                            b_dtype,
                             T.bool(trans),
                             4,
-                            ".b16",
-                            B_local_buf.data,
-                            i * local_size_b + lift(local_size_b) // 2,
-                            T.address_of(B_shared_buf_elem),
-                            get_ldmatrix_offset_b("B", tx, lift(local_size_b) // 2, stride, b_dtype, b_transposed),
+                            T.access_ptr(B_buf[src_indices], "r", extent=8),
+                            T.access_ptr(B_local_buf[i * local_size_b + lift(local_size_b) // 2], "w", extent=8),
                         )
                     else:
+                        row_off, col_off = get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed)
+                        src_indices = (
+                            tuple(B_other) + (B_base0 + wi + row_off, B_base1 + wk + col_off)
+                            if b_transposed
+                            else tuple(B_other) + (B_base0 + wk + row_off, B_base1 + wi + col_off)
+                        )
                         T.ptx_ldmatrix(
-                            b_dtype,
                             T.bool(trans),
                             4,
-                            ".b16",
-                            B_local_buf.data,
-                            i * local_size_b,
-                            T.address_of(B_shared_buf_elem),
-                            get_ldmatrix_offset_b("B", tx, 0, stride, b_dtype, b_transposed),
+                            T.access_ptr(B_buf[src_indices], "r", extent=8),
+                            T.access_ptr(B_local_buf[i * local_size_b], "w", extent=8),
                         )
 
                 else:
@@ -502,10 +485,30 @@ def _warp_ldmatrix_b(
                     for j in T.serial(local_size_b):
                         mi, mk = mma_load_layout(tx, j)
                         B_local_buf[i * local_size_b + j] = (
-                            B_shared_buf[wi + mi, wk + mk] if b_transposed else B_shared_buf[wk + mk, wi + mi]
+                            B_buf[tuple(B_other) + (B_base0 + wi + mi, B_base1 + wk + mk)]
+                            if b_transposed
+                            else B_buf[tuple(B_other) + (B_base0 + wk + mk, B_base1 + wi + mi)]
                         )
 
-        return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
+        return _warp_ldmatrix_b(B_local_buf, B_region, ki, thread_binding, rk)
+
+    @staticmethod
+    def _legalize_to_buffer_region(obj: Buffer | BufferLoad | BufferRegion) -> BufferRegion:
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            mins = [idx for idx in obj.indices]
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type for BufferRegion: {type(obj)}")
 
     def mma_sp(self, A_local_buf: Buffer, E_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr = 0):
         warp_rows = self.warp_rows
diff --git a/tilelang/intrinsics/tcgen05_macro_generator.py b/tilelang/intrinsics/tcgen05_macro_generator.py
index 923bb0e106..7799b79150 100644
--- a/tilelang/intrinsics/tcgen05_macro_generator.py
+++ b/tilelang/intrinsics/tcgen05_macro_generator.py
@@ -61,8 +61,11 @@ def swizzle_atom_size(self) -> int:
 
 # derive from MMAIntrinEmitter as some layouts are the same
 class TensorCoreIntrinEmitter(MMAIntrinEmitter):
-    """
-    To eliminate Python syntax within TIR Macro.
+    """Intrinsic emitter for Blackwell (SM100) TCGEN5MMA instructions.
+
+    Generates TIR macros that lower to ``tcgen05.mma`` PTX instructions for
+    both the SS (Shared-Shared) and TS (TensorMemory-Shared) GEMM variants.
+    Also provides layout helpers for tensor-memory (TMEM) buffers.
     """
 
     # should be rewritten to support dynamic k_dim
@@ -114,29 +117,20 @@ def _assign_b_shared_layout(self, layout: Layout):
         return self
 
     def _initialize_micro_size(self, m_dim: int = 16, k_dim: int = 16):
-        warp_row_tiles = self.warp_row_tiles
-        warp_col_tiles = self.warp_col_tiles
-        # For tcgen05, warp_row_tiles is 8 as we can use .ws to support m32
-        assert warp_row_tiles >= 8, f"warp_row_tiles must be greater than 8, got {warp_row_tiles}"
-        assert warp_row_tiles % 8 == 0, f"warp_row_tiles must be divisible by 8, got {warp_row_tiles}"
-        assert warp_col_tiles >= 8, f"warp_col_tiles must be greater than 8, got {warp_col_tiles}"
-        assert warp_col_tiles % 8 == 0, f"warp_col_tiles must be divisible by 8, got {warp_col_tiles}"
-
-        # four warps per block
-        self.warp_rows = warp_row_tiles // 8
-        if warp_col_tiles % 16 == 0:
-            self.n_dim = 16
-            self.micro_size_y = 16
-            self.warp_cols = warp_col_tiles // 16
-        else:
-            # must be divisible by 8
-            self.n_dim = 8
-            self.micro_size_y = 8
-            self.warp_cols = warp_col_tiles // 8
-
+        # tcgen05 doesn't care about warp partitioning
         self.micro_size_x = m_dim
         self.micro_size_k = k_dim
 
+    def _initialize_k_dim(self, a_dtype=T.float16):
+        if isinstance(a_dtype, str):
+            a_dtype = DataType(a_dtype)
+        if a_dtype.bits == 6 or str(a_dtype) == "float4_e2m1fn":
+            if self.chunk % 32 != 0:
+                raise ValueError(f"TCGEN5MMA FP{a_dtype.bits} requires chunk to be a multiple of 32, got {self.chunk}")
+            self.k_dim = 32
+            return
+        super()._initialize_k_dim(a_dtype)
+
     def _determinate_swizzle_mode(self, buffer: Buffer, layout: Layout) -> SwizzleMode:
         # same behavior to src/layout/gemm_layouts.cc::makeGemmABLayoutHopper
         if layout is None or layout.is_equal(make_linear_layout(buffer)):
@@ -151,13 +145,59 @@ def _determinate_swizzle_mode(self, buffer: Buffer, layout: Layout) -> SwizzleMo
             raise ValueError(f"Unsupported swizzle mode: {layout}")
 
     def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, clear_accum: PrimExpr = False):
+        """Emit a TCGEN5MMA operation, dispatching to SS or TS variant based on A's memory scope.
+
+        If *A_buf* resides in tensor memory (``shared.tmem``), the TS variant is
+        emitted; otherwise the SS variant is used (both A and B from shared memory).
+
+        Parameters
+        ----------
+        A_buf : Buffer
+            Operand A — either in shared memory (SS) or tensor memory (TS).
+        B_buf : Buffer
+            Operand B in shared memory.
+        C_local_buf : Buffer
+            Accumulator buffer in tensor memory.
+        mbar : PrimExpr
+            Memory barrier used for MMA completion signalling.
+        clear_accum : PrimExpr
+            Whether to zero the accumulator before the first MMA.
+        """
         if is_tensor_memory(A_buf):
-            return self.tcgen05mma_rs(A_buf, B_buf, C_local_buf, clear_accum)
+            return self.tcgen05mma_ts(A_buf, B_buf, C_local_buf, mbar, clear_accum)
+        return self.tcgen05mma_ss(A_buf, B_buf, C_local_buf, mbar, clear_accum)
 
+    def tcgen05mma_ss(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, clear_accum: PrimExpr = False):
+        """Emit the SS (Shared-Shared) variant of TCGEN5MMA.
+
+        Reads operand A and B from shared memory via a descriptor.
+
+        Parameters
+        ----------
+        A_buf : Buffer
+            Operand A in shared memory.
+        B_buf : Buffer
+            Operand B in shared memory.
+        C_local_buf : Buffer
+            Accumulator buffer in tensor memory.
+        mbar : PrimExpr
+            Memory barrier for MMA completion signalling.
+        clear_accum : PrimExpr
+            Whether to zero the accumulator before the first MMA.
+        """
         accum_dtype = self.accum_dtype
         m_dim = self.block_row_warps * self.warp_row_tiles
         micro_size_k = self.micro_size_k
         k_dim, n_dim = self.chunk, self.block_col_warps * self.warp_col_tiles
+        meta = self.meta
+        if len(meta) != 5:
+            raise ValueError(
+                f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
+                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
+        atom_m_per_cta = atom_m // 2 if enable_2cta else atom_m
+        n_dim_per_cta = n_dim // 2 if enable_2cta else n_dim
         scale_in_a = 1
         scale_in_b = 1
 
@@ -169,19 +209,11 @@ def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, cl
         b_swizzle_mode = self._determinate_swizzle_mode(B_buf, self.b_shared_layout)
 
         elems_in_bits = DataType(self.a_dtype).bits
-        elems_in_bytes = elems_in_bits // 8
+        elems_in_bytes = (elems_in_bits + 7) // 8
         a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
-        b_swizzle_atom_elems = n_dim if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim_per_cta if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
         accum_dtype_in_bits = DataType(accum_dtype).bits
 
-        meta = self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim)
-        if len(meta) != 5:
-            raise ValueError(
-                f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
-                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
-            )
-        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
-
         # by default, we utilize non-swizzle layout offset
         a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * m_dim * elems_in_bytes)
         a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
@@ -207,8 +239,8 @@ def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, cl
                 else:
                     a_stride_byte_offset = 8 * elems_in_bytes * a_swizzle_atom_elems
 
-        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim * elems_in_bytes)
-        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim == 8 else (8 * 8 * elems_in_bytes))
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim_per_cta * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim_per_cta == 8 else (8 * 8 * elems_in_bytes))
         if not b_swizzle_mode.is_none():
             # swizzle mode doesn't require LBO/SBO to be 1
             # https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-leading-dimension-byte-offset
@@ -219,18 +251,18 @@ def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, cl
                 # MN Major, K * N
                 # LBO represents the distance between two atoms along the N dimension
                 # SBO represents the distance between two atoms along the K dimension
-                b_n_axis_atoms = n_dim // b_swizzle_atom_elems
+                b_n_axis_atoms = n_dim_per_cta // b_swizzle_atom_elems
                 if b_n_axis_atoms <= 1:
                     b_leading_byte_offset = 0
                 else:
                     b_leading_byte_offset = 8 * 8 * elems_in_bytes * k_dim
                 if b_n_axis_atoms <= 1:
-                    b_stride_byte_offset = 8 * elems_in_bytes * n_dim
+                    b_stride_byte_offset = 8 * elems_in_bytes * n_dim_per_cta
                 else:
                     b_stride_byte_offset = 8 * elems_in_bytes * b_swizzle_atom_elems
 
         # for example, if [n, k] where k is 128, we should split it into 2 atoms
-        # where max specially handles the case when n_dim is 8.
+        # where max specially handles the case when n_dim_per_cta is 8.
         ak_atom_size = max(a_swizzle_atom_elems // micro_size_k, 1)
         bk_atom_size = max(b_swizzle_atom_elems // micro_size_k, 1)
 
@@ -245,11 +277,11 @@ def tcgen05mma(self, A_buf: Buffer, B_buf: Buffer, C_local_buf: Buffer, mbar, cl
         )
         # Allocate an instruction descriptor wrapper and initialize it
         a_dtype_abbrv = self.a_dtype_abbrv
-        mask_zero = T.Cast(T.int32, 0)
+        mask_zero = T.cast(0, T.int32)
         mask0 = mask1 = mask2 = mask3 = mask_zero
 
         # TCGEN05 only has one warp group
-        num_inst_m = self.block_row_warps * self.warp_row_tiles // atom_m
+        num_inst_m = self.block_row_warps * self.warp_row_tiles // atom_m_per_cta
         num_inst_n = self.block_col_warps * self.warp_col_tiles // atom_n
 
         # Helper to allow BufferRegion/BufferLoad as inputs
@@ -262,10 +294,10 @@ def access_ptr_from(buffer_or_load_or_region, access_type: str = "r"):
                 buffer = buffer_load.buffer
                 for i, shape in enumerate(reversed(buffer.shape)):
                     indice = buffer_load.indices[len(buffer_load.indices) - i - 1]
-                    if isinstance(indice, (tvm.tir.IntImm, tvm.tir.PrimExpr)):
-                        offset += indice * stride
-                    elif isinstance(indice, tvm.tir.Ramp):
+                    if isinstance(indice, tvm.tir.Ramp):
                         offset += indice.base * stride
+                    elif isinstance(indice, (tvm.tir.IntImm, tvm.tir.PrimExpr)):
+                        offset += indice * stride
                     else:
                         raise ValueError(f"Unsupported index type: {type(indice)}")
                     stride *= shape
@@ -282,7 +314,7 @@ def access_ptr_from(buffer_or_load_or_region, access_type: str = "r"):
                 raise ValueError(f"Unsupported buffer type: {type(buffer_or_load_or_region)}")
 
         @T.macro
-        def _warp_mma(A_buf, B_buf, C_local_buf, mbar):
+        def _warp_mma_ss(A_buf, B_buf, C_local_buf, mbar):
             # Allocate SMEM descriptors for A and B
             desc_a = T.alloc_tcgen05_smem_desc()
             desc_b = T.alloc_tcgen05_smem_desc()
@@ -308,26 +340,27 @@ def _warp_mma(A_buf, B_buf, C_local_buf, mbar):
                 int(b_swizzle_mode),
             )
 
-            tmem_col_step = atom_n // (128 // atom_m)
+            tmem_col_step = atom_n // (128 // atom_m_per_cta)
             for j in T.unroll(num_inst_n):
                 for i in T.unroll(num_inst_m):
                     for ki in T.unroll(0, (k_dim // micro_size_k)):
                         scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
                         A_elem_offset = (
                             (ki % ak_atom_size) * micro_size_k
-                            + i * atom_m * a_swizzle_atom_elems
+                            + i * atom_m_per_cta * a_swizzle_atom_elems
                             + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
                             if a_is_k_major
-                            else i * atom_m * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                            else i * atom_m_per_cta * k_dim + ki * a_swizzle_atom_elems * micro_size_k
                         )
 
                         B_elem_offset = (
-                            (ki // bk_atom_size) * n_dim * b_swizzle_atom_elems
+                            (ki // bk_atom_size) * n_dim_per_cta * b_swizzle_atom_elems
                             + (ki % bk_atom_size) * micro_size_k
                             + j * atom_n * b_swizzle_atom_elems
                             if b_is_k_major
                             else (
-                                ki * b_swizzle_atom_elems * micro_size_k + j * atom_n * (k_dim if n_dim // b_swizzle_atom_elems > 1 else 1)
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + j * atom_n * (k_dim if n_dim_per_cta // b_swizzle_atom_elems > 1 else 1)
                             )
                         )
 
@@ -350,10 +383,437 @@ def _warp_mma(A_buf, B_buf, C_local_buf, mbar):
                             mask2,
                             mask3,
                             enable_ws,
+                            enable_2cta,
+                        )
+            T.tcgen05_mma_arrive(mbar, arrive_2cta=enable_2cta)
+
+        return _warp_mma_ss(A_buf, B_buf, C_local_buf, mbar)
+
+    def tcgen05mma_ts(self, A_buf, B_buf, C_local_buf, mbar, clear_accum: PrimExpr = False):
+        """Emit the TS (TensorMemory-Shared) variant of TCGEN5MMA.
+
+        Reads operand A directly from tensor memory (TMEM) and operand B from
+        shared memory via a descriptor.  The TMEM column offset for A is
+        computed assuming packed storage (e.g. two ``bfloat16`` values per
+        ``uint32`` column) to match the output of ``tcgen05.st``.
+
+        Parameters
+        ----------
+        A_buf : Buffer
+            Operand A residing in tensor memory (``shared.tmem``).
+        B_buf : Buffer
+            Operand B in shared memory.
+        C_local_buf : Buffer
+            Accumulator buffer in tensor memory.
+        mbar : PrimExpr
+            Memory barrier for MMA completion signalling.
+        clear_accum : PrimExpr
+            Whether to zero the accumulator before the first MMA.
+        """
+        accum_dtype = self.accum_dtype
+        m_dim = self.block_row_warps * self.warp_row_tiles
+        micro_size_k = self.micro_size_k
+        k_dim, n_dim = self.chunk, self.block_col_warps * self.warp_col_tiles
+        meta = self.meta
+        if len(meta) != 5:
+            raise ValueError(
+                f"Unsupported TCGEN5MMA configuration for desc generation: M={m_dim}, N={n_dim}, "
+                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = (int(x) for x in meta)
+        atom_m_per_cta = atom_m // 2 if enable_2cta else atom_m
+        n_dim_per_cta = n_dim // 2 if enable_2cta else n_dim
+        scale_in_a = 1
+        scale_in_b = 1
+
+        assert k_dim >= micro_size_k, f"k_dim must be >= {micro_size_k}, got {k_dim}"
+
+        b_is_k_major = self.b_transposed
+        b_swizzle_mode = self._determinate_swizzle_mode(B_buf, self.b_shared_layout)
+
+        a_dtype_in_bits = DataType(self.a_dtype).bits
+        elems_in_bits = a_dtype_in_bits
+        elems_in_bytes = (elems_in_bits + 7) // 8
+        b_swizzle_atom_elems = n_dim_per_cta if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        accum_dtype_in_bits = DataType(accum_dtype).bits
+
+        # B descriptor parameters (same as SS)
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim_per_cta * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim_per_cta == 8 else (8 * 8 * elems_in_bytes))
+        if not b_swizzle_mode.is_none():
+            if b_is_k_major:
+                b_leading_byte_offset = 16
+                b_stride_byte_offset = 8 * b_swizzle_mode.swizzle_byte_size()
+            else:
+                b_n_axis_atoms = n_dim_per_cta // b_swizzle_atom_elems
+                if b_n_axis_atoms <= 1:
+                    b_leading_byte_offset = 0
+                else:
+                    b_leading_byte_offset = 8 * 8 * elems_in_bytes * k_dim
+                if b_n_axis_atoms <= 1:
+                    b_stride_byte_offset = 8 * elems_in_bytes * n_dim_per_cta
+                else:
+                    b_stride_byte_offset = 8 * elems_in_bytes * b_swizzle_atom_elems
+
+        bk_atom_size = max(b_swizzle_atom_elems // micro_size_k, 1)
+
+        a_is_k_major = not self.a_transposed
+        instr_desc = self.get_tcgen5_instr_desc(
+            atom_m,
+            atom_n,
+            atom_k,
+            a_is_k_major,
+            b_is_k_major,
+            scale_in_a,
+            scale_in_b,
+        )
+        a_dtype_abbrv = self.a_dtype_abbrv
+        mask_zero = T.cast(0, T.int32)
+        mask0 = mask1 = mask2 = mask3 = mask_zero
+
+        num_inst_m = m_dim // atom_m_per_cta
+        num_inst_n = n_dim // atom_n
+
+        # TMEM column geometry for A operand
+        # Each TMEM column is 32 bits; row interleaving factor = 128 / atom_m
+        interleave = max(128 // atom_m, 1)
+        a_tmem_cols_per_k_atom = atom_k * a_dtype_in_bits // 32 // interleave
+        a_tmem_k_stride = k_dim * a_dtype_in_bits // 32 // interleave
+
+        def access_ptr_from(buffer_or_load_or_region, access_type: str = "r"):
+            if isinstance(buffer_or_load_or_region, Buffer):
+                return buffer_or_load_or_region.access_ptr(access_type)
+            elif isinstance(buffer_or_load_or_region, BufferLoad):
+                buffer_load = buffer_or_load_or_region
+                offset, stride = 0, 1
+                buffer = buffer_load.buffer
+                for i, shape in enumerate(reversed(buffer.shape)):
+                    indice = buffer_load.indices[len(buffer_load.indices) - i - 1]
+                    if isinstance(indice, tvm.tir.Ramp):
+                        offset += indice.base * stride
+                    elif isinstance(indice, (tvm.tir.IntImm, tvm.tir.PrimExpr)):
+                        offset += indice * stride
+                    else:
+                        raise ValueError(f"Unsupported index type: {type(indice)}")
+                    stride *= shape
+                return buffer.access_ptr(access_type, offset=offset)
+            elif isinstance(buffer_or_load_or_region, BufferRegion):
+                buffer_region = buffer_or_load_or_region
+                buffer = buffer_region.buffer
+                offset, stride = 0, 1
+                for i, shape in enumerate(reversed(buffer.shape)):
+                    offset += buffer_region.region[len(buffer_region.region) - i - 1].min * stride
+                    stride *= shape
+                return buffer.access_ptr(access_type, offset=offset)
+            else:
+                raise ValueError(f"Unsupported buffer type: {type(buffer_or_load_or_region)}")
+
+        # Resolve the TMEM data pointer for A
+        if isinstance(A_buf, BufferRegion):
+            a_tmem_data = A_buf.buffer.data
+        elif isinstance(A_buf, Buffer):
+            a_tmem_data = A_buf.data
+        else:
+            raise ValueError(f"Unsupported A_buf type for TS variant: {type(A_buf)}")
+
+        @T.macro
+        def _warp_mma_ts(a_data, B_buf, C_local_buf, mbar):
+            desc_b = T.alloc_tcgen05_smem_desc()
+            B_ptr = access_ptr_from(B_buf, "r")
+
+            T.initialize_tcgen05_descriptor(
+                desc_b,
+                B_ptr,
+                int(b_leading_byte_offset >> 4),
+                int(b_stride_byte_offset >> 4),
+                0,
+                False,
+                int(b_swizzle_mode),
+            )
+
+            tmem_col_step = atom_n // (128 // atom_m_per_cta)
+            for j in T.unroll(num_inst_n):
+                for i in T.unroll(num_inst_m):
+                    for ki in T.unroll(0, (k_dim // micro_size_k)):
+                        scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
+
+                        # A: TMEM column offset
+                        A_tmem_offset = i * a_tmem_k_stride + ki * a_tmem_cols_per_k_atom
+
+                        # B: SMEM byte offset (same as SS)
+                        B_elem_offset = (
+                            (ki // bk_atom_size) * n_dim_per_cta * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + j * atom_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + j * atom_n * (k_dim if n_dim_per_cta // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
+                        B_byte_offset = B_elem_offset * elems_in_bytes
+
+                        # C: TMEM column offset (same as SS)
+                        C_offset = (i * n_dim + j * tmem_col_step) * accum_dtype_in_bits // 32
+
+                        T.ptx_tcgen05_mma_ts(
+                            a_dtype_abbrv,
+                            a_data,
+                            A_tmem_offset,
+                            desc_b.data,
+                            B_byte_offset,
+                            C_local_buf.data,
+                            C_offset,
+                            instr_desc,
+                            scale_out,
+                            mask0,
+                            mask1,
+                            mask2,
+                            mask3,
+                        )
+            T.tcgen05_mma_arrive(mbar, arrive_2cta=enable_2cta)
+
+        return _warp_mma_ts(a_tmem_data, B_buf, C_local_buf, mbar)
+
+    def tcgen05mma_blockscaled(
+        self,
+        A_buf: Buffer,
+        B_buf: Buffer,
+        C_local_buf: Buffer,
+        SFA_tmem,
+        SFB_tmem,
+        mbar,
+        clear_accum: PrimExpr = False,
+        sf_a_id=0,
+        sf_b_id=0,
+    ):
+        """Emit a block-scaled TCGEN5MMA (SS variant with TMEM scale factors).
+
+        Uses ``tcgen05.mma.cta_group::1|2.kind::mxf8f6f4.block_scale`` PTX instruction.
+        Scale factors must already reside in tensor memory.
+        """
+        accum_dtype = self.accum_dtype
+        m_dim = self.block_row_warps * self.warp_row_tiles
+        micro_size_k = self.micro_size_k
+        k_dim, n_dim = self.chunk, self.block_col_warps * self.warp_col_tiles
+        scale_in_a = 1
+        scale_in_b = 1
+
+        assert k_dim >= micro_size_k
+
+        a_is_k_major = not self.a_transposed
+        b_is_k_major = self.b_transposed
+        a_swizzle_mode = self._determinate_swizzle_mode(A_buf, self.a_shared_layout)
+        b_swizzle_mode = self._determinate_swizzle_mode(B_buf, self.b_shared_layout)
+
+        elems_in_bits = DataType(self.a_dtype).bits
+        elems_in_bytes = (elems_in_bits + 7) // 8
+        accum_dtype_in_bits = DataType(accum_dtype).bits
+
+        if len(self.meta) != 5:
+            self.get_tcgen5_mma_meta(m_dim, n_dim, k_dim, disable_2cta=False)
+        if len(self.meta) != 5:
+            raise ValueError(
+                f"Unsupported TCGEN5MMA configuration for block-scaled: M={m_dim}, N={n_dim}, "
+                f"K={k_dim}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
+            )
+        atom_m, atom_n, atom_k, _enable_ws, enable_2cta = (int(x) for x in self.meta)
+        enable_ws = 0
+        atom_m_per_cta = atom_m // 2 if enable_2cta else atom_m
+        n_dim_per_cta = n_dim // 2 if enable_2cta else n_dim
+
+        a_swizzle_atom_elems = a_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+        b_swizzle_atom_elems = n_dim_per_cta if b_swizzle_mode.is_none() else b_swizzle_mode.swizzle_byte_size() // elems_in_bytes
+
+        a_leading_byte_offset = (8 * 8 * elems_in_bytes) if a_is_k_major else (8 * atom_m_per_cta * elems_in_bytes)
+        a_stride_byte_offset = (8 * k_dim * elems_in_bytes) if a_is_k_major else (8 * 8 * elems_in_bytes)
+        if not a_swizzle_mode.is_none():
+            if a_is_k_major:
+                a_leading_byte_offset = 16
+                a_stride_byte_offset = 8 * a_swizzle_mode.swizzle_byte_size()
+            else:
+                a_m_axis_atoms = atom_m_per_cta // a_swizzle_atom_elems
+                a_leading_byte_offset = k_dim * a_swizzle_mode.swizzle_byte_size() if a_m_axis_atoms > 1 else 0
+                a_stride_byte_offset = (
+                    8 * elems_in_bytes * a_swizzle_atom_elems if a_m_axis_atoms > 1 else 8 * elems_in_bytes * atom_m_per_cta
+                )
+
+        b_leading_byte_offset = (8 * 8 * elems_in_bytes) if b_is_k_major else (8 * n_dim_per_cta * elems_in_bytes)
+        b_stride_byte_offset = (8 * k_dim * elems_in_bytes) if b_is_k_major else (0 if n_dim_per_cta == 8 else (8 * 8 * elems_in_bytes))
+        if not b_swizzle_mode.is_none():
+            if b_is_k_major:
+                b_leading_byte_offset = 16
+                b_stride_byte_offset = 8 * b_swizzle_mode.swizzle_byte_size()
+            else:
+                b_n_axis_atoms = n_dim_per_cta // b_swizzle_atom_elems
+                b_leading_byte_offset = b_swizzle_mode.swizzle_byte_size() * k_dim if b_n_axis_atoms > 1 else 0
+                b_stride_byte_offset = (
+                    8 * elems_in_bytes * b_swizzle_atom_elems if b_n_axis_atoms > 1 else 8 * elems_in_bytes * n_dim_per_cta
+                )
+
+        ak_atom_size = max(a_swizzle_atom_elems // micro_size_k, 1)
+        bk_atom_size = max(b_swizzle_atom_elems // micro_size_k, 1)
+
+        base_instr_desc = self.get_tcgen5_blockscaled_instr_desc(
+            atom_m,
+            atom_n,
+            a_is_k_major,
+            b_is_k_major,
+            scale_in_a,
+            scale_in_b,
+            0,
+            0,
+        )
+
+        a_dtype_abbrv = self.a_dtype_abbrv
+        num_inst_m = m_dim // atom_m_per_cta
+        num_inst_n = n_dim // atom_n
+
+        def access_ptr_from(buffer_or_load_or_region, access_type: str = "r"):
+            if isinstance(buffer_or_load_or_region, Buffer):
+                return buffer_or_load_or_region.access_ptr(access_type)
+            elif isinstance(buffer_or_load_or_region, BufferLoad):
+                buffer_load = buffer_or_load_or_region
+                offset, stride = 0, 1
+                buffer = buffer_load.buffer
+                for i, shape in enumerate(reversed(buffer.shape)):
+                    indice = buffer_load.indices[len(buffer_load.indices) - i - 1]
+                    if isinstance(indice, tvm.tir.Ramp):
+                        offset += indice.base * stride
+                    elif isinstance(indice, (tvm.tir.IntImm, tvm.tir.PrimExpr)):
+                        offset += indice * stride
+                    else:
+                        raise ValueError(f"Unsupported index type: {type(indice)}")
+                    stride *= shape
+                return buffer.access_ptr(access_type, offset=offset)
+            elif isinstance(buffer_or_load_or_region, BufferRegion):
+                buffer_region = buffer_or_load_or_region
+                buffer = buffer_region.buffer
+                offset, stride = 0, 1
+                for i, shape in enumerate(reversed(buffer.shape)):
+                    offset += buffer_region.region[len(buffer_region.region) - i - 1].min * stride
+                    stride *= shape
+                return buffer.access_ptr(access_type, offset=offset)
+            else:
+                raise ValueError(f"Unsupported buffer type: {type(buffer_or_load_or_region)}")
+
+        if isinstance(SFA_tmem, BufferRegion):
+            sfa_data = SFA_tmem.buffer.data
+        elif isinstance(SFA_tmem, Buffer):
+            sfa_data = SFA_tmem.data
+        else:
+            raise ValueError(f"Unsupported SFA_tmem type: {type(SFA_tmem)}")
+
+        if isinstance(SFB_tmem, BufferRegion):
+            sfb_data = SFB_tmem.buffer.data
+        elif isinstance(SFB_tmem, Buffer):
+            sfb_data = SFB_tmem.data
+        else:
+            raise ValueError(f"Unsupported SFB_tmem type: {type(SFB_tmem)}")
+
+        @T.macro
+        def _warp_mma_blockscaled(A_buf, B_buf, C_local_buf, sfa_data, sfb_data, mbar):
+            desc_a = T.alloc_tcgen05_smem_desc()
+            desc_b = T.alloc_tcgen05_smem_desc()
+            A_ptr = access_ptr_from(A_buf, "r")
+            B_ptr = access_ptr_from(B_buf, "r")
+
+            T.initialize_tcgen05_descriptor(
+                desc_a,
+                A_ptr,
+                int(a_leading_byte_offset >> 4),
+                int(a_stride_byte_offset >> 4),
+                0,
+                False,
+                int(a_swizzle_mode),
+            )
+            T.initialize_tcgen05_descriptor(
+                desc_b,
+                B_ptr,
+                int(b_leading_byte_offset >> 4),
+                int(b_stride_byte_offset >> 4),
+                0,
+                False,
+                int(b_swizzle_mode),
+            )
+
+            tmem_col_step = atom_n // (128 // atom_m_per_cta)
+            _sf_a = tvm.tir.const(sf_a_id, "int32") if isinstance(sf_a_id, int) else sf_a_id
+            _sf_b = tvm.tir.const(sf_b_id, "int32") if isinstance(sf_b_id, int) else sf_b_id
+            runtime_instr_desc = base_instr_desc | (_sf_a << 29) | (_sf_b << 4)
+            for j in T.unroll(num_inst_n):
+                for i in T.unroll(num_inst_m):
+                    for ki in T.unroll(0, (k_dim // micro_size_k)):
+                        scale_out = T.Select(ki != 0, 1, T.Select(clear_accum, 0, 1))
+                        A_elem_offset = (
+                            (ki % ak_atom_size) * micro_size_k
+                            + i * atom_m_per_cta * a_swizzle_atom_elems
+                            + (ki // ak_atom_size) * m_dim * a_swizzle_atom_elems
+                            if a_is_k_major
+                            else i * atom_m_per_cta * k_dim + ki * a_swizzle_atom_elems * micro_size_k
+                        )
+                        B_elem_offset = (
+                            (ki // bk_atom_size) * n_dim_per_cta * b_swizzle_atom_elems
+                            + (ki % bk_atom_size) * micro_size_k
+                            + j * atom_n * b_swizzle_atom_elems
+                            if b_is_k_major
+                            else (
+                                ki * b_swizzle_atom_elems * micro_size_k
+                                + j * atom_n * (k_dim if n_dim_per_cta // b_swizzle_atom_elems > 1 else 1)
+                            )
+                        )
+
+                        A_byte_offset = A_elem_offset * elems_in_bytes
+                        B_byte_offset = B_elem_offset * elems_in_bytes
+                        C_offset = (i * n_dim + j * tmem_col_step) * accum_dtype_in_bits // 32
+
+                        T.ptx_tcgen05_mma_blockscaled_ss(
+                            a_dtype_abbrv,
+                            desc_a.data,
+                            A_byte_offset,
+                            desc_b.data,
+                            B_byte_offset,
+                            C_local_buf.data,
+                            C_offset,
+                            runtime_instr_desc,
+                            scale_out,
+                            sfa_data,
+                            0,
+                            sfb_data,
+                            0,
+                            0,
+                            0,
+                            enable_ws,
+                            enable_2cta,
                         )
-            T.tcgen05_mma_arrive(mbar)
+            T.tcgen05_mma_arrive(mbar, arrive_2cta=enable_2cta)
 
-        return _warp_mma(A_buf, B_buf, C_local_buf, mbar)
+        return _warp_mma_blockscaled(A_buf, B_buf, C_local_buf, sfa_data, sfb_data, mbar)
+
+    def get_tcgen5_blockscaled_instr_desc(
+        self,
+        atom_m: int,
+        atom_n: int,
+        a_is_k_major: bool,
+        b_is_k_major: bool,
+        scale_in_a: int,
+        scale_in_b: int,
+        a_sf_id: int,
+        b_sf_id: int,
+    ) -> PrimExpr:
+        """Build the block-scaled instruction descriptor via FFI."""
+        desc = _ffi_api.get_tcgen5_blockscaled_instr_desc(
+            atom_m,
+            atom_n,
+            DataType(self.a_dtype),
+            a_is_k_major,
+            b_is_k_major,
+            scale_in_a,
+            scale_in_b,
+            a_sf_id,
+            b_sf_id,
+        )
+        return lift(desc)
 
     def make_mma_load_layout(self, local_buf: Buffer, matrix: str = "A") -> T.Fragment:
         raise NotImplementedError
@@ -386,27 +846,44 @@ def make_mma_store_layout(self, tmem_buf: Buffer) -> Layout:
         n = int(tmem_buf.shape[1])
         k = int(self.chunk)
 
-        meta = self.get_tcgen5_mma_meta(m, n, k)
+        meta = self.meta
         if len(meta) != 5:
             raise ValueError(
                 f"Unsupported TCGEN5MMA configuration: M={m}, N={n}, K={k}, A dtype={self.a_dtype}, accum dtype={self.accum_dtype}"
             )
-        atom_m, atom_n, _, _, _ = (int(x) for x in meta)
+        atom_m, atom_n, _, _, enable_2cta = (int(x) for x in meta)
+        atom_m_per_cta = atom_m // 2 if enable_2cta else atom_m
 
-        if m % atom_m != 0 or n % atom_n != 0:
+        if m % atom_m_per_cta != 0 or n % atom_n != 0:
             raise ValueError(f"Invalid TCGEN5MMA store layout for shape ({m}, {n}) with atoms ({atom_m}, {atom_n})")
 
         def forward(i: PrimExpr, j: PrimExpr):
-            atom_idx = (i // atom_m) + (j // atom_n) * (m // atom_m)
-            ai = i % atom_m
+            atom_idx = (i // atom_m_per_cta) + (j // atom_n) * (m // atom_m_per_cta)
+            ai = i % atom_m_per_cta
             aj = j % atom_n
 
-            if atom_m == 128:
-                # Layout D
+            # NOTE: Currently not all 7 layout are supported
+            if atom_m == 256:
+                # Layout A (2 cta)
+                assert enable_2cta, "atom_m=256 for TCGEN5MMA must use 2cta"
                 return [
-                    ai,
+                    ai % 128,
                     aj + atom_idx * atom_n,
                 ]
+            if atom_m == 128:
+                if enable_2cta:
+                    # Layout B
+                    half_atom_n = atom_n // 2
+                    return [
+                        ai + (aj // half_atom_n) * 64,
+                        (aj % half_atom_n) + atom_idx * half_atom_n,
+                    ]
+                else:
+                    # Layout D
+                    return [
+                        ai,
+                        aj + atom_idx * atom_n,
+                    ]
             if atom_m == 64:
                 # Layout E (.ws variant)
                 half_atom_n = atom_n // 2
@@ -426,12 +903,16 @@ def forward(i: PrimExpr, j: PrimExpr):
 
         return Layout([m, n], forward)
 
-    def get_tcgen5_mma_meta(self, m: int, n: int, k: int):
-        return _ffi_api.get_tcgen5_mma_meta(int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype))
+    def get_tcgen5_mma_meta(self, m: int, n: int, k: int, disable_2cta: bool):
+        """Query the FFI for TCGEN5MMA atom metadata (atom_m, atom_n, atom_k, enable_ws, enable_2cta), and record them in `self.meta`."""
+        self.meta = _ffi_api.get_tcgen5_mma_meta(
+            int(m), int(n), int(k), DataType(self.a_dtype), DataType(self.accum_dtype), bool(disable_2cta)
+        )
 
     def get_tcgen5_instr_desc(
         self, atom_m: int, atom_n: int, atom_k: int, a_is_k_major: bool, b_is_k_major: bool, scale_in_a: int, scale_in_b: int
     ) -> PrimExpr:
+        """Build the 64-bit instruction descriptor for a ``tcgen05.mma`` PTX call."""
         desc = _ffi_api.get_tcgen5_instr_desc(
             atom_m,
             atom_n,
diff --git a/tilelang/intrinsics/utils.py b/tilelang/intrinsics/utils.py
index fb24a4add2..f65fff1a9b 100644
--- a/tilelang/intrinsics/utils.py
+++ b/tilelang/intrinsics/utils.py
@@ -10,7 +10,7 @@
     mma_store_32x8_to_shared_16x16_layout,
     mma_store_32x2_to_shared_8x8_layout_fp64,
 )
-from .mfma_layout import thread_id_shared_access_64x4_to_16x16_layout_C_n_m
+from .mfma_layout import thread_id_shared_access_64x4_to_16x16_layout_C_n_m, thread_id_shared_access_64x16_to_32x32_layout_C_m_n
 
 from .mma_layout import get_swizzle_layout  # noqa: F401
 from .mma_layout import make_mma_swizzle_layout  # noqa: F401
@@ -24,7 +24,7 @@ def get_ldmatrix_offset(
     row_idx,
     col_idx,
     stride,
-    dtype: Literal["float16", "int8"] = "float16",
+    dtype: Literal["float16", "int8", "int4"] = "float16",
     transposed: bool = False,
 ):
     assert matrix in ["A", "B"], "matrix should be either A or B"
@@ -33,11 +33,11 @@ def get_ldmatrix_offset(
         if matrix == "B" and transposed:
             transform_func = ldmatrix_32x4_to_shared_16x8_layout_b
             new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            return new_row_idx, new_col_idx
         elif matrix == "A" and not transposed:
             transform_func = ldmatrix_32x4_to_shared_16x8_layout_a
             new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            return new_row_idx, new_col_idx
         else:
             raise ValueError("ldmatrix only supports B transposed and A non-transposed for int8")
     elif dtype_bits == 16:
@@ -45,19 +45,21 @@ def get_ldmatrix_offset(
         transform_func_trans = ldmatrix_trans_32x8_to_shared_16x16_layout
         if transposed:
             new_row_idx, new_col_idx = transform_func_trans(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            return new_row_idx, new_col_idx
         else:
             new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
-    elif dtype_bits == 8:
+            return new_row_idx, new_col_idx
+    elif dtype_bits <= 8:
         if matrix == "B" and transposed:
             transform_func = ldmatrix_32x16_to_shared_16x32_layout_b
             new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            pack_factor = 8 // dtype_bits
+            return new_row_idx, new_col_idx * pack_factor
         elif matrix == "A" and not transposed:
             transform_func = ldmatrix_32x16_to_shared_16x32_layout_a
             new_row_idx, new_col_idx = transform_func(row_idx, col_idx)
-            return new_row_idx * stride + new_col_idx
+            pack_factor = 8 // dtype_bits
+            return new_row_idx, new_col_idx * pack_factor
         else:
             raise ValueError("ldmatrix only supports B transposed and A non-transposed for int8")
     else:
@@ -91,6 +93,10 @@ def mfma_store_index_map(thread_id, local_id):
     return thread_id_shared_access_64x4_to_16x16_layout_C_n_m(thread_id, local_id)
 
 
+def mfma_store_index_map_32x32(thread_id, local_id):
+    return thread_id_shared_access_64x16_to_32x32_layout_C_m_n(thread_id, local_id)
+
+
 def get_mma_micro_size(dtype: Literal["float16", "int8"]):
     # TODO(lei): FP8 related precision support.
     # Basic Tensor Core Matrix Multiply operation Unit
diff --git a/tilelang/intrinsics/wmma_layout.py b/tilelang/intrinsics/wmma_layout.py
new file mode 100644
index 0000000000..65143332ac
--- /dev/null
+++ b/tilelang/intrinsics/wmma_layout.py
@@ -0,0 +1,421 @@
+"""Layout functions for AMD RDNA WMMA instructions (gfx11/gfx12).
+
+EMPIRICALLY VERIFIED hardware layouts for wmma_f32_16x16x16_f16_w32_gfx12:
+
+  A[M=16][K=16]:
+    thread t, elem e -> A[M=t%16][K=(t//16)*8+e]
+    Forward: (M, K) -> (thread=(K//8)*16+M, local=K%8)
+    Reverse: (thread, local) -> (M=thread%16, K=(thread//16)*8+local)
+    Memory load: A[M=t%16][K=(t//16)*8..+7] -> CONTIGUOUS in K (vectorized)
+
+  B[K=16][N=16] (non-transposed, K x N storage):
+    thread t, elem e -> B[K=(t//16)*8+e][N=t%16]
+    Forward: (K, N) -> (thread=(K//8)*16+N, local=K%8)
+    Reverse: (thread, local) -> (K=(thread//16)*8+local, N=thread%16)
+
+  B_T[N=16][K=16] (transposed storage of B):
+    B_T[N=t%16][K=(t//16)*8+e] -> CONTIGUOUS in K (vectorized)
+
+  D[M=16][N=16]:
+    thread t, elem l -> D[M=(t//16)*8+l][N=t%16]
+    Forward: (M, N) -> (thread=(M//8)*16+N, local=M%8)
+    Reverse: (thread, local) -> (M=(thread//16)*8+local, N=thread%16)
+    Store: D[M=(t//16)*8+l][N=t%16] = d_vec[l]
+
+EMPIRICALLY VERIFIED hardware layouts for wmma_f32_16x16x16_f16_w32 (gfx11):
+
+  A[M=16][K=16]:
+    thread t, elem e -> A[M=t%16][K=e]
+    Forward: (M, K) -> (thread=M, local=K%16) [Mapping to tid=0~15]
+    Reverse: (thread, local) -> (M=thread%16, K=local)
+    Memory load: A[M=t%16][K=0..+15] -> CONTIGUOUS in K (vectorized)
+
+  B[K=16][N=16] (non-transposed, K x N storage):
+    thread t, elem e -> B[K=e][N=t%16]
+    Forward: (K, N) -> (thread=N, local=K%16) [Mapping to tid=0~15]
+    Reverse: (thread, local) -> (K=local, N=thread%16)
+
+  B_T[N=16][K=16] (transposed storage of B):
+    B_T[N=t%16][K=e] -> CONTIGUOUS in K (vectorized)
+
+  D[M=16][N=16]:
+    thread t, elem l -> D[M=(t//16)+l*2][N=t%16]
+    Forward: (M, N) -> (thread=(M%2)*16+N, local=M//2)
+    Reverse: (thread, local) -> (M=(thread//16)+local*2, N=thread%16)
+    Store: D[M=(t//16)+l*2][N=t%16] = d_vec[l]
+
+NOTE:
+1.  A and D have DIFFERENT layouts (e.g. For gfx12, A uses t%16 for M,
+    D uses (t//16)*8+l for M). This means they cannot be used interchangeably
+    without a layout change.
+2.  For gfx11, lane 16~31 share the same A/B data as lane 0~15.
+
+local_size = 8 (gfx12) | 16 (gfx11)
+"""
+
+from tvm.runtime import convert
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx12 A matrix: shared[M=16][K=16]
+# A[M=t%16][K=(t//16)*8+l] -> vectorized load from row M=t%16, consecutive K
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x8_layout_A_gfx12(i, j):
+    """Forward: A[i=M, j=K] -> (thread=(j//8)*16+i, local=j%8)."""
+    thread_id = (j // 8) * 16 + i  # (K//8)*16 + M
+    local_id = j % 8  # K%8
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x8_to_16x16_layout_A_gfx12(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=M=thread%16, j=K=(thread//16)*8+local)."""
+    return thread_id % 16, (thread_id // 16) * 8 + local_id
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx12 A_T matrix (transposed storage, K x M): shared[K=16][M=16]
+# A_T[K=(t//16)*8+l][M=t%16]
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x8_layout_A_colmajor_gfx12(i, j):
+    """Forward: A_T[i=K, j=M] -> (thread=(i//8)*16+j, local=i%8)."""
+    thread_id = (i // 8) * 16 + j  # (K//8)*16 + M
+    local_id = i % 8  # K%8
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x8_to_16x16_layout_A_colmajor_gfx12(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=K=(thread//16)*8+local, j=M=thread%16)."""
+    return (thread_id // 16) * 8 + local_id, thread_id % 16
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx12 B matrix (non-transposed, K x N): shared[K=16][N=16]
+# B[K=(t//16)*8+l][N=t%16]
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x8_layout_B_gfx12(i, j):
+    """Forward: B[i=K, j=N] -> (thread=(i//8)*16+j, local=i%8)."""
+    thread_id = (i // 8) * 16 + j  # (K//8)*16 + N
+    local_id = i % 8  # K%8
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x8_to_16x16_layout_B_gfx12(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=K=(thread//16)*8+local, j=N=thread%16)."""
+    return (thread_id // 16) * 8 + local_id, thread_id % 16
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx12 B_T matrix (transposed storage, N x K): shared[N=16][K=16]
+# B_T[N=t%16][K=(t//16)*8+l] -> vectorized load from row N=t%16, consecutive K
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x8_layout_B_colmajor_gfx12(i, j):
+    """Forward: B_T[i=N, j=K] -> (thread=(j//8)*16+i, local=j%8)."""
+    thread_id = (j // 8) * 16 + i  # (K//8)*16 + N
+    local_id = j % 8  # K%8
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x8_to_16x16_layout_B_colmajor_gfx12(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=N=thread%16, j=K=(thread//16)*8+local)."""
+    return thread_id % 16, (thread_id // 16) * 8 + local_id
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx12 D/C output matrix: shared[M=16][N=16] fp32
+# D[M=(t//16)*8+l][N=t%16] -- hardware native
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x8_layout_C_gfx12(i, j):
+    """Forward: D[i=M, j=N] -> (thread=(i//8)*16+j, local=i%8)."""
+    thread_id = (i // 8) * 16 + j  # (M//8)*16 + N
+    local_id = i % 8  # M%8
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x8_to_16x16_layout_C_gfx12(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=M=(thread//16)*8+local, j=N=thread%16)."""
+    return (thread_id // 16) * 8 + local_id, thread_id % 16
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx12 store index map: (thread, local) -> (M, N) in D (hardware D layout)
+# D[M=(t//16)*8+local][N=t%16] -- affine, invertible
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def wmma_store_index_map_gfx12(thread_id, local_id):
+    """(thread, local) -> (M, N) in D.  Hardware D layout."""
+    i = (thread_id // 16) * 8 + local_id  # M
+    j = thread_id % 16  # N
+    return convert([i, j])
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx11 A matrix: shared[M=16][K=16]
+# A[M=t%16][K=l] -> vectorized load from row M=t%16, consecutive K
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x16_layout_A_gfx11(i, j):
+    """
+    Forward: A[i=M, j=K] -> (thread=i, local=j%16).
+    ATTN: Here we only reflect (i, j) to the lower-half-lane of threads in
+    a warp.
+    """
+    thread_id = i
+    local_id = j % 16
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x16_to_16x16_layout_A_gfx11(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=M=thread%16, j=K=local)"""
+    return thread_id % 16, local_id
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx11 A_T matrix (transposed storage, K x M): shared[K=16][M=16]
+# A_T[K=l][M=t%16]
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x16_layout_A_colmajor_gfx11(i, j):
+    """
+    Forward: A_T[i=K, j=M] -> (thread=M, local=K%16).
+    ATTN: Here we only reflect (i, j) to the lower-half-lane of threads in
+    a warp.
+    """
+    thread_id = j
+    local_id = i % 16
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x16_to_16x16_layout_A_colmajor_gfx11(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=K=local, j=M=thread%16)"""
+    return local_id, thread_id % 16
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx11 B matrix (non-transposed, K x N): shared[K=16][N=16]
+# B[K=l][N=t%16]
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x16_layout_B_gfx11(i, j):
+    """
+    Forward: B[i=K, j=N] -> (thread=N, local=K%16).
+    ATTN: Here we only reflect (i, j) to the lower-half-lane of threads in
+    a warp.
+    """
+    thread_id = j
+    local_id = i % 16
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x16_to_16x16_layout_B_gfx11(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=K=local, j=N=thread%16)"""
+    return local_id, thread_id % 16
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx11 B_T matrix (transposed storage, N x K): shared[N=16][K=16]
+# B_T[N=t%16][K=l] -> vectorized load from row N=t%16, consecutive K
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x16_layout_B_colmajor_gfx11(i, j):
+    """
+    Forward: B_T[i=N, j=K] -> (thread=i, local=j%16).
+    ATTN: Here we only reflect (i, j) to the lower-half-lane of threads in
+    a warp.
+    """
+    thread_id = i
+    local_id = j % 16
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x16_to_16x16_layout_B_colmajor_gfx11(thread_id, local_id):
+    """Reverse: (thread, local) -> (j=K=local, i=N=thread%16)"""
+    return thread_id % 16, local_id
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx11 D/C output matrix: shared[M=16][N=16] fp32
+# D[M=(t//16)+l*2][N=t%16] -- hardware native
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def shared_16x16_to_local_32x8_layout_C_gfx11(i, j):
+    """Forward: D[i=M, j=N] -> (thread=(i%2)*16+j, local=i//2)."""
+    thread_id = (i % 2) * 16 + j
+    local_id = i // 2
+    return thread_id, local_id
+
+
+def thread_id_shared_access_32x8_to_16x16_layout_C_gfx11(thread_id, local_id):
+    """Reverse: (thread, local) -> (i=M=(thread//16)+local*2, j=N=thread%16)"""
+    return (thread_id // 16) + local_id * 2, thread_id % 16
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx11 store index map: (thread, local) -> (M, N) in D (hardware D layout)
+# D[M=(t//16)+local*2][N=t%16] -- affine, invertible
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def wmma_store_index_map_gfx11(thread_id, local_id):
+    """(thread, local) -> (M, N) in D.  Hardware D layout."""
+    i = (thread_id // 16) + local_id * 2
+    j = thread_id % 16
+    return convert([i, j])
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# gfx11 fragment-forward helpers for duplicated half-wave ownership
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def fragment_forward_A_gfx11(i, j, rep):
+    """Replicated fragment forward map for gfx11 A.
+
+    The canonical owner lives in the lower half-wave and `rep` selects whether
+    the logical element is materialized in the lower or upper half-wave copy.
+    """
+    thread_id, local_id = shared_16x16_to_local_32x16_layout_A_gfx11(i, j)
+    return thread_id + 16 * rep, local_id
+
+
+def fragment_forward_A_colmajor_gfx11(i, j, rep):
+    """Replicated fragment forward map for gfx11 transposed A."""
+    thread_id, local_id = shared_16x16_to_local_32x16_layout_A_colmajor_gfx11(i, j)
+    return thread_id + 16 * rep, local_id
+
+
+def fragment_forward_B_gfx11(i, j, rep):
+    """Replicated fragment forward map for gfx11 B."""
+    thread_id, local_id = shared_16x16_to_local_32x16_layout_B_gfx11(i, j)
+    return thread_id + 16 * rep, local_id
+
+
+def fragment_forward_B_colmajor_gfx11(i, j, rep):
+    """Replicated fragment forward map for gfx11 transposed B."""
+    thread_id, local_id = shared_16x16_to_local_32x16_layout_B_colmajor_gfx11(i, j)
+    return thread_id + 16 * rep, local_id
+
+
+# ──────────────────────────────────────────────────────────────────────────────
+# Factory helpers
+# ──────────────────────────────────────────────────────────────────────────────
+
+
+def _unsupported_rdna_generation(rdna_gen: int):
+    raise ValueError(f"Unsupported RDNA generation for WMMA layout: {rdna_gen}")
+
+
+def get_wmma_a_layout_funcs(rdna_gen: int, transposed: bool):
+    """Return (forward_map, reverse_map) for A layout."""
+    if rdna_gen == 11:
+        if transposed:
+            return (
+                shared_16x16_to_local_32x16_layout_A_colmajor_gfx11,
+                thread_id_shared_access_32x16_to_16x16_layout_A_colmajor_gfx11,
+            )
+        return (
+            shared_16x16_to_local_32x16_layout_A_gfx11,
+            thread_id_shared_access_32x16_to_16x16_layout_A_gfx11,
+        )
+    if rdna_gen == 12:
+        if transposed:
+            return (
+                shared_16x16_to_local_32x8_layout_A_colmajor_gfx12,
+                thread_id_shared_access_32x8_to_16x16_layout_A_colmajor_gfx12,
+            )
+        return (
+            shared_16x16_to_local_32x8_layout_A_gfx12,
+            thread_id_shared_access_32x8_to_16x16_layout_A_gfx12,
+        )
+    _unsupported_rdna_generation(rdna_gen)
+
+
+def get_wmma_b_layout_funcs(rdna_gen: int, transposed: bool):
+    """Return (forward_map, reverse_map) for B layout."""
+    if rdna_gen == 11:
+        if transposed:
+            return (
+                shared_16x16_to_local_32x16_layout_B_colmajor_gfx11,
+                thread_id_shared_access_32x16_to_16x16_layout_B_colmajor_gfx11,
+            )
+        return (
+            shared_16x16_to_local_32x16_layout_B_gfx11,
+            thread_id_shared_access_32x16_to_16x16_layout_B_gfx11,
+        )
+    if rdna_gen == 12:
+        if transposed:
+            return (
+                shared_16x16_to_local_32x8_layout_B_colmajor_gfx12,
+                thread_id_shared_access_32x8_to_16x16_layout_B_colmajor_gfx12,
+            )
+        return (
+            shared_16x16_to_local_32x8_layout_B_gfx12,
+            thread_id_shared_access_32x8_to_16x16_layout_B_gfx12,
+        )
+    _unsupported_rdna_generation(rdna_gen)
+
+
+def get_wmma_c_layout_funcs(rdna_gen: int):
+    """Return (forward_map, reverse_map) for C/D layout."""
+    if rdna_gen == 11:
+        return (
+            shared_16x16_to_local_32x8_layout_C_gfx11,
+            thread_id_shared_access_32x8_to_16x16_layout_C_gfx11,
+        )
+    if rdna_gen == 12:
+        return (
+            shared_16x16_to_local_32x8_layout_C_gfx12,
+            thread_id_shared_access_32x8_to_16x16_layout_C_gfx12,
+        )
+    _unsupported_rdna_generation(rdna_gen)
+
+
+def get_wmma_store_index_map_func(rdna_gen: int):
+    """Return the (thread_id, local_id) -> (row, col) store map."""
+    if rdna_gen == 11:
+        return wmma_store_index_map_gfx11
+    if rdna_gen == 12:
+        return wmma_store_index_map_gfx12
+    _unsupported_rdna_generation(rdna_gen)
+
+
+def get_wmma_a_fragment_forward_func(rdna_gen: int, transposed: bool):
+    """Return the fragment forward function for A layout."""
+    if rdna_gen == 11:
+        return fragment_forward_A_colmajor_gfx11 if transposed else fragment_forward_A_gfx11
+    if rdna_gen == 12:
+        return None
+    _unsupported_rdna_generation(rdna_gen)
+
+
+def get_wmma_b_fragment_forward_func(rdna_gen: int, transposed: bool):
+    """Return the fragment forward function for B layout."""
+    if rdna_gen == 11:
+        return fragment_forward_B_colmajor_gfx11 if transposed else fragment_forward_B_gfx11
+    if rdna_gen == 12:
+        return None
+    _unsupported_rdna_generation(rdna_gen)
+
+
+def get_wmma_fragment_replicate_count(rdna_gen: int):
+    """Return the fragment replicate count used for logical one-to-many owners."""
+    if rdna_gen == 11:
+        return 2
+    if rdna_gen == 12:
+        return 1
+    _unsupported_rdna_generation(rdna_gen)
diff --git a/tilelang/intrinsics/wmma_macro_generator.py b/tilelang/intrinsics/wmma_macro_generator.py
new file mode 100644
index 0000000000..811d3f1719
--- /dev/null
+++ b/tilelang/intrinsics/wmma_macro_generator.py
@@ -0,0 +1,486 @@
+"""WMMA intrinsic emitter for AMD RDNA architectures (gfx11 / gfx12).
+
+Only supports the f16->f32, 16x16x16 variant with warp-size=32.
+
+Thread-data mapping (per AMDGPU ISA):
+  gfx11:
+    - A/B: duplicated across the two half-waves, so each logical input fragment
+      is distributed over an effective wave size of 16 lanes.
+    - C/D: distributed over the full wave32 output layout.
+  gfx12:
+    - A/B: distributed over the full wave32 input layout.
+    - C/D: distributed over the full wave32 output layout.
+"""
+
+from __future__ import annotations
+
+from typing import Literal
+
+import tilelang.language as T
+from tilelang import _ffi_api
+from tilelang import tvm as tvm
+from tvm import tir
+from tvm.ir import Range
+from tvm.target import Target
+from tvm.tir import Buffer, BufferLoad, BufferRegion, IndexMap, PrimExpr, Var
+from tvm.runtime import convert
+
+from tilelang.language.utils import get_buffer_region_from_load
+from tilelang.utils import is_fragment
+from .wmma_layout import (
+    get_wmma_a_layout_funcs,
+    get_wmma_a_fragment_forward_func,
+    get_wmma_b_layout_funcs,
+    get_wmma_b_fragment_forward_func,
+    get_wmma_c_layout_funcs,
+    get_wmma_fragment_replicate_count,
+    get_wmma_store_index_map_func,
+)
+
+lift = convert
+
+
+class WMMAIntrinEmitter:
+    """Intrinsic emitter for AMD RDNA WMMA (16x16x16, warp-size=32).
+
+    Supports:
+      - fp16 -> fp32  (f32_16x16x16_f16_w32, with `_gfx12` codegen suffix on gfx12)
+    """
+
+    M_DIM = 16
+    N_DIM = 16
+    K_DIM = 16
+    WARP_SIZE = 32
+
+    def __init__(
+        self,
+        a_dtype: str = "float16",
+        b_dtype: str = "float16",
+        accum_dtype: str = "float32",
+        a_transposed: bool = False,
+        b_transposed: bool = False,
+        block_row_warps: int = 2,
+        block_col_warps: int = 2,
+        warp_row_tiles: int = 16,
+        warp_col_tiles: int = 16,
+        chunk: int = 16,
+        k_pack: int = 1,
+        thread_var: Var | None = None,
+        target: Target | None = None,
+    ):
+        assert a_dtype in ("float16", "bfloat16"), f"Unsupported a_dtype: {a_dtype}"
+        assert accum_dtype == "float32", f"Unsupported accum_dtype: {accum_dtype}"
+        assert target is not None, "WMMAIntrinEmitter requires a HIP target to select WMMA layouts."
+
+        self.a_dtype = a_dtype
+        self.b_dtype = b_dtype
+        self.accum_dtype = accum_dtype
+        self.a_transposed = a_transposed
+        self.b_transposed = b_transposed
+        self.block_row_warps = block_row_warps
+        self.block_col_warps = block_col_warps
+        self.warp_row_tiles = warp_row_tiles
+        self.warp_col_tiles = warp_col_tiles
+        self.chunk = chunk
+        self.k_pack = k_pack
+        self.thread_var = thread_var
+        self.target = target
+        self.rdna_gen = _ffi_api.TargetGetRDNAGeneration(target)
+        if self.rdna_gen == 0:
+            raise ValueError(f"Invalid RDNA target for WMMA: {target}")
+
+        self.micro_size_x = self.M_DIM
+        self.micro_size_y = self.N_DIM
+        self.micro_size_k = self.K_DIM
+
+        # gfx11 duplicates A/B across half-waves, so the effective input fragment
+        # distribution uses 16 lanes instead of the full wave32 used by gfx12.
+        input_fragment_warp_size = (self.WARP_SIZE // 2) if self.rdna_gen == 11 else self.WARP_SIZE
+        self.local_size_a = (self.M_DIM * self.K_DIM) // input_fragment_warp_size
+        self.local_size_b = (self.N_DIM * self.K_DIM) // input_fragment_warp_size
+        # C/D outputs are distributed over the full wave32 layout on both gfx11 and gfx12.
+        self.local_size_out = (self.M_DIM * self.N_DIM) // self.WARP_SIZE
+
+        self.warp_rows = warp_row_tiles // self.M_DIM
+        self.warp_cols = warp_col_tiles // self.N_DIM
+        self.threads = self.WARP_SIZE * block_row_warps * block_col_warps
+
+        self.a_forward_layout_fn, self.a_reverse_layout_fn = get_wmma_a_layout_funcs(self.rdna_gen, self.a_transposed)
+        self.a_fragment_forward_fn = get_wmma_a_fragment_forward_func(self.rdna_gen, self.a_transposed)
+        self.b_forward_layout_fn, self.b_reverse_layout_fn = get_wmma_b_layout_funcs(self.rdna_gen, self.b_transposed)
+        self.b_fragment_forward_fn = get_wmma_b_fragment_forward_func(self.rdna_gen, self.b_transposed)
+        self.c_forward_layout_fn, self.c_reverse_layout_fn = get_wmma_c_layout_funcs(self.rdna_gen)
+        self.fragment_replicate = get_wmma_fragment_replicate_count(self.rdna_gen)
+        self.store_index_map_fn = get_wmma_store_index_map_func(self.rdna_gen)
+
+        # Build the wmma shape string used by T.tvm_rdna_wmma
+        # shape = "f32_16x16x16_f16_w32" (or _gfx12 suffix is handled in codegen)
+        dtype_in_abbrv = {"float16": "f16", "bfloat16": "bf16"}[a_dtype]
+        dtype_out_abbrv = "f32"
+        self.wmma_shape = f"{dtype_out_abbrv}_{self.M_DIM}x{self.N_DIM}x{self.K_DIM}_{dtype_in_abbrv}_w{self.WARP_SIZE}"
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Thread binding helpers
+    # ─────────────────────────────────────────────────────────────────────────
+
+    def get_thread_binding(self) -> PrimExpr:
+        if self.thread_var is not None:
+            return self.thread_var
+        current_frame = T.KernelLaunchFrame.Current()
+        assert current_frame is not None, "Must be called inside T.Kernel"
+        return current_frame.get_thread_binding()
+
+    def extract_thread_binding(self, thread_id):
+        """Return (lane_id, warp_n, warp_m)."""
+        WARP_SIZE = self.WARP_SIZE
+        block_row_warps = self.block_row_warps
+        lane_id = thread_id % WARP_SIZE
+        warp_m = (thread_id // WARP_SIZE) % block_row_warps
+        warp_n = (thread_id // (WARP_SIZE * block_row_warps)) % self.block_col_warps
+        return lane_id, warp_n, warp_m
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Layout queries
+    # ─────────────────────────────────────────────────────────────────────────
+
+    def get_ldmatrix_index_map(self, is_b: bool = False):
+        """Return (forward, reverse) index maps for shared→local loading.
+
+        The actual layout functions are chosen during __init__ based on rdna_gen:
+          - gfx11 uses half-wave duplicated A/B input layouts (32x16 naming).
+          - gfx12 uses full wave32 A/B input layouts (32x8 naming).
+        """
+        if not is_b:
+            return self.a_forward_layout_fn, self.a_reverse_layout_fn
+        return self.b_forward_layout_fn, self.b_reverse_layout_fn
+
+    def get_store_index_map(self, inverse: bool = False) -> IndexMap:
+        """Return the store index map.
+
+        The forward map is (thread_id, local_id) -> (i, j), which is affine.
+        The inverse map is (i, j) -> (thread_id, local_id).
+        """
+        warp_size, local_size_c = self.WARP_SIZE, self.local_size_out
+        # forward: (thread_id, local_id) -> (row, col)
+        index_map = IndexMap.from_func(self.store_index_map_fn, index_dtype=T.int32)
+        if not inverse:
+            return index_map
+        # inverse: (row, col) -> (thread_id, local_id)
+        return index_map.inverse([warp_size, local_size_c])
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Load A from shared memory
+    # ─────────────────────────────────────────────────────────────────────────
+
+    def ldmatrix_a(self, A_local_buf, A_shared_buf, ki, rk=0):
+        warp_row_tiles = self.warp_row_tiles
+        warp_rows = self.warp_rows
+        chunk = self.chunk
+        micro_size_x = self.micro_size_x
+        micro_size_k = self.micro_size_k
+        local_size_a = self.local_size_a
+        k_pack = self.k_pack
+        is_transposed = self.a_transposed
+        thread_binding = self.get_thread_binding()
+        _, reverse_index_map = self.get_ldmatrix_index_map(is_b=False)
+
+        # legalize shared buffer to region
+        A_region = self._legalize_to_buffer_region(A_shared_buf)
+        A_buf = A_region.buffer
+        A_base0 = A_region.region[-2].min
+        A_base1 = A_region.region[-1].min
+        # Leading dimensions (e.g. pipeline stage axis) – empty for 2-D buffers
+        A_other = [r.min for r in A_region.region[:-2]]
+
+        @T.macro
+        def _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk=0):
+            tx, _, warp_m = self.extract_thread_binding(thread_binding)
+            if is_transposed:
+                for i in T.serial(warp_rows):
+                    for local_id in T.vectorized(k_pack * local_size_a):
+                        row, col = T.meta_var(reverse_index_map(tx, local_id))
+                        l, r = (rk * chunk + ki * (k_pack * micro_size_k), warp_m * warp_row_tiles + i * micro_size_x)
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[tuple(A_other) + (A_base0 + l + row, A_base1 + r + col)]
+            else:
+                for i in T.serial(warp_rows):
+                    for local_id in T.vectorized(k_pack * local_size_a):
+                        row, col = T.meta_var(reverse_index_map(tx, local_id))
+                        l, r = (warp_m * warp_row_tiles + i * micro_size_x, rk * chunk + ki * (k_pack * micro_size_k))
+                        A_local_buf[i * k_pack * local_size_a + local_id] = A_buf[tuple(A_other) + (A_base0 + l + row, A_base1 + r + col)]
+
+        return _warp_ldmatrix_a(A_local_buf, A_shared_buf, ki, thread_binding, rk)
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Load B from shared memory
+    # ─────────────────────────────────────────────────────────────────────────
+
+    def ldmatrix_b(self, B_local_buf, B_shared_buf, ki, rk=0):
+        warp_col_tiles = self.warp_col_tiles
+        warp_cols = self.warp_cols
+        chunk = self.chunk
+        micro_size_y = self.micro_size_y
+        micro_size_k = self.micro_size_k
+        local_size_b = self.local_size_b
+        k_pack = self.k_pack
+        is_transposed = self.b_transposed
+        thread_binding = self.get_thread_binding()
+        _, reverse_index_map = self.get_ldmatrix_index_map(is_b=True)
+
+        # legalize shared buffer to region
+        B_region = self._legalize_to_buffer_region(B_shared_buf)
+        B_buf = B_region.buffer
+        B_base0 = B_region.region[-2].min
+        B_base1 = B_region.region[-1].min
+        # Leading dimensions (e.g. pipeline stage axis) – empty for 2-D buffers
+        B_other = [r.min for r in B_region.region[:-2]]
+
+        @T.macro
+        def _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk=0):
+            tx, warp_n, _ = self.extract_thread_binding(thread_binding)
+            if is_transposed:
+                for j in T.serial(warp_cols):
+                    for local_id in T.vectorized(k_pack * local_size_b):
+                        row, col = T.meta_var(reverse_index_map(tx, local_id))
+                        l, r = (warp_n * warp_col_tiles + j * micro_size_y, rk * chunk + ki * (k_pack * micro_size_k))
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[tuple(B_other) + (B_base0 + l + row, B_base1 + r + col)]
+            else:
+                for j in T.serial(warp_cols):
+                    for local_id in T.vectorized(k_pack * local_size_b):
+                        row, col = T.meta_var(reverse_index_map(tx, local_id))
+                        l, r = (rk * chunk + ki * (k_pack * micro_size_k), warp_n * warp_col_tiles + j * micro_size_y)
+                        B_local_buf[j * k_pack * local_size_b + local_id] = B_buf[tuple(B_other) + (B_base0 + l + row, B_base1 + r + col)]
+
+        return _warp_ldmatrix_b(B_local_buf, B_shared_buf, ki, thread_binding, rk)
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Issue WMMA
+    # ─────────────────────────────────────────────────────────────────────────
+
+    def wmma(self, A_local_buf: Buffer, B_local_buf: Buffer, C_local_buf: Buffer, k_inner: PrimExpr | None = 0):
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_a = self.local_size_a
+        local_size_b = self.local_size_b
+        local_size_out = self.local_size_out
+        k_pack = self.k_pack
+        wmma_shape = self.wmma_shape
+        a_dtype = self.a_dtype
+        b_dtype = self.b_dtype
+        out_dtype = self.accum_dtype
+
+        # vectorized dtype strings for the intrinsic
+        compute_a_dtype = f"{a_dtype}x{local_size_a}"
+        compute_b_dtype = f"{b_dtype}x{local_size_b}"
+        compute_out_dtype = f"{out_dtype}x{local_size_out}"
+
+        a_is_fragment = is_fragment(A_local_buf)
+        b_is_fragment = is_fragment(B_local_buf)
+        a_local_stride: PrimExpr = k_inner * warp_rows * k_pack * local_size_a if a_is_fragment else 0
+        b_local_stride: PrimExpr = k_inner * warp_cols * k_pack * local_size_b if b_is_fragment else 0
+
+        @T.macro
+        def _warp_wmma(A_local_buf, B_local_buf, C_local_buf):
+            for kp, i, j in T.grid(k_pack, warp_rows, warp_cols):
+                # With hardware D layout: no A/B swap needed for either case.
+                # Both transposed and non-transposed B give correct results
+                # with A first, B second.
+                T.tvm_rdna_wmma(
+                    compute_out_dtype,
+                    wmma_shape,
+                    "row",
+                    "row",
+                    compute_a_dtype,
+                    compute_b_dtype,
+                    compute_out_dtype,
+                    A_local_buf.data,
+                    (a_local_stride + (i * k_pack + kp) * local_size_a) // local_size_a,
+                    B_local_buf.data,
+                    (b_local_stride + (j * k_pack + kp) * local_size_b) // local_size_b,
+                    C_local_buf.data,
+                    (i * warp_cols * local_size_out + j * local_size_out) // local_size_out,
+                )
+
+        return _warp_wmma(A_local_buf, B_local_buf, C_local_buf)
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Store C to shared/global memory
+    # ─────────────────────────────────────────────────────────────────────────
+
+    def stmatrix(self, C_local_buf, C_buf, pid_m=None, pid_n=None):
+        block_row_warps = self.block_row_warps
+        block_col_warps = self.block_col_warps
+        warp_rows = self.warp_rows
+        warp_cols = self.warp_cols
+        local_size_out = self.local_size_out
+        thread_binding = self.get_thread_binding()
+        is_global = pid_m is not None and pid_n is not None
+        BLOCK_M = block_row_warps * warp_rows
+        BLOCK_N = block_col_warps * warp_cols
+        M_DIM, N_DIM = self.M_DIM, self.N_DIM
+        C_buf_dims = len(C_buf.shape)
+        assert C_buf_dims in {2, 4}, "C_buf should be 2D or 4D"
+        store_index_map = self.store_index_map_fn
+
+        @T.macro
+        def _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id in T.vectorized(local_size_out):
+                    row, col = T.meta_var(store_index_map(tx, local_id))
+                    if C_buf_dims == 2:
+                        C_buf[
+                            (warp_m * warp_rows + i) * M_DIM + row,
+                            (warp_n * warp_cols + j) * N_DIM + col,
+                        ] = C_local_buf[i * (warp_cols * local_size_out) + j * local_size_out + local_id]
+                    else:
+                        C_buf[warp_m * warp_rows + i, warp_n * warp_cols + j, row, col] = C_local_buf[
+                            i * warp_cols * local_size_out + j * local_size_out + local_id
+                        ]
+
+        @T.macro
+        def _warp_stmatrix_global(C_local_buf, C_buf, thread_binding):
+            tx, warp_n, warp_m = self.extract_thread_binding(thread_binding)
+            for i, j in T.grid(warp_rows, warp_cols):
+                for local_id in T.vectorized(local_size_out):
+                    row, col = T.meta_var(store_index_map(tx, local_id))
+                    C_buf[
+                        (pid_m * BLOCK_M + warp_m * warp_rows + i) * M_DIM + row,
+                        (pid_n * BLOCK_N + warp_n * warp_cols + j) * N_DIM + col,
+                    ] = C_local_buf[i * warp_cols * local_size_out + j * local_size_out + local_id]
+
+        return (
+            _warp_stmatrix_global(C_local_buf, C_buf, thread_binding)
+            if is_global
+            else _warp_stmatrix_shared(C_local_buf, C_buf, thread_binding)
+        )
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Layout inference helpers (used by GemmWMMA.infer_layout)
+    # ─────────────────────────────────────────────────────────────────────────
+
+    def make_wmma_load_layout(self, local_buf: Buffer, matrix: Literal["A", "B"] = "A") -> T.Fragment:
+        assert is_fragment(local_buf), "local_buf must be a fragment"
+        assert matrix in ("A", "B")
+
+        matrix_is_a = matrix == "A"
+        transposed = self.a_transposed if matrix_is_a else self.b_transposed
+
+        micro_size_k = self.micro_size_k * self.k_pack
+        if matrix_is_a:
+            if transposed:
+                shape_atom = [micro_size_k, self.micro_size_x]
+            else:
+                shape_atom = [self.micro_size_x, micro_size_k]
+        else:
+            if transposed:
+                shape_atom = [self.micro_size_y, micro_size_k]
+            else:
+                shape_atom = [micro_size_k, self.micro_size_y]
+
+        """
+        gfx11 and gfx12 differ in how logical A/B fragment elements map to lanes.
+
+        gfx11 duplicates each logical A/B element across the two half-waves
+        (lane t and lane t + 16). A single-owner forward_thread_fn cannot
+        faithfully represent this one-to-many ownership, so we model it with
+        T.Fragment(..., forward_fn=..., replicate=2), where `rep` selects the
+        lower/upper half-wave copy.
+
+        gfx12 has a single unique owner for each logical element, so the
+        existing forward_thread_fn/forward_index_fn form is sufficient.
+        """
+        if self.rdna_gen == 11:
+            fragment_forward = self.a_fragment_forward_fn if matrix_is_a else self.b_fragment_forward_fn
+            assert fragment_forward is not None
+            base_fragment = T.Fragment(
+                shape_atom,
+                forward_fn=fragment_forward,
+                replicate=self.fragment_replicate,
+            )
+        else:
+            index_map, _ = self.get_ldmatrix_index_map(is_b=not matrix_is_a)
+            inverse_load_layout = IndexMap.from_func(index_map, index_dtype=T.int32)
+
+            def forward_thread(i, j):
+                lane_id, _ = inverse_load_layout.map_indices([i, j])
+                return lane_id
+
+            def forward_index(i, j):
+                _, local_id = inverse_load_layout.map_indices([i, j])
+                return local_id
+
+            base_fragment = T.Fragment(
+                shape_atom,
+                forward_thread_fn=forward_thread,
+                forward_index_fn=forward_index,
+            )
+
+        warp_s = self.warp_rows if matrix_is_a else self.warp_cols
+        warp_r = self.chunk // micro_size_k
+        block_s = self.block_row_warps if matrix_is_a else self.block_col_warps
+        block_replicate = self.block_col_warps if matrix_is_a else self.block_row_warps
+
+        if (matrix_is_a and not transposed) or (not matrix_is_a and transposed):
+            warp_fragment = base_fragment.repeat([warp_s, warp_r], repeat_on_thread=False, lower_dim_first=False)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True).replicate(block_replicate)
+            else:
+                block_fragment = warp_fragment.replicate(block_replicate).repeat([block_s, 1], repeat_on_thread=True, lower_dim_first=True)
+        else:
+            warp_fragment = base_fragment.repeat([warp_r, warp_s], repeat_on_thread=False, lower_dim_first=True)
+            if matrix_is_a:
+                block_fragment = warp_fragment.repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True).replicate(block_replicate)
+            else:
+                block_fragment = warp_fragment.replicate(block_replicate).repeat([1, block_s], repeat_on_thread=True, lower_dim_first=True)
+
+        return block_fragment
+
+    def make_wmma_store_layout(self, local_buf: Buffer) -> T.Fragment:
+        assert is_fragment(local_buf), "local_buf must be a fragment"
+        shape = local_buf.shape
+        # inverse_store_layout: (row, col) -> (thread_id, local_id) within 16x16 atom
+        inverse_store_layout = self.get_store_index_map(inverse=True)
+        micro_size_x, micro_size_y = self.micro_size_x, self.micro_size_y
+        local_size_out = self.local_size_out
+        block_row_warps, _ = self.block_row_warps, self.block_col_warps
+        warp_rows, warp_cols = self.warp_rows, self.warp_cols
+        warp_size = self.WARP_SIZE
+
+        def forward_thread(i, j):
+            block_i = (i // micro_size_x) // warp_rows
+            block_j = (j // micro_size_y) // warp_cols
+            atom_i, atom_j = i % micro_size_x, j % micro_size_y
+            lane_id, _ = inverse_store_layout.map_indices([atom_i, atom_j])
+            # block layout: [block_row_warps, block_col_warps, warp_size]
+            return block_j * (block_row_warps * warp_size) + block_i * warp_size + lane_id
+
+        def forward_index(i, j):
+            warp_i = (i // micro_size_x) % warp_rows
+            warp_j = (j // micro_size_y) % warp_cols
+            atom_i, atom_j = i % micro_size_x, j % micro_size_y
+            _, local_id = inverse_store_layout.map_indices([atom_i, atom_j])
+            return warp_i * (warp_cols * local_size_out) + warp_j * local_size_out + local_id
+
+        return T.Fragment(shape, forward_thread_fn=forward_thread, forward_index_fn=forward_index)
+
+    # ─────────────────────────────────────────────────────────────────────────
+    # Static helper
+    # ─────────────────────────────────────────────────────────────────────────
+
+    @staticmethod
+    def _legalize_to_buffer_region(obj) -> BufferRegion:
+        if isinstance(obj, BufferRegion):
+            return obj
+        if isinstance(obj, Buffer):
+            mins = [tir.IntImm("int32", 0) for _ in obj.shape]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, obj.shape)]
+            return BufferRegion(obj, ranges)
+        if isinstance(obj, BufferLoad):
+            region = get_buffer_region_from_load(obj)
+            if region is not None:
+                return region
+            mins = list(obj.indices)
+            ones = [tir.IntImm("int32", 1) for _ in obj.indices]
+            ranges = [Range.from_min_extent(m, e) for m, e in zip(mins, ones)]
+            return BufferRegion(obj.buffer, ranges)
+        raise ValueError(f"Unsupported argument type: {type(obj)}")
diff --git a/tilelang/ir.py b/tilelang/ir.py
index b4a7de5ebb..b4ae5fe7e4 100644
--- a/tilelang/ir.py
+++ b/tilelang/ir.py
@@ -4,6 +4,7 @@
 import tvm_ffi
 from tvm.target import Target
 from tilelang import _ffi_api
+from tilelang.tileop.gemm.inst import GemmInst
 
 
 @tvm_ffi.register_object("tl.Fill")
@@ -28,8 +29,8 @@ class GemmWarpPolicy(Node, Scriptable):
     m_warp: int
     n_warp: int
 
-    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, is_wgmma: bool):
-        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, is_wgmma)
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, gemm_inst: GemmInst):
+        _ffi_api.GemmWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, gemm_inst)
         return self.m_warp, self.n_warp
 
 
@@ -39,15 +40,11 @@ class GemmSPWarpPolicy(Node, Scriptable):
     m_warp: int
     n_warp: int
 
-    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, is_wgmma: bool, bits: int):
-        _ffi_api.GemmSPWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, is_wgmma, bits)
+    def compute_warp_partition(self, M: int, N: int, block_size: int, target: Target, gemm_inst: GemmInst, bits: int):
+        _ffi_api.GemmSPWarpPolicyComputeWarpPartition(self, int(M), int(N), int(block_size), target, gemm_inst, bits)
         return self.m_warp, self.n_warp
 
 
-@tvm_ffi.register_object("tl.Gemm")
-class Gemm(Node, Scriptable): ...
-
-
 @tvm_ffi.register_object("tl.GemmSP")
 class GemmSP(Node, Scriptable): ...
 
diff --git a/tilelang/jit/__init__.py b/tilelang/jit/__init__.py
index f30bae29c6..f7beac7cb7 100644
--- a/tilelang/jit/__init__.py
+++ b/tilelang/jit/__init__.py
@@ -24,12 +24,10 @@
 except ImportError:  # Python < 3.10
     from typing_extensions import ParamSpec
 from tilelang import tvm as tvm
-from tilelang.language.v2 import PrimFunc, prim_func, LazyJITFunc
+from tilelang.language.eager import PrimFunc, prim_func, JITFunc
 from tvm.target import Target
 
 from tilelang.jit.kernel import JITKernel
-from tilelang.jit.adapter.utils import is_metal_target
-from tilelang.utils.target import determine_target
 from tilelang.cache import cached
 from os import path, makedirs
 from logging import getLogger
@@ -91,19 +89,28 @@ def compile(
     """
 
     assert isinstance(func, PrimFunc), f"target function must be a PrimFunc but got {type(func)}"
-    if isinstance(compile_flags, str):
-        compile_flags = [compile_flags]
 
-    # This path is not a performance critical path, so we can afford to convert the target.
-    target = Target(determine_target(target))
-
-    if is_metal_target(target):
-        assert execution_backend == "torch", 'Currently metal target only support `tl.jit(execution_backend="torch")`'
-
-    if hasattr(func, "out_idx_override"):
-        if func.out_idx_override is not None and out_idx is not None:
+    # Merge function-level attrs from PrimFunc
+    func_attrs = func.attrs
+    if func_attrs and "tilelang_out_idx" in func_attrs:
+        func_out_idx = list(func_attrs["tilelang_out_idx"])
+        if out_idx is not None:
             raise ValueError("Out index conflict: out_idx is specified and prim_func have returned `T.empty` tensors")
-        out_idx = func.out_idx_override or out_idx
+        out_idx = func_out_idx
+    if func_attrs and "tilelang_pass_configs" in func_attrs:
+        func_pc = dict(func_attrs["tilelang_pass_configs"])
+        if pass_configs is not None:
+            # External pass_configs override function-level ones
+            func_pc.update(pass_configs)
+        pass_configs = func_pc
+    if func_attrs and "tilelang_compile_flags" in func_attrs:
+        func_cf = list(func_attrs["tilelang_compile_flags"])
+        if compile_flags is not None:
+            if isinstance(compile_flags, str):
+                func_cf.append(compile_flags)
+            else:
+                func_cf.extend(compile_flags)
+        compile_flags = func_cf
 
     return cached(
         func=func,
@@ -202,72 +209,75 @@ def par_compile(
 @dataclass
 class JITImpl(Generic[_P, _KP, _T, _Ret]):
     """
-    Detailed Just-In-Time wrapper for TileLang programs.
+    Just-In-Time compilation wrapper for TileLang programs.
+
+    This class provides a unified interface for compiling and executing TileLang
+    kernels. It supports two execution modes that are automatically inferred:
+
+    Execution Modes
+    ---------------
+    - **lazy**: The decorated function returns a PrimFunc explicitly. Calling the
+      JIT wrapper returns a compiled kernel object, which can be invoked separately.
+      This mode is useful when you want to inspect or reuse the kernel object.
+
+      Example (lazy mode)::
+
+          @tilelang.jit(out_idx=[-1])
+          def matmul(M, N, K, block_M, block_N, block_K):
+              @T.prim_func
+              def kernel(A: T.Tensor((M, K), dtype), ...):
+                  ...
+              return kernel  # explicitly return PrimFunc
+
+          kernel = matmul(1024, 1024, 1024, 128, 128, 32)  # returns kernel
+          result = kernel(a, b)  # execute separately
+
+    - **eager**: The decorated function uses the DSL builder pattern with tensor
+      type annotations. Calling the JIT wrapper compiles and immediately executes
+      the kernel, returning the result directly.
+
+      Example (eager mode)::
+
+          @tilelang.jit
+          def gemm(A, B, C, block_M: int = 64):
+              M, N, K = T.const("M N K")
+              A: T.Tensor[[M, K], dtype]  # tensor shape via annotation
+              B: T.Tensor[[K, N], dtype]
+              C: T.Tensor[[M, N], dtype]
+              with T.Kernel(...):
+                  ...
 
-    This dataclass encapsulates the configuration and runtime helpers used by the
-    top-level `jit` and `jit2` decorators. It represents a configured JIT
-    "factory" that can (a) elaborate TileLang/PrimFunc creators into concrete
-    TIR (PrimFunc), (b) compile those TIR functions into runnable kernels via
-    the TVM bridge, (c) cache compiled kernels keyed by call-site arguments
-    (and optional tuning parameters), and (d) provide parallel compilation
-    helpers for batch autotuning workflows.
+          gemm(A, B, C)  # compiles and executes immediately
+
+    The mode is automatically inferred based on whether the function returns a
+    PrimFunc (lazy) or uses the builder pattern (eager).
 
     Attributes
     ----------
     out_idx : list[int] | int | None
-        Which output tensor(s) of the compiled kernel should be returned to the
-        caller. Accepts a single index, a list of indices, or None to return all.
-    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"]
-        Backend used for exchanging arguments and executing the generated kernel.
-    target : str | tvm.target.Target
-        TVM compilation target (e.g. "cuda", "llvm", or "auto").
-    target_host : str | tvm.target.Target | None
-        Host target used for cross-compilation, or None to infer/default.
-    verbose : bool
-        Enable verbose messages during compilation/build.
+        Index(es) of output tensor(s) to return (lazy mode only).
+    execution_backend : str | None
+        Backend for kernel execution ("auto", "dlpack", "tvm_ffi", etc.).
+    target : str | Target | None
+        TVM compilation target (e.g., "cuda", "llvm", "auto").
+    target_host : str | Target | None
+        Host target for cross-compilation.
+    verbose : bool | None
+        Enable verbose compilation output.
     pass_configs : dict[str, Any] | None
-        Extra TVM pass configuration options forwarded to the compiler's
-        PassContext.
+        TVM pass configuration options.
     debug_root_path : str | None
-        If provided, compiled kernel source and the elaborated Python program
-        are written to this directory to ease debugging and inspection.
+        Directory to save compiled kernel source for debugging.
     compile_flags : list[str] | str | None
-        Additional flags passed to the compiler. A single string will be converted
-        to a single-element list.
+        Additional compiler flags.
     func_source : str
-        Original Python source string from which the PrimFunc or creator was
-        derived. Used for diagnostics and debug dumps.
+        Original Python source code of the decorated function.
     signature : inspect.Signature
-        Function signature of the original Python function (useful for tooling).
-    v2 : bool
-        Indicates whether the object wraps a "v2" PrimFunc creator (True) or a
-        plain callable / PrimFunc (False). v2-mode enables argument conversion
-        hooks and a distinct cache keying strategy.
-    func : Callable | PrimFunc | PrimFuncCreater
-        The underlying object: either a user function that returns a PrimFunc
-        (creator), a PrimFuncCreater, or an already-constructed PrimFunc.
-        For presentation/readability the function is stored last in the dataclass.
-
-    Behavioral summary
-    ------------------
-    - get_tir(*args, **kwargs)
-        Converts provided call-site arguments into a concrete PrimFunc. If the
-        wrapped object is a PrimFuncCreater or a user callable, it is invoked
-        with the given arguments. If the wrapped object is already a PrimFunc,
-        it is returned as-is.
-
-    - compile(...)
-        A convenience wrapper that elaborates and immediately compiles a single
-        PrimFunc into a JITKernel using the module-level `compile` function.
-        When `debug_root_path` is set, the compiled C kernel and the source
-        Python program are saved for inspection.
-
-    - par_compile(configs, ...)
-        Accepts an iterable of configs (either dicts mapping keyword args or
-        tuples mapping to positional args). Each config is elaborated to a
-        PrimFunc and the resulting set is compiled in parallel via the
-        module-level `par_compile` helper. Returns a list of JITKernel objects
-        in the same order as the provided configs.
+        Function signature of the original function.
+    mode : Literal["auto", "lazy", "eager"]
+        Execution mode. "auto" infers from function behavior.
+    func : JITFunc
+        The wrapped function object.
     """
 
     out_idx: list[int] | int | None
@@ -280,9 +290,9 @@ class JITImpl(Generic[_P, _KP, _T, _Ret]):
     compile_flags: list[str] | str | None
     func_source: str
     signature: inspect.Signature
-    lazy_jit: bool
+    mode: Literal["auto", "lazy", "eager"]
     # place func at the last element for better __repr__
-    func: Callable[_P, _T] | PrimFunc[_KP, _T] | LazyJITFunc[_KP, _T]
+    func: JITFunc[_KP, _T]
 
     def __post_init__(self):
         if self.debug_root_path is not None and not path.isabs(self.debug_root_path):
@@ -298,9 +308,8 @@ def get_tir(self, *args: _P.args, **kwargs: _P.kwargs) -> PrimFunc[_KP, _T]:
         """
         Retrieve a TIR (Tensor Intermediate Representation) PrimFunc from the stored callable or object.
         """
-        if isinstance(self.func, LazyJITFunc):
-            tir = self.func.get_tir(*args, **kwargs)
-        elif isinstance(self.func, PrimFunc):
+        self.initialize_jit_mode(*args, **kwargs)
+        if isinstance(self.func, PrimFunc):
             tir = self.func
         elif callable(self.func):
             tir = self.func(*args, **kwargs)
@@ -309,8 +318,34 @@ def get_tir(self, *args: _P.args, **kwargs: _P.kwargs) -> PrimFunc[_KP, _T]:
         assert isinstance(tir, PrimFunc), f"target function must be a PrimFunc but got {type(tir)}"
         return tir
 
+    def _infer_jit_mode(self, *args: _P.args, **kwargs: _P.kwargs) -> Literal["lazy", "eager"]:
+        """
+        Infer the JIT execution mode based on function behavior.
+
+        Returns "lazy" if the function explicitly returns a PrimFunc,
+        or "eager" if it uses the DSL builder pattern.
+        """
+        if self.mode in ("lazy", "eager"):
+            return self.mode
+        # auto: infer by checking if function returns PrimFunc directly
+        if not isinstance(self.func, JITFunc):
+            return "lazy"
+        is_lazy_style = self.func._is_lazy_style(*args, **kwargs)
+        return "lazy" if is_lazy_style else "eager"
+
+    def initialize_jit_mode(self, *args: _P.args, **kwargs: _P.kwargs) -> Literal["lazy", "eager"]:
+        if self.mode == "auto":
+            self.mode = self._infer_jit_mode(*args, **kwargs)
+        self.func.set_mode(self.mode)
+        if self.mode == "eager" and self.out_idx is not None:
+            raise ValueError("out_idx is only supported in lazy mode. In eager mode, use T.empty() to declare output tensors instead.")
+        return self.mode
+
     def par_compile(
-        self, configs: Iterable[dict[str, Any] | tuple[str, Any]], num_workers: int = None, ignore_error: bool = False
+        self,
+        configs: Iterable[dict[str, Any] | tuple[str, Any]],
+        num_workers: int = None,
+        ignore_error: bool = False,
     ) -> list[JITKernel[_KP, _T]]:
         """
         Parallel compile multiple TileLang PrimFunc with TVM and build JITKernels.
@@ -332,6 +367,7 @@ def par_compile(
         List[JITKernel]
             A list of compiled JITKernel objects corresponding to the provided configs.
         """
+
         configs = list(configs)
         funcs = []
         for cfg in tqdm(configs, desc="Elaborating"):
@@ -355,9 +391,9 @@ def par_compile(
         )
 
     def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
-        func = self.get_tir(*args, **kwargs)
+        prim_func = self.get_tir(*args, **kwargs)
         kernel_result = compile(
-            func,
+            prim_func,
             out_idx=self.out_idx,
             execution_backend=self.execution_backend,
             target=self.target,
@@ -372,13 +408,18 @@ def compile(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
                 func_name = self.func.attrs["global_symbol"]
             else:
                 func_name = getattr(self.func, "__name__", "jit_kernel")
-            kernel_file = f"tilelang_jit_kernel_{func_name}.c"
+
+            # cutedsl emits python executor not `c`
+            is_cutedsl = (self.execution_backend or self.target) == "cutedsl"
+            kernel_suffix = "py" if is_cutedsl else "c"
+            kernel_file = f"tilelang_jit_kernel_{func_name}.{kernel_suffix}"
+
             program_file = f"tilelang_jit_program_{func_name}.py"
             makedirs(self.debug_root_path, exist_ok=True)
             with open(path.join(self.debug_root_path, kernel_file), "w") as f:
                 print(kernel_result.get_kernel_source(), file=f)
             with open(path.join(self.debug_root_path, program_file), "w") as f:
-                print(func.script(), file=f)
+                print(prim_func.script(), file=f)
 
         return kernel_result
 
@@ -390,6 +431,10 @@ def parse_cache_key(self, *args: _P.args, **kwargs: _P.kwargs):
         key = (key_args_tuple, key_kwargs_tuple, tuned_key_kwargs_tuple)
         return key
 
+    def get_kernel_source(self, *args: _P.args, **kwargs: _P.kwargs) -> str:
+        kernel = self.compile(*args, **kwargs)
+        return kernel.get_kernel_source()
+
     def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
         # Separate out the tuning parameters from the user's kwargs
         # Whether to return the compile arguments (out_idx, target, target_host, etc.) for autotuner cache
@@ -407,22 +452,24 @@ def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
             }
             return compile_args
 
-        if self.lazy_jit:
-            kwargs.update(kwargs.pop("__tune_params", {}))
-            key, kernel_args = self.func.parse_args(*args, **kwargs)
-            kernel = self._kernel_cache.get(key, None)
-            if kernel is None:
-                kernel = self.compile(*args, **kwargs)
-                self._kernel_cache[key] = kernel
-            return kernel(*kernel_args.values())
+        kwargs.update(kwargs.pop("__tune_params", {}))
+
+        # infer mode early, before parse_args needs it
+        if self.mode == "auto":
+            self.mode = self._infer_jit_mode(*args, **kwargs)
+            self.func.set_mode(self.mode)
 
+        key, kernel_args = self.func.parse_args(*args, **kwargs)
+        kernel = self._kernel_cache.get(key, None)
+        if kernel is None:
+            kernel = self.compile(*args, **kwargs)
+            self._kernel_cache[key] = kernel
+
+        # eager mode: execute kernel immediately and return result
+        # lazy mode: return kernel object for manual invocation
+        if self.mode == "eager":
+            return kernel(*kernel_args.values())
         else:
-            key = self.parse_cache_key(*args, **kwargs)
-            tune_params = kwargs.pop("__tune_params", {})
-            kernel = self._kernel_cache.get(key, None)
-            if kernel is None:
-                kernel = self.compile(*args, **kwargs, **tune_params)
-                self._kernel_cache[key] = kernel
             return kernel
 
 
@@ -430,12 +477,12 @@ def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _Ret:
 
 
 @overload
-def jit(func: Callable[_P, PrimFunc[_KP, _T]]) -> JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]: ...
+def jit(func: Callable[_KP, _T]) -> JITImpl[_KP, _KP, _T, _T]: ...
 
 
 @overload
 def jit(
-    *,  # Indicates subsequent arguments are keyword-only
+    *,
     out_idx: Any = None,
     target: str | Target | None = None,
     target_host: str | Target | None = None,
@@ -444,13 +491,13 @@ def jit(
     pass_configs: dict[str, Any] | None = None,
     debug_root_path: str | None = None,
     compile_flags: list[str] | str | None = None,
-) -> Callable[[Callable[_P, PrimFunc[_KP, _T]]], JITImpl[_P, _KP, _T, JITKernel[_KP, _T]]]: ...
+) -> Callable[[Callable[_KP, _T]], JITImpl[_KP, _KP, _T, _T]]: ...
 
 
-def jit(  # This is the new public interface
+def jit(
     func: Callable[_P, _T] | PrimFunc | None = None,
     *,  # Indicates subsequent arguments are keyword-only
-    out_idx: Any = None,
+    out_idx: list[int] | int | None = None,
     target: str | Target | None = None,
     target_host: str | Target | None = None,
     execution_backend: ExecutionBackend | None = None,
@@ -458,115 +505,36 @@ def jit(  # This is the new public interface
     pass_configs: dict[str, Any] | None = None,
     debug_root_path: str | None = None,
     compile_flags: list[str] | str | None = None,
-):
+) -> Callable[[Callable[_P, _T]], JITImpl[_KP, _KP, _T, _T]]:
     """
-    Just-In-Time (JIT) compiler decorator for TileLang functions.
+    JIT compiler decorator for TileLang functions.
 
-    This decorator can be used without arguments (e.g., `@tilelang.jit`):
-       Applies JIT compilation with default settings.
+    Supports two execution modes (automatically inferred):
+    - **lazy**: Function returns PrimFunc explicitly. Returns compiled kernel object.
+    - **eager**: Function uses DSL builder pattern. Executes kernel immediately.
 
     Parameters
     ----------
-    func_or_out_idx : Any, optional
-        If using `@tilelang.jit(...)` to configure, this is the `out_idx` parameter.
-        If using `@tilelang.jit` directly on a function, this argument is implicitly
-        the function to be decorated (and `out_idx` will be `None`).
-    target : Union[str, Target], optional
-        Compilation target for TVM (e.g., "cuda", "llvm"). If None, reads from
-        TILELANG_TARGET environment variable (defaults to "auto").
-    target_host : Union[str, Target], optional
-        Target host for cross-compilation. Defaults to None.
-    execution_backend : Literal["auto", "dlpack", "tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"], optional
-        Backend for kernel execution and argument passing. If None, reads from
-        TILELANG_EXECUTION_BACKEND environment variable (defaults to "auto").
-    verbose : bool, optional
-        Enables verbose logging during compilation. If None, reads from
-        TILELANG_VERBOSE environment variable (defaults to False).
-    pass_configs : Optional[Dict[str, Any]], optional
-        Configurations for TVM's pass context. Defaults to None.
-    debug_root_path : Optional[str], optional
-        Directory to save compiled kernel source for debugging. Defaults to None.
-
-    Environment Variables
-    ---------------------
-    TILELANG_TARGET : str
-        Default compilation target (e.g., "cuda", "llvm"). Defaults to "auto".
-    TILELANG_EXECUTION_BACKEND : str
-        Default execution backend. Defaults to "auto".
-    TILELANG_VERBOSE : str
-        Set to "1", "true", "yes", or "on" to enable verbose compilation by default.
-
-    Returns
-    -------
-    Callable
-        Either a JIT-compiled wrapper around the input function, or a configured decorator
-        instance that can then be applied to a function.
-    """
-
-    def decorator(func: Callable[_P, _T]) -> JITImpl[_P, _T]:
-        if isinstance(func, PrimFunc):
-            orig_func = func.orig_func
-        else:
-            orig_func = func
-        return JITImpl(
-            func=func,
-            out_idx=out_idx,
-            execution_backend=execution_backend,
-            target=target,
-            target_host=target_host,
-            verbose=verbose,
-            pass_configs=pass_configs,
-            debug_root_path=debug_root_path,
-            compile_flags=compile_flags,
-            func_source=inspect.getsource(orig_func),
-            signature=inspect.signature(orig_func),
-            lazy_jit=False,
-        )
-
-    if func is not None:
-        return decorator(func)
-    else:
-        return decorator
-
-
-@overload
-def lazy_jit(func: Callable[_KP, _T]) -> JITImpl[_KP, _KP, _T, _T]: ...
-
-
-@overload
-def lazy_jit(
-    *,
-    out_idx: Any = None,
-    target: str | Target | None = None,
-    target_host: str | Target | None = None,
-    execution_backend: ExecutionBackend | None = None,
-    verbose: bool | None = None,
-    pass_configs: dict[str, Any] | None = None,
-    debug_root_path: str | None = None,
-    compile_flags: list[str] | str | None = None,
-) -> Callable[[Callable[_KP, _T]], JITImpl[_KP, _KP, _T, _T]]: ...
-
-
-def lazy_jit(
-    func: Callable[_P, _T] | PrimFunc | None = None,
-    *,  # Indicates subsequent arguments are keyword-only
-    target: str | Target | None = None,
-    target_host: str | Target | None = None,
-    execution_backend: ExecutionBackend | None = None,
-    verbose: bool | None = None,
-    pass_configs: dict[str, Any] | None = None,
-    debug_root_path: str | None = None,
-    compile_flags: list[str] | str | None = None,
-):
-    """
-    Lazy JIT compiler decorator - returns the kernel object on first call, then executes it.
-
-    Supports environment variable defaults for target, execution_backend, and verbose.
-    See `jit` documentation for parameter details and environment variables.
+    out_idx : list[int] | int | None
+        Output tensor index(es). Only supported in lazy mode.
+    target : str | Target | None
+        TVM compilation target (e.g., "cuda", "llvm", "auto").
+    target_host : str | Target | None
+        Host target for cross-compilation.
+    execution_backend : ExecutionBackend | None
+        Backend for kernel execution.
+    verbose : bool | None
+        Enable verbose compilation output.
+    pass_configs : dict[str, Any] | None
+        TVM pass configuration options.
+    debug_root_path : str | None
+        Directory to save compiled kernel source for debugging.
+    compile_flags : list[str] | str | None
+        Additional compiler flags.
     """
 
     compile_args = dict(
-        out_idx=None,
+        out_idx=out_idx,
         execution_backend=execution_backend,
         target=target,
         target_host=target_host,
@@ -577,9 +545,17 @@ def lazy_jit(
     )
 
     def decorator(func: Callable[_P, _T]):
-        pf: LazyJITFunc[_P, _T] = prim_func(func, lazy_jit=True)
+        mode = "auto"
+        pf: JITFunc[_P, _T] = prim_func(func, eager_jit=True)
+        func_source = inspect.getsource(pf.orig_func)
+        signature = inspect.signature(pf.orig_func)
+
         return JITImpl(
-            func=pf, **compile_args, func_source=inspect.getsource(pf.orig_func), signature=inspect.signature(pf.orig_func), lazy_jit=True
+            func=pf,
+            **compile_args,
+            func_source=func_source,
+            signature=signature,
+            mode=mode,
         )
 
     return decorator(func) if func is not None else decorator
diff --git a/tilelang/jit/adapter/cutedsl/adapter.py b/tilelang/jit/adapter/cutedsl/adapter.py
index 1f5d2f2fcd..7dd7917a27 100644
--- a/tilelang/jit/adapter/cutedsl/adapter.py
+++ b/tilelang/jit/adapter/cutedsl/adapter.py
@@ -173,12 +173,17 @@ def _process_dynamic_symbolic(self) -> tuple[dict[tir.Var, tuple[int, int, int]]
         buffer_map = func.buffer_map
         dynamic_symbolic_map: dict[tir.Var, tuple[int, int, int]] = {}
         dynamic_symbolic_order: list[tir.Var] = []
+        # Secondary index by variable name for fallback lookup when tir.Var
+        # object identity differs (e.g. params created from a different
+        # PrimFunc instance than the one stored in ir_module).
+        self._dynamic_symbolic_name_map: dict[str, tuple[int, int, int]] = {}
 
         def unique_push_back(v: tir.Var, entry: tuple[int, int, int]):
             if v in dynamic_symbolic_map:
                 return
             dynamic_symbolic_map[v] = entry
             dynamic_symbolic_order.append(v)
+            self._dynamic_symbolic_name_map[v.name] = entry
 
         # 1) Shapes
         for i, param in enumerate(params):
@@ -202,6 +207,19 @@ def unique_push_back(v: tir.Var, entry: tuple[int, int, int]):
 
         return dynamic_symbolic_map, dynamic_symbolic_order
 
+    def _lookup_dynamic_symbolic(self, v: tir.Var) -> tuple[int, int, int]:
+        """Look up a tir.Var in the dynamic symbolic map.
+
+        Falls back to name-based lookup when object identity doesn't match
+        (can happen when param_shapes and prim_func come from different
+        compilation stages).
+        """
+        if v in self.dynamic_symbolic_map:
+            return self.dynamic_symbolic_map[v]
+        if v.name in self._dynamic_symbolic_name_map:
+            return self._dynamic_symbolic_name_map[v.name]
+        raise KeyError(f"Dynamic symbolic variable '{v.name}' not found in symbolic map")
+
     def get_kernel_source(self, kernel_only: bool = True) -> str | None:
         """Get the CUDA kernel source code.
 
@@ -311,7 +329,7 @@ def _wrap_forward_from_prebuild_lib(self, *ins: Any, stream: int | None = None):
                 # Now working with native Python list, no FFI calls needed
                 for s in self.param_shapes[i]:
                     if isinstance(s, tir.Var):
-                        ref_id, ref_param_idx, ref_dim_idx = self.dynamic_symbolic_map[s]
+                        ref_id, ref_param_idx, ref_dim_idx = self._lookup_dynamic_symbolic(s)
                         ref_val = param_values[ref_param_idx]
                         if not isinstance(ref_val, torch.Tensor):
                             raise TypeError(f"Dynamic shape/stride var {s} refers to a non-tensor param at index {ref_param_idx}")
diff --git a/tilelang/jit/adapter/cutedsl/wrapper.py b/tilelang/jit/adapter/cutedsl/wrapper.py
index 1cd5d8e0b3..da3f2dd98d 100644
--- a/tilelang/jit/adapter/cutedsl/wrapper.py
+++ b/tilelang/jit/adapter/cutedsl/wrapper.py
@@ -177,6 +177,35 @@
   }}
 """
 
+# Cooperative kernel launch template (for sync_grid / cooperative groups)
+# Uses cuLaunchCooperativeKernel which guarantees all thread blocks are resident
+CPP_COOPERATIVE_KERNEL_LAUNCH_TEMPLATE = """\
+  // Launch kernel {kernel_idx}: {kernel_name} (cooperative)
+  {{
+    // Get the kernel for current device
+    auto kernels_it = g_device_kernels.find(device_id);
+    if (kernels_it == g_device_kernels.end()) {{
+      std::cerr << "Kernels not initialized for device " << device_id << "\\n";
+      return CUDA_ERROR_NOT_INITIALIZED;
+    }}
+    const std::vector<CUfunction>& kernels = kernels_it->second;
+
+    void* args[] = {{{kernel_args}}};
+    result = cuLaunchCooperativeKernel(
+        kernels[{kernel_idx}],
+        {grid_x}, {grid_y}, {grid_z},
+        {block_x}, {block_y}, {block_z},
+        {smem_size},
+        stream,
+        args
+    );
+    if (result != CUDA_SUCCESS) {{
+      std::cerr << "Failed to launch cooperative kernel {kernel_name} on device " << device_id << ": " << result << "\\n";
+      return result;
+    }}
+  }}
+"""
+
 # Complete C++ launcher template
 CPP_LAUNCHER_TEMPLATE = """\
 #include <cuda.h>
@@ -587,13 +616,12 @@ def _generate_cubin_if_needed({cubin_gen_params}):
       "torch.int64": cutlass.Int64,
       "torch.int32": cutlass.Int32,
       "torch.uint32": cutlass.Uint32,
-      "torch.bool": cutlass.Boolean,
+      "torch.bool": cutlass.Uint8,  # CuTeDSL only supports i1 in rmem; use u8 for gmem
       "torch.int8": cutlass.Int8,
       "torch.uint8": cutlass.Uint8,
       "torch.int16": cutlass.Int16,
       "torch.uint16": cutlass.Uint16,
-      "torch.uchar": cutlass.Uint8,
-  }}
+      "torch.uchar": cutlass.Uint8}}
 
 {cubin_gen_code}
 
@@ -660,7 +688,7 @@ class TLCuTeDSLSourceWrapper(TLCUDASourceWrapper):
         "int64": "cutlass.Int64",
         "int32": "cutlass.Int32",
         "uint32": "cutlass.Uint32",
-        "bool": "cutlass.Boolean",
+        "bool": "cutlass.Uint8",  # CuTeDSL only supports i1 in rmem; use u8 for gmem
         "int8": "cutlass.Int8",
         "uint8": "cutlass.Uint8",
         "int16": "cutlass.Int16",
@@ -683,6 +711,22 @@ class TLCuTeDSLSourceWrapper(TLCUDASourceWrapper):
         "uint16": "uint16_t",
     }
 
+    # Maps cutlass Python type names (from _TYPE_MAP values) to C++ types
+    # for generated launcher code.
+    _CUTLASS_TO_CXX: ClassVar[dict[str, str]] = {
+        "int": "int32_t",
+        "float": "float",
+        "cutlass.Float32": "float",
+        "cutlass.Float64": "double",
+        "cutlass.Int64": "int64_t",
+        "cutlass.Int32": "int32_t",
+        "cutlass.Uint32": "uint32_t",
+        "cutlass.Uint8": "uint8_t",
+        "cutlass.Int8": "int8_t",
+        "cutlass.Int16": "int16_t",
+        "cutlass.Uint16": "uint16_t",
+    }
+
     _CTYPES_MAP: ClassVar[dict[str, str]] = {
         "buffer": "ctypes.c_uint64",
         "cutlass.Float32": "ctypes.c_float",
@@ -946,13 +990,8 @@ def _generate_tma_init_func(
 
         func_args_parts = [f"uint64_t {arg}_ptr" for arg in tensor_args]
         for arg in scalar_args:
-            if arg["type"] in ["int", "cutlass.Int32"]:
-                func_args_parts.append(f"int32_t {arg['name']}")
-            elif arg["type"] in ["float", "cutlass.Float32"]:
-                func_args_parts.append(f"float {arg['name']}")
-            else:
-                # Default to int32_t for scalars used in shape/stride math
-                func_args_parts.append(f"int32_t {arg['name']}")
+            cxx_type = self._CUTLASS_TO_CXX.get(arg["type"], "int32_t")
+            func_args_parts.append(f"{cxx_type} {arg['name']}")
         func_args = ", ".join(func_args_parts)
         num_descs = len(desc_names)
 
@@ -1006,6 +1045,9 @@ def _generate_kernel_launch(self, kernel_meta: dict, kernel_idx: int, all_desc_n
         For __grid_constant__ CUtensorMap params:
         - Pass CUtensorMap* directly (not CUtensorMap**)
         - cuLaunchKernel copies 128 bytes to kernel param space
+
+        Uses cuLaunchCooperativeKernel when use_cooperative_groups is set
+        (required for sync_grid / grid-level synchronization).
         """
         call_args = kernel_meta["call_args"]
         desc_names = kernel_meta["desc_names"]
@@ -1028,9 +1070,14 @@ def _generate_kernel_launch(self, kernel_meta: dict, kernel_idx: int, all_desc_n
         block = function_info["block_info"]
         smem_size = function_info["dynamic_smem_buf"] or 0
 
-        return CPP_KERNEL_LAUNCH_TEMPLATE.format(
+        # Choose launch template based on cooperative groups requirement
+        function_name = kernel_meta["function_name"]
+        use_cooperative = self.use_cooperative_groups.get(function_name, False)
+        template = CPP_COOPERATIVE_KERNEL_LAUNCH_TEMPLATE if use_cooperative else CPP_KERNEL_LAUNCH_TEMPLATE
+
+        return template.format(
             kernel_idx=kernel_idx,
-            kernel_name=kernel_meta["function_name"],
+            kernel_name=function_name,
             kernel_args=", ".join(kernel_args),
             grid_x=self._cxx_expr(grid[0]),
             grid_y=self._cxx_expr(grid[1]),
@@ -1079,12 +1126,9 @@ def _generate_cpp_launcher(
             if arg["type"] == "buffer":
                 func_sig_parts.append(f"tvm::ffi::TensorView {arg['name']}")
                 get_ptr_code += f"  uint64_t {arg['name']}_ptr = reinterpret_cast<uint64_t>({arg['name']}.data_ptr());\n"
-            elif arg["type"] in ["int", "cutlass.Int32"]:
-                func_sig_parts.append(f"int32_t {arg['name']}")
-            elif arg["type"] in ["float", "cutlass.Float32"]:
-                func_sig_parts.append(f"float {arg['name']}")
             else:
-                func_sig_parts.append(f"int32_t {arg['name']}")
+                cxx_type = self._CUTLASS_TO_CXX.get(arg["type"], "int32_t")
+                func_sig_parts.append(f"{cxx_type} {arg['name']}")
 
         # Generate TMA init in launch
         tma_init_in_launch = self._generate_tma_launch_init(all_desc_names, all_tma_tensors, scalar_args, num_tma_descs)
@@ -1243,11 +1287,8 @@ def _process_tma_descriptors(self, desc_names: list[str]) -> tuple[list[str], di
 
         for desc_name in desc_names:
             info = self.tma_desc_info[desc_name]
-            # Extract the base buffer variable name (must be a Var, not arbitrary expression)
-            global_addr = info["globalAddress"]
-            if not isinstance(global_addr, tvm.tir.Var):
-                raise ValueError(f"TMA globalAddress must be a buffer Var, got {type(global_addr)}: {global_addr}")
-            tensor_name = global_addr.name
+            # Extract the base buffer variable name
+            tensor_name = info["globalAddress"]
 
             if tensor_name not in tensor_args:
                 tensor_args.append(tensor_name)
diff --git a/tilelang/jit/adapter/cython/adapter.py b/tilelang/jit/adapter/cython/adapter.py
index 4fa66a7f32..62bb0a6c33 100644
--- a/tilelang/jit/adapter/cython/adapter.py
+++ b/tilelang/jit/adapter/cython/adapter.py
@@ -1,7 +1,6 @@
 """The profiler and convert to torch utils"""
 
 from __future__ import annotations
-
 import ctypes
 import logging
 import torch
@@ -19,7 +18,6 @@
 from tilelang.jit.adapter.utils import is_cuda_target, is_hip_target, is_cpu_target, is_metal_target
 from tilelang.utils.target import determine_target
 from tilelang.utils.language import retrieve_func_from_module
-from tilelang.utils.tensor import map_torch_type
 
 logger = logging.getLogger(__name__)
 
@@ -52,6 +50,7 @@ class CythonKernelAdapter(BaseKernelAdapter):
     # that is not wrapped by the wrapper code
     host_kernel_source: str | None = None
     device_kernel_source: str | None = None
+    kernel_global_source: str | None = None  # Alias for device_kernel_source for compatibility
     lib: ctypes.CDLL | None = None  # Compiled library handle
     # Maps symbolic variables to their corresponding buffer and shape indices
     dynamic_symbolic_map: dict[tir.Var, tuple[int, int]] | None = None
@@ -97,6 +96,7 @@ def __init__(
         self.params = params
         self.result_idx = self._legalize_result_idx(result_idx)
         self.device_kernel_source = device_kernel_source
+        self.kernel_global_source = device_kernel_source  # Set alias for compatibility
 
         if isinstance(func_or_mod, tir.PrimFunc):
             self.ir_module = tvm.IRModule({func_or_mod.attrs["global_symbol"]: func_or_mod})
@@ -167,6 +167,7 @@ def from_database(
         adapter.result_idx = adapter._legalize_result_idx(result_idx)
         adapter.host_kernel_source = host_kernel_source
         adapter.device_kernel_source = device_kernel_source
+        adapter.kernel_global_source = device_kernel_source  # Set alias for compatibility
         adapter.pass_configs = pass_configs
 
         if isinstance(func_or_mod, tir.PrimFunc):
@@ -211,12 +212,14 @@ def from_database(
         adapter._post_init()
         return adapter
 
-    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int]]:
+    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int, int]]:
         """Extract information about dynamic shapes from the TIR function.
 
-        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
+        Maps symbolic variables to their corresponding (id, buffer_index, dimension, stride_scale)
         for runtime shape resolution.
-        id represents shape or stride, 0 represents shape, 1 represents stride
+        id represents shape or stride, 0 represents shape, 1 represents stride.
+        stride_scale compensates for sub-byte dtypes (e.g. float4_e2m1fn) where torch strides
+        are in storage units but the kernel expects logical element strides.
         """
         func = self.prim_func
         params = func.params
@@ -227,13 +230,15 @@ def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int]]:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
                     if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
-                        dynamic_symbolic_map[shape] = (0, i, j)
+                        dynamic_symbolic_map[shape] = (0, i, j, 1)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
+                element_bits = buffer.dtype.bits * buffer.dtype.lanes
+                stride_scale = 8 // element_bits if element_bits < 8 else 1
                 for j, stride in enumerate(buffer.strides):
                     if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
-                        dynamic_symbolic_map[stride] = (1, i, j)
+                        dynamic_symbolic_map[stride] = (1, i, j, stride_scale)
         return dynamic_symbolic_map
 
     def _process_buffer_dtype(self) -> dict[tir.Var, tuple[int, torch.dtype]]:
@@ -249,7 +254,7 @@ def _process_buffer_dtype(self) -> dict[tir.Var, tuple[int, torch.dtype]]:
             if param in buffer_map:
                 buffer = buffer_map[param]
                 name, dtype = buffer.name, buffer.dtype
-                buffer_dtype_map[name] = (i, map_torch_type(dtype))
+                buffer_dtype_map[name] = (i, dtype.as_torch())
         return buffer_dtype_map
 
     def _process_ptr_map(self) -> dict[int, str]:
@@ -386,3 +391,7 @@ def get_kernel_source(self, kernel_only: bool = False):
             # Wrapper only has host kernel source
             assert self.host_kernel_source is not None, "Wrapped source is not available"
             return self.host_kernel_source
+
+    def get_host_source(self):
+        """Returns the source code of the host function."""
+        return self.host_kernel_source
diff --git a/tilelang/jit/adapter/cython/cython_wrapper.pyx b/tilelang/jit/adapter/cython/cython_wrapper.pyx
index 0139be513b..03238f7808 100644
--- a/tilelang/jit/adapter/cython/cython_wrapper.pyx
+++ b/tilelang/jit/adapter/cython/cython_wrapper.pyx
@@ -6,16 +6,6 @@ import ctypes
 from libc.stdint cimport int64_t, uintptr_t
 from libc.stdlib cimport malloc, free
 from tvm import tir
-from tilelang.utils.tensor import map_torch_type
-from tilelang.env import env
-
-def _use_nvshmem():
-    """Check if NVSHMEM is enabled in the environment."""
-    val = str(env.USE_NVSHMEM).lower()
-    return val in ("1", "true", "yes", "on")
-
-if _use_nvshmem():
-    import pynvshmem
 
 cdef class CythonKernelWrapper:
     # Class attributes to store kernel configuration and library reference
@@ -208,7 +198,7 @@ cdef class CythonKernelWrapper:
                     if isinstance(s, tir.Var):
                         for key in self.dynamic_symbolic_map:
                             if(str(s) == str(key)):
-                                ref_id, ref_tensor_idx, ref_shape_idx = self.dynamic_symbolic_map[key]
+                                ref_id, ref_tensor_idx, ref_shape_idx, _stride_scale = self.dynamic_symbolic_map[key]
                                 shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
                     else:  # Already converted to Python int during initialization
                         shape.append(s)
@@ -222,10 +212,7 @@ cdef class CythonKernelWrapper:
                         f"Cannot create output tensor (name={param_name}) - 0-dimensional tensors are not supported. "
                         f"Expected shape: {shape}"
                     )
-                if _use_nvshmem():
-                    tensor = pynvshmem.nvshmem_create_tensor(shape, dtype)
-                else:
-                    tensor = torch.empty(*shape, dtype=dtype, device=device)
+                tensor = torch.empty(*shape, dtype=dtype, device=device)
             else:
                 tensor = inputs[ins_idx]
                 ins_idx += 1
@@ -278,11 +265,11 @@ cdef class CythonKernelWrapper:
             self._check_static_contiguous(tensor_list)
 
         # Add dynamic dimension values to kernel arguments
-        for _, (ref_id, buffer_idx, shape_idx) in self.dynamic_symbolic_map.items():
+        for _, (ref_id, buffer_idx, shape_idx, stride_scale) in self.dynamic_symbolic_map.items():
             if ref_id == 0:
                 call_args.append(ctypes.c_int64(tensor_list[buffer_idx].shape[shape_idx]))
             else:
-                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].stride(shape_idx)))
+                call_args.append(ctypes.c_int64(tensor_list[buffer_idx].stride(shape_idx) * stride_scale))
 
         # Add CUDA stream to kernel arguments
         call_args.append(ctypes.c_void_p(stream))
diff --git a/tilelang/jit/adapter/libgen.py b/tilelang/jit/adapter/libgen.py
index e4ab01b3b9..761fc44248 100644
--- a/tilelang/jit/adapter/libgen.py
+++ b/tilelang/jit/adapter/libgen.py
@@ -12,7 +12,7 @@
 from tilelang.transform import PassConfigKey
 from tilelang.contrib.nvcc import get_nvcc_compiler, get_target_arch, get_target_compute_version
 from tilelang.contrib.rocm import find_rocm_path, get_rocm_arch
-from tilelang import env
+from tilelang.env import TILELANG_TEMPLATE_PATH
 
 from .utils import is_cpu_target, is_cuda_target, is_hip_target
 
@@ -50,8 +50,6 @@ def load_lib(self, lib_path: str | None = None):
         return ctypes.CDLL(lib_path)
 
     def compile_lib(self, timeout: float = None):
-        disable_rdc = self.pass_configs.get(PassConfigKey.TL_DISABLE_RDC, False)
-
         target = self.target
         verbose = self.verbose
         if is_cuda_target(target):
@@ -64,6 +62,8 @@ def compile_lib(self, timeout: float = None):
             enable_fast_math = self.pass_configs.get(PassConfigKey.TL_ENABLE_FAST_MATH, False)
 
             ptxas_usage_level = self.pass_configs.get(PassConfigKey.TL_PTXAS_REGISTER_USAGE_LEVEL, None)
+            if ptxas_usage_level is not None:
+                ptxas_usage_level = int(ptxas_usage_level)
             verbose_ptxas_output = self.pass_configs.get(PassConfigKey.TL_ENABLE_PTXAS_VERBOSE_OUTPUT, False)
 
             command = [
@@ -117,23 +117,15 @@ def compile_lib(self, timeout: float = None):
 
             command = [get_cplus_compiler(), "-std=c++17", "-fPIC", "-shared", src.name]
             command += [
-                "-I" + env.TILELANG_TEMPLATE_PATH,
+                "-I" + TILELANG_TEMPLATE_PATH,
             ]
         else:
             raise ValueError(f"Unsupported target: {target}")
 
         command += [
-            "-I" + env.TILELANG_TEMPLATE_PATH,
+            "-I" + TILELANG_TEMPLATE_PATH,
         ]
 
-        if env.USE_NVSHMEM and is_cuda_target(target):
-            assert env.NVSHMEM_INCLUDE_DIR is not None, "env.NVSHMEM_INCLUDE_DIR is not set"
-            assert env.NVSHMEM_LIB_PATH is not None, "env.NVSHMEM_LIB_PATH is not set"
-            command += ["-diag-suppress=20013"]
-            if not disable_rdc:
-                command += ["-rdc=true"]
-            command += ["-I" + env.NVSHMEM_INCLUDE_DIR, "-L" + env.NVSHMEM_LIB_PATH, "-lnvshmem_host", "-lnvshmem_device"]
-
         if self.compile_flags:
             command += [item for flag in self.compile_flags for item in flag.split() if item not in command]
 
@@ -141,6 +133,7 @@ def compile_lib(self, timeout: float = None):
 
         src.write(self.lib_code)
         src.flush()
+
         try:
             if verbose:
                 print(f"compile_lib compilation command: {' '.join(command)}")
diff --git a/tilelang/jit/adapter/nvrtc/adapter.py b/tilelang/jit/adapter/nvrtc/adapter.py
index 083c8f215d..d7626393b1 100644
--- a/tilelang/jit/adapter/nvrtc/adapter.py
+++ b/tilelang/jit/adapter/nvrtc/adapter.py
@@ -169,13 +169,29 @@ def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
         params = func.params
         buffer_map = func.buffer_map
         dynamic_symbolic_map = {}
+        # Secondary index by variable name for fallback lookup when tir.Var
+        # object identity differs (e.g. params created from a different
+        # PrimFunc instance than the one stored in ir_module).
+        self._dynamic_symbolic_name_map: dict[str, tuple[int, int]] = {}
         for i, param in enumerate(params):
             buffer = buffer_map[param]
             for j, shape in enumerate(buffer.shape):
                 if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map):
                     dynamic_symbolic_map[shape] = (i, j)
+                    self._dynamic_symbolic_name_map[shape.name] = (i, j)
         return dynamic_symbolic_map
 
+    def _lookup_dynamic_symbolic(self, v: tir.Var) -> tuple[int, int]:
+        """Look up a tir.Var in the dynamic symbolic map.
+
+        Falls back to name-based lookup when object identity doesn't match.
+        """
+        if v in self.dynamic_symbolic_map:
+            return self.dynamic_symbolic_map[v]
+        if v.name in self._dynamic_symbolic_name_map:
+            return self._dynamic_symbolic_name_map[v.name]
+        raise KeyError(f"Dynamic symbolic variable '{v.name}' not found in symbolic map")
+
     def get_kernel_source(self, kernel_only: bool = True) -> str | None:
         """Get the CUDA kernel source code.
 
@@ -224,7 +240,7 @@ def _wrap_forward_from_prebuild_lib(self, *ins: list[torch.Tensor], stream: int
                 # Now working with native Python list, no FFI calls needed
                 for s in self.param_shapes[i]:
                     if isinstance(s, tir.Var):
-                        ref_tensor_idx, ref_shape_idx = self.dynamic_symbolic_map[s]
+                        ref_tensor_idx, ref_shape_idx = self._lookup_dynamic_symbolic(s)
                         shape.append(ins[ref_tensor_idx].shape[ref_shape_idx])
                     else:  # Already converted to Python int during initialization
                         shape.append(s)
diff --git a/tilelang/jit/adapter/nvrtc/kernel_cache.py b/tilelang/jit/adapter/nvrtc/kernel_cache.py
index 754ab61479..fb71b76418 100644
--- a/tilelang/jit/adapter/nvrtc/kernel_cache.py
+++ b/tilelang/jit/adapter/nvrtc/kernel_cache.py
@@ -8,6 +8,9 @@ class NVRTCKernelCache(KernelCache):
     kernel_lib_path = "kernel.cubin"
     kernel_py_path = "kernel.py"
 
+    def _get_required_files(self, cache_path: str) -> list[str]:
+        return super()._get_required_files(cache_path) + [os.path.join(cache_path, self.kernel_py_path)]
+
     def _save_so_cubin_to_disk(self, kernel: JITKernel, cache_path: str, verbose: bool = False):
         src_lib_path = kernel.adapter.libpath
         kernel_py_path = os.path.join(cache_path, self.kernel_py_path)
diff --git a/tilelang/jit/adapter/nvrtc/libgen.py b/tilelang/jit/adapter/nvrtc/libgen.py
index 406cc44d97..f25152ddfc 100644
--- a/tilelang/jit/adapter/nvrtc/libgen.py
+++ b/tilelang/jit/adapter/nvrtc/libgen.py
@@ -197,6 +197,14 @@ def compile_lib(self, timeout: float | None = None):
                 f"-I{cuda_home}/targets/{target_arch}/include/cccl",
                 f"-D__CUDACC_VER_MAJOR__={__CUDACC_VER_MAJOR__}",
             ]
+
+            # Add CUDA C++ standard library include path.
+            # CUDA 13+ uses the CCCL-based cuda::std layout, while older versions use the legacy path.
+            if __CUDACC_VER_MAJOR__ >= 13:
+                options += [f"-I{cuda_home}/targets/{target_arch}/include/cccl/cuda/std"]
+            else:
+                options += [f"-I{cuda_home}/targets/{target_arch}/include/cuda/std"]
+
             if self.compile_flags:
                 options += [item for flag in self.compile_flags for item in flag.split() if item not in options]
 
diff --git a/tilelang/jit/adapter/nvrtc/wrapper.py b/tilelang/jit/adapter/nvrtc/wrapper.py
index 2316823ec6..07d21da0b0 100644
--- a/tilelang/jit/adapter/nvrtc/wrapper.py
+++ b/tilelang/jit/adapter/nvrtc/wrapper.py
@@ -42,6 +42,8 @@
     cuuint64_t,
     cuuint32_t,
     CUkernel,
+    CUlaunchAttribute,
+    CUlaunchAttributeID,
 )
 import ctypes
 
@@ -169,6 +171,16 @@ def call({}):
         raise RuntimeError(f"Failed to restore L2 cache size limit: {{res}}")
 """
 
+PDL_SYNC_PY = """
+    num_attrs = 1
+    attrs = [CUlaunchAttribute()]
+    attrs[0].id = CUlaunchAttributeID.CU_LAUNCH_ATTRIBUTE_PROGRAMMATIC_STREAM_SERIALIZATION
+    attrs[0].value.programmaticStreamSerializationAllowed = 1
+
+    config.numAttrs = num_attrs
+    config.attrs = attrs
+"""
+
 KERNEL_LAUNCH_FUNC_PY = """
     res = cuKernelSetAttribute(
         CUfunction_attribute.CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES,
@@ -188,6 +200,7 @@ def call({}):
     config.blockDimZ = {6}
     config.sharedMemBytes = {7}
     config.hStream = stream
+    {11}
 
     arg_values = {8}
     arg_types = {9}
@@ -403,6 +416,8 @@ def transform_nvrtc_arg(name: str, arg_type: str):
             init_l2_persistent_map = self.generate_l2_persistent_map(function_name)
             kernel_launch_code += init_l2_persistent_map
 
+            pdl_sync_code = self.generate_pdl_sync_code(function_name)
+
             # Generate kernel launch code
             kernel_launch_code += KERNEL_LAUNCH_FUNC_PY.format(
                 function_name,
@@ -416,6 +431,7 @@ def transform_nvrtc_arg(name: str, arg_type: str):
                 arg_names,
                 arg_types,
                 device_index,
+                pdl_sync_code,
             )
 
         # Reset L2 persistent map after all kernel execution
@@ -456,6 +472,15 @@ def generate_l2_persistent_map(self, function_name: str) -> str:
 
         return init_l2_persistent_map
 
+    def generate_pdl_sync_code(self, function_name: str) -> str:
+        """
+        Generate Python code to insert PDL synchronization for a given kernel.
+        """
+        if function_name not in self.pdl_sync_map:
+            return ""
+
+        return PDL_SYNC_PY
+
     def generate_tma_descriptor_args(self, desc_name_map: dict[str, str], desc_name_var_map: dict[str, tvm.tir.Var]) -> str:
         """Generate Python code to initialize TMA descriptors.
 
diff --git a/tilelang/jit/adapter/tvm_ffi.py b/tilelang/jit/adapter/tvm_ffi.py
index 533969f797..6bf03f8a24 100644
--- a/tilelang/jit/adapter/tvm_ffi.py
+++ b/tilelang/jit/adapter/tvm_ffi.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 from typing import Callable, Any
+import sys
 
 import torch
 from tilelang import tvm
@@ -19,7 +20,15 @@
 from tilelang.jit.adapter.base import BaseKernelAdapter
 from tilelang.utils.language import retrieve_func_from_module
 from tilelang.engine.param import KernelParam
-from tilelang.language.v2.dtypes import dtype
+from tilelang.language.dtypes import dtype
+
+
+COMPILE_ARGS = {}
+
+if sys.platform == "darwin":
+    from torch.utils import cpp_extension
+
+    COMPILE_ARGS["options"] = ["-x", "objective-c++", "-g", "-std=gnu++17"] + ["-I" + i for i in cpp_extension.include_paths()]
 
 
 class TVMFFIKernelAdapter(BaseKernelAdapter):
@@ -97,15 +106,18 @@ def __init__(
         self.pass_configs = pass_configs
         self.compile_flags = compile_flags
         self.dynamic_symbolic_map = self._process_dynamic_symbolic()
+        self.kernel_global_source = self.device_kernel_source
 
         self._post_init()
 
-    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
+    def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int, int, int]]:
         """Extract information about dynamic shapes from the TIR function.
 
-        Maps symbolic variables to their corresponding (id, buffer_index, dimension)
+        Maps symbolic variables to their corresponding (id, buffer_index, dimension, stride_scale)
         for runtime shape resolution.
-        id represents shape or stride, 0 represents shape, 1 represents stride
+        id represents shape or stride, 0 represents shape, 1 represents stride, 2 represents scalar param.
+        stride_scale compensates for sub-byte dtypes (e.g. float4_e2m1fn) where torch strides
+        are in storage units but the kernel expects logical element strides.
         """
         func = self.prim_func
         params = func.params
@@ -113,19 +125,21 @@ def _process_dynamic_symbolic(self) -> dict[tir.Var, tuple[int, int]]:
         dynamic_symbolic_map = {}
         for i, param in enumerate(params):
             if isinstance(param, tir.Var) and (param not in dynamic_symbolic_map):
-                dynamic_symbolic_map[param] = (2, i, -1)
+                dynamic_symbolic_map[param] = (2, i, -1, 1)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
                 for j, shape in enumerate(buffer.shape):
                     if isinstance(shape, tir.Var) and (shape not in dynamic_symbolic_map) and (shape not in params):
-                        dynamic_symbolic_map[shape] = (0, i, j)
+                        dynamic_symbolic_map[shape] = (0, i, j, 1)
         for i, param in enumerate(params):
             if param in buffer_map:
                 buffer = buffer_map[param]
+                element_bits = buffer.dtype.bits * buffer.dtype.lanes
+                stride_scale = 8 // element_bits if element_bits < 8 else 1
                 for j, stride in enumerate(buffer.strides):
                     if isinstance(stride, tir.Var) and (stride not in dynamic_symbolic_map) and (stride not in params):
-                        dynamic_symbolic_map[stride] = (1, i, j)
+                        dynamic_symbolic_map[stride] = (1, i, j, stride_scale)
         return dynamic_symbolic_map
 
     def _convert_torch_func(self) -> Callable[..., Any]:
@@ -159,6 +173,9 @@ def _convert_torch_func(self) -> Callable[..., Any]:
 
         if self.executable is None:
             self.executable = runtime.Executable(self.rt_mod)
+            if COMPILE_ARGS:
+                # Precompile jit module with extra arguments
+                self.executable.jit(**COMPILE_ARGS)
 
         dynamic_symbolic_map = self._process_dynamic_symbolic()
         executable = self.executable
@@ -179,29 +196,6 @@ def _convert_torch_func(self) -> Callable[..., Any]:
                 expected_dtype_strs.append(None)
                 is_buffer_param.append(False)
 
-        # Map torch dtype to TVM-style dtype string
-        def torch_dtype_to_tvm_str(dtype: torch.dtype) -> str:
-            try:
-                import torch as _torch
-            except Exception:  # pragma: no cover
-                # Fallback, though torch should always be available here
-                return str(dtype)
-            fp8_e4m3fn = getattr(_torch, "float8_e4m3fn", None)
-            fp8_e4m3fnuz = getattr(_torch, "float8_e4m3fnuz", None)
-            fp8_e5m2 = getattr(_torch, "float8_e5m2", None)
-            fp8_e5m2fnuz = getattr(_torch, "float8_e5m2fnuz", None)
-            if fp8_e4m3fn is not None and dtype == fp8_e4m3fn:
-                return "float8_e4m3"
-            if fp8_e4m3fnuz is not None and dtype == fp8_e4m3fnuz:
-                return "float8_e4m3fnuz"
-            if fp8_e5m2 is not None and dtype == fp8_e5m2:
-                return "float8_e5m2"
-            if fp8_e5m2fnuz is not None and dtype == fp8_e5m2fnuz:
-                return "float8_e5m2"
-            # Strip torch. prefix for readability
-            s = str(dtype)
-            return s[6:] if s.startswith("torch.") else s
-
         def func(*inputs: torch.Tensor | Any):
             # Validate input count strictly
             expected_inputs = len(self.params) - len(self.result_idx)
@@ -226,13 +220,13 @@ def func(*inputs: torch.Tensor | Any):
                         if isinstance(s, tir.Var):
                             for key in dynamic_symbolic_map:
                                 if str(s) == str(key):
-                                    ref_id, ref_tensor_idx, ref_shape_idx = dynamic_symbolic_map[key]
+                                    ref_id, ref_tensor_idx, ref_shape_idx, stride_scale = dynamic_symbolic_map[key]
                                     if ref_id == 2:
                                         shape.append(inputs[ref_tensor_idx])
                                     elif ref_id == 0:
                                         shape.append(tensor_list[ref_tensor_idx].shape[ref_shape_idx])
                                     elif ref_id == 1:
-                                        shape.append(tensor_list[ref_tensor_idx].stride()[ref_shape_idx])
+                                        shape.append(tensor_list[ref_tensor_idx].stride()[ref_shape_idx] * stride_scale)
                         else:  # Already converted to Python int during initialization
                             shape.append(s)
 
@@ -291,6 +285,8 @@ def from_database(
         adapter.target = Target.canon_target(determine_target(target))
 
         adapter.verbose = verbose
+        adapter.libpath = kernel_lib_path
+        adapter.kernel_global_source = device_kernel_source
         adapter.executable = runtime.load_module(kernel_lib_path)
         adapter._post_init()
         return adapter
@@ -314,30 +310,8 @@ def get_kernel_source(self, kernel_only: bool = False):
         else:
             return self.get_device_source() + "\n\n" + self.get_host_source()
 
-    @property
-    def prim_func(self) -> tir.PrimFunc:
-        """Returns the primary TIR function from the IR module."""
-        return retrieve_func_from_module(self.ir_module)
-
     def init_table(self, host_table_ptr: int, table_size: int, stream: int = 0) -> int:
-        """Initialize distributed table by copying host data to device meta_data symbol.
-
-        This method is used for TileScale distributed kernels to initialize the
-        meta_data constant memory with rank information and remote base pointers.
-
-        Args:
-            host_table_ptr: Pointer to host table data (as int)
-            table_size: Number of uint64_t elements in the table
-            stream: CUDA stream pointer (as int), default 0 for default stream
-
-        Returns:
-            0 on success, non-zero on failure
-
-        Raises:
-            RuntimeError: If rt_mod is not available or init_table function not found
-        """
-        # Must use the jitted module (the same one the kernel will run from)
-        # to ensure we write to the same CUmodule's meta_data that the kernel reads.
+        """Initialize distributed table by copying host data to device meta_data symbol."""
         if self.executable is None:
             if self.rt_mod is None:
                 raise RuntimeError("rt_mod is not available for init_table")
@@ -346,12 +320,15 @@ def init_table(self, host_table_ptr: int, table_size: int, stream: int = 0) -> i
         if isinstance(self.executable, runtime.Executable):
             jitted_mod = self.executable.jit()
         else:
-            # from_database path: executable is already a loaded Module
             jitted_mod = self.executable
 
         init_table_func = jitted_mod.get_function("__tilescale_init_table", query_imports=True)
         if init_table_func is None:
             raise RuntimeError("__tilescale_init_table function not found in module")
 
-        # Call the TVM FFI function
         return init_table_func(host_table_ptr, table_size, stream)
+
+    @property
+    def prim_func(self) -> tir.PrimFunc:
+        """Returns the primary TIR function from the IR module."""
+        return retrieve_func_from_module(self.ir_module)
diff --git a/tilelang/jit/adapter/utils.py b/tilelang/jit/adapter/utils.py
index d43adf840a..5c730b3790 100644
--- a/tilelang/jit/adapter/utils.py
+++ b/tilelang/jit/adapter/utils.py
@@ -432,6 +432,7 @@ def parse_tma_descriptor_args(
         if not isinstance(tensor_rank, int) or tensor_rank <= 0:
             raise ValueError(f"Invalid tensor_rank: {tensor_rank}. Must be a positive integer")
 
+        global_address = pythonic_expr_func(global_address)
         params = TMADescriptorParams(handle_name, dtype, tensor_rank, global_address, is_img2col)
 
         if not is_img2col:
diff --git a/tilelang/jit/adapter/wrapper.py b/tilelang/jit/adapter/wrapper.py
index f08587da56..3b15f2609a 100644
--- a/tilelang/jit/adapter/wrapper.py
+++ b/tilelang/jit/adapter/wrapper.py
@@ -1,7 +1,6 @@
 from __future__ import annotations
 from abc import ABC, abstractmethod
 from tilelang import tvm as tvm
-from tilelang import env
 from typing import Any
 from tvm import IRModule
 from tvm.target import Target
@@ -33,8 +32,26 @@
 """
 
 PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY_HIP = """
-    if ({1} > 65536) {{
-        snprintf(error_buf, ERROR_BUF_SIZE, "Failed to set the allowed dynamic shared memory size for {0} to %d", {1});
+    int device_{0} = 0;
+    hipError_t dev_res_{0} = hipGetDevice(&device_{0});
+    if (dev_res_{0} != hipSuccess) {{
+        snprintf(error_buf, ERROR_BUF_SIZE, "Failed to get HIP device for {0}: %s", hipGetErrorString(dev_res_{0}));
+        return -1;
+    }}
+    int max_smem_{0} = 0;
+    hipError_t attr_res_{0} = hipDeviceGetAttribute(&max_smem_{0}, hipDeviceAttributeMaxSharedMemoryPerBlock, device_{0});
+    if (attr_res_{0} != hipSuccess || max_smem_{0} <= 0) {{
+        snprintf(error_buf, ERROR_BUF_SIZE, "Failed to query HIP max shared memory for {0}: %s", hipGetErrorString(attr_res_{0}));
+        return -1;
+    }}
+    if ({1} > max_smem_{0}) {{
+        snprintf(
+            error_buf,
+            ERROR_BUF_SIZE,
+            "Requested dynamic shared memory %d exceeds device limit %d for {0}",
+            {1},
+            max_smem_{0}
+        );
         return -1;
     }}
     return 0;
@@ -55,28 +72,6 @@
 }}
 """
 
-PREDEF_INIT_TABLE_FUNC = """
-extern "C" int init_table(const void* host_table, size_t n, cudaStream_t stream) {{
-    if (error_buf) error_buf[0] = '\\0';
-
-    if (host_table == nullptr) {{
-        if (error_buf) std::snprintf(error_buf, 256, "host_table is null");
-        return -1;
-    }}
-    if (n == 0) {{
-        return 0;
-    }}
-
-    size_t bytes = n * sizeof(uint64_t);
-    cudaError_t err = cudaMemcpyToSymbolAsync(meta_data, host_table, bytes, 0, cudaMemcpyHostToDevice, stream);
-    if (err != cudaSuccess) {{
-        if (error_buf) std::snprintf(error_buf, 256, "cudaMemcpyToSymbolAsync failed: %s", cudaGetErrorString(err));
-        return static_cast<int>(err);
-    }}
-    return 0;
-}}
-"""
-
 PREDEF_HOST_FUNC = """
 extern "C" int call({}) {{
 {}
@@ -161,6 +156,46 @@
 \t}}
 """
 
+KERNEL_LAUNCH_FUNC_CODE = """
+\t{{
+\t\tcudaLaunchConfig_t config;
+\t\tcudaLaunchAttribute attribute[1];
+\t\tattribute[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+\t\tattribute[0].val.programmaticStreamSerializationAllowed = 1;
+\t\tconfig.attrs = attribute;
+\t\tconfig.numAttrs = 1;
+\t\tconfig.stream = stream;
+\t\tconfig.gridDim = {0};
+\t\tconfig.blockDim = {1};
+\t\tconfig.dynamicSmemBytes = {2};
+\t\tcudaLaunchKernelEx(&config, {4}, {3});
+\t}}
+"""
+
+# Cluster launch code for SM90+
+KERNEL_CLUSTER_LAUNCH_FUNC_CODE = """
+\t{{
+\t\tcudaLaunchConfig_t config;
+\t\tcudaLaunchAttribute attribute[2];
+\t\tattribute[0].id = cudaLaunchAttributeClusterDimension;
+\t\tattribute[0].val.clusterDim = {{{5}, {6}, {7}}};
+\t\tattribute[1].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+\t\tattribute[1].val.programmaticStreamSerializationAllowed = 1;
+\t\tconfig.attrs = attribute;
+\t\tconfig.numAttrs = 2;
+\t\tconfig.stream = stream;
+\t\tconfig.gridDim = {0};
+\t\tconfig.blockDim = {1};
+\t\tconfig.dynamicSmemBytes = {2};
+\t\tcudaError_t cluster_attr_result = cudaFuncSetAttribute({4}, cudaFuncAttributeNonPortableClusterSizeAllowed, 1);
+\t\tif (cluster_attr_result != cudaSuccess) {{
+\t\t\tsnprintf(error_buf, ERROR_BUF_SIZE, "Failed to set cluster attribute for {4}: %s", cudaGetErrorString(cluster_attr_result));
+\t\t\treturn -1;
+\t\t}}
+\t\tcudaLaunchKernelEx(&config, {4}, {3});
+\t}}
+"""
+
 
 class BaseWrapper(ABC):
     @abstractmethod
@@ -189,7 +224,6 @@ class TLCUDASourceWrapper:
         "int16": "int16_t",
         "uint16": "uint16_t",
         "uchar": "uint8_t",
-        "uint64": "uint64_t",
     }
 
     backend = "tl"
@@ -217,9 +251,8 @@ def __init__(
         self.block_info: list[int] | dict = [1, 1, 1]
         self.grid_info: list[int] | dict = [1, 1, 1]
         self.tma_descriptor_args: dict | None = None
-        self.use_distributed = env.USE_DISTRIBUTED
-        self.use_nvshmem = env.USE_NVSHMEM
         self.l2_persistent_map: dict[str, dict] | None = {}
+        self.pdl_sync_map: dict[str, int] | None = {}
         self.parse_source_information()
         self.srcpath: str | None = None
         self.libpath: str | None = None
@@ -292,7 +325,7 @@ def create_dispatch_func(self, code, function_informations):
             index = match_declare_kernel(code, function_name + "(")
 
             # Analyze the function declaration to prepare for argument extraction
-            declaration = code[index:].split(";")[0]
+            declaration = self.get_declaration(code[index:])
 
             # Identify the start of the function body to insert arguments
             index = code.index("{", index)
@@ -316,6 +349,7 @@ def create_dispatch_func(self, code, function_informations):
                 call_args = f"\tvoid* {function_name}_args[] = {{{', '.join(args_array)}}};\n"
                 kernel_launch_code += call_args
                 # Using cudaLaunchCooperativeKernel to launch the kernel
+                assert self.cluster_dims[function_name] is None, "Cluster launch is not supported for cooperative groups"
                 kernel_launch_code += "\tTILELANG_CHECK(cudaLaunchCooperativeKernel((void*){}, {}, {}, {}, {}, stream));\n".format(
                     function_name, grid_str, block_str, function_name + "_args", smem_str
                 )
@@ -324,9 +358,15 @@ def create_dispatch_func(self, code, function_informations):
                 assert len(function_params) == len(args_list), (
                     f"Function {function_name} has {len(function_params)} parameters, but {len(args_list)} arguments"
                 )
+
                 call_args = ", ".join(args_list)
-                kernel_launch_code += f"\t{function_name}<<<{grid_str}, {block_str}, {smem_str}, stream>>>({call_args});\n"
+                kernel_code = self.get_kernel_launch_code(
+                    function_name, grid_str, block_str, smem_str, call_args, self.cluster_dims[function_name]
+                )
+
+                kernel_launch_code += kernel_code
                 kernel_launch_code += f'\tTILELANG_CHECK_LAST_ERROR("{function_name}");\n'
+
             if has_l2_persistent_map:
                 kernel_launch_code += L2_PERSISTENT_MAP_RESET_HANDLE
 
@@ -337,6 +377,9 @@ def create_dispatch_func(self, code, function_informations):
         host_func = PREDEF_HOST_FUNC.format(def_args, kernel_launch_code)
         return host_func
 
+    def get_declaration(self, declare_kernel_code: str) -> str:
+        return declare_kernel_code.split(";")[0]
+
     def generate_l2_persistent_map(self, function_name: str) -> str:
         if function_name not in self.l2_persistent_map:
             return ""
@@ -415,10 +458,12 @@ def parse_source_information(self):
         dynamic_smem_buf_map = {}
         function_names = []
         use_cooperative_groups_map = {}
+        cluster_dims_map = {}
         for g_var, func in self.device_mod.functions.items():
             # Default block and grid configurations
             block_info = [1, 1, 1]
             grid_info = [1, 1, 1]
+            cluster_dims = None
             function_name = g_var.name_hint
             attrs = func.attrs
             dynamic_smem_buf = None
@@ -435,11 +480,20 @@ def parse_source_information(self):
                         block_info["xyz".index(tag[-1])] = extent
                     elif "blockIdx" in tag:
                         grid_info["xyz".index(tag[-1])] = extent
+            if "cluster_dims" in attrs:
+                # Extract cluster dimensions for SM90+ cluster launch
+                cluster_dims_attr = attrs["cluster_dims"]
+                cluster_dims = [int(cluster_dims_attr[i]) for i in range(len(cluster_dims_attr))]
+
+            if "has_cuda_pdl_sync" in attrs:
+                self.pdl_sync_map[function_name] = 0
+
             # Map the extracted configurations to each function
             block_info_map[function_name] = block_info
             grid_info_map[function_name] = grid_info
             dynamic_smem_buf_map[function_name] = dynamic_smem_buf
             use_cooperative_groups_map[function_name] = use_cooperative_groups
+            cluster_dims_map[function_name] = cluster_dims
             function_names.append(function_name)
 
         # Store the mappings for use in code generation
@@ -447,9 +501,11 @@ def parse_source_information(self):
         self.grid_info = grid_info_map
         self.dynamic_smem_buf = dynamic_smem_buf_map
         self.use_cooperative_groups = use_cooperative_groups_map
+        self.cluster_dims = cluster_dims_map
 
         function_names_index = {}
-        for _, func in self.host_mod.functions.items():
+        for g_var, func in self.host_mod.functions.items():
+            function_name = g_var.name_hint
             if "tma_descriptor_args" in func.attrs:
                 self.tma_descriptor_args = func.attrs["tma_descriptor_args"]
             if "l2_persistent_map" in func.attrs:
@@ -457,7 +513,10 @@ def parse_source_information(self):
 
             host_code = str(func)
             for function_name in function_names:
-                index = host_code.index(f'T.call_packed("{function_name}"')
+                try:
+                    index = host_code.index(f'T.call_packed("{function_name}"')
+                except ValueError:
+                    index = host_code.index(f'value="{function_name}"')
                 function_names_index[function_name] = index
         # sort function_names
         function_names = sorted(function_names, key=lambda x: function_names_index[x])
@@ -490,6 +549,12 @@ def unique_push_back(name: str, dtype: str):
 
         return list(dynamic_symbolic_set.items())
 
+    def get_kernel_launch_code(self, function_name, grid_str, block_str, smem_str, call_args, cluster_dims):
+        if cluster_dims is None:
+            return KERNEL_LAUNCH_FUNC_CODE.format(grid_str, block_str, smem_str, call_args, function_name)
+        else:
+            return KERNEL_CLUSTER_LAUNCH_FUNC_CODE.format(grid_str, block_str, smem_str, call_args, function_name, *cluster_dims)
+
     def get_init_func(self):
         # Initialize an empty string for the CUDA function call
         call_str = """"""
@@ -498,12 +563,8 @@ def get_init_func(self):
             if dynamic_smem_buf is not None:
                 # Format the cudaFuncSetAttribute call for dynamic shared memory
                 call_str += PREDEF_ATTRIBUTE_SET_DYNAMIC_MEMORY.format(function_name, dynamic_smem_buf)
-        # Add NVSHMEM init if needed
-        nvshmem_init_str = "nvshmem_init();\n\t" if self.use_nvshmem else ""
         # Format the initialization function using the call_str
-        init_funcs = PREDEF_INIT_FUNC.format(nvshmem_init_str + call_str)
-        if self.use_distributed:
-            init_funcs += PREDEF_INIT_TABLE_FUNC
+        init_funcs = PREDEF_INIT_FUNC.format(call_str)
         return init_funcs
 
     def update_lib_code(self, code: str):
@@ -546,6 +607,7 @@ def visitor(node, fn=function_name, param_cnt=kernel_params_cnt):
                 "grid_info": self.grid_info[function_name],
                 "dynamic_smem_buf": self.dynamic_smem_buf[function_name],
                 "function_params": function_params,
+                "cluster_dims": self.cluster_dims.get(function_name, None),
             }
 
         # Create the host function wrapper for the CUDA kernel
@@ -609,12 +671,14 @@ class TLHIPSourceWrapper(TLCUDASourceWrapper):
         "float8_e4m3": "fp8_e4_t",
         "float8_e4m3fn": "fp8_e4_t",
         "float8_e5m2": "fp8_e5_t",
+        "float8_e5m2fnuz": "fp8_e5_t",
         "float8_e4m3fnuz": "fp8_e4_t",
         "e4m3fnuz_float8": "fp8_e4_t",
         "float64": "double",
         "int64": "int64_t",
         "int32": "int",
         "uint32": "unsigned int",
+        "uint64": "uint64_t",
         "bool": "int8_t",
         "int8": "int8_t",
         "uint8": "uint8_t",
@@ -634,6 +698,15 @@ def __init__(
     ):
         super().__init__(scheduled_ir_module, source, target, device_mod, host_mod, pass_configs)
 
+    def get_declaration(self, declare_kernel_code: str) -> str:
+        # HIP code dont have function declaration, so we use '{\n' to split
+        # __global__ void __launch_bounds__(128) kernel_kernel(float* __restrict__ A) {\n
+        return declare_kernel_code.split("{")[0]
+
+    def get_kernel_launch_code(self, function_name, grid_str, block_str, smem_str, call_args, cluster_dims):
+        # HIP does not support cudaLaunchKernelEx; use <<<>>> syntax (same as pre-cluster-launch behavior)
+        return f"\t{function_name}<<<{grid_str}, {block_str}, {smem_str}, stream>>>({call_args});\n"
+
     def get_init_func(self):
         # Initialize an empty string for the CUDA function call
         call_str = """"""
@@ -766,13 +839,16 @@ def func_call_args(s, function_args):
         return host_func
 
     def parse_source_information(self):
-        with tvm.transform.PassContext(opt_level=3, config=self.pass_configs):
-            device_mod, host_mod = get_annotated_mod(self.mod, self.target)
-        assert len(device_mod.functions) >= 1, "Device module should have at least one function."
-        assert len(host_mod.functions) == 1, "Only support one function in host module."
+        if self.device_mod is None or self.host_mod is None:
+            with tvm.transform.PassContext(opt_level=3, config=self.pass_configs), self.target:
+                device_mod, host_mod = get_annotated_mod(self.mod, self.target)
+            self.device_mod = device_mod
+            self.host_mod = host_mod
+        assert len(self.device_mod.functions) >= 1, "Device module should have at least one function."
+        assert len(self.host_mod.functions) == 1, "Only support one function in host module."
 
         function_names = []
-        for g_var, _ in device_mod.functions.items():
+        for g_var, _ in self.device_mod.functions.items():
             function_name = g_var.name_hint
             function_names.append(function_name)
 
diff --git a/tilelang/jit/exceptions.py b/tilelang/jit/exceptions.py
new file mode 100644
index 0000000000..844b5edb03
--- /dev/null
+++ b/tilelang/jit/exceptions.py
@@ -0,0 +1,24 @@
+"""Custom exceptions for TileLang JIT compilation."""
+
+
+class JITNoBuilderError(Exception):
+    """
+    Exception raised when JIT-related operations require a Builder but none exists.
+
+    In eager mode, TileLang constructs AST directly without an explicit prim_func,
+    so there must be a Builder available. This error is raised when eager-only
+    features like T.const() or T.Kernel() are called outside of a JIT/prim_func context.
+    """
+
+    pass
+
+
+class EagerJITBuildError(Exception):
+    """
+    Exception raised for errors when building TileLang eager JIT kernels.
+
+    This error indicates that something went wrong during the eager-style
+    kernel construction process.
+    """
+
+    pass
diff --git a/tilelang/jit/execution_backend.py b/tilelang/jit/execution_backend.py
index fa3c1ecb0a..7481dde7aa 100644
--- a/tilelang/jit/execution_backend.py
+++ b/tilelang/jit/execution_backend.py
@@ -38,7 +38,7 @@ def allowed_backends_for_target(target: Target, *, include_unavailable: bool = T
     elif kind == "hip":
         allowed = ["tvm_ffi", "cython"]
     elif kind == "metal":
-        allowed = ["torch"]
+        allowed = ["tvm_ffi", "torch"]
     elif kind == "c":  # CPU C backend
         allowed = ["cython", "tvm_ffi"]
     else:
@@ -80,10 +80,8 @@ def resolve_execution_backend(requested: str | None, target: Target) -> str:
         if is_cutedsl_target(target):
             return "cutedsl"
         kind = _target_kind(target)
-        if kind == "cuda":
+        if kind == "cuda" or kind == "metal" or kind == "hip":
             choice = "tvm_ffi"
-        elif kind == "metal":
-            choice = "torch"
         else:
             choice = "cython"
         # If the chosen default is not available (very rare), fall back to first available
diff --git a/tilelang/jit/kernel.py b/tilelang/jit/kernel.py
index 7a934e166c..9e3b9134e0 100644
--- a/tilelang/jit/kernel.py
+++ b/tilelang/jit/kernel.py
@@ -1,4 +1,5 @@
 from __future__ import annotations
+import ctypes
 from typing import Any, Callable, Generic, Literal, TypeVar
 
 # Python 3.9 compatibility for ParamSpec
@@ -24,10 +25,10 @@
 )
 from tilelang.profiler import Profiler, TensorSupplyType
 from tilelang.utils.target import determine_target
+from tilelang.utils.allocator import BaseAllocator
 from tilelang.contrib import nvcc as tl_nvcc
 from tilelang.transform import PassConfigKey
-from tilelang.utils.allocator import BaseAllocator
-import ctypes
+from tilelang.transform.pass_config import normalize_pass_configs
 import logging
 import os
 
@@ -101,9 +102,7 @@ def __init__(
         self.target_host = target_host
         self.verbose = verbose
 
-        if pass_configs is None:
-            pass_configs = {}
-        self.pass_configs = pass_configs
+        self.pass_configs = normalize_pass_configs(pass_configs)
 
         self.compile_flags = [compile_flags] if isinstance(compile_flags, str) else compile_flags
 
@@ -159,7 +158,7 @@ def from_database(
         target: str | Target,
         target_host: str | Target,
         out_idx: list[int] | int,
-        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch"],
+        execution_backend: Literal["tvm_ffi", "cython", "nvrtc", "torch", "cutedsl"],
         pass_configs: dict[str, Any] | None = None,
         compile_flags: list[str] | None = None,
     ):
@@ -228,22 +227,26 @@ def _compile_and_create_adapter(self, tilelang_func: PrimFunc, out_idx: list[int
         target_host = self.target_host
 
         execution_backend = self.execution_backend
-        pass_configs = self.pass_configs or {}
+        pass_configs = dict(self.pass_configs) if self.pass_configs else {}
 
         compile_flags = self.compile_flags
-
         if compile_flags is not None:
             compile_flags_cfg = pass_configs.get(PassConfigKey.TL_DEVICE_COMPILE_FLAGS)
             pass_configs[PassConfigKey.TL_DEVICE_COMPILE_FLAGS] = (
                 compile_flags_cfg + compile_flags if compile_flags_cfg is not None else compile_flags
             )
 
-        compile_flags = self.compile_flags
-
         # Compile the function with TVM, optimizing with shared memory lowering.
         enable_host_codegen = execution_backend == "tvm_ffi"
         enable_device_compile = execution_backend == "tvm_ffi"
-        with tvm.transform.PassContext(opt_level=3, config=pass_configs), self.target:
+
+        # Additional pass instruments
+        pass_instruments = []
+        if pass_configs.get(PassConfigKey.TL_ENABLE_DUMP_IR):
+            dump_ir_path = pass_configs.get(PassConfigKey.TL_DUMP_IR_DIR, "./dump_ir")  # Default dump path
+            pass_instruments.append(tvm.ir.instrument.DumpIR(dump_dir=dump_ir_path))
+
+        with tvm.transform.PassContext(opt_level=3, config=pass_configs, instruments=pass_instruments), self.target:
             artifact = tilelang.lower(
                 tilelang_func,
                 target=target,
@@ -467,15 +470,11 @@ def initialize(
         allocator: BaseAllocator,
         stream: int | None = None,
     ):
-        """Initialize base addr table for TileScale kernels."""
-
+        """Initialize base addr table for TileScale distributed kernels."""
         assert allocator.initialized(), "Allocator is not initialized"
-
         stream_val = stream if stream is not None else 0
 
         if self.execution_backend == "tvm_ffi":
-            # TVM FFI adapter: call init_table method directly
-            # Note: TVM FFI expects plain int pointers, not ctypes objects
             result = self.adapter.init_table(
                 allocator.table.data_ptr(),
                 allocator.table_size,
@@ -484,7 +483,6 @@ def initialize(
             if result != 0:
                 raise RuntimeError("Initialization failed for TVM FFI adapter")
         else:
-            # Cython/NVRTC adapter: use ctypes lib interface
             result = self.adapter.lib.init_table(
                 ctypes.c_void_p(allocator.table.data_ptr()),
                 allocator.table_size,
@@ -674,8 +672,17 @@ def export_library(self, kernel_file: str) -> None:
         # rt_module: use export_library to export
         # rt_params: use cloudpickle to serialize
 
-        # Export the compiled kernel function to a shared library file.
-        self.rt_module.export_library(kernel_file)
+        if self.artifact is None or self.artifact.rt_mod is None:
+            raise AttributeError(
+                'Runtime module is not available. Please compile the kernel with `execution_backend="tvm_ffi"` before exporting.'
+            )
+
+        dir_path = os.path.dirname(kernel_file)
+        if dir_path:
+            os.makedirs(dir_path, exist_ok=True)
+
+        self.artifact.rt_mod.export_library(kernel_file)
+        logger.info(f"Kernel library exported to {os.path.abspath(kernel_file)}")
 
     def _get_ptx(self, verbose: bool | None = None) -> str:
         """
diff --git a/tilelang/language/__init__.py b/tilelang/language/__init__.py
index 88f164b308..a90b64715a 100644
--- a/tilelang/language/__init__.py
+++ b/tilelang/language/__init__.py
@@ -11,23 +11,26 @@
 from . import overrides as _overrides  # noqa: F401
 
 # from .tir import prim_func, macro,  # noqa: F401
-from .v2 import *  # noqa: F401
+from .eager import *  # noqa: F401
 from .tir.ir import *  # noqa: F401
 from tilelang.layout import Layout, Fragment  # noqa: F401
-from .proxy import ptr, make_tensor, Buffer, Tensor, StridedTensor, FragmentBuffer, SharedBuffer, LocalBuffer  # noqa: F401
+from .proxy import ptr, make_tensor, make_tensor_from_addr, Buffer, Tensor, StridedTensor, FragmentBuffer, SharedBuffer, LocalBuffer  # noqa: F401
 from .loop import (
     Parallel,  # noqa: F401
     Persistent,  # noqa: F401
     Pipelined,  # noqa: F401
     serial,  # noqa: F401
     unroll,  # noqa: F401
+    vectorized,  # noqa: F401
     Serial,  # noqa: F401
     Unroll,  # noqa: F401
+    Vectorized,  # noqa: F401
 )
 from .frame import has_let_value, get_let_value  # noqa: F401
 from .math_intrinsics import *  # noqa: F401
 from .kernel import (
     Kernel,  # noqa: F401
+    CUDASourceCodeKernel,  # noqa: F401
     KernelLaunchFrame,  # noqa: F401
     get_thread_binding,  # noqa: F401
     get_thread_bindings,  # noqa: F401
@@ -40,7 +43,9 @@
     alloc_local,  # noqa: F401
     alloc_shared,  # noqa: F401
     alloc_fragment,  # noqa: F401
+    alloc_global,  # noqa: F401
     alloc_barrier,  # noqa: F401
+    alloc_cluster_barrier,  # noqa: F401
     alloc_tmem,  # noqa: F401
     alloc_reducer,  # noqa: F401
     alloc_descriptor,  # noqa: F401
@@ -49,9 +54,16 @@
     alloc_tcgen05_instr_desc,  # noqa: F401
     empty,  # noqa: F401
 )
-from .copy_op import copy, c2d_im2col  # noqa: F401
+from tvm.script.parser.tir import allocate as allocate  # noqa: F401
+from .copy_op import copy, async_copy, tma_copy, transpose, c2d_im2col  # noqa: F401
 from tilelang.tileop.base import GemmWarpPolicy  # noqa: F401
-from .gemm_op import gemm, gemm_v1, gemm_v2  # noqa: F401
+from .gemm_op import (  # noqa: F401
+    gemm,
+    wgmma_gemm,
+    tcgen05_gemm,
+    tcgen05_gemm_blockscaled,
+    make_blockscaled_gemm_layout,
+)
 from .experimental.gemm_sp import gemm_sp, gemm_sp_v2  # noqa: F401
 from .fill_op import fill, clear  # noqa: F401
 from .reduce_op import (
@@ -90,6 +102,24 @@
 from .logical import any_of, all_of  # noqa: F401
 from .builtin import *  # noqa: F401
 from .builtin import __ldg as __ldg  # noqa: F401
+from .builtin import ds_read_tr16_b64 as ds_read_tr16_b64  # noqa: F401
+from .builtin import ds_read_tr8_b64 as ds_read_tr8_b64  # noqa: F401
+from .builtin import ldg32 as ldg32  # noqa: F401
+from .builtin import ldg64 as ldg64  # noqa: F401
+from .builtin import ldg128 as ldg128  # noqa: F401
+from .builtin import ldg256 as ldg256  # noqa: F401
+from .builtin import stg32 as stg32  # noqa: F401
+from .builtin import stg64 as stg64  # noqa: F401
+from .builtin import stg128 as stg128  # noqa: F401
+from .builtin import stg256 as stg256  # noqa: F401
+from .builtin import any_sync as any_sync  # noqa: F401
+from .builtin import all_sync as all_sync  # noqa: F401
+from .builtin import ballot_sync as ballot_sync  # noqa: F401
+from .builtin import ballot as ballot  # noqa: F401
+from .builtin import activemask as activemask  # noqa: F401
+from .builtin import syncthreads_count as syncthreads_count  # noqa: F401
+from .builtin import syncthreads_and as syncthreads_and  # noqa: F401
+from .builtin import syncthreads_or as syncthreads_or  # noqa: F401
 
 from .utils import index_to_coordinates  # noqa: F401
 
@@ -100,19 +130,41 @@
     annotate_safe_value,
     annotate_l2_hit_ratio,
     annotate_restrict_buffers,
+    annotate_min_blocks_per_sm,
 )
 
 from .random import (
     rng_init,  # noqa: F401
     rng_rand,  # noqa: F401
+    rng_rand_float,  # noqa: F401
 )
 
-# Distributed multi-device primitives (NVSHMEM)
-from .distributed.multi_device.nvshmem import *  # noqa: F401
-from .distributed.multi_device.cpengine import *  # noqa: F401
-from .distributed.common import *  # noqa: F401
+from .pdl import (
+    pdl_trigger,  # noqa: F401
+    pdl_sync,  # noqa: F401
+)
+
+from .cluster import (
+    cluster_arrive_relaxed,  # noqa: F401
+    cluster_arrive,  # noqa: F401
+    cluster_wait,  # noqa: F401
+    cluster_sync,  # noqa: F401
+    block_rank_in_cluster,  # noqa: F401
+    clc_try_cancel,  # noqa: F401
+    clc_try_cancel_multicast,  # noqa: F401
+    clc_is_canceled,  # noqa: F401
+    clc_get_first_ctaid_x,  # noqa: F401
+    clc_get_first_ctaid_y,  # noqa: F401
+    clc_get_first_ctaid_z,  # noqa: F401
+)
 
 
 def import_source(source: str | None = None):
     # source is the source code to be imported
     return block_attr({"pragma_import_c": source}) if source is not None else None
+
+
+# Distributed multi-device primitives (NVSHMEM)
+from .distributed.multi_device.nvshmem import *  # noqa: F401
+from .distributed.multi_device.cpengine import *  # noqa: F401
+from .distributed import *  # noqa: F401
diff --git a/tilelang/language/allocate.py b/tilelang/language/allocate.py
index 69811fc63a..19d1ced762 100644
--- a/tilelang/language/allocate.py
+++ b/tilelang/language/allocate.py
@@ -9,35 +9,29 @@
     - alloc_local: Allocates local memory buffers for thread-private storage
     - alloc_fragment: Allocates fragment memory buffers for specialized operations
     - alloc_var: Allocates single-element variable buffers
+    - alloc_global: Allocates global memory buffers as workspace
 
 Each function takes shape and dtype parameters and returns a TVM buffer object
 with the appropriate memory scope.
 """
 
 from __future__ import annotations
-
-from typing import TypeVar, overload, Literal
-
-# Python 3.9 compatibility for advanced typing features (PEP 646)
-try:
-    from typing import TypeVarTuple  # type: ignore[attr-defined]
-except Exception:
-    from typing_extensions import TypeVarTuple  # type: ignore
+from typing import overload, Literal
+from tilelang._typing import DType, ShapeType
 from tilelang import tvm as tvm
 from tvm.script import tir as T
 from tvm.tir import PrimExpr
 from tvm.script.parser.tir import block_attr
 from tvm.tir.buffer import Buffer
 from tvm.tir.expr import FloatImm, IntImm
-from .v2 import dtypes as _dtypes
-from .v2.dtypes import dtype as tl_dtype
-from .v2.builder import OutTensor
 
-_Shapes = TypeVarTuple("_Shapes")
-_DType = TypeVar("_DType")
+from . import dtypes as _dtypes
+from .dtypes import dtype as tl_dtype
+from .eager.builder import OutTensor
+from .proxy import Tensor, ptr as _ptr_sentinel
 
 
-def alloc_shared(shape, dtype: _DType, scope="shared.dyn"):
+def alloc_shared(shape: ShapeType, dtype: DType, scope="shared.dyn") -> Buffer:
     """Allocate a shared memory buffer for inter-thread communication.
 
     Args:
@@ -55,7 +49,7 @@ def alloc_shared(shape, dtype: _DType, scope="shared.dyn"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_local(shape, dtype: _DType, scope="local"):
+def alloc_local(shape: ShapeType, dtype: DType, scope="local") -> Buffer:
     """Allocate a local memory buffer for thread-private storage.
 
     Args:
@@ -69,7 +63,7 @@ def alloc_local(shape, dtype: _DType, scope="local"):
     return T.alloc_buffer(shape, dtype, scope=scope)
 
 
-def alloc_fragment(shape, dtype: _DType, scope="local.fragment"):
+def alloc_fragment(shape: ShapeType, dtype: DType, scope="local.fragment") -> Buffer:
     """Allocate a fragment memory buffer for specialized operations.
 
     Args:
@@ -84,14 +78,14 @@ def alloc_fragment(shape, dtype: _DType, scope="local.fragment"):
 
 
 @overload
-def alloc_var(dtype: str, init: PrimExpr | int | float, scope: str = "local.var") -> Buffer: ...
+def alloc_var(dtype: DType, init: PrimExpr | int | float, scope: str = "local.var") -> Buffer: ...
 
 
 @overload
-def alloc_var(dtype: str, scope: str = "local.var", *, init: PrimExpr | int | float | None = None) -> Buffer: ...
+def alloc_var(dtype: DType, scope: str = "local.var", *, init: PrimExpr | int | float | None = None) -> Buffer: ...
 
 
-def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
+def alloc_var(dtype: DType, *args, scope: str = "local.var", init: PrimExpr | int | float | None = None) -> Buffer:
     """Allocate a single-element variable buffer.
 
     Args:
@@ -139,28 +133,99 @@ def alloc_var(dtype, *args, scope="local.var", init: PrimExpr | None = None):
     if not isinstance(parsed_scope, str):
         raise TypeError("Scope must be a string in alloc_var.")
 
+    if dtype is _ptr_sentinel:
+        dtype = _dtypes.int64
+
     buffer = T.alloc_buffer([1], dtype, scope=parsed_scope)
     if parsed_init is not None:
+        # Always use T.buffer_store for reliable initialisation across all
+        # backends.  The block_attr("tl.local_var_init") path feeds into the
+        # flatten_buffer transform which does not reliably emit initialiser
+        # code on some backends (e.g. HIP codegen silently drops the
+        # annotation for integer/float literals, leaving the scalar
+        # uninitialised).  T.buffer_store emits an explicit BufferStore TIR
+        # node that every backend lowers to an assignment statement.
         if isinstance(parsed_init, (int, float, IntImm, FloatImm)):
-            block_attr({"tl.local_var_init": {buffer.data: tl_dtype(dtype)(parsed_init)}})
-        else:
-            T.buffer_store(buffer, parsed_init, 0)
+            parsed_init = tl_dtype(dtype)(parsed_init)
+        T.buffer_store(buffer, parsed_init, 0)
     return buffer
 
 
-def alloc_barrier(arrive_count: int):
+def alloc_global(shape: ShapeType, dtype: DType, scope="global") -> Buffer:
+    """Allocate a global memory buffer as a global workspace.
+
+    NOTE(chaofan): Memory allocated in this way doesn't go through torch allocator. Instead,
+    it's allocated directly by the corresponding backend APIs, like cudaMalloc. We
+    recommend allocating workspace in Torch side and pass it to the kernel via arguments,
+    which is managed under the hood by the framework. This API is mainly for testing
+    purposes and some specific purposes.
+
+    NOTE(chaofan): This API may not be available in all backends (e.g. CuteDSL).
+
+    Args:
+        shape (tuple): The shape of the buffer to allocate
+        dtype (str): The data type of the buffer (e.g., 'float32', 'int32')
+        scope (str, optional): The memory scope. Defaults to "global"
+
+    Returns:
+        T.Buffer: A TVM buffer object allocated in global memory
+    """
+
+    return T.alloc_buffer(shape, dtype, scope=scope)
+
+
+def alloc_barrier(arrive_count: int | list[int]) -> Buffer:
     """Allocate a barrier buffer.
 
     Args:
-        arrive_count (int): The number of threads that need to arrive at the barrier
+        arrive_count (int | list[int]): The number of threads that need to arrive at each barrier
 
     Returns:
         T.Buffer: A TVM buffer object allocated as a barrier
+
+    Examples
+    --------
+    >>> mbar = alloc_barrier(128)  # allocate a barrier with arrive count 128
+    >>> mbars = alloc_barrier([128] * n)  # allocate n barriers with the same arrive count 128
+    """
+    # Normalize to list
+    if isinstance(arrive_count, int):
+        arrive_count = [arrive_count]
+    else:
+        arrive_count = list(arrive_count)
+    buffer = T.alloc_buffer((len(arrive_count),), _dtypes.uint64, scope="shared.barrier")
+    # Convert to TIR IntImm expressions for C++ pass to consume as Map<Var, Array<PrimExpr>>
+    # Use buffer.data as key to support multiple barrier buffer allocations
+    arrive_count_exprs = [IntImm("int32", c) for c in arrive_count]
+    block_attr({"barrier_init": {buffer.data: arrive_count_exprs}})
+
+    return buffer
+
+
+def alloc_cluster_barrier(arrive_count: int | list[int]) -> Buffer:
+    """Allocate a cluster barrier buffer.
+
+    Args:
+        arrive_count (int | list[int]): The number of threads that need to arrive at each barrier
+
+    Returns:
+        T.Buffer: A TVM buffer object allocated as a cluster barrier
     """
-    return T.alloc_buffer([arrive_count], _dtypes.uint64, scope="shared.barrier")
+    # Normalize to list
+    if isinstance(arrive_count, int):
+        arrive_count = [arrive_count]
+    else:
+        arrive_count = list(arrive_count)
+    buffer = T.alloc_buffer((len(arrive_count),), _dtypes.uint64, scope="shared.cluster_barrier")
+    # Convert to TIR IntImm expressions for C++ pass to consume as Map<Var, Array<PrimExpr>>
+    # Use buffer.data as key to support multiple barrier buffer allocations
+    arrive_count_exprs = [IntImm("int32", c) for c in arrive_count]
+    block_attr({"barrier_init": {buffer.data: arrive_count_exprs}})
+
+    return buffer
 
 
-def alloc_tmem(shape, dtype):
+def alloc_tmem(shape: ShapeType, dtype: DType) -> Buffer:
     """
     Allocate a Tensor Memory (TMEM) buffer for use with 5th generation Tensor Core operations (e.g., TCGEN5.MMA).
 
@@ -168,7 +233,9 @@ def alloc_tmem(shape, dtype):
 
     Key properties and requirements:
         - The number of columns allocated must be a power of 2 and at least 32.
-        - TMEM allocations are dynamic and must be explicitly deallocated.
+        - TMEM allocations are dynamic. TileLang deallocates them automatically at
+          the end of the allocation block unless you call ``T.deallocate_tmem`` to
+          take manual control of the lifetime.
         - Both allocation and deallocation must be performed by the same warp.
         - The base address of the TMEM allocation is stored in shared memory and used as the offset for TCGEN5.MMA accumulator tensors.
         - Only TCGEN5.MMA and specific TMEM load/store instructions can access TMEM; all pre-processing must occur before data is loaded into TMEM, and all post-processing after data is retrieved.
@@ -182,14 +249,18 @@ def alloc_tmem(shape, dtype):
 
     Note:
         - TMEM is only available on supported architectures (e.g., Hopper and later).
-        - The buffer returned should be used according to TMEM access restrictions and deallocated appropriately.
+        - The buffer returned should be used according to TMEM access restrictions.
+          Use ``T.deallocate_tmem`` only when you need an earlier, explicit release.
     """
 
     assert len(shape) == 2, "shape must be a 2D tensor for TMEM allocation"
     return T.alloc_buffer(shape, dtype, scope="shared.tmem")
 
 
-def alloc_reducer(shape, dtype, op="sum", replication=None):
+ReducerOp = Literal["sum", "max", "min"]
+
+
+def alloc_reducer(shape: ShapeType, dtype: DType, op: ReducerOp = "sum", replication=None) -> Buffer:
     """
     Allocate a reducer buffer.
 
@@ -230,8 +301,8 @@ def alloc_reducer(shape, dtype, op="sum", replication=None):
 
 def alloc_descriptor(
     kind: DescKind = "wgmma",
-    dtype: str = _dtypes.uint64,
-):
+    dtype: DType = _dtypes.uint64,
+) -> Buffer:
     """Allocate a descriptor buffer for WGMMA and TCGEN5.MMA.
 
     Args:
@@ -247,28 +318,40 @@ def alloc_descriptor(
     return T.alloc_buffer([1], dtype, scope=scope)
 
 
-def alloc_wgmma_desc(dtype: str = _dtypes.uint64):
+def alloc_wgmma_desc(dtype: DType = _dtypes.uint64) -> Buffer:
     return alloc_descriptor("wgmma", dtype=dtype)
 
 
-def alloc_tcgen05_smem_desc(dtype: str = _dtypes.uint64):
+def alloc_tcgen05_smem_desc(dtype: DType = _dtypes.uint64) -> Buffer:
     return alloc_descriptor("tcgen05_smem", dtype=dtype)
 
 
-def alloc_tcgen05_instruction_desc(dtype: str = _dtypes.uint32):
+def alloc_tcgen05_instruction_desc(dtype: DType = _dtypes.uint32) -> Buffer:
     return alloc_descriptor("tcgen05_instr", dtype=dtype)
 
 
 # Alias: short name consistent with imports
-def alloc_tcgen05_instr_desc(dtype: str = _dtypes.uint32):
+def alloc_tcgen05_instr_desc(dtype: DType = _dtypes.uint32) -> Buffer:
     return alloc_tcgen05_instruction_desc(dtype)
 
 
 @overload
-def empty(shape, dtype: str = _dtypes.float32): ...
+def empty(shape, dtype: DType = _dtypes.float32) -> Tensor: ...
+
+
+def empty(*shape, dtype: DType = _dtypes.float32) -> Tensor:
+    """Declare the output tensor used in eager-style JIT.
 
+    Tensors allocated in this way should be returned as the output of the function.
+
+    Args:
+        shape (tuple): The shape of the tensor to allocate
+        dtype (str): The data type of the tensor (e.g., 'float32', 'int32')
+
+    Returns:
+        Tensor: The declared OutTensor object.
+    """
 
-def empty(*shape, dtype: str = _dtypes.float32):
     if len(shape) == 1 and isinstance(shape[0], (tuple, list)):
         return OutTensor(shape[0], dtype)
     elif len(shape) == 2 and isinstance(shape[0], (tuple, list)) and isinstance(shape[1], str):
@@ -276,4 +359,4 @@ def empty(*shape, dtype: str = _dtypes.float32):
     elif all([isinstance(x, (int, PrimExpr)) for x in shape]):
         return OutTensor(shape, dtype)
     else:
-        raise RuntimeError(f"Invalid shape {shape}")
+        raise TypeError(f"Invalid shape {shape}")
diff --git a/tilelang/language/annotations.py b/tilelang/language/annotations.py
index 6e95cdafe0..7cd3f8c51c 100644
--- a/tilelang/language/annotations.py
+++ b/tilelang/language/annotations.py
@@ -5,7 +5,7 @@
 from tilelang.layout import Fragment, Layout
 from tilelang.utils.language import is_fragment
 from tvm.script.parser.tir import attr, block_attr
-from tvm.tir import FloatImm
+from tvm.tir import FloatImm, tvm_tuple
 
 __all__ = [
     "use_swizzle",
@@ -13,6 +13,7 @@
     "annotate_safe_value",
     "annotate_l2_hit_ratio",
     "annotate_restrict_buffers",
+    "annotate_min_blocks_per_sm",
 ]
 
 
@@ -21,7 +22,7 @@ def use_swizzle(panel_size: int, order: str = "row", enable: bool = True):
     device_func = "rasterization2DRow" if order == "row" else "rasterization2DColumn"
     if not enable:
         return None
-    return attr(None, "threadblock_swizzle_pattern", f"tl::{device_func}<{panel_size}>")
+    return attr(None, "threadblock_swizzle_pattern", tvm_tuple(device_func, panel_size))
 
 
 def annotate_layout(layout_map: dict):
@@ -57,6 +58,28 @@ def annotate_l2_hit_ratio(l2_hit_ratio_map: dict):
     return block_attr({"l2_hit_ratio_map": _l2_hit_ratio_map})
 
 
+def annotate_min_blocks_per_sm(n: int):
+    """Annotate the minimum number of thread blocks per SM (multiprocessor).
+
+    When set, this value is passed as the second argument of
+    ``__launch_bounds__(maxThreadsPerBlock, minBlocksPerMultiprocessor)`` in
+    the generated CUDA kernel.  A larger value hints the compiler to limit
+    register usage so that more blocks can reside on each SM simultaneously,
+    which can improve occupancy at the cost of potentially more register
+    spilling.
+
+    Example
+    -------
+    >>> @T.prim_func
+    ... def my_kernel(...):
+    ...     with T.Kernel(...):
+                T.annotate_min_blocks_per_sm(2)
+    ...         ...
+    """
+    assert isinstance(n, int) and n > 0, "n must be a positive integer"
+    return attr(None, "tl.min_blocks_per_sm", n)
+
+
 def annotate_restrict_buffers(*buffers):
     """Mark the given buffer parameters as non-restrict.
 
diff --git a/tilelang/language/ast/ir.py b/tilelang/language/ast/ir.py
index a4caefc24e..7865b74be7 100644
--- a/tilelang/language/ast/ir.py
+++ b/tilelang/language/ast/ir.py
@@ -26,6 +26,7 @@
 
 # isort: off
 from typing_extensions import Literal
+from tilelang._typing import ShapeType, DType
 
 # isort: on
 
@@ -86,21 +87,22 @@
 
 from . import _ffi_api
 from tvm.script.ir_builder.tir import frame
+from tilelang.language import dtypes as _dtypes
 
 # pylint: enable=unused-import
 
 
 def buffer(
-    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = T.float32,
-    data: Var = None,
-    strides: List[PrimExpr] = None,
-    elem_offset: PrimExpr = None,
+    shape: ShapeType | tir.PrimExpr | Integral,
+    dtype: DType = _dtypes.float32,
+    data: Optional[Var] = None,
+    strides: Optional[List[PrimExpr]] = None,
+    elem_offset: Optional[PrimExpr] = None,
     scope: str = "global",
     align: int = 0,
     offset_factor: int = 0,
     buffer_type: str = "",
-    axis_separators: List[int] = None,
+    axis_separators: Optional[List[int]] = None,
 ) -> Buffer:
     """The buffer declaration function.
 
@@ -109,16 +111,16 @@ def buffer(
     shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
         The type of the buffer prior to flattening.
 
-    dtype : str
+    dtype : DType, optional
         The data type in the content of the buffer.
 
-    data : Var
+    data : Var, optional
         The pointer to the head of the data.
 
-    strides : List[PrimExpr]
+    strides : List[PrimExpr], optional
         The strides of each dimension.
 
-    elem_offset : PrimExpr
+    elem_offset : PrimExpr, optional
         The offset in terms of number of dtype elements (including lanes).
 
     scope : str
@@ -133,7 +135,7 @@ def buffer(
     buffer_type : str
         The buffer type.
 
-    axis_separators : List[int]
+    axis_separators : List[int], optional
         The separators between input axes when generating flattened output axes.
 
     Returns
@@ -143,7 +145,7 @@ def buffer(
     """
     shape = (shape,) if isinstance(shape, (PrimExpr, Integral)) else shape
     if strides is not None:
-        strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
+        strides = [Var(s, _dtypes.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
     return _ffi_api.Buffer(  # type: ignore[attr-defined] # pylint: disable=no-member
@@ -243,16 +245,16 @@ def func_ret(ret_type: Type) -> Type:
 
 def match_buffer(
     param: Union[Var, BufferLoad, BufferRegion],
-    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral] = None,
-    dtype: str = T.float32,
-    data: Var = None,
-    strides: List[PrimExpr] = None,
-    elem_offset: PrimExpr = None,
+    shape: ShapeType | PrimExpr | Integral | None = None,
+    dtype: DType = _dtypes.float32,
+    data: Optional[Var] = None,
+    strides: Optional[List[PrimExpr]] = None,
+    elem_offset: Optional[PrimExpr] = None,
     scope: str = "global",
     align: int = -1,
     offset_factor: int = 0,
     buffer_type: str = "default",
-    axis_separators: List[int] = None,
+    axis_separators: Optional[List[int]] = None,
 ) -> Buffer:
     """The buffer match function.
 
@@ -277,19 +279,19 @@ def match_buffer(
     param : Union[Var, BufferLoad, BufferRegion]
         The parameter of the PrimFunc to match.
 
-    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral, None]
         The type of the buffer prior to flattening.
 
-    dtype : str
+    dtype : DType
         The data type in the content of the buffer.
 
-    data : Var
+    data : Var, optional
         The pointer to the head of the data.
 
-    strides : List[PrimExpr]
+    strides : List[PrimExpr], optional
         The strides of each dimension.
 
-    elem_offset : PrimExpr
+    elem_offset : PrimExpr, optional
         The offset in terms of number of dtype elements (including lanes).
 
     scope : str
@@ -304,7 +306,7 @@ def match_buffer(
     buffer_type : str
         The buffer type.
 
-    axis_separators : List[int]
+    axis_separators : List[int], optional
         The separators between input axes when generating flattened output axes.
 
     Returns
@@ -439,34 +441,35 @@ def block_attr(attrs: Dict[str, Any]) -> None:
 
 
 def alloc_buffer(
-    shape: Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral],
-    dtype: str = T.float32,
-    data: Var = None,
-    strides: List[PrimExpr] = None,
-    elem_offset: PrimExpr = None,
+    shape: ShapeType | PrimExpr | Integral,
+    dtype: DType = _dtypes.float32,
+    data: Optional[Var] = None,
+    strides: Optional[List[PrimExpr]] = None,
+    elem_offset: Optional[PrimExpr] = None,
     scope: str = "global",
     align: int = -1,
     offset_factor: int = 0,
     buffer_type: str = "default",
-    axis_separators: List[int] = None,
+    axis_separators: Optional[List[int]] = None,
+    annotations: Optional[Dict[str, Any]] = None,
 ) -> Buffer:
     """The buffer allocation function.
 
     Parameters
     ----------
-    shape : Union[List[PrimExpr], Tuple[PrimExpr], PrimExpr, Integral]
+    shape : ShapeType | PrimExpr | Integral
         The type of the buffer prior to flattening.
 
-    dtype : str
+    dtype : DType
         The data type in the content of the buffer.
 
-    data : Var
+    data : Var, optional
         The pointer to the head of the data.
 
-    strides : List[PrimExpr]
+    strides : List[PrimExpr], optional
         The strides of each dimension.
 
-    elem_offset : PrimExpr
+    elem_offset : PrimExpr, optional
         The offset in terms of number of dtype elements (including lanes).
 
     scope : str
@@ -481,9 +484,13 @@ def alloc_buffer(
     buffer_type : str
         The buffer type.
 
-    axis_separators : List[int]
+    axis_separators : List[int], optional
         The separators between input axes when generating flattened output axes.
 
+    annotations : Dict[str, Any], optional
+        Additional annotation hints for the buffer, e.g. to guide code generation
+        for specific backends.
+
     Returns
     -------
     res : Buffer
@@ -494,18 +501,10 @@ def alloc_buffer(
         strides = [Var(s, T.int32) if isinstance(s, str) else s for s in strides]
     else:
         strides = []
-    return _ffi_api.AllocBuffer(  # type: ignore[attr-defined] # pylint: disable=no-member
-        shape,
-        dtype,
-        data,
-        strides,
-        elem_offset,
-        scope,
-        align,
-        offset_factor,
-        buffer_type,
-        axis_separators,
-    )
+    args = [shape, dtype, data, strides, elem_offset, scope, align, offset_factor, buffer_type, axis_separators]
+    if annotations is not None:
+        args.append(annotations)
+    return _ffi_api.AllocBuffer(*args)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def _as_range(dom: Union[ir.Range, List[PrimExpr]]) -> ir.Range:
@@ -537,7 +536,7 @@ class axis:  # pylint: disable=invalid-name
     def spatial(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = T.int32,
+        dtype: DType = _dtypes.int32,
     ) -> Var:
         """The spatial block axis defining function.
 
@@ -565,7 +564,7 @@ def spatial(
     def reduce(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = T.int32,
+        dtype: DType = _dtypes.int32,
     ) -> Var:
         """The reduced block axis defining function.
 
@@ -593,7 +592,7 @@ def reduce(
     def scan(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = T.int32,
+        dtype: DType = _dtypes.int32,
     ) -> Var:
         """The scanning block axis defining function.
 
@@ -621,7 +620,7 @@ def scan(
     def opaque(
         dom: Union[ir.Range, List[PrimExpr], Tuple[PrimExpr]],
         binding: PrimExpr,
-        dtype: str = T.int32,
+        dtype: DType = _dtypes.int32,
     ) -> Var:
         """The opaque block axis defining function.
 
@@ -788,10 +787,10 @@ def unroll(start: PrimExpr, stop: PrimExpr = None, *, annotations: Dict[str, Any
 
 def thread_binding(
     start: PrimExpr,
-    stop: PrimExpr = None,
-    thread: str = None,
+    stop: Optional[PrimExpr] = None,
+    thread: Optional[str] = None,
     *,
-    annotations: Dict[str, Any] = None,
+    annotations: Optional[Dict[str, Any]] = None,
 ) -> frame.ForFrame:
     """The thread-binding For statement.
 
@@ -800,13 +799,13 @@ def thread_binding(
     start : PrimExpr
         The minimum value of iteration.
 
-    stop : PrimExpr
+    stop : PrimExpr, optional
         The maximum value of iteration.
 
-    thread : str
+    thread : str, optional
         The thread for loop variable to bind.
 
-    annotations : Dict[str, Any]
+    annotations : Dict[str, Any], optional
         The optional annotations of the For statement.
 
     Returns
@@ -915,7 +914,7 @@ def Let(  # pylint: disable=invalid-name
 def let(
     v: Var,
     value: PrimExpr,
-    body: PrimExpr = None,
+    body: Optional[PrimExpr] = None,
 ) -> frame.LetFrame:
     """Create a new let binding.
 
@@ -927,7 +926,7 @@ def let(
     value : PrimExpr
         The value to be bound.
 
-    body : PrimExpr
+    body : PrimExpr, optional
         The body expression, None will be used if it was not specified.
 
     Returns
@@ -982,7 +981,7 @@ def allocate(
     extents: List[PrimExpr],
     dtype: str,
     scope: str = "global",
-    condition: PrimExpr = None,
+    condition: Optional[PrimExpr] = None,
     annotations=None,
 ) -> frame.AllocateFrame:
     """Allocate node.
@@ -998,7 +997,7 @@ def allocate(
     scope : str
         The storage scope.
 
-    condition : PrimExpr
+    condition : PrimExpr, optional
         The condition.
 
     annotations: Optional[Mapping[str, Object]]
@@ -1132,7 +1131,7 @@ def Else() -> frame.ElseFrame:  # pylint: disable=invalid-name
 
 
 def decl_buffer(
-    shape,
+    shape: ShapeType,
     dtype=T.float32,
     data=None,
     strides=None,
@@ -1887,6 +1886,7 @@ def wrapped(*args, **kwargs):
 ptx_wgmma_rs = _dtype_forward(_tir_op.ptx_wgmma_rs)
 ptx_tcgen05_mma_ss = _dtype_forward(_tir_op.ptx_tcgen05_mma_ss)
 ptx_tcgen05_mma_ts = _dtype_forward(_tir_op.ptx_tcgen05_mma_ts)
+ptx_tcgen05_mma_blockscaled_ss = _dtype_forward(_tir_op.ptx_tcgen05_mma_blockscaled_ss)
 ptx_ldmatrix = _dtype_forward(_tir_op.ptx_ldmatrix)
 ptx_cp_async = _dtype_forward(_tir_op.ptx_cp_async)
 ptx_cp_async_bulk = _dtype_forward(_tir_op.ptx_cp_async_bulk)
@@ -2139,6 +2139,7 @@ def wrapped(*args, **kwargs):
     "ptx_wgmma_ss",
     "ptx_wgmma_rs",
     "ptx_tcgen05_mma_ss",
+    "ptx_tcgen05_mma_blockscaled_ss",
     "ptx_ldmatrix",
     "ptx_cp_async",
     "ptx_cp_async_bulk",
diff --git a/tilelang/language/atomic.py b/tilelang/language/atomic.py
index 30b5f533bb..5efe70ebb3 100644
--- a/tilelang/language/atomic.py
+++ b/tilelang/language/atomic.py
@@ -4,8 +4,9 @@
 
 import tilelang.language as T
 from tvm import ir
-from tvm.tir import PrimExpr, Buffer, BufferRegion, Var, op
+from tvm.tir import PrimExpr, Buffer, op
 from tilelang.utils.language import to_buffer_region, legalize_pairwise_extents
+from tilelang.language.utils import get_extent
 
 _MEMORY_ORDER_ID_MAP = {
     "relaxed": 0,
@@ -21,7 +22,7 @@ def atomic_max(dst: Buffer, value: PrimExpr, memory_order: str | None = None, re
     """
     Perform an atomic maximum on the value stored at dst with an optional memory-order.
 
-    If memory_order is None the runtime extern "AtomicMax" is called without an explicit memory-order id; otherwise the provided memory_order string is mapped to a numeric id using the module's memory-order map and passed to the extern.
+    Supports scalar/addressed extern atomic max when neither argument exposes extents, or tile-region-based atomic max for Buffer/BufferRegion/BufferLoad inputs. If both arguments are plain Buffers their shapes must be structurally equal. If at least one side exposes extents, extents are aligned (missing dimensions are treated as size 1); an assertion is raised if extents cannot be deduced. The optional `memory_order` (one of "relaxed","consume","acquire","release","acq_rel","seq_cst") is used only for the direct extern `AtomicMax` path when no extents are available — otherwise the tile-region path ignores `memory_order`.
 
     Parameters:
         dst (Buffer): Destination buffer/address to apply the atomic max.
@@ -50,29 +51,60 @@ def atomic_max(dst: Buffer, value: PrimExpr, memory_order: str | None = None, re
         >>> def find_max(data: T.Buffer, result: T.Buffer):
         >>>     for i in T.thread_binding(128, "threadIdx.x"):
         >>>         atomic_max(result, data[i])
+
+        >>> # Tensor-to-tensor atomic max (tile-region based)
+        >>> src_tensor = T.Tensor([128, 64], "float32", name="src")
+        >>> dst_tensor = T.Tensor([128, 64], "float32", name="dst")
+        >>> atomic_max(dst_tensor, src_tensor)  # Max entire tensors atomically
     """
-    func_name = "AtomicMaxRet" if return_prev else "AtomicMax"
-    return_type = dst.dtype if return_prev else "handle"
 
-    if memory_order is None:
-        return T.call_extern(return_type, func_name, T.address_of(dst), value)
-    else:
-        return T.call_extern(
+    src_extent = get_extent(value)
+    dst_extent = get_extent(dst)
+
+    if dst_extent is None and src_extent is None:
+        # Scalar path: use atomicmax_elem_op intrinsic
+        return_type = dst.dtype if return_prev else "handle"
+        memory_order_id = _MEMORY_ORDER_ID_MAP[memory_order] if memory_order else 0
+
+        return T.call_intrin(
             return_type,
-            func_name,
-            T.address_of(dst),
+            op.Op.get("tl.atomic_max_elem_op"),
+            T.access_ptr(dst, "rw"),
             value,
-            _MEMORY_ORDER_ID_MAP[memory_order],
+            memory_order_id,
         )
 
+    # When both arguments are Buffer, we can check whether they are structural equal.
+    if isinstance(dst, Buffer) and isinstance(value, Buffer):
+        ir.assert_structural_equal(dst.shape, value.shape)
+
+    assert src_extent or dst_extent, "Can't deduce atomicmax extents from args"
+
+    # If src is BufferLike, we need to first transform it to region
+    if src_extent:
+        value = to_buffer_region(value, access_type="r", extents=src_extent)
+
+    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
+    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
+
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
+
+    if return_prev:
+        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
+
+    ann = {}
+    if memory_order is not None:
+        ann["memory_order"] = _MEMORY_ORDER_ID_MAP[memory_order]
+
+    return T.call_intrin("handle", op.Op.get("tl.tileop.atomicmax"), value, dst, annotations=ann if ann else None)
+
 
 def atomic_min(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False) -> PrimExpr:
     """
     Atomically update the value at dst to the minimum of its current value and value.
 
-    If memory_order is provided, it selects the memory-order semantic used by the underlying extern call;
-    allowed names are "relaxed", "consume", "acquire", "release", "acq_rel", and "seq_cst" (mapped internally
-    to integer IDs). If memory_order is None, the extern is invoked without an explicit memory-order argument.
+    Supports scalar/addressed extern atomic min when neither argument exposes extents, or tile-region-based atomic min for Buffer/BufferRegion/BufferLoad inputs. If both arguments are plain Buffers their shapes must be structurally equal. If at least one side exposes extents, extents are aligned (missing dimensions are treated as size 1); an assertion is raised if extents cannot be deduced. The optional `memory_order` (one of "relaxed","consume","acquire","release","acq_rel","seq_cst") is used only for the direct extern `AtomicMin` path when no extents are available — otherwise the tile-region path ignores `memory_order`.
 
     Parameters:
         dst (Buffer): Destination buffer/address to apply the atomic min.
@@ -101,21 +133,54 @@ def atomic_min(dst: Buffer, value: PrimExpr, memory_order: str | None = None, re
 
         >>> # With relaxed memory ordering for performance
         >>> atomic_min(min_val, 5, memory_order="relaxed")
+
+        >>> # Tensor-to-tensor atomic min (tile-region based)
+        >>> src_tensor = T.Tensor([128, 64], "float32", name="src")
+        >>> dst_tensor = T.Tensor([128, 64], "float32", name="dst")
+        >>> atomic_min(dst_tensor, src_tensor)  # Min entire tensors atomically
     """
-    func_name = "AtomicMinRet" if return_prev else "AtomicMin"
-    return_type = dst.dtype if return_prev else "handle"
 
-    if memory_order is None:
-        return T.call_extern(return_type, func_name, T.address_of(dst), value)
-    else:
-        return T.call_extern(
+    src_extent = get_extent(value)
+    dst_extent = get_extent(dst)
+
+    if dst_extent is None and src_extent is None:
+        # Scalar path: use atomicmin_elem_op intrinsic
+        return_type = dst.dtype if return_prev else "handle"
+        memory_order_id = _MEMORY_ORDER_ID_MAP[memory_order] if memory_order else 0
+
+        return T.call_intrin(
             return_type,
-            func_name,
-            T.address_of(dst),
+            op.Op.get("tl.atomic_min_elem_op"),
+            T.access_ptr(dst, "rw"),
             value,
-            _MEMORY_ORDER_ID_MAP[memory_order],
+            memory_order_id,
         )
 
+    # When both arguments are Buffer, we can check whether they are structural equal.
+    if isinstance(dst, Buffer) and isinstance(value, Buffer):
+        ir.assert_structural_equal(dst.shape, value.shape)
+
+    assert src_extent or dst_extent, "Can't deduce atomicmin extents from args"
+
+    # If src is BufferLike, we need to first transform it to region
+    if src_extent:
+        value = to_buffer_region(value, access_type="r", extents=src_extent)
+
+    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
+    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
+
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
+
+    if return_prev:
+        raise NotImplementedError("return_prev is not supported for tile-region-based atomic operations")
+
+    ann = {}
+    if memory_order is not None:
+        ann["memory_order"] = _MEMORY_ORDER_ID_MAP[memory_order]
+
+    return T.call_intrin("handle", op.Op.get("tl.tileop.atomicmin"), value, dst, annotations=ann if ann else None)
+
 
 def atomic_add(dst: Buffer, value: PrimExpr, memory_order: str | None = None, return_prev: bool = False, use_tma: bool = False) -> PrimExpr:
     """
@@ -162,54 +227,40 @@ def atomic_add(dst: Buffer, value: PrimExpr, memory_order: str | None = None, re
         >>> atomic_add(global_grad, gradients)
     """
 
-    def get_extent(data):
-        """
-        Return the inferred extent (shape) of a buffer-like object.
-
-        If `data` is a Var bound to a let value, the let value is resolved before inspection.
-        Parameters:
-            data: A Var, Buffer, or BufferRegion to inspect.
-
-        Returns:
-            The shape/extents as a list-like of PrimExpr (Buffer.shape or list of region item extents), or None if the extent cannot be determined.
-        """
-        if isinstance(data, Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, Buffer):
-            return data.shape
-        elif isinstance(data, BufferRegion):
-            return [x.extent for x in data.region]
-        else:
-            return None
-
     src_extent = get_extent(value)
     dst_extent = get_extent(dst)
 
+    # Thread-level atomic add, where both extent can't be inferred
     if dst_extent is None and src_extent is None:
-        func_name = "AtomicAddRet" if return_prev else "AtomicAdd"
+        atomic_add_op = op.Op.get("tl.atomic_add_ret_elem_op") if return_prev else op.Op.get("tl.atomic_add_elem_op")
         return_type = dst.dtype if return_prev else "handle"
 
         # Pass destination by pointer to match device signature
         if memory_order is None:
-            return T.call_extern(return_type, func_name, T.address_of(dst), value)
+            return T.call_intrin(return_type, atomic_add_op, T.access_ptr(dst, "rw"), value)
         else:
-            return T.call_extern(
+            return T.call_intrin(
                 return_type,
-                func_name,
-                T.address_of(dst),
+                atomic_add_op,
+                T.access_ptr(dst, "rw"),
                 value,
                 _MEMORY_ORDER_ID_MAP[memory_order],
             )
 
+    # When both arguments are Buffer, we can check whether they are structural equal.
     if isinstance(dst, Buffer) and isinstance(value, Buffer):
         ir.assert_structural_equal(dst.shape, value.shape)
 
     assert src_extent or dst_extent, "Can't deduce atomicadd extents from args"
+
+    # If src is BufferLike, we need to first transform it to region
+    if src_extent:
+        value = to_buffer_region(value, access_type="r", extents=src_extent)
+
     src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
     dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
     src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-    value = to_buffer_region(value, access_type="r", extents=src_extent)
     dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
 
     # Note: tile-region-based atomic operations don't support return_prev yet
@@ -260,9 +311,9 @@ def atomic_addx2(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> Pri
         >>>         for j in range(0, grads.shape[1], 2):  # Process in pairs
         >>>             atomic_addx2(global_grads[i, j:j+2], grads[i, j:j+2])
     """
-    func_name = "AtomicAddx2Ret" if return_prev else "AtomicAddx2"
+    atomic_addx2_op = op.Op.get("tl.atomic_addx2_elem_op") if return_prev else op.Op.get("tl.atomic_addx2_elem_op")
     return_type = dst.dtype if return_prev else "handle"
-    return T.call_extern(return_type, func_name, T.address_of(dst), T.address_of(value))
+    return T.call_intrin(return_type, atomic_addx2_op, T.access_ptr(dst, "rw"), T.access_ptr(value, "r"))
 
 
 def atomic_addx4(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> PrimExpr:
@@ -298,9 +349,9 @@ def atomic_addx4(dst: Buffer, value: PrimExpr, return_prev: bool = False) -> Pri
         >>> rgba_add = T.Tensor([4], "float32", name="rgba_add")
         >>> atomic_addx4(rgba_dst, rgba_add)  # Atomic blend of all 4 channels
     """
-    func_name = "AtomicAddx4Ret" if return_prev else "AtomicAddx4"
+    atomic_addx4_op = op.Op.get("tl.atomic_addx4_elem_op") if return_prev else op.Op.get("tl.atomic_addx4_elem_op")
     return_type = "float4" if "float" in str(dst.dtype).lower() else "handle"
-    return T.call_extern(return_type, func_name, T.address_of(dst), T.address_of(value))
+    return T.call_intrin(return_type, atomic_addx4_op, T.access_ptr(dst, "rw"), T.access_ptr(value, "r"))
 
 
 def atomic_load(src: Buffer, memory_order: str = "seq_cst") -> PrimExpr:
@@ -339,7 +390,12 @@ def atomic_load(src: Buffer, memory_order: str = "seq_cst") -> PrimExpr:
         >>> counter = T.Tensor([1], "int64", name="counter")
         >>> current_count = atomic_load(counter, memory_order="relaxed")
     """
-    return T.call_extern(src.dtype, "AtomicLoad", T.address_of(src), _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_intrin(
+        src.dtype,
+        op.Op.get("tl.atomic_load_elem_op"),
+        T.access_ptr(src, "r"),
+        _MEMORY_ORDER_ID_MAP[memory_order],
+    )
 
 
 def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> PrimExpr:
@@ -392,4 +448,10 @@ def atomic_store(dst: Buffer, src: PrimExpr, memory_order: str = "seq_cst") -> P
         >>> log_counter = T.Tensor([1], "int64", name="log_counter")
         >>> atomic_store(log_counter, 0)  # Reset counter atomically
     """
-    return T.call_extern("handle", "AtomicStore", T.address_of(dst), src, _MEMORY_ORDER_ID_MAP[memory_order])
+    return T.call_intrin(
+        "handle",
+        op.Op.get("tl.atomic_store_elem_op"),
+        T.access_ptr(dst, "w"),
+        src,
+        _MEMORY_ORDER_ID_MAP[memory_order],
+    )
diff --git a/tilelang/language/builtin.py b/tilelang/language/builtin.py
index f4d46d6c85..17d6c60767 100644
--- a/tilelang/language/builtin.py
+++ b/tilelang/language/builtin.py
@@ -2,14 +2,17 @@
 
 from __future__ import annotations
 
+import tvm.script.parser.tir as T
+from tilelang._typing import BufferLikeType, BufferLikeTypeTuple, BarrierType, DType
 from tilelang import tvm as tvm
 from tilelang.language import ptx_arrive_barrier, evaluate, address_of, alloc_buffer
+from tilelang.language.eager.builder import macro
 from tilelang.language.kernel import get_thread_bindings, get_block_extents
 from tilelang.utils.target import check_hip_availability
 from tvm import DataType, tir
 from tvm.runtime import convert
-from typing import Any, Literal
 from tvm.tir import PrimExpr, Var, Call, BufferLoad, BufferRegion
+from tilelang.utils.language import retrieve_ptr, get_buffer_region_from_load, retrieve_buffer_and_offset
 
 _IS_HIP_AVAILABLE = check_hip_availability()
 
@@ -28,36 +31,23 @@ def _normalize_index_arg(value: int | PrimExpr | None) -> PrimExpr | None:
     raise TypeError(f"Expect warp sizing argument to be int or PrimExpr, but got {type(value)}.")
 
 
-def create_list_of_mbarrier(*args: Any) -> Call:
-    """
-    Create a list of memory barrier handles.
-
-    Parameters
-    ----------
-    *args : list or Any
-        Either a single list of arguments, or multiple arguments directly.
-
-    Returns
-    -------
-    tvm.tir.Call
-        Handle to the created list of memory barriers.
+def _mbar_to_buffer_load(mbar: BarrierType) -> BufferLoad:
+    """Convert a memory barrier to a buffer load.
 
-    Raises
-    ------
-    TypeError
-        If the input is not a list or variadic arguments.
+    Args:
+        mbar: BarrierType
+            The memory barrier to convert
 
-    Examples
-    --------
-    >>> create_list_of_mbarrier([128, 128])
-    >>> create_list_of_mbarrier(128, 128)
+    Returns:
+        tir.BufferLoad: A buffer load of the memory barrier
     """
-    if len(args) == 1 and isinstance(args[0], list):
-        return tir.call_intrin("handle", tir.op.Op.get("tl.create_list_of_mbarrier"), *args[0])
-    elif len(args) >= 1:
-        return tir.call_intrin("handle", tir.op.Op.get("tl.create_list_of_mbarrier"), *args)
+    if isinstance(mbar, tir.BufferLoad):
+        return mbar
+    elif isinstance(mbar, tir.Buffer):
+        assert len(mbar.shape) == 1, f"mbarrier must be a single element buffer, but got {mbar.shape}"
+        return tir.BufferLoad(mbar, [0])
     else:
-        raise TypeError("create_list_of_mbarrier expects a list or one or more arguments.")
+        raise TypeError(f"mbarrier must be an tir.BufferLoad or a tir.Buffer, but got {type(mbar)}")
 
 
 def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = None) -> PrimExpr:
@@ -89,23 +79,218 @@ def __ldg(load_or_buf: BufferLoad | tir.Buffer, index: PrimExpr | int | None = N
     raise TypeError("T.__ldg expects a BufferLoad or a Buffer.")
 
 
-def get_mbarrier(*args):
-    """Retrieve a memory barrier operation.
+def access_ptr(
+    base: BufferLikeType,
+    access_type: str | int = "r",
+    *extents: PrimExpr | int | tuple[PrimExpr | int, ...] | list[PrimExpr | int],
+    offset: PrimExpr | int = 0,
+    extent: PrimExpr | int | None = None,
+    ignore_last_ndim: int = 0,
+) -> PrimExpr:
+    """Create a TileLang `tl.access_ptr` from a buffer-like base location.
 
-    Args:
-        *args: Variable arguments to specify which memory barrier to retrieve
+    This is a frontend convenience wrapper that keeps a `BufferLoad` argument
+    in the resulting call so downstream passes can recover the referenced
+    `tir.Buffer` (including strides/storage scope) *and* the `rw_mask`
+    (read/write intent) required by synchronization and safety checks.
 
-    Returns:
-        tir.Call: A handle to the requested memory barrier
+    The returned `tl.access_ptr` is expected to be lowered to
+    `tir.builtin.tvm_access_ptr` later in the TileLang compilation pipeline.
+
+    Parameters
+    ----------
+    base : BufferLikeType
+        The base location to take the address of. Supported:
+        - `tir.BufferLoad` (e.g. `A[i, j]`): pointer to that element
+        - `tir.BufferRegion`: pointer to the region minima
+        - `tir.Buffer`: pointer to the beginning of the buffer
+        - `tir.Var` with let-binding to one of the above (inside TileLang frame)
+
+    access_type : str | int
+        Access mask for the pointer. Common string forms: `"r"`, `"w"`, `"rw"`.
+        Integer bitmask is also accepted (1=read, 2=write, 3=read-write).
+
+    *extents : PrimExpr | int
+        Optional per-axis extents. When provided and `extent` is not specified,
+        the 1D `extent` passed to `tvm_access_ptr` is computed as the product of
+        the provided extents (padding leading dimensions with 1 if needed).
+
+        For example:
+        - `T.access_ptr(A[i], "r")` -> extent defaults to 1 (element pointer)
+        - `T.access_ptr(A[i], "r", 16)` -> extent=16
+        - `T.access_ptr(A[i, j], "r", m, n)` -> extent=m*n
+
+    offset : PrimExpr | int
+        Additional element offset from the base location.
+
+    extent : PrimExpr | int | None
+        Optional explicit 1D extent override (in elements). If provided, it
+        takes precedence over `*extents`.
+
+    ignore_last_ndim : int
+        If non-zero, the base linear offset is computed only over the leading
+        dimensions, ignoring the last `ignore_last_ndim` axes. This is useful
+        when treating an N-D buffer as a view of its trailing sub-tensor.
+
+    Returns
+    -------
+    ptr : PrimExpr
+        A handle-typed `tir.Call` to `tl.access_ptr`.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.get_mbarrier"), *args)
+
+    from tilelang.language.frame import has_let_value, get_let_value
+    from tilelang.language.utils import get_buffer_region_from_load
+
+    if isinstance(base, tir.Var) and has_let_value(base):
+        base = get_let_value(base)
+
+    # Allow passing a single list/tuple as the extents argument.
+    if len(extents) == 1 and isinstance(extents[0], (list, tuple)):
+        extents = tuple(extents[0])
+
+    def _rw_mask(access_type: str | int) -> int:
+        if isinstance(access_type, int):
+            return int(access_type)
+        if isinstance(access_type, str):
+            table = {"r": 1, "w": 2, "rw": 3}
+            if access_type not in table:
+                raise ValueError(f'Invalid access_type="{access_type}", expected one of {sorted(table.keys())}.')
+            return table[access_type]
+        raise TypeError(f"T.access_ptr access_type must be str or int, but got {type(access_type)}.")
+
+    def _index_dtype(buf: tir.Buffer) -> str:
+        if len(buf.shape) > 0:
+            return str(buf.shape[0].dtype)
+        return "int32"
+
+    # Extract underlying buffer and per-axis minima (indices).
+    inferred_region_extents: list[PrimExpr] | None = None
+    is_buffer_load_base = False
+    if isinstance(base, BufferLoad):
+        is_buffer_load_base = True
+        buf = base.buffer
+        region = get_buffer_region_from_load(base)
+        if region is not None:
+            mins = [r.min for r in region.region]
+            inferred_region_extents = [r.extent for r in region.region]
+        else:
+            mins = list(base.indices)
+    elif isinstance(base, BufferRegion):
+        buf = base.buffer
+        mins = [r.min for r in base.region]
+        inferred_region_extents = [r.extent for r in base.region]
+    elif isinstance(base, tir.Buffer):
+        buf = base
+        idx_dtype = _index_dtype(buf)
+        mins = [tir.IntImm(idx_dtype, 0) for _ in buf.shape]
+    else:
+        raise TypeError(f"T.access_ptr expects a Buffer, BufferLoad, BufferRegion, or a Var bound to one of them, but got {type(base)}.")
+
+    # Apply ignore_last_ndim by zeroing-out the ignored tail indices.
+    idx_dtype = _index_dtype(buf)
+    ignore_last_ndim = int(ignore_last_ndim)
+    if ignore_last_ndim != 0:
+        upto = max(0, len(mins) - ignore_last_ndim)
+        mins = list(mins[:upto]) + [tir.IntImm(idx_dtype, 0) for _ in range(len(mins) - upto)]
+
+    # Support non-zero `offset` only for 1D buffers in the frontend meta-op.
+    if isinstance(offset, int):
+        if offset != 0:
+            if len(mins) != 1:
+                raise ValueError(
+                    "T.access_ptr(offset!=0) is only supported for 1D buffers when emitting tl.access_ptr. "
+                    "Use explicit indexing (e.g. A[i + off]) for N-D buffers."
+                )
+            mins = [mins[0] + tir.IntImm(idx_dtype, offset)]
+    elif isinstance(offset, PrimExpr):
+        if not (isinstance(offset, tir.IntImm) and int(offset.value) == 0):
+            if len(mins) != 1:
+                raise ValueError(
+                    "T.access_ptr(offset!=0) is only supported for 1D buffers when emitting tl.access_ptr. "
+                    "Use explicit indexing (e.g. A[i + off]) for N-D buffers."
+                )
+            mins = [mins[0] + offset]
+    else:
+        raise TypeError(f"T.access_ptr offset must be int or PrimExpr, but got {type(offset)}.")
+
+    base_load = BufferLoad(buf, mins)
+
+    # Compute 1D extent (in elements).
+    extent_1d: PrimExpr
+    if extent is not None:
+        extent_1d = convert(extent)
+    elif len(extents) > 0:
+        exts = [convert(e) for e in extents]
+        if len(exts) > len(buf.shape):
+            raise ValueError(f"T.access_ptr got {len(exts)} extents for a buffer with ndim={len(buf.shape)}.")
+        if len(exts) < len(buf.shape):
+            pad = [tir.IntImm(idx_dtype, 1) for _ in range(len(buf.shape) - len(exts))]
+            exts = pad + exts
+        extent_1d = tir.IntImm(idx_dtype, 1)
+        for e in exts:
+            extent_1d = extent_1d * e
+    else:
+        # Match `tir.Buffer.access_ptr` defaults:
+        # - BufferLoad base: element pointer (extent=1)
+        # - BufferRegion base: product of region extents
+        # - Buffer base: full buffer size (product of shape)
+        if is_buffer_load_base:
+            extent_1d = tir.IntImm(idx_dtype, 1)
+        elif inferred_region_extents is not None:
+            extent_1d = tir.IntImm(idx_dtype, 1)
+            for e in inferred_region_extents:
+                extent_1d = extent_1d * convert(e)
+        else:
+            extent_1d = tir.IntImm(idx_dtype, 1)
+            for dim in buf.shape:
+                extent_1d = extent_1d * convert(dim)
+
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.access_ptr"),
+        base_load,
+        extent_1d,
+        tir.IntImm("int32", _rw_mask(access_type)),
+    )
+
+
+def deallocate_tmem(tmem: tir.Buffer) -> None:
+    """Explicitly deallocate a TMEM buffer allocated by ``T.alloc_tmem``.
+
+    By default, TileLang inserts a TMEM deallocation automatically at the end
+    of the allocation block. Calling ``T.deallocate_tmem(buf)`` suppresses that
+    automatic tail deallocation for ``buf`` and lowers an explicit deallocation
+    at the call site instead.
+
+    Notes:
+    - The deallocation must obey the hardware TMEM rules: it should be issued by
+      the same warp that performed the allocation.
+    - Once this API is used, the buffer lifetime is user-managed for the current
+      block; deallocating too early or conditionally is the user's responsibility.
+
+    Args:
+        tmem: A TMEM buffer previously returned by ``T.alloc_tmem``.
+    """
+
+    if not isinstance(tmem, tir.Buffer):
+        raise TypeError(f"T.deallocate_tmem expects a tvm.tir.Buffer, but got {type(tmem)}.")
+    if tmem.scope() != "shared.tmem":
+        raise ValueError(f"T.deallocate_tmem expects a shared.tmem buffer, but got scope={tmem.scope()}.")
+
+    return evaluate(tir.call_intrin("handle", tir.op.Op.get("tl.deallocate_tmem"), tmem.data))
 
 
 def create_tma_descriptor(*args):
     """Create a Tensor Memory Access (TMA) descriptor.
 
-    Args:
-        *args: Variable arguments defining the TMA descriptor configuration
+    This is an internal API used by copy lowering. The argument list depends on
+    the tensor rank and encodes the full TMA descriptor configuration:
+
+        create_tma_descriptor(data_type, rank, global_addr,
+            global_shape..., global_stride..., smem_box..., smem_stride...,
+            interleave, swizzle, l2_promotion, oob_fill)
+
+    Total arguments: 7 + 4 * rank.
 
     Returns:
         tir.Call: A handle to the created TMA descriptor
@@ -116,8 +301,9 @@ def create_tma_descriptor(*args):
 def tma_load(*args):
     """Perform a Tensor Memory Access (TMA) load operation.
 
-    Args:
-        *args: Variable arguments specifying the TMA load parameters
+    This is an internal API used by copy lowering. Arguments:
+
+        tma_load(descriptor, mbarrier, smem_addr, coord_0, ..., coord_n, eviction_policy)
 
     Returns:
         tir.Call: A handle to the TMA load operation
@@ -125,40 +311,56 @@ def tma_load(*args):
     return tir.call_intrin("handle", tir.op.Op.get("tl.tma_load"), *args)
 
 
-def fence_proxy_async(*args):
-    """Create a fence for asynchronous proxy operations.
+def tma_load_2sm(*args):
+    """Perform a TMA load with 2SM (two Streaming Multiprocessors) on Blackwell.
 
-    Args:
-        *args: Variable arguments for fence configuration
+    This is an internal API. Same arguments as :func:`tma_load`, but with
+    the ``use_2cta`` annotation enabled for 2-CTA cooperative loading.
+
+    Returns:
+        tir.Call: A handle to the TMA load operation
+    """
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tma_load"), *args, annotations={"use_2cta": 1})
+
+
+def fence_proxy_async():
+    """Issue a shared memory fence for asynchronous proxy operations.
+
+    Ensures that prior asynchronous operations (e.g. TMA stores) are visible
+    to subsequent memory accesses. Maps to ``fence.proxy.async.shared::cta``.
 
     Returns:
         tir.Call: A handle to the fence operation
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.fence_proxy_async"), *args)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.fence_proxy_async"))
 
 
-def tma_store_arrive(*args):
+def tma_store_arrive():
     """Signal the arrival of a TMA store operation.
 
-    Args:
-        *args: Variable arguments for the store arrival operation
+    Commits the current group of outstanding TMA store operations.
+    Maps to ``cp.async.bulk.commit_group``.
 
     Returns:
         tir.Call: A handle to the store arrive operation
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.tma_store_arrive"), *args)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tma_store_arrive"))
 
 
-def tma_store_wait(*args):
+def tma_store_wait(count: int = 0):
     """Wait for completion of TMA store operations.
 
+    Waits until the number of outstanding TMA store groups is at most ``count``.
+    Maps to the PTX instruction ``cp.async.bulk.wait_group.read <count>``.
+
     Args:
-        *args: Variable arguments specifying which store operations to wait for
+        count (int): The maximum number of outstanding store groups allowed
+            to remain in flight. Defaults to 0 (wait for all stores to complete).
 
     Returns:
         tir.Call: A handle to the store wait operation
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.tma_store_wait"), *args)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tma_store_wait"), count)
 
 
 def set_max_nreg(reg_count: int, is_inc: int):
@@ -191,12 +393,20 @@ def dec_max_nreg(reg_count: int):
 
 def annotate_producer_reg_dealloc(reg_count: int = 24):
     """Annotate the producer reg dealloc."""
-    return dec_max_nreg(reg_count)
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.annotate_producer_reg_dealloc"),
+        reg_count,
+    )
 
 
 def annotate_consumer_reg_alloc(reg_count: int = 240):
     """Annotate the consumer reg alloc."""
-    return inc_max_nreg(reg_count)
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.annotate_consumer_reg_alloc"),
+        reg_count,
+    )
 
 
 def no_set_max_nreg():
@@ -209,84 +419,99 @@ def disable_warp_group_reg_alloc():
     return no_set_max_nreg()
 
 
-def mbarrier_wait_parity(mbarrier: int | PrimExpr | tir.Call, parity: int | Var):
+def ptx_arrive_cluster_barrier(mbarrier: BarrierType, cta_id: int | Var):
+    """Arrive at a shared barrier in cluster.
+
+    Args:
+        mbarrier: BarrierType
+            The memory barrier to arrive at
+        cta_id: int | Var
+            The peer CTA rank in cluster to arrive at.
+    """
+    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_arrive_cluster_barrier"), mbarrier, cta_id)
+
+
+def mbarrier_wait_parity(mbarrier: BarrierType, parity: int | Var):
     """Wait for memory barrier parity condition.
 
     Args:
-        mbarrier: Optional[int, PrimExpr]
+            mbarrier: BarrierType
             The memory barrier to wait on
-        parity: Optional[int, Var]
+        parity: int | Var
             The parity value to wait for
     Examples:
         .. code-block:: python
 
-            # Wait for parity 0 on barrier 0
-            T.mbarrier_wait_parity(0, 0)
-
-            # Wait for parity value in variable ko on barrier 1
-            T.mbarrier_wait_parity(1, ko)
+            mbar = T.alloc_barrier(1)
+            # Wait for parity 0 on a single mbarrier
+            T.mbarrier_wait_parity(mbar, 0)
 
-            # Wait using barrier handle
-            barrier = T.get_mbarrier(0)
-            T.mbarrier_wait_parity(barrier, 1)
+            mbars = T.alloc_barrier([128] * n)
+            # Wait for parity value on one of the mbarriers
+            T.mbarrier_wait_parity(mbars[ko], ko)
 
             # Common usage in pipelined kernels:
             for ko in range(num_stages):
                 # Producer waits for consumer to finish previous iteration
-                T.mbarrier_wait_parity(1, ko ^ 1)
+                T.mbarrier_wait_parity(mbars[1], ko ^ 1)
                 # Producer copies data
                 T.copy(A_global, A_shared)
                 # Producer signals data ready
-                T.mbarrier_arrive(0)
+                T.mbarrier_arrive(mbars[0])
 
                 # Consumer waits for producer data
-                T.mbarrier_wait_parity(0, ko)
+                T.mbarrier_wait_parity(mbars[0], ko)
                 # Consumer computes
                 T.gemm(A_shared, B_shared, C_local)
                 # Consumer signals completion
-                T.mbarrier_arrive(1)
+                T.mbarrier_arrive(mbars[1])
     Returns:
         tir.Call: A handle to the barrier wait operation
     """
-    if isinstance(mbarrier, (tir.Call, tir.BufferLoad)):
-        mbarrier = mbarrier
-    elif isinstance(mbarrier, (tir.PrimExpr, int)):
-        mbarrier = get_mbarrier(mbarrier)
-    elif isinstance(mbarrier, tir.Buffer):
-        mbarrier = tir.BufferLoad(mbarrier, [0])
-    else:
-        raise TypeError(f"mbarrier must be an integer or a tir.Call, but got {type(mbarrier)}")
+    mbarrier = _mbar_to_buffer_load(mbarrier)
     return tir.call_intrin("handle", tir.op.Op.get("tl.mbarrier_wait_parity"), mbarrier, parity)
 
 
-def mbarrier_arrive(mbarrier: int | PrimExpr | tir.Call):
+def mbarrier_arrive(mbarrier: BarrierType, cta_id: int | Var | None = None):
     """Arrive at memory barrier.
 
     Args:
-        mbarrier: Optional[int, PrimExpr]
+        mbarrier: BarrierType
             The memory barrier to arrive at
+        cta_id: int | Var | None
+            The peer CTA rank in cluster to arrive at. (Only valid for cluster barriers)
+            If not provided, will arrive on current CTA's barrier.
     """
-    if isinstance(mbarrier, (tir.Call, tir.BufferLoad)):
-        mbarrier = mbarrier
-    elif isinstance(mbarrier, (tir.PrimExpr, int)):
-        mbarrier = get_mbarrier(mbarrier)
-    elif isinstance(mbarrier, tir.Buffer):
-        mbarrier = tir.BufferLoad(mbarrier, [0])
+    mbarrier = _mbar_to_buffer_load(mbarrier)
+    if cta_id is not None:
+        assert mbarrier.buffer.scope() == "shared.cluster_barrier", f"mbarrier must be a cluster barrier, but got {mbarrier.buffer.scope}"
+        return ptx_arrive_cluster_barrier(mbarrier, cta_id)
     else:
-        raise TypeError(f"mbarrier must be an integer or a tir.Call, but got {type(mbarrier)}")
-    return ptx_arrive_barrier(mbarrier)
+        return ptx_arrive_barrier(mbarrier)
 
 
-def mbarrier_expect_tx(*args):
+def mbarrier_expect_tx(mbarrier: BarrierType, tx: int):
     """Set expected transaction count for memory barrier.
 
     Args:
-        *args: Variable arguments specifying the expected transaction count
+        mbarrier: BarrierType
+            The memory barrier to expect transaction count for
+        tx: int
+            The expected transaction count
 
     Returns:
         tir.Call: A handle to the barrier expectation operation
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.mbarrier_expect_tx"), *args)
+    mbarrier = _mbar_to_buffer_load(mbarrier)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.mbarrier_expect_tx"), mbarrier, tx)
+
+
+def mbarrier_arrive_expect_tx(mbarrier: BarrierType, tx: int):
+    """Arrive at a memory barrier and expect completion of async transactions."""
+    from tilelang.language.tir.op import ptx_arrive_barrier_expect_tx
+
+    mbarrier = _mbar_to_buffer_load(mbarrier)
+    return ptx_arrive_barrier_expect_tx(mbarrier, tx)
 
 
 def warpgroup_arrive():
@@ -460,7 +685,10 @@ def shuffle_elect(thread_extent: int) -> PrimExpr:
 
 
 def warpgroup_fence_operand(
-    buffer_or_ptr: tir.Buffer | PrimExpr, offset: int | PrimExpr = 0, num_regs: int | PrimExpr | None = None, dtype: str | None = None
+    buffer_or_ptr: BufferLikeType | PrimExpr,
+    offset: int | PrimExpr = 0,
+    num_regs: int | PrimExpr | None = None,
+    dtype: DType | None = None,
 ):
     """Insert a warpgroup fence for the destination accumulator registers.
 
@@ -468,7 +696,7 @@ def warpgroup_fence_operand(
     WGMMA operations by issuing an empty inline assembly barrier on every register.
 
     Args:
-        buffer_or_ptr: Buffer | BufferLoad | BufferRegion | PrimExpr
+        buffer_or_ptr: BufferLikeType | PrimExpr
             A buffer representing the accumulator fragment, a buffer load/region
             that identifies a starting element within the fragment, or a pointer expression
             (e.g., tvm_access_ptr/address_of/typed Var).
@@ -477,7 +705,7 @@ def warpgroup_fence_operand(
         num_regs: int | PrimExpr | None
             Number of 32-bit registers to fence. If None and a Buffer is provided, it will be
             derived from the buffer shape and dtype.
-        dtype: str | None
+        dtype: DType | None
             Data type string of the accumulator elements. When passing a buffer or
             buffer-derived expression, dtype is inferred. It is required only when
             passing a raw pointer expression that cannot be inferred.
@@ -631,72 +859,82 @@ def wait_wgmma(id: int):
     return tir.call_intrin("handle", tir.op.Op.get("tl.wait_wgmma"), id)
 
 
-def barrier_wait(barrier_id: int | PrimExpr | tir.Call, parity: int | Var | None = None):
+def barrier_wait(mbarrier: BarrierType, parity: int | Var):
     """Wait for a memory barrier to complete.
 
     Args:
-        barrier_id: Optional[int, PrimExpr]
+        mbarrier: BarrierType
             The memory barrier to wait on
-        parity: Optional[int, Var]
+        parity: int | Var
             The parity value to wait for
     Returns:
         tir.Call: A handle to the barrier wait operation
     Current implementation is a sugar syntax for mbarrier_wait_parity, as we only support parity 0 and 1.
     """
-    return mbarrier_wait_parity(barrier_id, parity)
+    return mbarrier_wait_parity(mbarrier, parity)
 
 
-def barrier_arrive(barrier_id: int | PrimExpr | tir.Call):
+def barrier_arrive(mbarrier: BarrierType):
     """Arrive at a memory barrier.
 
     Args:
-        barrier_id: Optional[int, PrimExpr]
+        mbarrier: BarrierType
             The memory barrier to arrive at
     """
-    return mbarrier_arrive(barrier_id)
+    return mbarrier_arrive(mbarrier)
 
 
-def shfl_xor(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
-    """Perform a shuffle operation with XOR offset.
+# Full-warp mask as a proper uint32 TIR constant so the emitted C/C++ source
+# prints as `0xFFFFFFFFu` instead of `(int64_t)4294967295` after TIR widening.
+_FULL_WARP_MASK = tir.const(0xFFFFFFFF, "uint32")
+_DEFAULT_SHFL_WIDTH = 32
 
-    Args:
-        value: Optional[int, PrimExpr]
-            The value to shuffle
-        offset: Optional[int, PrimExpr]
-            The offset for the shuffle operation
-    Returns:
-        tir.Call: A handle to the shuffle operation
-    """
-    if _IS_HIP_AVAILABLE:
-        return tir.call_extern(value.dtype, "__shfl_xor", value, offset)
-    else:
-        return tir.call_extern(value.dtype, "__shfl_xor_sync", 0xFFFFFFFF, value, offset)
 
+def _as_uint32_mask(mask: int | PrimExpr) -> PrimExpr:
+    """Normalize a warp lane mask to a uint32 TIR expression.
+
+    Python literals (e.g. ``0xFFFFFFFF``) would otherwise be widened to int64
+    by TIR and printed as ``(int64_t)4294967295`` in the generated source.
+    """
+    if isinstance(mask, int):
+        return tir.const(mask, "uint32")
+    return mask
 
-def shfl_down(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
-    """Perform a shuffle operation with down offset.
 
-    Args:
-        value: Optional[int, PrimExpr]
-            The value to shuffle
+def shfl_xor(
+    value: int | PrimExpr | tir.Call,
+    delta: int | PrimExpr | tir.Call,
+    width: int | PrimExpr = _DEFAULT_SHFL_WIDTH,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+):
+    """XOR-swap ``value`` across lanes (``__shfl_xor_sync`` on CUDA,
+    ``__shfl_xor`` on HIP — mask ignored on HIP).
     """
-    if _IS_HIP_AVAILABLE:
-        return tir.call_extern(value.dtype, "__shfl_down", value, offset)
-    else:
-        return tir.call_extern(value.dtype, "__shfl_down_sync", 0xFFFFFFFF, value, offset)
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.shfl_xor_sync"), _as_uint32_mask(mask), value, delta, width)
+
 
+def shfl_down(
+    value: int | PrimExpr | tir.Call,
+    delta: int | PrimExpr | tir.Call,
+    width: int | PrimExpr = _DEFAULT_SHFL_WIDTH,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+):
+    """Shift ``value`` down by ``delta`` lanes (``__shfl_down_sync`` on CUDA,
+    ``__shfl_down`` on HIP).
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.shfl_down_sync"), _as_uint32_mask(mask), value, delta, width)
 
-def shfl_up(value: int | PrimExpr | tir.Call, offset: int | PrimExpr | tir.Call):
-    """Perform a shuffle operation with up offset.
 
-    Args:
-        value: Optional[int, PrimExpr]
-            The value to shuffle
+def shfl_up(
+    value: int | PrimExpr | tir.Call,
+    delta: int | PrimExpr | tir.Call,
+    width: int | PrimExpr = _DEFAULT_SHFL_WIDTH,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+):
+    """Shift ``value`` up by ``delta`` lanes (``__shfl_up_sync`` on CUDA,
+    ``__shfl_up`` on HIP).
     """
-    if _IS_HIP_AVAILABLE:
-        return tir.call_extern(value.dtype, "__shfl_up", value, offset)
-    else:
-        return tir.call_extern(value.dtype, "__shfl_up_sync", 0xFFFFFFFF, value, offset)
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.shfl_up_sync"), _as_uint32_mask(mask), value, delta, width)
 
 
 def sync_threads(barrier_id: int = None, arrive_count: int = None):
@@ -709,6 +947,166 @@ def sync_threads(barrier_id: int = None, arrive_count: int = None):
     return tir.call_intrin("int32", "tir.tvm_storage_sync", "shared", *args)
 
 
+def sync_warp(mask: int = None):
+    """Synchronize all threads in a warp."""
+    if mask is not None:
+        return tir.call_intrin("void", tir.op.Op.get("tl.sync_warp"), mask)
+    return tir.call_intrin("void", tir.op.Op.get("tl.sync_warp"))
+
+
+def shfl_sync(
+    value: int | PrimExpr,
+    srcLane: int | PrimExpr,
+    width: int | PrimExpr = _DEFAULT_SHFL_WIDTH,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+):
+    """Broadcast ``value`` from ``srcLane`` to all lanes in the subgroup of
+    ``width`` lanes (``__shfl_sync`` on CUDA, ``__shfl`` on HIP — mask ignored
+    on HIP).
+    """
+    return tir.call_intrin(value.dtype, tir.op.Op.get("tl.shfl_sync"), _as_uint32_mask(mask), value, srcLane, width)
+
+
+# ---------------------------------------------------------------------------
+# Warp-vote / warp-ballot intrinsics
+#
+# The CUDA/HIP split and the uint32->uint64 zero-extension for ballot_sync and
+# activemask are handled in codegen (src/target/codegen_{cuda,hip}.cc). These
+# Python wrappers simply emit the backend-agnostic tl.* ops.
+# ---------------------------------------------------------------------------
+
+
+def any_sync(
+    predicate: int | PrimExpr,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+) -> PrimExpr:
+    """Non-zero if ANY active lane in ``mask`` has a non-zero ``predicate``.
+
+    Lowers to ``__any_sync(mask, predicate)`` on CUDA and ``__any(predicate)``
+    on HIP (the mask is ignored on HIP because the full wavefront is always
+    convergent).
+
+    Args:
+        predicate: Integer condition to test.
+        mask: Warp lane mask (defaults to ``0xFFFFFFFF``, i.e. all 32 lanes).
+
+    Returns:
+        int32: Non-zero if any thread in the mask has a non-zero predicate.
+    """
+    return tir.call_intrin("int32", tir.op.Op.get("tl.any_sync"), _as_uint32_mask(mask), predicate)
+
+
+def all_sync(
+    predicate: int | PrimExpr,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+) -> PrimExpr:
+    """Non-zero only if ALL active lanes in ``mask`` have a non-zero predicate.
+
+    Lowers to ``__all_sync(mask, predicate)`` on CUDA and ``__all(predicate)``
+    on HIP.
+
+    Args:
+        predicate: Integer condition to test.
+        mask: Warp lane mask (defaults to ``0xFFFFFFFF``, i.e. all 32 lanes).
+
+    Returns:
+        int32: Non-zero if all threads in the mask have a non-zero predicate.
+    """
+    return tir.call_intrin("int32", tir.op.Op.get("tl.all_sync"), _as_uint32_mask(mask), predicate)
+
+
+def ballot_sync(
+    predicate: int | PrimExpr,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+) -> PrimExpr:
+    """Return a ``uint64`` bitmask of lanes in ``mask`` whose predicate is set.
+
+    CUDA: ``__ballot_sync(mask, predicate)`` returns ``unsigned int``; codegen
+    zero-extends it to ``uint64`` (upper 32 bits always zero for 32-wide warps).
+    HIP: ``__ballot(predicate)`` returns ``uint64`` natively, covering all
+    64 wavefront lanes. The mask argument is ignored on HIP.
+
+    Returns:
+        uint64: Bitmask with bit N set if lane N's predicate is non-zero.
+    """
+    return tir.call_intrin("uint64", tir.op.Op.get("tl.ballot_sync"), _as_uint32_mask(mask), predicate)
+
+
+def ballot(predicate: int | PrimExpr) -> PrimExpr:
+    """Full-warp / full-wavefront ballot. Equivalent to
+    ``ballot_sync(predicate)`` (i.e. with the default full warp mask).
+
+    Returns:
+        uint64: Bitmask with bit N set if lane N's predicate is non-zero.
+    """
+    return tir.call_intrin("uint64", tir.op.Op.get("tl.ballot"), predicate)
+
+
+def activemask() -> PrimExpr:
+    """Return a ``uint64`` bitmask of currently active (non-exited) lanes.
+
+    Lowers to ``__activemask()`` (zero-extended to ``uint64``) on CUDA and
+    ``__ballot(1)`` on HIP.
+    """
+    return tir.call_intrin("uint64", tir.op.Op.get("tl.activemask"))
+
+
+# ---------------------------------------------------------------------------
+# Thread-block synchronization with predicate (shared across CUDA & HIP)
+# ---------------------------------------------------------------------------
+
+
+def syncthreads_count(predicate: int | PrimExpr) -> PrimExpr:
+    """Block barrier that returns the number of threads whose ``predicate``
+    evaluates to non-zero (``__syncthreads_count`` on CUDA and HIP).
+    """
+    return tir.call_intrin("int32", tir.op.Op.get("tl.syncthreads_count"), predicate)
+
+
+def syncthreads_and(predicate: int | PrimExpr) -> PrimExpr:
+    """Block barrier that returns non-zero only if ALL threads have a non-zero
+    ``predicate`` (``__syncthreads_and`` on CUDA and HIP).
+    """
+    return tir.call_intrin("int32", tir.op.Op.get("tl.syncthreads_and"), predicate)
+
+
+def syncthreads_or(predicate: int | PrimExpr) -> PrimExpr:
+    """Block barrier that returns non-zero if ANY thread has a non-zero
+    ``predicate`` (``__syncthreads_or`` on CUDA and HIP).
+    """
+    return tir.call_intrin("int32", tir.op.Op.get("tl.syncthreads_or"), predicate)
+
+
+# ---------------------------------------------------------------------------
+# Warp match intrinsics (CUDA sm_70+; unsupported on HIP)
+# ---------------------------------------------------------------------------
+
+
+def match_any_sync(
+    value: int | PrimExpr,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+) -> PrimExpr:
+    """Return a ``uint32`` bitmask of lanes in ``mask`` whose ``value`` equals
+    the calling lane's value. Lowers to ``__match_any_sync`` on CUDA
+    (compute capability >= 7.0). Not supported on HIP.
+    """
+    return tir.call_intrin("uint32", tir.op.Op.get("tl.match_any_sync"), _as_uint32_mask(mask), value)
+
+
+def match_all_sync(
+    value: int | PrimExpr,
+    mask: int | PrimExpr = _FULL_WARP_MASK,
+) -> PrimExpr:
+    """Return ``mask`` if all lanes in ``mask`` agree on ``value``, else 0.
+
+    Lowers to ``__match_all_sync`` on CUDA (compute capability >= 7.0); the
+    trailing ``int*`` predicate output is hidden in codegen and discarded.
+    Callers can reconstruct the predicate as ``result != 0``. Not supported
+    on HIP.
+    """
+    return tir.call_intrin("uint32", tir.op.Op.get("tl.match_all_sync"), _as_uint32_mask(mask), value)
+
+
 def sync_global():
     """Synchronize all threads in the entire grid."""
     tx, ty, tz = get_thread_bindings()
@@ -809,20 +1207,115 @@ def increase_descriptor_offset(descriptor: PrimExpr, offset: PrimExpr) -> PrimEx
     return evaluate(tir.call_intrin("handle", tir.op.Op.get("tl.increase_descriptor_offset"), descriptor, offset))
 
 
-def cp_async_barrier_noinc(barrier_id: int | PrimExpr | tir.Call):
+def loop_break():
+    """Break out of the innermost loop."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.loop_break"))
+
+
+def cp_async_barrier_noinc(barrier: BarrierType):
     """Perform a ptx async copy barrier using cp.async.mbarrier.arrive.noinc."""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier_id)
+    barrier = _mbar_to_buffer_load(barrier)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_cp_async_barrier_noinc"), barrier)
 
 
-def tcgen05_mma_arrive(mbar_ptr):
+def tcgen05_mma_arrive(mbar: tir.Buffer | BufferLoad | PrimExpr, arrive_2cta: bool = False):
     """Signal UMMA (TCGEN05) barrier arrival for a shared-memory mbarrier pointer.
 
     Parameters
     ----------
-    mbar_ptr : PrimExpr
-        Pointer to the mbarrier object in shared memory (e.g., Barrier*).
+    mbar: tir.Buffer | BufferLoad | PrimExpr
+        The mbarrier object in shared memory (e.g., Barrier*) or its address.
+    arrive_2cta: bool
+        Whether to also arrive at the peer CTA's barrier.
+        If set, will be lowered to umma_arrive_multicast_2x1SM.
+    """
+    if isinstance(mbar, (tir.Buffer, BufferLoad)):
+        mbar = retrieve_ptr(mbar, access_type="rw")
+    ann = {"use_2cta": 1} if arrive_2cta else {}
+    return tir.call_intrin("void", tir.op.Op.get("tl.tcgen05_mma_arrive"), mbar, annotations=ann)
+
+
+def tcgen05_before_thread_sync():
+    return tir.call_intrin("void", tir.op.Op.get("tl.tcgen05_before_thread_sync"))
+
+
+def tcgen05_after_thread_sync():
+    return tir.call_intrin("void", tir.op.Op.get("tl.tcgen05_after_thread_sync"))
+
+
+def _tcgen05_num_smem_chunks(smem_src, chunk_elems: int):
+    if isinstance(smem_src, tir.Buffer):
+        shape = list(smem_src.shape)
+    elif isinstance(smem_src, tir.BufferRegion):
+        shape = [r.extent for r in smem_src.region]
+    elif isinstance(smem_src, tir.BufferLoad):
+        region = get_buffer_region_from_load(smem_src)
+        if region is None:
+            raise TypeError("T.tcgen05_cp_warpx4 requires Buffer/BufferRegion-like scale-factor sources.")
+        shape = [r.extent for r in region.region]
+    else:
+        raise TypeError(f"Unsupported scale-factor buffer type: {type(smem_src)}")
+
+    total_elems = 1
+    for extent in shape:
+        if not isinstance(extent, tir.IntImm):
+            raise ValueError("Packed scale-factor helpers require a static extent.")
+        total_elems *= extent.value
+    if total_elems % chunk_elems != 0:
+        raise ValueError(f"Packed scale-factor helpers require total extent to be a multiple of {chunk_elems}, got {total_elems}.")
+    return total_elems // chunk_elems
+
+
+def tcgen05_cp_warpx4(smem_src, tmem_dst, tmem_col_offset=0, *, use_2cta: bool = False):
+    """Copy one or more packed scale-factor chunks from shared memory to tensor memory.
+
+    The helper lowers to one or more ``tcgen05.cp.cta_group::{1,2}.32x128b.warpx4``
+    instructions. For 1D packed ``uint32`` scale buffers, each 128-word chunk maps to
+    4 TMEM columns and the column offset is advanced automatically.
+    """
+    num_chunks = _tcgen05_num_smem_chunks(smem_src, 128)
+    if isinstance(tmem_dst, tir.Buffer):
+        tmem_ptr = tmem_dst.data
+    elif isinstance(tmem_dst, (BufferLoad, BufferRegion)):
+        tmem_ptr = tmem_dst.buffer.data
+    else:
+        tmem_ptr = tmem_dst
+    ann = {"use_2cta": 1} if use_2cta else None
+    buffer, base_offset = retrieve_buffer_and_offset(smem_src)
+
+    @macro
+    def _tcgen05_cp_warpx4_chunked(buffer, tmem_ptr, tmem_col_offset, base_offset):
+        for i in T.unroll(num_chunks):
+            chunk_ptr = buffer.access_ptr("r", offset=base_offset + i * 128)
+            tir.call_intrin(
+                "void",
+                tir.op.Op.get("tl.ptx_tcgen05_cp_warpx4"),
+                chunk_ptr,
+                tmem_ptr,
+                tmem_col_offset + i * 4,
+                annotations=ann,
+            )
+
+    return _tcgen05_cp_warpx4_chunked(buffer, tmem_ptr, tmem_col_offset, base_offset)
+
+
+def tcgen05_sf_warp_transpose(smem_src):
+    """Warp-level transpose for one or more packed scale-factor chunks in shared memory.
+
+    For 1D packed ``uint32`` scale buffers, the helper automatically applies the
+    transpose to each 128-word chunk in order.
     """
-    return tir.call_intrin("void", tir.op.Op.get("tl.tcgen05_mma_arrive"), mbar_ptr)
+    num_chunks = _tcgen05_num_smem_chunks(smem_src, 128)
+
+    buffer, base_offset = retrieve_buffer_and_offset(smem_src)
+
+    @macro
+    def _tcgen05_sf_warp_transpose_chunked(buffer, base_offset):
+        for i in T.unroll(num_chunks):
+            chunk_ptr = buffer.access_ptr("rw", offset=base_offset + i * 128)
+            tir.call_intrin("void", tir.op.Op.get("tl.ptx_tcgen05_sf_warp_transpose"), chunk_ptr)
+
+    return _tcgen05_sf_warp_transpose_chunked(buffer, base_offset)
 
 
 def ptx_mma_sm70(
@@ -924,77 +1417,253 @@ def ptx_mma_sm70(
     )
 
 
-# =====================================================================
-# TileScale Distributed Features
-# =====================================================================
+def ds_read_tr16_b64(src: BufferLikeType) -> PrimExpr:
+    """LDS transpose read, 64-bit, 16-element transpose (gfx950 only).
 
+    Reads 8 bytes from LDS (__shared__ memory) with a 16-element transpose.
+    Used for FP16/BF16 MFMA matrix B-loads on MI350/MI355X (gfx950).
 
-# Device-level barrier synchronization
+    Args:
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad` in shared memory.
 
+    Returns:
+        PrimExpr: The loaded 64-bit value as uint32x2.
+
+    Example:
+        >>> val = T.ds_read_tr16_b64(smem[i])
+    """
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ds_read_tr16_b64 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    return tir.call_intrin("uint32x2", tir.op.Op.get("tl.ds_read_tr16_b64"), ptr)
 
-def alloc_barrier_gpu():
-    """Allocate a barrier for GPU-level synchronization.
+
+def ds_read_tr8_b64(src: BufferLikeType) -> PrimExpr:
+    """LDS transpose read, 64-bit, 8-element transpose (gfx950 only).
+
+    Reads 8 bytes from LDS (__shared__ memory) with an 8-element transpose.
+    Used for FP32 MFMA matrix B-loads on MI350/MI355X (gfx950).
+
+    Args:
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad` in shared memory.
 
     Returns:
-        T.Buffer: A single-element TVM buffer object allocated as a barrier
+        PrimExpr: The loaded 64-bit value as uint32x2.
+
+    Example:
+        >>> val = T.ds_read_tr8_b64(smem[i])
     """
-    return alloc_buffer([1], "uint32", scope="global")
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ds_read_tr8_b64 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    return tir.call_intrin("uint32x2", tir.op.Op.get("tl.ds_read_tr8_b64"), ptr)
 
 
-def init_barrier_gpu(barrier: PrimExpr, expected: int):
-    """Initialize a barrier for GPU-level synchronization.
+def ldg32(src: BufferLikeType, pred: PrimExpr = None) -> PrimExpr:
+    """Load 32 bits (4 bytes) from global memory using explicit PTX instructions.
+
+    Usage: `T.ldg32(x[i])` or `T.ldg32(x[i:i+2])` emits `tl::ldg32(ptr)`.
+
+    Args:
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad`.
+        pred: Optional predicate condition. If False, the load is skipped.
+
+    Returns:
+        PrimExpr: The loaded 32-bit value.
+
+    Example:
+        >>> val = T.ldg32(x[i])
+        >>> val = T.ldg32(x[i:i+2])  # load 2 x fp16
+        >>> val = T.ldg32(x[i], pred=i < N)  # predicated load
+    """
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ldg32 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    if pred is None:
+        return tir.call_intrin("uint32", tir.op.Op.get("tl.ldg32"), ptr)
+    else:
+        return tir.call_intrin("uint32", tir.op.Op.get("tl.ldg32"), ptr, pred)
+
+
+def ldg64(src: BufferLikeType, pred: PrimExpr = None) -> PrimExpr:
+    """Load 64 bits (8 bytes) from global memory using explicit PTX instructions.
+
+    Usage: `T.ldg64(x[i])` or `T.ldg64(x[i:i+4])` emits `tl::ldg64(ptr)`.
 
     Args:
-        barrier: The barrier to initialize
-        expected (int): The number of threads that need to arrive at the barrier.
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad`.
+        pred: Optional predicate condition. If False, the load is skipped.
+
+    Returns:
+        PrimExpr: The loaded 64-bit value.
+
+    Example:
+        >>> val = T.ldg64(x[i])
+        >>> val = T.ldg64(x[i:i+4])  # load 4 x fp16
+        >>> val = T.ldg64(x[i], pred=i < N)  # predicated load
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.init_barrier_gpu"), address_of(barrier), expected)
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ldg64 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    if pred is None:
+        return tir.call_intrin("uint32x2", tir.op.Op.get("tl.ldg64"), ptr)
+    else:
+        return tir.call_intrin("uint32x2", tir.op.Op.get("tl.ldg64"), ptr, pred)
 
 
-def arrive_barrier_gpu(barrier: PrimExpr):
-    """Arrive at a barrier for GPU-level synchronization.
+def ldg128(src: BufferLikeType, pred: PrimExpr = None) -> PrimExpr:
+    """Load 128 bits (16 bytes) from global memory using explicit PTX instructions.
+
+    Usage: `T.ldg128(x[i])` or `T.ldg128(x[i:i+8])` emits `tl::ldg128(ptr)`.
 
     Args:
-        barrier: The barrier to arrive at
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad`.
+        pred: Optional predicate condition. If False, the load is skipped.
+
+    Returns:
+        PrimExpr: The loaded 128-bit value.
+
+    Example:
+        >>> val = T.ldg128(x[i])
+        >>> val = T.ldg128(x[i:i+8])  # load 8 x fp16
+        >>> val = T.ldg128(x[i], pred=i < N)  # predicated load
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.arrive_barrier_gpu"), address_of(barrier))
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ldg128 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    if pred is None:
+        return tir.call_intrin("uint32x4", tir.op.Op.get("tl.ldg128"), ptr)
+    else:
+        return tir.call_intrin("uint32x4", tir.op.Op.get("tl.ldg128"), ptr, pred)
 
 
-def wait_barrier_gpu(barrier: PrimExpr):
-    """Wait at a barrier for GPU-level synchronization.
+def ldg256(src: BufferLikeType, pred: PrimExpr = None) -> PrimExpr:
+    """Load 256 bits (32 bytes) from global memory using explicit PTX instructions.
+
+    Usage: `T.ldg256(x[i])` or `T.ldg256(x[i:i+16])` emits `tl::ldg256(ptr)`.
 
     Args:
-        barrier: The barrier to wait at
+        src: A `Buffer`, `BufferRegion`, or `BufferLoad`.
+        pred: Optional predicate condition. If False, the load is skipped.
+
+    Returns:
+        PrimExpr: The loaded 256-bit value.
+
+    Example:
+        >>> val = T.ldg256(x[i])
+        >>> val = T.ldg256(x[i:i+16])  # load 16 x fp16
+        >>> val = T.ldg256(x[i], pred=i < N)  # predicated load
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.wait_barrier_gpu"), address_of(barrier))
+    if not isinstance(src, BufferLikeTypeTuple):
+        raise TypeError(f"T.ldg256 expects Buffer, BufferRegion, or BufferLoad. Got {type(src)}: {src}")
+    ptr = retrieve_ptr(src, access_type="r")
+    if pred is None:
+        return tir.call_intrin("uint32x8", tir.op.Op.get("tl.ldg256"), ptr)
+    else:
+        return tir.call_intrin("uint32x8", tir.op.Op.get("tl.ldg256"), ptr, pred)
 
 
-def sync_barrier_gpu(barrier: PrimExpr):
-    """Synchronize at a barrier for GPU-level synchronization.
+def stg32(dst: BufferLikeType, value: PrimExpr, pred: PrimExpr = None) -> None:
+    """Store 32 bits (4 bytes) to global memory using explicit PTX instructions.
+
+    Usage: `T.stg32(y[i], value)` emits `tl::stg32(ptr, value)`.
 
     Args:
-        barrier: The barrier to synchronize at
+        dst: A `Buffer`, `BufferRegion`, or `BufferLoad` indicating the destination.
+        value: The 32-bit value to store.
+        pred: Optional predicate condition. If False, the store is skipped.
+
+    Example:
+        >>> T.stg32(y[i], val)
+        >>> T.stg32(y[i], val, pred=i < N)  # predicated store
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.sync_barrier_gpu"), address_of(barrier))
+    if not isinstance(dst, BufferLikeTypeTuple):
+        raise TypeError(f"T.stg32 expects Buffer, BufferRegion, or BufferLoad. Got {type(dst)}: {dst}")
+    ptr = retrieve_ptr(dst, access_type="w")
+    if pred is None:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg32"), ptr, value)
+    else:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg32"), ptr, value, pred)
 
 
-def barrier_blocks(barrier: PrimExpr):
-    """Barrier all blocks at a system-level barrier.
-    Compare to sync_blocks, barrier_blocks have an extra system-level fence effect
+def stg64(dst: BufferLikeType, value: PrimExpr, pred: PrimExpr = None) -> None:
+    """Store 64 bits (8 bytes) to global memory using explicit PTX instructions.
+
+    Usage: `T.stg64(y[i:i+2], value)` emits `tl::stg64(ptr, value)`.
 
     Args:
-        barrier: The barrier to synchronize at, should be [num_ranks] of int32
+        dst: A `Buffer`, `BufferRegion`, or `BufferLoad` indicating the destination.
+        value: The 64-bit value to store (e.g., uint2).
+        pred: Optional predicate condition. If False, the store is skipped.
+
+    Example:
+        >>> T.stg64(y[i:i+2], val)
+        >>> T.stg64(y[i:i+2], val, pred=i < N)  # predicated store
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.barrier_blocks"), address_of(barrier), 1)  # whether need fence
+    if not isinstance(dst, BufferLikeTypeTuple):
+        raise TypeError(f"T.stg64 expects Buffer, BufferRegion, or BufferLoad. Got {type(dst)}: {dst}")
+    ptr = retrieve_ptr(dst, access_type="w")
+    if pred is None:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg64"), ptr, value)
+    else:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg64"), ptr, value, pred)
 
 
-def sync_blocks(barrier: PrimExpr):
-    """Synchronize all blocks at a system-level barrier.
+def stg128(dst: BufferLikeType, value: PrimExpr, pred: PrimExpr = None) -> None:
+    """Store 128 bits (16 bytes) to global memory using explicit PTX instructions.
+
+    Usage: `T.stg128(y[i:i+4], value)` emits `tl::stg128(ptr, value)`.
+
+    Args:
+        dst: A `Buffer`, `BufferRegion`, or `BufferLoad` indicating the destination.
+        value: The 128-bit value to store (e.g., uint4).
+        pred: Optional predicate condition. If False, the store is skipped.
+
+    Example:
+        >>> T.stg128(y[i:i+4], val)
+        >>> T.stg128(y[i:i+4], val, pred=i < N)  # predicated store
+    """
+    if not isinstance(dst, BufferLikeTypeTuple):
+        raise TypeError(f"T.stg128 expects Buffer, BufferRegion, or BufferLoad. Got {type(dst)}: {dst}")
+    ptr = retrieve_ptr(dst, access_type="w")
+    if pred is None:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg128"), ptr, value)
+    else:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg128"), ptr, value, pred)
+
+
+def stg256(dst: BufferLikeType, value: PrimExpr, pred: PrimExpr = None) -> None:
+    """Store 256 bits (32 bytes) to global memory using explicit PTX instructions.
+
+    Usage: `T.stg256(y[i:i+8], value)` emits `tl::stg256(ptr, value)`.
 
     Args:
-        barrier: The barrier to synchronize at, should be [num_ranks] of int32
+        dst: A `Buffer`, `BufferRegion`, or `BufferLoad` indicating the destination.
+        value: The 256-bit value to store (e.g., ulonglong4).
+        pred: Optional predicate condition. If False, the store is skipped.
+
+    Example:
+        >>> T.stg256(y[i:i+8], val)
+        >>> T.stg256(y[i:i+8], val, pred=i < N)  # predicated store
+    """
+    if not isinstance(dst, BufferLikeTypeTuple):
+        raise TypeError(f"T.stg256 expects Buffer, BufferRegion, or BufferLoad. Got {type(dst)}: {dst}")
+    ptr = retrieve_ptr(dst, access_type="w")
+    if pred is None:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg256"), ptr, value)
+    else:
+        return tir.call_intrin("handle", tir.op.Op.get("tl.stg256"), ptr, value, pred)
+
+
+def fence_sys():
+    """Synchronize all threads at the system level (visible in a node).
+
+    Returns
+    -------
+    call : PrimExpr
+        A call to ``tl.fence_sys``.
     """
-    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.barrier_blocks"), address_of(barrier), 0)  # whether need fence
+    return tir.call_intrin("handle", tir.op.Op.get("tl.fence_sys"))
 
 
 def fence_cta():
@@ -1007,35 +1676,16 @@ def fence_gpu():
     return tir.call_intrin("handle", tir.op.Op.get("tl.fence_gpu"))
 
 
-def fence_sys():
-    """Synchronize all threads at the system level (visible in a node)."""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.fence_sys"))
-
-
 def ld(
     src: PrimExpr,
     value: PrimExpr,
-    scope: Literal["cta", "gpu", "sys"] = "gpu",
-    sem: Literal["weak", "volatile", "acquire", "release", "relaxed"] = "weak",
+    scope: Literal[cta, gpu, sys] = "gpu",  # noqa: F821
+    sem: Literal[weak, volatile, acquire, release, relaxed] = "weak",  # noqa: F821
     na: bool = False,
     nc: bool = False,
     src_pe: tir.PrimExpr | tir.IntImm | None = -1,
 ):
-    """Load a value from a given address with specified scope, semantic, and optional destination PE.
-
-    Args:
-        src: The source address to load from.
-        value: The value to load.
-        scope: The memory scope.
-        sem: The memory semantic.
-        na: Whether to use no-allocate L1 policy.
-        nc: Whether to use non-coherent cache.
-        src_pe: The source processing element (PE) identifier.
-                Use -1 (default) for local PE, or a non-negative integer to target a remote PE.
-
-    Returns:
-        tir.Call: A handle to the load operation.
-    """
+    """Load a value from a given address with specified scope, semantic, and optional destination PE."""
     assert scope in ["cta", "gpu", "sys"], "Scope must be one of 'cta', 'gpu', or 'sys'."
     assert sem in ["weak", "volatile", "acquire", "relaxed"], (
         "Semantic must be one of 'weak', 'volatile', 'acquire', 'release', or 'relaxed'."
@@ -1050,74 +1700,67 @@ def ld(
 def st(
     dst: PrimExpr,
     value: PrimExpr,
-    scope: Literal["cta", "gpu", "sys"] = "gpu",
-    sem: Literal["weak", "volatile", "release", "relaxed"] = "weak",
+    scope: Literal[cta, gpu, sys] = "gpu",  # noqa: F821
+    sem: Literal[weak, volatile, release, relaxed] = "weak",  # noqa: F821
     na: bool = False,
     dst_pe: tir.PrimExpr | tir.IntImm | None = -1,
 ):
-    """Store a value to a given address with specified scope, semantic, and optional destination PE.
-
-    Args:
-        dst: The destination to store the value to.
-        value: The value to store.
-        scope: The memory scope.
-        sem: The memory semantic.
-        na: Whether to use no-allocate L1 policy.
-        dst_pe: The destination processing element (PE) identifier.
-                Use -1 (default) for local PE, or a non-negative integer to target a remote PE.
-
-    Returns:
-        tir.Call: A handle to the store operation.
-    """
+    """Store a value to a given address with specified scope, semantic, and optional destination PE."""
     assert scope in ["cta", "gpu", "sys"], "Scope must be one of 'cta', 'gpu', or 'sys'."
     assert sem in ["weak", "volatile", "release", "relaxed"], "Semantic must be one of 'weak', 'volatile', 'release', or 'relaxed'."
-
-    # convert to int
     scope = {"cta": 0, "gpu": 1, "sys": 2}[scope]
     sem = {"weak": 0, "volatile": 1, "acquire": 2, "release": 3, "relaxed": 4}[sem]
     na = 1 if na else 0
     return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.st"), address_of(dst), value, sem, scope, na, dst_pe)
 
 
-def sync_warp():
-    """Synchronize all threads in a warp."""
-    return tir.call_intrin("handle", tir.op.Op.get("tl.sync_warp"))
-
-
 def warp_any(value, mask=-1):
-    """Check if any lane in the warp has a true value.
-
-    Args:
-        value (int): The value to vote.
-        mask (uint32): The mask to use, default is 0xFFFFFFFF, which means all lanes.
-
-    Returns:
-        result (int): The result of the vote.
-    """
+    """Check if any lane in the warp has a true value."""
     return tir.call_intrin("int32", tir.op.Op.get("tl.warp_any"), value, mask)
 
 
 def warp_all(value, mask=-1):
-    """Check if all lane in the warp have a true value.
-
-    Args:
-        value (int): The value to vote.
-        mask (uint32): The mask to use, default is 0xFFFFFFFF(-1), which means all lanes.
-
-    Returns:
-        result (int): The result of the vote.
-    """
+    """Check if all lanes in the warp have a true value."""
     return tir.call_intrin("int32", tir.op.Op.get("tl.warp_all"), value, mask)
 
 
 def atom_add(target: PrimExpr, value: PrimExpr, scope: str = "gpu", sem: str = "relaxed"):
-    """Perform an atomic addition to a value with specified scope and semantic.
-    Args:
-        target: The target to add to.
-        value: The value to add.
-        scope: The memory scope.
-        sem: The memory semantic.
-    """
+    """Perform an atomic addition to a value with specified scope and semantic."""
     assert scope in ["gpu", "sys"], "Scope must be one of 'gpu', or 'sys'."
     assert sem in ["relaxed", "acquire", "release", "acq_rel"], "Semantic must be one of 'relaxed', 'acquire', 'release', or 'acq_rel'."
     return tir.call_intrin("uint32", tir.op.Op.get("tl.atom_add"), address_of(target), value, sem, scope)
+
+
+def alloc_barrier_gpu():
+    """Allocate a barrier for GPU-level synchronization."""
+    return alloc_buffer([1], "uint32", scope="global")
+
+
+def init_barrier_gpu(barrier: PrimExpr, expected: int):
+    """Initialize a barrier for GPU-level synchronization."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.init_barrier_gpu"), address_of(barrier), expected)
+
+
+def arrive_barrier_gpu(barrier: PrimExpr):
+    """Arrive at a barrier for GPU-level synchronization."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.arrive_barrier_gpu"), address_of(barrier))
+
+
+def wait_barrier_gpu(barrier: PrimExpr):
+    """Wait at a barrier for GPU-level synchronization."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.wait_barrier_gpu"), address_of(barrier))
+
+
+def sync_barrier_gpu(barrier: PrimExpr):
+    """Synchronize at a barrier for GPU-level synchronization."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.sync_barrier_gpu"), address_of(barrier))
+
+
+def barrier_blocks(barrier: PrimExpr):
+    """Barrier all blocks at a system-level barrier."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.barrier_blocks"), address_of(barrier), 1)
+
+
+def sync_blocks(barrier: PrimExpr):
+    """Synchronize all blocks at a system-level barrier."""
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.barrier_blocks"), address_of(barrier), 0)
diff --git a/tilelang/language/cluster.py b/tilelang/language/cluster.py
new file mode 100644
index 0000000000..60fd36a630
--- /dev/null
+++ b/tilelang/language/cluster.py
@@ -0,0 +1,105 @@
+from tvm import tir
+from tvm.tir import BufferLoad
+
+from tilelang.utils.language import retrieve_ptr
+
+__all__ = [
+    "cluster_arrive_relaxed",
+    "cluster_arrive",
+    "cluster_wait",
+    "cluster_sync",
+    "block_rank_in_cluster",
+    "clc_try_cancel",
+    "clc_try_cancel_multicast",
+    "clc_is_canceled",
+    "clc_get_first_ctaid_x",
+    "clc_get_first_ctaid_y",
+    "clc_get_first_ctaid_z",
+]
+
+
+def _to_ptr(value, access_type: str):
+    if isinstance(value, BufferLoad):
+        return retrieve_ptr(value, access_type=access_type)
+    return retrieve_ptr(value, access_type=access_type)
+
+
+def cluster_arrive_relaxed() -> tir.PrimExpr:
+    """Issue barrier.cluster.arrive.relaxed.aligned."""
+    return tir.call_intrin("void", tir.op.Op.get("tl.cluster_arrive_relaxed"))
+
+
+def cluster_arrive() -> tir.PrimExpr:
+    """Issue barrier.cluster.arrive.aligned."""
+    return tir.call_intrin("void", tir.op.Op.get("tl.cluster_arrive"))
+
+
+def cluster_wait() -> tir.PrimExpr:
+    """Issue barrier.cluster.wait.aligned."""
+    return tir.call_intrin("void", tir.op.Op.get("tl.cluster_wait"))
+
+
+def cluster_sync() -> tir.PrimExpr:
+    """Issue cluster barrier arrive + wait (full synchronization)."""
+    return tir.call_intrin("void", tir.op.Op.get("tl.cluster_sync"))
+
+
+def block_rank_in_cluster() -> tir.PrimExpr:
+    """Return the 1-D rank of the calling CTA within its cluster (%%cluster_ctarank)."""
+    return tir.call_intrin("int32", tir.op.Op.get("tl.block_rank_in_cluster"))
+
+
+def clc_try_cancel(result, mbarrier) -> tir.PrimExpr:
+    """Issue a single-CTA cluster launch control query."""
+    return tir.call_intrin(
+        "void",
+        tir.op.Op.get("tl.clc_try_cancel"),
+        _to_ptr(result, "w"),
+        _to_ptr(mbarrier, "rw"),
+    )
+
+
+def clc_try_cancel_multicast(result, mbarrier) -> tir.PrimExpr:
+    """Issue a cluster-wide multicast cluster launch control query."""
+    return tir.call_intrin(
+        "void",
+        tir.op.Op.get("tl.clc_try_cancel_multicast"),
+        _to_ptr(result, "w"),
+        _to_ptr(mbarrier, "rw"),
+    )
+
+
+def clc_is_canceled(result) -> tir.PrimExpr:
+    """Return 1 when the CLC query successfully canceled a future launch."""
+    return tir.call_intrin(
+        "int32",
+        tir.op.Op.get("tl.clc_is_canceled"),
+        _to_ptr(result, "r"),
+    )
+
+
+def clc_get_first_ctaid_x(result) -> tir.PrimExpr:
+    """Return the x coordinate of the first CTA in a successful CLC response."""
+    return tir.call_intrin(
+        "uint32",
+        tir.op.Op.get("tl.clc_get_first_ctaid_x"),
+        _to_ptr(result, "r"),
+    )
+
+
+def clc_get_first_ctaid_y(result) -> tir.PrimExpr:
+    """Return the y coordinate of the first CTA in a successful CLC response."""
+    return tir.call_intrin(
+        "uint32",
+        tir.op.Op.get("tl.clc_get_first_ctaid_y"),
+        _to_ptr(result, "r"),
+    )
+
+
+def clc_get_first_ctaid_z(result) -> tir.PrimExpr:
+    """Return the z coordinate of the first CTA in a successful CLC response."""
+    return tir.call_intrin(
+        "uint32",
+        tir.op.Op.get("tl.clc_get_first_ctaid_z"),
+        _to_ptr(result, "r"),
+    )
diff --git a/tilelang/language/copy_op.py b/tilelang/language/copy_op.py
index 6401520fb3..7e9cdddb08 100644
--- a/tilelang/language/copy_op.py
+++ b/tilelang/language/copy_op.py
@@ -1,35 +1,77 @@
 """Copy operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
-from typing import Literal
-from tilelang import language as T
+from typing import Literal, Any
+from tilelang._typing import BufferLikeType
 from tilelang.utils.language import (
     to_buffer_region,
-    get_buffer_region_from_load,
     legalize_pairwise_extents,
 )
+from tilelang.language.utils import get_extent
 from tvm import ir, tir
 
 
+def _normalize_copy_regions(
+    src: BufferLikeType, dst: BufferLikeType
+) -> tuple[
+    tir.BufferRegion | tir.BufferLoad | tir.Buffer,
+    tir.BufferRegion | tir.BufferLoad | tir.Buffer,
+]:
+    # If both side are buffers, we should make sure their shapes are equal
+    if isinstance(src, tir.Buffer) and isinstance(dst, tir.Buffer):
+        ir.assert_structural_equal(src.shape, dst.shape)
+
+    src_extent = get_extent(src)
+    dst_extent = get_extent(dst)
+
+    src_is_scalar_load = src_extent is None and isinstance(src, tir.BufferLoad)
+    dst_is_scalar_load = dst_extent is None and isinstance(dst, tir.BufferLoad)
+
+    # copy(buffer_a[i], buffer_b[i]) where both are BufferLoad nodes
+    # In this case, lower it to a simple BufferStore: buffer_b[i] = buffer_a[i]
+    if src_is_scalar_load and dst_is_scalar_load:
+        return src, dst
+
+    assert src_extent or dst_extent, "Can't deduce copy extents from args. Both src and dst miss extents info."
+    # Treat missing extent as length-matched ones for convenience. This provides limited
+    # broadcasting-like syntactic sugar, but does not implement general broadcasting support.
+    src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
+    dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
+
+    # Align and broadcast extents from the right (tail) side.
+    # This is majorly for supporting some syntactic sugar, not the whole broadcasting ability of copy op.
+    src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
+
+    # Use legalized extents for src and dst respectively.
+    src = to_buffer_region(src, access_type="r", extents=src_extent)
+    dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
+    return src, dst
+
+
 def copy(
-    src: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
-    dst: tir.Buffer | tir.BufferLoad | tir.BufferRegion,
+    src: BufferLikeType,
+    dst: BufferLikeType,
+    *,
     coalesced_width: int | None = None,
     disable_tma: bool = False,
     eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
     annotations: dict | None = None,
-):
+    loop_layout: Any | None = None,
+) -> tir.PrimExpr | tir.Stmt:
     """Copy data between memory regions.
 
     Args:
         src (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Source memory region
         dst (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Destination memory region
-        coalesced_width (Optional[int], optional): Width for coalesced memory access. Defaults to None.
-        disable_tma (bool, optional): Whether to disable TMA acceleration. Defaults to False.
-        eviction_policy (Optional[str], optional): Cache eviction policy. Defaults to None.
-        annotations (Optional[dict], optional): Additional annotations dict. If provided,
+        coalesced_width (Optional[int], keyword-only): Width for coalesced memory access. Defaults to None.
+        disable_tma (bool, keyword-only): Whether to disable TMA acceleration. Defaults to False.
+        eviction_policy (Optional[str], keyword-only): Cache eviction policy. Defaults to None.
+        annotations (Optional[dict], keyword-only): Additional annotations dict. If provided,
             coalesced_width, disable_tma, and eviction_policy can also be specified here.
             Values in annotations take precedence over individual arguments.
+        loop_layout (Optional[Fragment], keyword-only): A parallel loop layout hint for the SIMT copy
+            (only valid for normal SIMT copy; incompatible with TMA/LDSM/STSM/TMem). When provided,
+            it is attached to the outermost parallel loop generated by this copy.
 
     Raises:
         TypeError: If copy extents cannot be deduced from arguments
@@ -41,75 +83,190 @@ def copy(
     - Accepts `Buffer`/`BufferRegion`/`BufferLoad` on either side. Extents are
       derived as follows: `Buffer -> shape`, `BufferRegion -> [r.extent]`,
       `BufferLoad -> extents from its inferred/encoded region`.
-    - If both `src` and `dst` are scalar `BufferLoad` without region extents,
-      lowers to a direct store: `dst[...] = src`.
-    - If one side is missing extents, it is treated as all-ones with the other
-      side's rank to enable broadcasting.
-    - Extents are right-aligned and legalized via `legalize_pairwise_extents`:
-      per tail-dimension, equal keeps as-is, a `1` broadcasts to the other,
-      otherwise a conservative `tir.max` is used to remain safe for dynamic
-      shapes.
+    - Normally, we require the extents of both sides to be the same. If they
+      differ, the copy instruction follows an internal rule to select one side
+      as the base range and create iteration space. This may generate unexpected
+      code. And if some dimensions are 1, unexpected errors may happen.
+    - Small Optimization: If both `src` and `dst` are scalar `BufferLoad` without
+      region extents, lowers to a direct store: `dst[...] = src[...]`.
+    - Syntactic Sugar: TileLang supports passing the head address of a buffer to represent
+      the whole buffer if there are no ambiguity. For example, T.copy(A, A_shared[i, j]).
+      To support this, we need some special shape checking. But remember currently we don't
+      support something like "broadcast".
     - The finalized extents are encoded with `tl.region` via `to_buffer_region`
       and passed through to the backend; low-level loop construction and any
       scope-specific decisions happen during lowering.
     """
+    src, dst = _normalize_copy_regions(src, dst)
+    if isinstance(src, tir.BufferLoad) and isinstance(dst, tir.BufferLoad):
+        return tir.BufferStore(dst.buffer, src, dst.indices)
+
+    # Build annotations dict
+    ann = annotations.copy() if annotations else {}
+
+    # Individual arguments take lower precedence than annotations
+    if "coalesced_width" not in ann and coalesced_width is not None:
+        ann["coalesced_width"] = coalesced_width
+    if "disable_tma" not in ann and disable_tma:
+        ann["disable_tma"] = disable_tma
+    if "eviction_policy" not in ann and eviction_policy is not None:
+        eviction_policy_map = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}
+        ann["eviction_policy"] = eviction_policy_map[eviction_policy]
+
+    # Parallel loop layout hint (Fragment). Mirrors T.Parallel(loop_layout=...)
+    if loop_layout is not None and "parallel_loop_layout" not in ann:
+        ann["parallel_loop_layout"] = loop_layout
+
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.copy"), src, dst, annotations=ann if ann else None)
+
+
+def async_copy(
+    src: BufferLikeType,
+    dst: BufferLikeType,
+    *,
+    coalesced_width: int | None = None,
+    annotations: dict | None = None,
+    loop_layout: Any | None = None,
+) -> tir.PrimExpr | tir.Stmt:
+    """Asynchronous copy primitive lowered through cp.async.
+
+    This operator is intended for explicitly asynchronous global->shared copy.
+    The backend enforces cp.async constraints and emits:
+      `ptx_cp_async(...)` + `ptx_commit_group()`.
+    No wait is auto-inserted for `T.async_copy`; synchronization is explicit.
+
+    Args:
+        src (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Source memory region
+        dst (Union[tir.Buffer, tir.BufferLoad, tir.BufferRegion]): Destination memory region
+        coalesced_width (Optional[int], keyword-only): Width for coalesced memory access. Defaults to None.
+        annotations (Optional[dict], keyword-only): Additional annotations dict.
+        loop_layout (Optional[Fragment], keyword-only): A parallel loop layout hint for the SIMT copy loop.
+
+    Returns:
+        tir.Call: A handle to the async copy operation
+    """
+    src, dst = _normalize_copy_regions(src, dst)
+    if isinstance(src, tir.BufferLoad) and isinstance(dst, tir.BufferLoad):
+        return tir.BufferStore(dst.buffer, src, dst.indices)
+
+    ann = annotations.copy() if annotations else {}
+    if "coalesced_width" not in ann and coalesced_width is not None:
+        ann["coalesced_width"] = coalesced_width
+    if loop_layout is not None and "parallel_loop_layout" not in ann:
+        ann["parallel_loop_layout"] = loop_layout
+
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.async_copy"),
+        src,
+        dst,
+        annotations=ann if ann else None,
+    )
+
+
+def tma_copy(
+    src: BufferLikeType,
+    dst: BufferLikeType,
+    *,
+    barrier=None,
+    eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
+    annotations: dict | None = None,
+) -> tir.PrimExpr | tir.Stmt:
+    """TMA copy with user-managed synchronization.
+
+    For **loads** (global -> shared): issues expect_tx + tma_load (no wait).
+    Unlike T.copy() which emits a full synchronous TMA sequence (arrive + load + wait),
+    T.tma_copy() emits only the producer part (expect_tx + tma_load).
+    The user manages synchronization explicitly via T.barrier_arrive() and
+    T.mbarrier_wait_parity(). ``barrier`` is required for loads.
+
+    For **stores** (shared -> global): issues tma_store + tma_store_arrive (no wait).
+    Unlike T.copy() which emits tma_store + tma_store_arrive + tma_store_wait,
+    T.tma_copy() omits the wait so the user can batch multiple stores before
+    calling T.tma_store_wait() explicitly. ``barrier`` is not needed for stores.
+
+    Args:
+        src: Source memory region (global or shared)
+        dst: Destination memory region (shared or global)
+        barrier: Mbarrier (from T.alloc_barrier()) for TMA load synchronization.
+            Required for loads (global -> shared). Not needed for stores.
+            The TMA load will arrive at this barrier with expected byte count.
+            The user must wait on the same barrier via T.mbarrier_wait_parity().
+        eviction_policy: Cache eviction policy. Defaults to None.
+        annotations: Additional annotations dict. Values in annotations take
+            precedence over individual arguments.
+
+    Returns:
+        tir.Call: A handle to the tma_copy operation
+    """
+    # If both side are buffers, we should make sure their shapes are equal
     if isinstance(src, tir.Buffer) and isinstance(dst, tir.Buffer):
         ir.assert_structural_equal(src.shape, dst.shape)
 
-    def get_extent(data):
-        if isinstance(data, tir.Var) and T.has_let_value(data):
-            data = T.get_let_value(data)
-        if isinstance(data, tir.Buffer):
-            return data.shape
-        elif isinstance(data, tir.BufferRegion):
-            return [x.extent for x in data.region]
-        elif isinstance(data, tir.BufferLoad):
-            region = get_buffer_region_from_load(data)
-            if region is None:
-                return None
-            return [x.extent for x in region.region]
-        else:
-            return None
-
     src_extent = get_extent(src)
     dst_extent = get_extent(dst)
-    # Combine the nested if statements into a single if statement as suggested by SIM102
-    if src_extent is None and dst_extent is None and isinstance(src, tir.BufferLoad) and isinstance(dst, tir.BufferLoad):
-        # check if the case is like this:
-        # copy(buffer_a[i], buffer_b[i]) where both are BufferLoad nodes
-        # In this case, lower it to a simple BufferStore: buffer_b[i] = buffer_a[i]
-        return tir.BufferStore(dst.buffer, src, dst.indices)
 
-    assert src_extent or dst_extent, "Can't deduce copy extents from args"
-    # Treat missing extent as length-matched ones to enable broadcasting.
+    assert src_extent or dst_extent, "Can't deduce copy extents from args. Both src and dst miss extents info."
     src_extent = list(src_extent) if src_extent else [1] * len(dst_extent)
     dst_extent = list(dst_extent) if dst_extent else [1] * len(src_extent)
 
-    # Align and broadcast extents from the right (tail) side.
     src_extent, dst_extent = legalize_pairwise_extents(src_extent, dst_extent)
 
-    # Use legalized extents for src and dst respectively.
     src = to_buffer_region(src, access_type="r", extents=src_extent)
     dst = to_buffer_region(dst, access_type="w", extents=dst_extent)
 
-    # Build annotations dict
     ann = annotations.copy() if annotations else {}
 
-    # Individual arguments take lower precedence than annotations
-    if "coalesced_width" not in ann and coalesced_width is not None:
-        ann["coalesced_width"] = coalesced_width
-    if "disable_tma" not in ann and disable_tma:
-        ann["disable_tma"] = disable_tma
+    if barrier is not None:
+        from .builtin import _mbar_to_buffer_load
+
+        ann["barrier"] = _mbar_to_buffer_load(barrier)
+
     if "eviction_policy" not in ann and eviction_policy is not None:
         eviction_policy_map = {"evict_normal": 0, "evict_first": 1, "evict_last": 2}
         ann["eviction_policy"] = eviction_policy_map[eviction_policy]
 
-    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.copy"), src, dst, annotations=ann if ann else None)
+    return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.tma_copy"), src, dst, annotations=ann if ann else None)
+
+
+def transpose(
+    src: BufferLikeType,
+    dst: BufferLikeType,
+) -> tir.PrimExpr:
+    """Transpose a 2D buffer in shared memory: dst[j, i] = src[i, j].
+
+    Both src and dst should be shared memory buffers.
+    If src has shape (M, N), dst should have shape (N, M).
+
+    Args:
+        src: Source buffer or region of shape (..., M, N).
+        dst: Destination buffer or region of shape (..., N, M).
+
+    Returns:
+        tir.Call: A handle to the transpose operation.
+    """
+    src_extent = get_extent(src)
+    dst_extent = get_extent(dst)
+
+    assert src_extent is not None, "Cannot deduce extent for transpose src."
+    assert dst_extent is not None, "Cannot deduce extent for transpose dst."
+    assert len(src_extent) >= 2, "Transpose requires at least 2D buffers."
+    assert len(dst_extent) >= 2, "Transpose requires at least 2D buffers."
+
+    src = to_buffer_region(src, access_type="r")
+    dst = to_buffer_region(dst, access_type="w")
+
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.transpose"),
+        src,
+        dst,
+    )
 
 
 def c2d_im2col(
-    img: tir.Buffer,
-    col: tir.Buffer,
+    img: BufferLikeType,
+    col: BufferLikeType,
     nhw_step: tir.PrimExpr,
     c_step: tir.PrimExpr,
     kernel: int,
@@ -117,7 +274,7 @@ def c2d_im2col(
     dilation: int,
     pad: int,
     eviction_policy: Literal["evict_normal", "evict_first", "evict_last"] | None = None,
-):
+) -> tir.PrimExpr:
     """Perform im2col transformation for 2D convolution.
 
     Args:
diff --git a/tilelang/language/customize.py b/tilelang/language/customize.py
index ae4e754f7c..fa6659c9d6 100644
--- a/tilelang/language/customize.py
+++ b/tilelang/language/customize.py
@@ -1,6 +1,7 @@
 """Some customized operations frequently used in tensor programming, exposed on the TileLang language surface."""
 
 from __future__ import annotations
+from tilelang._typing import ShapeType, DType
 import tilelang.language as T
 from tvm.tir import PrimExpr, Buffer, op
 from tilelang.utils.language import bits_product, prim_expr_equal
@@ -18,7 +19,13 @@ def dp4a(A: Buffer, B: Buffer, C: Buffer) -> PrimExpr:
     Returns:
         PrimExpr: Handle to the DP4A operation
     """
-    return T.call_extern("handle", "DP4A", T.address_of(A), T.address_of(B), T.address_of(C))
+    return T.call_extern(
+        "handle",
+        "DP4A",
+        T.access_ptr(A, "r"),
+        T.access_ptr(B, "r"),
+        T.access_ptr(C, "rw"),
+    )
 
 
 def clamp(dst: PrimExpr, min_val: PrimExpr, max_val: PrimExpr) -> PrimExpr:
@@ -37,12 +44,12 @@ def clamp(dst: PrimExpr, min_val: PrimExpr, max_val: PrimExpr) -> PrimExpr:
     return dst
 
 
-def reshape(src: Buffer, shape: list[PrimExpr]) -> Buffer:
+def reshape(src: Buffer, shape: ShapeType) -> Buffer:
     """Reshapes the input buffer to the specified shape.
 
     Args:
         src (Buffer): Input buffer to be reshaped
-        shape (List[PrimExpr]): New shape for the buffer
+        shape (ShapeType): New shape for the buffer
 
     Returns:
         Buffer: A new buffer view with the specified shape
@@ -53,7 +60,7 @@ def reshape(src: Buffer, shape: list[PrimExpr]) -> Buffer:
     return T.Tensor(shape, src.dtype, src.data)
 
 
-def view(src: Buffer, shape: list[PrimExpr] | None = None, dtype: str | None = None) -> Buffer:
+def view(src: Buffer, shape: ShapeType | None = None, dtype: DType | None = None) -> Buffer:
     """Return a Tensor view of the input buffer with an optional new shape and dtype.
 
     If `shape` is None the source buffer's shape is used; if `dtype` is None the source buffer's dtype is used. The returned buffer shares the same underlying data as `src` (no copy).
@@ -66,7 +73,7 @@ def view(src: Buffer, shape: list[PrimExpr] | None = None, dtype: str | None = N
     return T.Tensor(shape, dtype, src.data)
 
 
-def loop_break():
+def loop_break() -> PrimExpr:
     """Break out of the current loop.
 
     Returns:
diff --git a/tilelang/language/distributed/__init__.py b/tilelang/language/distributed/__init__.py
index 976130a40b..5b5e5d5880 100644
--- a/tilelang/language/distributed/__init__.py
+++ b/tilelang/language/distributed/__init__.py
@@ -18,6 +18,15 @@
     wait_lt,
 )
 
+from .multimem import (
+    MultimemReduceOp,
+    multimem_ld_reduce,
+    multimem_st,
+    multimem_red,
+    multimem_tma_store,
+    multimem_signal,
+)
+
 __all__ = [
     "get_rank",
     "get_num_ranks",
@@ -32,4 +41,10 @@
     "wait_le",
     "wait_gt",
     "wait_lt",
+    "MultimemReduceOp",
+    "multimem_ld_reduce",
+    "multimem_st",
+    "multimem_red",
+    "multimem_tma_store",
+    "multimem_signal",
 ]
diff --git a/tilelang/language/distributed/multimem.py b/tilelang/language/distributed/multimem.py
new file mode 100644
index 0000000000..9dd94ece96
--- /dev/null
+++ b/tilelang/language/distributed/multimem.py
@@ -0,0 +1,109 @@
+"""Multimem operations (NVSwitch SHARP multicast) using layout-aware lowering.
+
+These operations use T.copy's ParallelOp + InferLayout + VectorizeLoop pipeline
+to correctly handle fragment layouts, then post-process to emit multimem instructions.
+"""
+
+from __future__ import annotations
+
+from enum import Enum
+from typing import Literal
+from tvm import tir
+from tvm.tir import PrimExpr, address_of
+from tilelang.utils.language import to_buffer_region
+
+
+class MultimemReduceOp(Enum):
+    ADD = 0
+    MIN = 1
+    MAX = 2
+    NONE = -1  # plain store (no reduction), for multimem_tma_store
+
+
+class _MultimemMode(Enum):
+    LD_REDUCE = 0
+    ST = 1
+    RED = 2
+    TMA_STORE = 3
+    TMA_RED_STORE = 4
+
+
+def _multimem_impl(src, dst, mode: _MultimemMode, reduce_op: MultimemReduceOp = MultimemReduceOp.NONE):
+    """Shared implementation for all multimem operations.
+
+    Converts src/dst to buffer regions and emits the tl.tileop.multimem intrinsic.
+
+    Args:
+        src: Source (Buffer, BufferLoad with slice, or BufferRegion)
+        dst: Destination (Buffer, BufferLoad with slice, or BufferRegion)
+        mode: 0=kLdReduce, 1=kSt, 2=kRed
+        reduce_op: 0=ADD, 1=MIN, 2=MAX
+    """
+    src_region = to_buffer_region(src, access_type="r")
+    dst_region = to_buffer_region(dst, access_type="w")
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.multimem"),
+        src_region,
+        dst_region,
+        mode.value,
+        reduce_op.value,
+    )
+
+
+def multimem_ld_reduce(src, dst, reduce_op: MultimemReduceOp = MultimemReduceOp.ADD):
+    """Load-reduce from multicast address into local buffer.
+
+    Uses T.copy's layout inference to handle fragment layouts correctly.
+    Each thread issues 128-bit multimem instructions after vectorization.
+
+    Args:
+        src: Multicast source (Buffer, BufferLoad with slice, or BufferRegion)
+        dst: Local destination (Buffer, BufferLoad with slice, or BufferRegion)
+        reduce_op: Reduction operation: 0=ADD, 1=MIN, 2=MAX.
+    """
+    return _multimem_impl(src, dst, mode=_MultimemMode.LD_REDUCE, reduce_op=reduce_op)
+
+
+def multimem_st(src, dst):
+    """Store to multicast address (broadcast to all ranks).
+
+    Args:
+        src: Local source (Buffer, BufferLoad with slice, or BufferRegion)
+        dst: Multicast destination (Buffer, BufferLoad with slice, or BufferRegion)
+    """
+    return _multimem_impl(src, dst, mode=_MultimemMode.ST)
+
+
+def multimem_red(src, dst, reduce_op: MultimemReduceOp = MultimemReduceOp.ADD):
+    """Reduce into multicast address (accumulate without read-back).
+
+    Args:
+        src: Local source (Buffer, BufferLoad with slice, or BufferRegion)
+        dst: Multicast destination (Buffer, BufferLoad with slice, or BufferRegion)
+        reduce_op: Reduction operation: 0=ADD, 1=MIN, 2=MAX.
+    """
+    return _multimem_impl(src, dst, mode=_MultimemMode.RED, reduce_op=reduce_op)
+
+
+def multimem_tma_store(src, dst, reduce_op: MultimemReduceOp | None = None):
+    """Async bulk TMA store from shared memory to multicast global address.
+
+    CTA-collective: a single thread emits one PTX instruction per call.
+    Uses bulk_group completion (fence.proxy.async + commit_group + wait).
+
+    Args:
+        src: Shared memory source (Buffer, BufferLoad or BufferRegion, shared scope)
+        dst: Multicast global destination (Buffer, BufferLoad or BufferRegion, global scope)
+        reduce_op: None for plain store (broadcast), MultimemReduceOp.ADD/MIN/MAX for reduce-accumulate
+
+    NOTE: This instruction requires Hopper+ and CUDA toolkit 13.x.
+    (For unsatisfied CTK version, a hack is to use plain TMA store to mcast vaddr.)
+    """
+    if reduce_op is None:
+        return _multimem_impl(src, dst, mode=_MultimemMode.TMA_STORE)
+    return _multimem_impl(src, dst, mode=_MultimemMode.TMA_RED_STORE, reduce_op=reduce_op)
+
+
+def multimem_signal(addr, value: PrimExpr, dtype_tag: Literal["uint32", "uint64"] = "uint32"):
+    return tir.call_extern("handle", f"tl::multimem::Signal<{dtype_tag}>::run", address_of(addr), value)
diff --git a/tilelang/language/dtypes.py b/tilelang/language/dtypes.py
new file mode 100644
index 0000000000..02b289ec49
--- /dev/null
+++ b/tilelang/language/dtypes.py
@@ -0,0 +1,767 @@
+from tilelang import tvm
+from tvm import ir
+import torch
+from typing import Generic, TypeVar, Union, TYPE_CHECKING
+from tvm import tir
+import tvm.script.ir_builder.tir._ffi_api as tb_ffi
+import numpy as np
+from tilelang import logger
+
+_T = TypeVar("_T")
+
+if TYPE_CHECKING:
+
+    class dtype(Generic[_T]):
+        @property
+        def bits(self) -> int: ...
+        @property
+        def bytes(self) -> int: ...
+        def as_torch(self) -> torch.dtype: ...
+else:
+    dtype = tvm.DataType
+
+# Python 3.9 compatibility: avoid PEP 604 unions at runtime
+AnyDType = Union[ir.Type, str, type, torch.dtype, dtype]
+
+
+def _is_any_dtype(obj: object) -> bool:
+    """Check if obj is a dtype-like value. Use instead of isinstance(obj, AnyDType)
+    because Union types cannot be used with isinstance in Python 3.9."""
+    return isinstance(obj, (ir.Type, str, type, torch.dtype, dtype))
+
+
+_PYTHON_DTYPE_TO_STR = {bool: "bool", int: "int32", float: "float32"}
+
+_NUMPY_DTYPE_TO_STR = {
+    np.bool_: "bool",
+    np.short: "int16",
+    np.int_: "int64",
+    np.longlong: "int64",
+    np.half: "float16",
+    np.double: "float64",
+    np.int8: "int8",
+    np.int16: "int16",
+    np.int32: "int32",
+    np.int64: "int64",
+    np.uint8: "uint8",
+    np.uint16: "uint16",
+    np.uint32: "uint32",
+    np.uint64: "uint64",
+    np.float16: "float16",
+    np.float32: "float32",
+    np.float64: "float64",
+}
+
+_NUMPY_DTYPE_TO_STR.update({np.dtype(k): v for k, v in _NUMPY_DTYPE_TO_STR.items()})
+
+_TORCH_DTYPE_TO_STR = {
+    torch.bool: "bool",
+    torch.short: "int16",
+    torch.int: "int32",
+    torch.long: "int64",
+    torch.half: "float16",
+    torch.float: "float32",
+    torch.double: "float64",
+    torch.int8: "int8",
+    torch.int16: "int16",
+    torch.int32: "int32",
+    torch.int64: "int64",
+    torch.uint8: "uint8",
+    torch.float16: "float16",
+    torch.float32: "float32",
+    torch.float64: "float64",
+    torch.bfloat16: "bfloat16",
+}
+
+_extended_torch_dtypes = [
+    # Some dtypes are not provided by old torch versions
+    ("uint16",),
+    ("uint32",),
+    ("uint64",),
+    ("float8_e4m3fn",),
+    ("float8_e4m3fnuz",),
+    ("float8_e5m2",),
+    ("float8_e5m2fnuz",),
+    ("float8_e8m0fnu",),
+    ("float4_e2m1fnx2",),
+]
+for dtype_name_tuple in _extended_torch_dtypes:
+    dtype_name = dtype_name_tuple[0]
+    torch_dtype = None
+    if dtype_name == "float4_e2m1fnx2":
+        torch_dtype = getattr(torch, "float4_e2m1fn_x2", None)
+    else:
+        torch_dtype = getattr(torch, dtype_name, None)
+
+    if torch_dtype is not None:
+        _TORCH_DTYPE_TO_STR[torch_dtype] = dtype_name
+
+
+_CANONICAL_TO_DISPLAY_STR = {
+    "double": "float64",
+    "float": "float32",
+    "int": "int32",
+    "long": "int64",
+    "short": "int16",
+    "uint": "uint32",
+    "ulong": "uint64",
+}
+
+_STR_TO_TORCH_DTYPE = {v: k for k, v in _TORCH_DTYPE_TO_STR.items()}
+
+# _STR_TO_NUMPY_DTYPE = {v: k for k, v in _NUMPY_DTYPE_TO_STR.items()}
+
+_DTYPE_TO_STR = {**_PYTHON_DTYPE_TO_STR, **_NUMPY_DTYPE_TO_STR, **_TORCH_DTYPE_TO_STR}
+
+_STR_TO_TVM_DTYPE_CALL = {
+    "bool": "Boolean",
+    "int4": "Int4",
+    "int8": "Int8",
+    "int16": "Int16",
+    "int32": "Int32",
+    "int64": "Int64",
+    "uint8": "UInt8",
+    "uint16": "UInt16",
+    "uint32": "UInt32",
+    "uint64": "UInt64",
+    "float16": "Float16",
+    "float32": "Float32",
+    "float64": "Float64",
+    "bfloat16": "BFloat16",
+    "float8_e4m3": "Float8E4M3",
+    "float8_e4m3fn": "Float8E4M3FN",
+    "float8_e4m3fnuz": "Float8E4M3FNUZ",
+    "float8_e5m2": "Float8E5M2",
+    "float8_e5m2fnuz": "Float8E5M2FNUZ",
+    "float8_e8m0fnu": "Float8E8M0FNU",
+}
+
+int_ = int
+
+
+def __dtype_call__(self: dtype, *args, is_size_var: bool = False) -> tir.Var:
+    # When called with multiple args, pack the scalars into a vector via Shuffle.
+    # e.g. T.bfloat16x2(a, b) -> tir.Shuffle([a, b], [0, 1]) : bfloat16x2
+    if len(args) > 1:
+        return tir.Shuffle(list(args), list(range(len(args))))
+    expr = args[0] if args else None
+    if isinstance(expr, int_):
+        return tvm.tir.const(expr, dtype=self)
+    if self in _STR_TO_TVM_DTYPE_CALL:
+        attr = _STR_TO_TVM_DTYPE_CALL[self]
+        call = getattr(tb_ffi, attr, None)
+        return call(expr, is_size_var)
+    # try to construct the ffi call
+    if self.startswith("uint"):
+        val = "UInt" + self[4:]
+    elif self.startswith("int"):
+        val = "Int" + self[3:]
+    elif self.startswith("float"):
+        val = "Float" + self[5:]
+    elif self.startswith("bfloat"):
+        val = "BFloat" + self[6:]
+    else:
+        raise TypeError(f"Invalid type {self}")
+    if "_" in val:
+        first, second = val.split("_", maxsplit=1)
+        val = first + second.upper()
+    call = getattr(tb_ffi, val, None)
+    if call is None:
+        raise TypeError(
+            f"Convert to datatype `{self}` is not supported by tvm\ncalling failed on `tvm.script.ir_builder.tir._ffi_api.{val}`"
+        )
+    return call(expr, is_size_var)
+
+
+def __dtype_as_torch__(self: dtype) -> torch.dtype:
+    """Convert TileLang dtype to PyTorch dtype."""
+    dtype_str = str(self)
+
+    if dtype_str == "float8_e4m3":
+        # Check if we're on HIP (AMD ROCm) or CUDA
+        if torch.version.hip is not None:
+            # HIP backend - use float8_e4m3fnuz
+            assert hasattr(torch, "float8_e4m3fnuz"), (
+                "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+            )
+            return torch.float8_e4m3fnuz
+        else:
+            # CUDA backend - use float8_e4m3fn
+            assert hasattr(torch, "float8_e4m3fn"), (
+                "torch.float8_e4m3fn is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+            )
+            return torch.float8_e4m3fn
+    elif dtype_str == "float8_e5m2":
+        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torch. Please upgrade torch >= 2.1.0"
+        return torch.float8_e5m2
+    elif dtype_str in ("float8_e4m3fnuz", "e4m3fnuz_float8"):
+        assert hasattr(torch, "float8_e4m3fnuz"), (
+            "torch.float8_e4m3fnuz is not supported in this version of torch. Please upgrade torch >= 2.2.0"
+        )
+        return torch.float8_e4m3fnuz
+    elif dtype_str == "float8_e8m0fnu":
+        assert hasattr(torch, "float8_e8m0fnu"), (
+            "torch.float8_e8m0fnu is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float8_e8m0fnu
+    elif dtype_str == "float4_e2m1fnx2":
+        assert hasattr(torch, "float4_e2m1fnx2"), (
+            "torch.float4_e2m1fnx2 is not supported in this version of torch. Please upgrade torch >= 2.8.0"
+        )
+        return torch.float4_e2m1fn_x2
+    elif dtype_str == "float4_e2m1fn":
+        logger.info("torch doesn't support float4_e2m1fn, using float4_e2m1fnx2 as storage dtype.")
+        return torch.float4_e2m1fn_x2 if hasattr(torch, "float4_e2m1fn_x2") else torch.int8
+    elif dtype_str == "int4":
+        logger.info("torch doesn't support int4, using int8 as storage dtype.")
+        return torch.int8
+    elif dtype_str == "handle":
+        return None
+    elif dtype_str in _STR_TO_TORCH_DTYPE:
+        return _STR_TO_TORCH_DTYPE[dtype_str]
+
+    raise ValueError(f"Cannot convert dtype '{dtype_str}' to torch.dtype. Supported dtypes: {list(_STR_TO_TORCH_DTYPE.keys())}")
+
+
+__orig_dtype_new = dtype.__new__
+
+
+def __dtype_new__(cls, value: AnyDType) -> dtype:
+    if isinstance(value, dtype):
+        return value
+    if isinstance(value, str):
+        return __orig_dtype_new(cls, _CANONICAL_TO_DISPLAY_STR.get(value, value))
+    elif value in _DTYPE_TO_STR:
+        return __orig_dtype_new(cls, _DTYPE_TO_STR[value])
+    else:
+        expected = set(list(_DTYPE_TO_STR.keys()) + list(_DTYPE_TO_STR.values()))
+        raise TypeError(f"Invalid DataType {value}({type(value)}), expect one of {expected}")
+
+
+def __dtype_bytes__(self: dtype) -> int:
+    """Return the number of bytes for this dtype."""
+    return self.itemsize
+
+
+dtype.__call__ = __dtype_call__
+dtype.__new__ = __dtype_new__
+dtype.as_torch = __dtype_as_torch__
+dtype.bytes = property(__dtype_bytes__)
+
+
+def get_tvm_dtype(value: AnyDType) -> dtype:
+    if isinstance(value, (dtype, ir.Type)):
+        return value
+    return dtype(value)
+
+
+if TYPE_CHECKING:
+    # yapf: disable
+    class bool(dtype): ...
+    class short(dtype): ...
+    class int(dtype): ...
+    class uint(dtype): ...
+    class long(dtype): ...
+    class half(dtype): ...
+    class float(dtype): ...
+    class double(dtype): ...
+    class int4(dtype): ...
+    class int8(dtype): ...
+    class int16(dtype): ...
+    class int32(dtype): ...
+    class int64(dtype): ...
+    class int8x2(dtype): ...
+    class int16x2(dtype): ...
+    class int32x2(dtype): ...
+    class int64x2(dtype): ...
+    class int8x4(dtype): ...
+    class int16x4(dtype): ...
+    class int32x4(dtype): ...
+    class int64x4(dtype): ...
+    class int8x8(dtype): ...
+    class int16x8(dtype): ...
+    class int32x8(dtype): ...
+    class int64x8(dtype): ...
+    class int8x16(dtype): ...
+    class int16x16(dtype): ...
+    class int32x16(dtype): ...
+    class int64x16(dtype): ...
+    class int8x32(dtype): ...
+    class int16x32(dtype): ...
+    class int32x32(dtype): ...
+    class int64x32(dtype): ...
+    class int8x64(dtype): ...
+    class int16x64(dtype): ...
+    class int32x64(dtype): ...
+    class int64x64(dtype): ...
+    class uint8(dtype): ...
+    class uint16(dtype): ...
+    class uint32(dtype): ...
+    class uint64(dtype): ...
+    class uint8x2(dtype): ...
+    class uint16x2(dtype): ...
+    class uint32x2(dtype): ...
+    class uint64x2(dtype): ...
+    class uint8x4(dtype): ...
+    class uint16x4(dtype): ...
+    class uint32x4(dtype): ...
+    class uint64x4(dtype): ...
+    class uint8x8(dtype): ...
+    class uint16x8(dtype): ...
+    class uint32x8(dtype): ...
+    class uint64x8(dtype): ...
+    class uint8x16(dtype): ...
+    class uint16x16(dtype): ...
+    class uint32x16(dtype): ...
+    class uint64x16(dtype): ...
+    class uint8x32(dtype): ...
+    class uint16x32(dtype): ...
+    class uint32x32(dtype): ...
+    class uint64x32(dtype): ...
+    class uint8x64(dtype): ...
+    class uint16x64(dtype): ...
+    class uint32x64(dtype): ...
+    class uint64x64(dtype): ...
+    class float16(dtype): ...
+    class float32(dtype): ...
+    class float64(dtype): ...
+    class float16x2(dtype): ...
+    class float32x2(dtype): ...
+    class float64x2(dtype): ...
+    class float16x4(dtype): ...
+    class float32x4(dtype): ...
+    class float64x4(dtype): ...
+    class float16x8(dtype): ...
+    class float32x8(dtype): ...
+    class float64x8(dtype): ...
+    class float16x16(dtype): ...
+    class float32x16(dtype): ...
+    class float64x16(dtype): ...
+    class float16x32(dtype): ...
+    class float32x32(dtype): ...
+    class float64x32(dtype): ...
+    class float16x64(dtype): ...
+    class float32x64(dtype): ...
+    class float64x64(dtype): ...
+    class float8_e3m4(dtype): ...
+    class float8_e3m4x2(dtype): ...
+    class float8_e3m4x4(dtype): ...
+    class float8_e3m4x8(dtype): ...
+    class float8_e3m4x16(dtype): ...
+    class float8_e3m4x32(dtype): ...
+    class float8_e3m4x64(dtype): ...
+    class float8_e4m3(dtype): ...
+    class float8_e4m3x2(dtype): ...
+    class float8_e4m3x4(dtype): ...
+    class float8_e4m3x8(dtype): ...
+    class float8_e4m3x16(dtype): ...
+    class float8_e4m3x32(dtype): ...
+    class float8_e4m3x64(dtype): ...
+    class float8_e4m3b11fnuz(dtype): ...
+    class float8_e4m3b11fnuzx2(dtype): ...
+    class float8_e4m3b11fnuzx4(dtype): ...
+    class float8_e4m3b11fnuzx8(dtype): ...
+    class float8_e4m3b11fnuzx16(dtype): ...
+    class float8_e4m3b11fnuzx32(dtype): ...
+    class float8_e4m3b11fnuzx64(dtype): ...
+    class float8_e4m3fn(dtype): ...
+    class float8_e4m3fnx2(dtype): ...
+    class float8_e4m3fnx4(dtype): ...
+    class float8_e4m3fnx8(dtype): ...
+    class float8_e4m3fnx16(dtype): ...
+    class float8_e4m3fnx32(dtype): ...
+    class float8_e4m3fnx64(dtype): ...
+    class float8_e4m3fnuz(dtype): ...
+    class float8_e4m3fnuzx2(dtype): ...
+    class float8_e4m3fnuzx4(dtype): ...
+    class float8_e4m3fnuzx8(dtype): ...
+    class float8_e4m3fnuzx16(dtype): ...
+    class float8_e4m3fnuzx32(dtype): ...
+    class float8_e4m3fnuzx64(dtype): ...
+    class float8_e5m2(dtype): ...
+    class float8_e5m2x2(dtype): ...
+    class float8_e5m2x4(dtype): ...
+    class float8_e5m2x8(dtype): ...
+    class float8_e5m2x16(dtype): ...
+    class float8_e5m2x32(dtype): ...
+    class float8_e5m2x64(dtype): ...
+    class float8_e5m2fnuz(dtype): ...
+    class float8_e5m2fnuzx2(dtype): ...
+    class float8_e5m2fnuzx4(dtype): ...
+    class float8_e5m2fnuzx8(dtype): ...
+    class float8_e5m2fnuzx16(dtype): ...
+    class float8_e5m2fnuzx32(dtype): ...
+    class float8_e5m2fnuzx64(dtype): ...
+    class float8_e8m0fnu(dtype): ...
+    class float8_e8m0fnux2(dtype): ...
+    class float8_e8m0fnux4(dtype): ...
+    class float8_e8m0fnux8(dtype): ...
+    class float8_e8m0fnux16(dtype): ...
+    class float8_e8m0fnux32(dtype): ...
+    class float8_e8m0fnux64(dtype): ...
+    class float6_e2m3fn(dtype): ...
+    class float6_e2m3fnx2(dtype): ...
+    class float6_e2m3fnx4(dtype): ...
+    class float6_e2m3fnx8(dtype): ...
+    class float6_e2m3fnx16(dtype): ...
+    class float6_e2m3fnx32(dtype): ...
+    class float6_e2m3fnx64(dtype): ...
+    class float6_e3m2fn(dtype): ...
+    class float6_e3m2fnx2(dtype): ...
+    class float6_e3m2fnx4(dtype): ...
+    class float6_e3m2fnx8(dtype): ...
+    class float6_e3m2fnx16(dtype): ...
+    class float6_e3m2fnx32(dtype): ...
+    class float6_e3m2fnx64(dtype): ...
+    class float4_e2m1fn(dtype): ...
+    class float4_e2m1fnx2(dtype): ...
+    class float4_e2m1fnx4(dtype): ...
+    class float4_e2m1fnx8(dtype): ...
+    class float4_e2m1fnx16(dtype): ...
+    class float4_e2m1fnx32(dtype): ...
+    class float4_e2m1fnx64(dtype): ...
+    class bfloat16(dtype): ...
+    class bfloat16x2(dtype): ...
+
+    # yapf: enable
+
+else:
+    bool = dtype("bool")
+    short = dtype("int16")
+    int = dtype("int32")
+    uint = dtype("uint32")
+    long = dtype("int64")
+    half = dtype("float16")
+    float = dtype("float32")
+    double = dtype("float64")
+    int4 = dtype("int4")
+    int8 = dtype("int8")
+    int16 = dtype("int16")
+    int32 = dtype("int32")
+    int64 = dtype("int64")
+    int8x2 = dtype("int8x2")
+    int16x2 = dtype("int16x2")
+    int32x2 = dtype("int32x2")
+    int64x2 = dtype("int64x2")
+    int8x4 = dtype("int8x4")
+    int16x4 = dtype("int16x4")
+    int32x4 = dtype("int32x4")
+    int64x4 = dtype("int64x4")
+    int8x8 = dtype("int8x8")
+    int16x8 = dtype("int16x8")
+    int32x8 = dtype("int32x8")
+    int64x8 = dtype("int64x8")
+    int8x16 = dtype("int8x16")
+    int16x16 = dtype("int16x16")
+    int32x16 = dtype("int32x16")
+    int64x16 = dtype("int64x16")
+    int8x32 = dtype("int8x32")
+    int16x32 = dtype("int16x32")
+    int32x32 = dtype("int32x32")
+    int64x32 = dtype("int64x32")
+    int8x64 = dtype("int8x64")
+    int16x64 = dtype("int16x64")
+    int32x64 = dtype("int32x64")
+    int64x64 = dtype("int64x64")
+    uint8 = dtype("uint8")
+    uint16 = dtype("uint16")
+    uint32 = dtype("uint32")
+    uint64 = dtype("uint64")
+    uint8x2 = dtype("uint8x2")
+    uint16x2 = dtype("uint16x2")
+    uint32x2 = dtype("uint32x2")
+    uint64x2 = dtype("uint64x2")
+    uint8x4 = dtype("uint8x4")
+    uint16x4 = dtype("uint16x4")
+    uint32x4 = dtype("uint32x4")
+    uint64x4 = dtype("uint64x4")
+    uint8x8 = dtype("uint8x8")
+    uint16x8 = dtype("uint16x8")
+    uint32x8 = dtype("uint32x8")
+    uint64x8 = dtype("uint64x8")
+    uint8x16 = dtype("uint8x16")
+    uint16x16 = dtype("uint16x16")
+    uint32x16 = dtype("uint32x16")
+    uint64x16 = dtype("uint64x16")
+    uint8x32 = dtype("uint8x32")
+    uint16x32 = dtype("uint16x32")
+    uint32x32 = dtype("uint32x32")
+    uint64x32 = dtype("uint64x32")
+    uint8x64 = dtype("uint8x64")
+    uint16x64 = dtype("uint16x64")
+    uint32x64 = dtype("uint32x64")
+    uint64x64 = dtype("uint64x64")
+    float16 = dtype("float16")
+    float32 = dtype("float32")
+    float64 = dtype("float64")
+    float16x2 = dtype("float16x2")
+    float32x2 = dtype("float32x2")
+    float64x2 = dtype("float64x2")
+    float16x4 = dtype("float16x4")
+    float32x4 = dtype("float32x4")
+    float64x4 = dtype("float64x4")
+    float16x8 = dtype("float16x8")
+    float32x8 = dtype("float32x8")
+    float64x8 = dtype("float64x8")
+    float16x16 = dtype("float16x16")
+    float32x16 = dtype("float32x16")
+    float64x16 = dtype("float64x16")
+    float16x32 = dtype("float16x32")
+    float32x32 = dtype("float32x32")
+    float64x32 = dtype("float64x32")
+    float16x64 = dtype("float16x64")
+    float32x64 = dtype("float32x64")
+    float64x64 = dtype("float64x64")
+    float8_e3m4 = dtype("float8_e3m4")
+    float8_e3m4x2 = dtype("float8_e3m4x2")
+    float8_e3m4x4 = dtype("float8_e3m4x4")
+    float8_e3m4x8 = dtype("float8_e3m4x8")
+    float8_e3m4x16 = dtype("float8_e3m4x16")
+    float8_e3m4x32 = dtype("float8_e3m4x32")
+    float8_e3m4x64 = dtype("float8_e3m4x64")
+    float8_e4m3 = dtype("float8_e4m3")
+    float8_e4m3x2 = dtype("float8_e4m3x2")
+    float8_e4m3x4 = dtype("float8_e4m3x4")
+    float8_e4m3x8 = dtype("float8_e4m3x8")
+    float8_e4m3x16 = dtype("float8_e4m3x16")
+    float8_e4m3x32 = dtype("float8_e4m3x32")
+    float8_e4m3x64 = dtype("float8_e4m3x64")
+    float8_e4m3b11fnuz = dtype("float8_e4m3b11fnuz")
+    float8_e4m3b11fnuzx2 = dtype("float8_e4m3b11fnuzx2")
+    float8_e4m3b11fnuzx4 = dtype("float8_e4m3b11fnuzx4")
+    float8_e4m3b11fnuzx8 = dtype("float8_e4m3b11fnuzx8")
+    float8_e4m3b11fnuzx16 = dtype("float8_e4m3b11fnuzx16")
+    float8_e4m3b11fnuzx32 = dtype("float8_e4m3b11fnuzx32")
+    float8_e4m3b11fnuzx64 = dtype("float8_e4m3b11fnuzx64")
+    float8_e4m3fn = dtype("float8_e4m3fn")
+    float8_e4m3fnx2 = dtype("float8_e4m3fnx2")
+    float8_e4m3fnx4 = dtype("float8_e4m3fnx4")
+    float8_e4m3fnx8 = dtype("float8_e4m3fnx8")
+    float8_e4m3fnx16 = dtype("float8_e4m3fnx16")
+    float8_e4m3fnx32 = dtype("float8_e4m3fnx32")
+    float8_e4m3fnx64 = dtype("float8_e4m3fnx64")
+    float8_e4m3fnuz = dtype("float8_e4m3fnuz")
+    float8_e4m3fnuzx2 = dtype("float8_e4m3fnuzx2")
+    float8_e4m3fnuzx4 = dtype("float8_e4m3fnuzx4")
+    float8_e4m3fnuzx8 = dtype("float8_e4m3fnuzx8")
+    float8_e4m3fnuzx16 = dtype("float8_e4m3fnuzx16")
+    float8_e4m3fnuzx32 = dtype("float8_e4m3fnuzx32")
+    float8_e4m3fnuzx64 = dtype("float8_e4m3fnuzx64")
+    float8_e5m2 = dtype("float8_e5m2")
+    float8_e5m2x2 = dtype("float8_e5m2x2")
+    float8_e5m2x4 = dtype("float8_e5m2x4")
+    float8_e5m2x8 = dtype("float8_e5m2x8")
+    float8_e5m2x16 = dtype("float8_e5m2x16")
+    float8_e5m2x32 = dtype("float8_e5m2x32")
+    float8_e5m2x64 = dtype("float8_e5m2x64")
+    float8_e5m2fnuz = dtype("float8_e5m2fnuz")
+    float8_e5m2fnuzx2 = dtype("float8_e5m2fnuzx2")
+    float8_e5m2fnuzx4 = dtype("float8_e5m2fnuzx4")
+    float8_e5m2fnuzx8 = dtype("float8_e5m2fnuzx8")
+    float8_e5m2fnuzx16 = dtype("float8_e5m2fnuzx16")
+    float8_e5m2fnuzx32 = dtype("float8_e5m2fnuzx32")
+    float8_e5m2fnuzx64 = dtype("float8_e5m2fnuzx64")
+    float8_e8m0fnu = dtype("float8_e8m0fnu")
+    float8_e8m0fnux2 = dtype("float8_e8m0fnux2")
+    float8_e8m0fnux4 = dtype("float8_e8m0fnux4")
+    float8_e8m0fnux8 = dtype("float8_e8m0fnux8")
+    float8_e8m0fnux16 = dtype("float8_e8m0fnux16")
+    float8_e8m0fnux32 = dtype("float8_e8m0fnux32")
+    float8_e8m0fnux64 = dtype("float8_e8m0fnux64")
+    float6_e2m3fn = dtype("float6_e2m3fn")
+    float6_e2m3fnx2 = dtype("float6_e2m3fnx2")
+    float6_e2m3fnx4 = dtype("float6_e2m3fnx4")
+    float6_e2m3fnx8 = dtype("float6_e2m3fnx8")
+    float6_e2m3fnx16 = dtype("float6_e2m3fnx16")
+    float6_e2m3fnx32 = dtype("float6_e2m3fnx32")
+    float6_e2m3fnx64 = dtype("float6_e2m3fnx64")
+    float6_e3m2fn = dtype("float6_e3m2fn")
+    float6_e3m2fnx2 = dtype("float6_e3m2fnx2")
+    float6_e3m2fnx4 = dtype("float6_e3m2fnx4")
+    float6_e3m2fnx8 = dtype("float6_e3m2fnx8")
+    float6_e3m2fnx16 = dtype("float6_e3m2fnx16")
+    float6_e3m2fnx32 = dtype("float6_e3m2fnx32")
+    float6_e3m2fnx64 = dtype("float6_e3m2fnx64")
+    float4_e2m1fn = dtype("float4_e2m1fn")
+    float4_e2m1fnx2 = dtype("float4_e2m1fnx2")
+    float4_e2m1fnx4 = dtype("float4_e2m1fnx4")
+    float4_e2m1fnx8 = dtype("float4_e2m1fnx8")
+    float4_e2m1fnx16 = dtype("float4_e2m1fnx16")
+    float4_e2m1fnx32 = dtype("float4_e2m1fnx32")
+    float4_e2m1fnx64 = dtype("float4_e2m1fnx64")
+    bfloat16 = dtype("bfloat16")
+    bfloat16x2 = dtype("bfloat16x2")
+
+_all_dtypes = [
+    "bool",
+    "short",
+    "int",
+    "uint",
+    "long",
+    "half",
+    "float",
+    "double",
+    "int4",
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "int8x2",
+    "int16x2",
+    "int32x2",
+    "int64x2",
+    "int8x4",
+    "int16x4",
+    "int32x4",
+    "int64x4",
+    "int8x8",
+    "int16x8",
+    "int32x8",
+    "int64x8",
+    "int8x16",
+    "int16x16",
+    "int32x16",
+    "int64x16",
+    "int8x32",
+    "int16x32",
+    "int32x32",
+    "int64x32",
+    "int8x64",
+    "int16x64",
+    "int32x64",
+    "int64x64",
+    "uint8",
+    "uint16",
+    "uint32",
+    "uint64",
+    "uint8x2",
+    "uint16x2",
+    "uint32x2",
+    "uint64x2",
+    "uint8x4",
+    "uint16x4",
+    "uint32x4",
+    "uint64x4",
+    "uint8x8",
+    "uint16x8",
+    "uint32x8",
+    "uint64x8",
+    "uint8x16",
+    "uint16x16",
+    "uint32x16",
+    "uint64x16",
+    "uint8x32",
+    "uint16x32",
+    "uint32x32",
+    "uint64x32",
+    "uint8x64",
+    "uint16x64",
+    "uint32x64",
+    "uint64x64",
+    "float16",
+    "float32",
+    "float64",
+    "float16x2",
+    "float32x2",
+    "float64x2",
+    "float16x4",
+    "float32x4",
+    "float64x4",
+    "float16x8",
+    "float32x8",
+    "float64x8",
+    "float16x16",
+    "float32x16",
+    "float64x16",
+    "float16x32",
+    "float32x32",
+    "float64x32",
+    "float16x64",
+    "float32x64",
+    "float64x64",
+    "float8_e3m4",
+    "float8_e3m4x2",
+    "float8_e3m4x4",
+    "float8_e3m4x8",
+    "float8_e3m4x16",
+    "float8_e3m4x32",
+    "float8_e3m4x64",
+    "float8_e4m3",
+    "float8_e4m3x2",
+    "float8_e4m3x4",
+    "float8_e4m3x8",
+    "float8_e4m3x16",
+    "float8_e4m3x32",
+    "float8_e4m3x64",
+    "float8_e4m3b11fnuz",
+    "float8_e4m3b11fnuzx2",
+    "float8_e4m3b11fnuzx4",
+    "float8_e4m3b11fnuzx8",
+    "float8_e4m3b11fnuzx16",
+    "float8_e4m3b11fnuzx32",
+    "float8_e4m3b11fnuzx64",
+    "float8_e4m3fn",
+    "float8_e4m3fnx2",
+    "float8_e4m3fnx4",
+    "float8_e4m3fnx8",
+    "float8_e4m3fnx16",
+    "float8_e4m3fnx32",
+    "float8_e4m3fnx64",
+    "float8_e4m3fnuz",
+    "float8_e4m3fnuzx2",
+    "float8_e4m3fnuzx4",
+    "float8_e4m3fnuzx8",
+    "float8_e4m3fnuzx16",
+    "float8_e4m3fnuzx32",
+    "float8_e4m3fnuzx64",
+    "float8_e5m2",
+    "float8_e5m2x2",
+    "float8_e5m2x4",
+    "float8_e5m2x8",
+    "float8_e5m2x16",
+    "float8_e5m2x32",
+    "float8_e5m2x64",
+    "float8_e5m2fnuz",
+    "float8_e5m2fnuzx2",
+    "float8_e5m2fnuzx4",
+    "float8_e5m2fnuzx8",
+    "float8_e5m2fnuzx16",
+    "float8_e5m2fnuzx32",
+    "float8_e5m2fnuzx64",
+    "float8_e8m0fnu",
+    "float8_e8m0fnux2",
+    "float8_e8m0fnux4",
+    "float8_e8m0fnux8",
+    "float8_e8m0fnux16",
+    "float8_e8m0fnux32",
+    "float8_e8m0fnux64",
+    "float6_e2m3fn",
+    "float6_e2m3fnx2",
+    "float6_e2m3fnx4",
+    "float6_e2m3fnx8",
+    "float6_e2m3fnx16",
+    "float6_e2m3fnx32",
+    "float6_e2m3fnx64",
+    "float6_e3m2fn",
+    "float6_e3m2fnx2",
+    "float6_e3m2fnx4",
+    "float6_e3m2fnx8",
+    "float6_e3m2fnx16",
+    "float6_e3m2fnx32",
+    "float6_e3m2fnx64",
+    "float4_e2m1fn",
+    "float4_e2m1fnx2",
+    "float4_e2m1fnx4",
+    "float4_e2m1fnx8",
+    "float4_e2m1fnx16",
+    "float4_e2m1fnx32",
+    "float4_e2m1fnx64",
+    "bfloat16",
+    "bfloat16x2",
+]
+
+__all__ = list(_all_dtypes) + [
+    "dtype",
+    "AnyDType",
+    "get_tvm_dtype",
+]
diff --git a/tilelang/language/eager/__init__.py b/tilelang/language/eager/__init__.py
new file mode 100644
index 0000000000..e97f11ba51
--- /dev/null
+++ b/tilelang/language/eager/__init__.py
@@ -0,0 +1,2 @@
+from .builder import prim_func, macro, PrimFunc, JITFunc, Ref, const, annotate_compile_flags, annotate_pass_configs  # noqa: F401
+from ..dtypes import *
diff --git a/tilelang/language/eager/ast.py b/tilelang/language/eager/ast.py
new file mode 100644
index 0000000000..f3a39eef6a
--- /dev/null
+++ b/tilelang/language/eager/ast.py
@@ -0,0 +1,705 @@
+from __future__ import annotations
+import ast
+from dataclasses import dataclass, field
+from pathlib import Path
+from typing import Callable, Generic, Any, Literal, TypeVar
+from contextlib import AbstractContextManager
+from collections.abc import Iterable
+
+
+# Python 3.9 compatibility for ParamSpec
+try:
+    from typing import ParamSpec
+except ImportError:  # Python < 3.10
+    from typing_extensions import ParamSpec
+import inspect
+
+# from .utils import get_ast, get_compiled_object
+from . import utils
+from .. import dtypes
+
+_span_attrs = ["lineno", "col_offset", "end_lineno", "end_col_offset"]
+
+
+def ast_has_span(ast: ast.AST) -> bool:
+    return all(hasattr(ast, attr) for attr in _span_attrs)
+
+
+def ast_get_span(ast: ast.AST) -> tuple[int, int, int, int]:
+    if not ast_has_span(ast):
+        return None
+    return tuple(getattr(ast, attr) for attr in _span_attrs)
+
+
+def ast_set_span(ast: ast.AST, span: tuple[int, int, int, int]):
+    if not ast_has_span(ast):
+        return
+    for attr, value in zip(_span_attrs, span):
+        setattr(ast, attr, value)
+
+
+class QuoteVisitor(ast.NodeTransformer):
+    def __init__(self, names: dict[str, ast.AST], passes: list[Any] | None = None, span=None):
+        self.names = names
+        self.passes = passes or []
+        self.span = span
+
+    def generic_visit(self, node: ast.AST):
+        if self.span is not None:
+            ast_set_span(node, self.span)
+        return super().generic_visit(node)
+
+    def visit_Name(self, node: ast.Name) -> Any:
+        if node.id in self.names:
+            return self.names[node.id]
+        else:
+            return node
+
+    def visit_Pass(self, node: ast.Pass) -> Any:
+        item = self.passes.pop(0)
+        return item if item else node
+
+
+def quote(expr: str, *, passes: list[Any] | None = None, span=None, **kws) -> list[ast.AST]:
+    tree = ast.parse(expr)
+    if isinstance(span, ast.AST):
+        span = ast_get_span(span)
+    tree = QuoteVisitor(kws, passes, span).visit(tree)
+    return tree.body
+
+
+def quote1(expr: str, *, passes: list[Any] | None = None, span=None, **kws) -> ast.AST:
+    res = quote(expr, passes=passes, span=span, **kws)
+    assert len(res) == 1
+    return res[0]
+
+
+def quote_expr(expr: str, **kws) -> ast.expr:
+    res = quote1(expr, **kws)
+    assert isinstance(res, ast.Expr)
+    return res.value
+
+
+Operator = Literal["Add", "Sub", "Mult", "MatMult", "Div", "Mod", "Pow", "LShift", "RShift", "BitOr", "BitXor", "BitAnd", "FloorDiv"]
+BoolOp = Literal["And", "Or", "Not"]
+
+
+def get_operator_name(operator: ast.operator) -> Operator:
+    return operator.__class__.__name__
+
+
+def get_boolop_name(boolop: ast.boolop) -> BoolOp:
+    return boolop.__class__.__name__
+
+
+_T = TypeVar("_T")
+
+
+def eval_op(op: Operator, left: Any, right: Any) -> Any:
+    if op == "Add":
+        return left + right
+    if op == "Sub":
+        return left - right
+    if op == "Mult":
+        return left * right
+    if op == "MatMult":
+        return left @ right
+    if op == "Div":
+        return left / right
+    if op == "Mod":
+        return left % right
+    if op == "Pow":
+        return left**right
+    if op == "LShift":
+        return left << right
+    if op == "RShift":
+        return left >> right
+    if op == "BitOr":
+        return left | right
+    if op == "BitXor":
+        return left ^ right
+    if op == "BitAnd":
+        return left & right
+    if op == "FloorDiv":
+        return left // right
+    raise ValueError(f"Unknown operator: {op}")
+
+
+def eval_aug_assign(op: Operator, left: Any, sl: slice, right: Any) -> Any:
+    if op == "Add":
+        left[sl] += right
+        return left
+    if op == "Sub":
+        left[sl] -= right
+        return left
+    if op == "Mult":
+        left[sl] *= right
+        return left
+    if op == "MatMult":
+        left[sl] @= right
+        return left
+    if op == "Div":
+        left[sl] /= right
+        return left
+    if op == "Mod":
+        left[sl] %= right
+        return left
+    if op == "Pow":
+        left[sl] **= right
+        return left
+    if op == "LShift":
+        left[sl] <<= right
+        return left
+    if op == "RShift":
+        left[sl] >>= right
+        return left
+    if op == "BitOr":
+        left[sl] |= right
+        return left
+    if op == "BitXor":
+        left[sl] ^= right
+        return left
+    if op == "BitAnd":
+        left[sl] &= right
+        return left
+    if op == "FloorDiv":
+        left[sl] //= right
+        return left
+    raise ValueError(f"Unknown operator: {op}")
+
+
+class _empty: ...
+
+
+class BaseBuilder:
+    empty = _empty
+
+    def get_parent_locals(self):
+        return inspect.currentframe().f_back.f_back.f_locals
+
+    def ctx_if(self, cond) -> Iterable[_T]:
+        yield cond
+
+    def ctx_then(self, val: _T) -> Iterable[None]:
+        if val:
+            yield
+
+    def ctx_else(self, val: _T) -> Iterable[None]:
+        if not val:
+            yield
+
+    def eval(self, val: Any):  # noqa: B027
+        pass
+
+    def ctx_for(self, range: Iterable[Any]) -> Iterable[Any]:
+        return range
+
+    def ctx_continue(self) -> bool:
+        return True
+
+    def ctx_break(self) -> bool:
+        return True
+
+    def ctx_while(self, cond: Callable[[], Any]) -> Iterable[None]:
+        while cond():
+            yield
+
+    def bind(self, name: str, value: Any, annot: Any = empty) -> Any:
+        return value
+
+    def unwrap_value(self, value):
+        return value
+
+    def assign_slice(self, lval: Any, sl: slice, value: Any, annot: Any = empty):
+        lval[sl] = value
+
+    def aug_assign(self, op: Operator, target: Any, aug_value: Any, name: str | None = None) -> Any:
+        return eval_op(op, target, aug_value)
+
+    def aug_assign_slice(self, op: Operator, target: Any, sl: slice, aug_value: Any):
+        eval_aug_assign(op, target, sl, aug_value)
+
+    def boolop(self, op: BoolOp, left: Any, right: Callable[[], Any] | None = None) -> Any:
+        if op == "And":
+            return left and right()
+        if op == "Or":
+            return left or right()
+        if op == "Not":
+            return not left
+        raise ValueError(f"Unknown boolop: {op}")
+
+    def ifexp(self, cond: Any, then: Callable[[], Any], otherwise: Callable[[], Any]) -> Any:
+        return then() if cond else otherwise()
+
+    def ret(self, value: Any) -> Any:
+        return value
+
+    def ctx_with(self, ctx: AbstractContextManager[Any]) -> AbstractContextManager[Any]:
+        return ctx
+
+    def assert_expr(self, cond: Any, msg: Any):
+        assert cond, msg
+
+    def rval(self, name: str, value: Any):
+        return value
+
+    def arg(self, name: str, value: Any):
+        return value
+
+    def override(self, name: str):
+        return globals()[name]
+
+
+def _try_eval(node: ast.expr, nonlocals: dict[str, Any], globals: dict[str, Any]) -> Any:
+    try:
+        code = "lambda " + ",".join(nonlocals.keys()) + ": " + ast.unparse(node)
+        return eval(code, globals)(**nonlocals)
+    except Exception:
+        return _empty
+
+
+class DSLMutator(ast.NodeTransformer):
+    def __init__(self, nonlocals: dict[str, Any], globals: dict[str, Any], filename: str):
+        self.tmp_counter = 0
+        self.nonlocals = nonlocals
+        self.globals = globals
+        self.extra_type_hints: dict[str, Any] = {}
+        self.filename = filename
+
+    def get_tmp(self) -> str:
+        name = f"__{self.tmp_counter}"
+        self.tmp_counter += 1
+        return name
+
+    def visit_If(self, node: ast.If):
+        node = self.generic_visit(node)
+        br = self.get_tmp()
+        if len(node.orelse) == 0:
+            return quote(
+                f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n",
+                cond=node.test,
+                passes=[node.body],
+                span=node,
+            )
+        return quote(
+            f"for {br} in __tb.ctx_if(cond):\n  for _ in __tb.ctx_then({br}):\n    pass\n  for _ in __tb.ctx_else({br}):\n    pass\n",
+            cond=node.test,
+            passes=[node.body, node.orelse],
+            span=node,
+        )
+
+    def visit_Expr(self, node: ast.Expr):
+        node = self.generic_visit(node)
+        return quote("__tb.eval(value)", value=node.value, span=node)
+
+    def _parse_names(self, target: ast.expr):
+        if isinstance(target, ast.Name):
+            return f"'{target.id}'"
+        elif isinstance(target, ast.Tuple):
+            return "(" + ",".join([self._parse_names(elt) for elt in target.elts]) + ",)"
+        else:
+            s = ast.unparse(target)
+            raise NotImplementedError(f"Unsupported for target `{s}`")
+
+    def visit_For(self, node: ast.For):
+        node = self.generic_visit(node)
+        tmp = self.get_tmp()
+        # names = self._parse_names(node.target)
+        var = ast.Name(tmp, ctx=ast.Load())
+        ast_set_span(var, ast_get_span(node.target))
+        stmts = self._emit_assign_target(node.target, var)
+        return quote(
+            f"for {tmp} in __tb.ctx_for(range):\n  pass\n",
+            target=node.target,
+            range=node.iter,
+            passes=[stmts + node.body],
+            span=node,
+        )
+
+    def visit_Continue(self, node: ast.Continue):
+        node = self.generic_visit(node)
+        return quote("if __tb.ctx_continue(): continue", span=node)
+
+    def visit_Break(self, node: ast.Break):
+        node = self.generic_visit(node)
+        return quote("if __tb.ctx_break(): break", span=node)
+
+    def _emit_assign_target(self, target: ast.expr, rval: ast.expr, annot: ast.expr = None) -> list[ast.AST]:
+        if isinstance(target, ast.Name):
+            if annot is None:
+                return quote(f"name = __tb.bind('{target.id}', value)", name=target, value=rval, span=target)
+            else:
+                return quote(f'name = __tb.bind("{target.id}", value, annot)', name=target, value=rval, annot=annot, span=target)
+        elif isinstance(target, ast.Attribute):
+            s = ast.unparse(target)
+            raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
+        elif isinstance(target, ast.Subscript):
+            if annot is None:
+                return quote(
+                    "__tb.assign_slice(lval, slice, value)",
+                    lval=target.value,
+                    slice=target.slice,
+                    value=rval,
+                    span=target,
+                )
+            else:
+                return quote(
+                    "__tb.assign_slice(lval, slice, value, annot)",
+                    lval=target.value,
+                    slice=target.slice,
+                    value=rval,
+                    annot=annot,
+                    span=target,
+                )
+        else:
+            # flatten nested tuple into a list of (tmp_name, target)
+            unpacked = []
+
+            def _visit_target(target: ast.expr) -> str:
+                if isinstance(target, (ast.Name, ast.Subscript)):
+                    tmp = self.get_tmp()
+                    unpacked.append((tmp, target))
+                    res = ast.Name(id=tmp, ctx=target.ctx)
+                    ast_set_span(res, ast_get_span(target))
+                    return res
+                elif isinstance(target, ast.Tuple):
+                    elts = [_visit_target(elt) for elt in target.elts]
+                    res = ast.Tuple(elts=elts, ctx=target.ctx)
+                    ast_set_span(res, ast_get_span(target))
+                    return res
+                else:
+                    s = ast.unparse(target)
+                    raise NotImplementedError(f"Attribute assignment not supported yet, `{s}`")
+
+            unpack_stmt = ast.Assign(targets=[_visit_target(target)], value=quote_expr("__tb.unwrap_value(rval)", rval=rval, span=rval))
+            ast_set_span(unpack_stmt, ast_get_span(target))
+            stmts = [unpack_stmt]
+            bind_lvals = []
+            bind_rvals = []
+
+            def flush_binds():
+                if bind_lvals:
+                    stmts.append(quote1(f"{', '.join(bind_lvals)}, = {', '.join(bind_rvals)},", span=target))
+                    bind_lvals.clear()
+                    bind_rvals.clear()
+
+            # the following code generate two phase binding to support swap like semantics
+            # for example:
+            #       a, b = b, a
+            # 1 phase:
+            #    _tmp_0, _tmp_1 = b, a
+            #    => _tmp_0: T.int32 = b
+            #    => _tmp_1: T.int32 = a
+            # 2 phase:
+            #    a, b = _tmp_0, _tmp_1
+            #    => a = _tmp_0 => a[0] = _tmp_0
+            #    => b = _tmp_1 => b[0] = _tmp_1
+
+            # 1 phase: _tmp_0, _tmp_1 = __tb.bind('_', a), __tb.bind('_', b)
+            for tmp, _target in unpacked:
+                bind_lvals.append(tmp)
+                bind_rvals.append(f'__tb.bind("_", {tmp})')
+
+            flush_binds()
+
+            # 2 phase: a, b = __tb.bind('a', _tmp_0), __tb.bind('b', _tmp_1)
+            for tmp, target in unpacked:
+                if isinstance(target, ast.Name):
+                    bind_lvals.append(target.id)
+                    bind_rvals.append(f'__tb.bind("{target.id}", {tmp})')
+                elif isinstance(target, ast.Subscript):
+                    flush_binds()
+                    stmts.append(quote1(f"__tb.assign_slice(lval, slice, {tmp})", lval=target.value, slice=target.slice, span=target))
+                else:
+                    s = ast.unparse(target)
+                    raise NotImplementedError(f"Unsupported target: {s}")
+            flush_binds()
+            return stmts
+
+    def visit_Assign(self, node: ast.Assign) -> list[ast.AST]:
+        node = self.generic_visit(node)
+        rval = node.value
+        if len(node.targets) == 1:
+            return self._emit_assign_target(node.targets[0], rval)
+        else:
+            tmp_name = self.get_tmp()
+            tmp_store = ast.Name(tmp_name, ctx=ast.Store())
+            tmp_load = ast.Name(tmp_name, ctx=ast.Load())
+            ast_set_span(tmp_store, node.targets[0])
+            ast_set_span(tmp_load, node.targets[0])
+            stmt = self._emit_assign_target(tmp_store, rval)
+            for target in node.targets:
+                stmt.extend(self._emit_assign_target(target, tmp_load))
+            return stmt
+
+    def visit_AugAssign(self, node: ast.AugAssign) -> list[ast.AST]:
+        node = self.generic_visit(node)
+        target, rval = node.target, node.value
+        op = get_operator_name(node.op)
+        if isinstance(target, ast.Name):
+            # NOTE: We intentionally avoid using placeholder names like `value` here because
+            # user code commonly uses `value` as a variable name, and QuoteVisitor would
+            # otherwise substitute the target identifier unexpectedly.
+            target_load = ast.Name(target.id, ctx=ast.Load())
+            ast_set_span(target_load, ast_get_span(target))
+            return quote(
+                f"__tl_lhs = __tb.aug_assign('{op}', __tl_target, __tl_aug_value, name='{target.id}')",
+                __tl_lhs=target,
+                __tl_target=target_load,
+                __tl_aug_value=rval,
+                span=node,
+            )
+        elif isinstance(target, ast.Subscript):
+            return quote(
+                f"__tb.aug_assign_slice('{op}', lval, slice, value)",
+                lval=target.value,
+                slice=target.slice,
+                value=rval,
+                span=node,
+            )
+        else:
+            return node
+
+    def visit_AnnAssign(self, node: ast.AnnAssign):
+        node = self.generic_visit(node)
+        rval = node.value or quote_expr("__tb.empty", span=node, annot=node)
+        return self._emit_assign_target(node.target, rval, annot=node.annotation)
+
+    def visit_While(self, node):
+        node = self.generic_visit(node)
+        return quote1("for _ in __tb.ctx_while(lambda: cond):\n  pass", cond=node.test, passes=[node.body], span=node)
+
+    def visit_FunctionDef(self, node: ast.FunctionDef):
+        stmts = []
+        arg_names = set()
+        all_args = node.args.posonlyargs + node.args.args
+        if node.args.vararg is not None:
+            all_args += node.args.vararg
+        all_args += node.args.kwonlyargs
+        for arg in all_args:
+            name = arg.arg
+            arg_names.add(name)
+            if arg.annotation is not None:
+                arg_stmt = quote1(f'{name} = __tb.arg("{name}", {name})', span=arg)
+            else:
+                arg_stmt = quote1(f'{name} = __tb.arg("{name}", {name})', span=arg)
+            arg.annotation = None
+            stmts.append(arg_stmt)
+        # trying to find `A: T.Tensor, b: T.float32` like type hints
+        for stmt in node.body:
+            self._parse_arg_annot(stmt, arg_names)
+        node = self.generic_visit(node)
+        node.body = stmts + node.body
+        node.decorator_list.clear()
+        name = node.name
+        node.args.kwarg = ast.arg(arg="__kwargs")
+        node = SpanAttacher("__tb_fl", "__tb_fn").visit(node)
+        return quote1(
+            f"def make_closure({', '.join(self.nonlocals.keys())}):\n"
+            f"  def {name}(__tb):\n"
+            f"    __tb_fl = '{self.filename}'\n"
+            f"    __tb_fn = '{name}'\n"
+            "    range = __tb.override('range')\n"
+            "    pass\n"
+            f"    return {name}\n"
+            f"  return {name}",
+            passes=[node],
+        )
+
+    def _try_eval(self, node: ast.expr) -> Any:
+        return _try_eval(node, self.nonlocals, self.globals)
+
+    def _parse_arg_annot(self, stmt: ast.stmt, arg_names: set[str]):
+        if not isinstance(stmt, ast.AnnAssign):
+            return
+        if not isinstance(stmt.target, ast.Name):
+            return
+        if stmt.value is not None:
+            return
+        name = stmt.target.id
+        if name not in arg_names:
+            return
+        annot = stmt.annotation
+
+        # case 1: attribute(T, float32)
+        if isinstance(annot, ast.Attribute) and annot.attr in dtypes._all_dtypes:
+            eval_res = self._try_eval(annot)
+            if isinstance(eval_res, dtypes.dtype):
+                self.extra_type_hints[name] = eval_res
+                return
+
+        # case 2: subscript(attribute(T, Tensor), ...) or call(attribute(T, Tensor), ...)
+        inner = None
+        if isinstance(annot, ast.Call) and isinstance(annot.func, ast.Attribute):
+            inner = annot.func
+        if isinstance(annot, ast.Subscript) and isinstance(annot.value, ast.Attribute):
+            inner = annot.value
+        if inner is not None and inner.attr in ["Tensor", "StridedTensor", "ptr"]:
+            eval_res = self._try_eval(inner)
+            from tilelang.language.proxy import TensorProxy, StridedTensorProxy, ptr
+
+            if isinstance(eval_res, (TensorProxy, StridedTensorProxy)) or eval_res is ptr:
+                self.extra_type_hints[name] = ptr
+                return
+
+    def visit_BoolOp(self, node: ast.BoolOp):
+        node = self.generic_visit(node)
+        op_name = get_boolop_name(node.op)
+        last = node.values[-1]
+        for i in reversed(range(len(node.values) - 1)):
+            last = quote_expr(
+                expr=f"__tb.boolop('{op_name}', left, lambda: right)",
+                left=node.values[i],
+                right=last,
+                span=node,
+            )
+        return last
+
+    def visit_UnaryOp(self, node: ast.UnaryOp):
+        node = self.generic_visit(node)
+        if isinstance(node.op, ast.Not):
+            return quote_expr("__tb.boolop('Not', operand)", operand=node.operand, span=node)
+        return node
+
+    def visit_Compare(self, node: ast.Compare) -> ast.expr:
+        node = self.generic_visit(node)
+        left = node.left
+        split = []
+        for op, comp in zip(node.ops, node.comparators):
+            cmp = ast.Compare(left=left, ops=[op], comparators=[comp])
+            ast_set_span(cmp, ast_get_span(node))
+            split.append(cmp)
+            left = comp
+        last = split[-1]
+        for i in reversed(range(len(split) - 1)):
+            last = quote_expr("__tb.boolop('And', left, lambda: right)", left=split[i], right=last, span=node)
+        return last
+
+    def visit_IfExp(self, node: ast.IfExp) -> ast.Expr:
+        node = self.generic_visit(node)
+        return quote_expr(
+            "__tb.ifexp(cond, lambda: then, lambda: otherwise)", cond=node.test, then=node.body, otherwise=node.orelse, span=node
+        )
+
+    def visit_Return(self, node: ast.Return):
+        node = self.generic_visit(node)
+        return quote("return __tb.ret(value)", value=node.value, span=node)
+
+    def visit_With(self, node: ast.With):
+        is_kernel_ctx = False
+        for expr in node.items:
+            cexpr = expr.context_expr
+            if isinstance(cexpr, ast.Call) and isinstance(cexpr.func, ast.Attribute) and cexpr.func.attr == "Kernel":
+                eval_res = self._try_eval(cexpr.func)
+                from tilelang.language import Kernel
+
+                if eval_res is Kernel:
+                    is_kernel_ctx = True
+        node = self.generic_visit(node)
+        for expr in node.items:
+            expr.context_expr = quote_expr("__tb.ctx_with(e)", e=expr.context_expr, span=expr)
+        if is_kernel_ctx:
+            return [quote1("if __tb.skip_kernel_ctx(): return"), node]
+        return node
+
+    def visit_Assert(self, node: ast.Assert):
+        node = self.generic_visit(node)
+        return quote("__tb.assert_expr(cond, msg)", cond=node.test, msg=node.msg, span=node)
+
+    def visit_Name(self, node: ast.Name):
+        if isinstance(node.ctx, ast.Load):
+            return quote_expr(f"__tb.rval('{node.id}', node)", node=node, span=node)
+        return node
+
+
+class SpanAttacher(ast.NodeTransformer):
+    def __init__(self, filename_var: str, func_name_var: str):
+        self.filename_var = filename_var
+        self.func_name_var = func_name_var
+
+    def visit(self, node: ast.AST):
+        node = self.generic_visit(node)
+        if isinstance(node, ast.stmt) and hasattr(node, "lineno"):
+            return quote(f"__tb.set_fileline({self.filename_var}, {node.lineno}, {self.func_name_var})") + [node]
+        return node
+
+
+_P = ParamSpec("_P")
+
+
+@dataclass
+class IRGenerator(Generic[_P, _T]):
+    gen: Callable[[BaseBuilder], Callable[_P, _T]]
+    source: str
+    extra_type_hints: dict[str, Any] = field(default_factory=dict)
+
+
+def has_internal_prim_func(func: Callable[_P, _T]) -> bool:
+    tree = utils.get_ast(func)
+    nonlocals = utils.get_func_nonlocals(func)
+    for item in ast.walk(tree):
+        if isinstance(item, ast.FunctionDef):
+            decors = item.decorator_list
+            for decor in decors:
+                if isinstance(decor, ast.Attribute) and decor.attr == "prim_func":
+                    from tilelang.language.eager import prim_func
+
+                    if _try_eval(decor, nonlocals, func.__globals__) is prim_func:
+                        return True
+    return False
+
+
+def mutate(func: Callable[_P, _T]) -> IRGenerator[_P, _T]:
+    """
+    Transform a Python function into an IR (Intermediate Representation) generator.
+    This function takes a regular Python function and performs AST (Abstract Syntax Tree)
+    transformation to create an IRGenerator that can be used for code generation purposes.
+    Args:
+        func (Callable[_P, _T]): The Python function to be transformed. This should be a
+            callable that will be analyzed and mutated at the AST level. The function's
+            signature is preserved through generic type parameters _P (parameters) and
+            _T (return type).
+    Returns:
+        IRGenerator[_P, _T]: An IRGenerator instance wrapping the transformed function.
+            The generator contains:
+            - gen: The compiled and mutated version of the original function
+            - source: The unparsed source code of the transformed AST as a string
+    Example:
+        >>> @mutate
+        ... def my_function(x: int) -> int:
+        ...     return x * 2
+        >>> # my_function is now an IRGenerator that can be used for code generation
+    Note:
+        - The original function's closure variables and captured context are preserved
+        - The transformation is performed at compile-time through AST manipulation
+        - The returned IRGenerator maintains type information from the original function
+    """
+
+    tree = utils.get_ast(func)
+    filename = inspect.getsourcefile(func) or inspect.getfile(func)
+    nonlocals = utils.get_func_nonlocals(func)
+
+    # DSLMutator generates a function named `make_closure`
+    #   it accepts all names inside nonlocal, and returns the mutated function
+    #   this is because we must separate the closure namespace form the global namespace
+    #     if we directly inject closure variables into the global namespace,
+    #     it generates a new `globals` dict, and the dict owns all reference to the original globalns
+    #     which makes memory leak, because the original globalns cannot be freed
+    #     ```py
+    #     a = 123
+    #     def foo():
+    #       x = foo.__globals__ # OK, globals are maintained by python
+    #       x = {**foo.__globals__} # Not OK: globals are copied, and the original globals cannot be freed
+    #       def bar(): x
+    #       return bar
+    #     ```
+    mut = DSLMutator(nonlocals, func.__globals__, Path(filename).name)
+    tree = mut.visit(tree)
+    make_closure = utils.get_compiled_object(
+        tree,
+        "make_closure",
+        filename,
+        func.__globals__,  # use the original globalns
+    )
+    fn = make_closure(**nonlocals)
+    return IRGenerator(gen=fn, source=ast.unparse(tree), extra_type_hints=mut.extra_type_hints)
diff --git a/tilelang/language/eager/builder.py b/tilelang/language/eager/builder.py
new file mode 100644
index 0000000000..74c315149b
--- /dev/null
+++ b/tilelang/language/eager/builder.py
@@ -0,0 +1,1284 @@
+from __future__ import annotations
+from contextlib import contextmanager, AbstractContextManager
+from dataclasses import dataclass
+import inspect
+import sys
+
+from tilelang.language.kernel import KernelLaunchFrame
+from tvm_ffi.container import Map
+from tvm.ir.base import Span
+from tvm.ir.expr import Range
+from tvm.tir.stmt import BufferRegion
+from tvm.tir.stmt_functor import substitute
+from .ast import BaseBuilder, IRGenerator, eval_op, has_internal_prim_func, mutate
+from .utils import construct_strides
+from tilelang.utils import side_effect
+import tvm
+from tvm.tir import Buffer
+from tvm.script.ir_builder import tir, IRBuilder
+
+from tvm.tir.expr import BufferLoad, CallEffectKind, EqualOp, FloatImm, IntImm, NotEqualOp, PrimExpr, StringImm, Var
+from typing import TYPE_CHECKING, Callable, Any, Generic, TypeVar, ForwardRef, Union, Literal, get_origin
+from collections.abc import Hashable
+from collections.abc import Sequence
+
+# Python 3.9 compatibility for ParamSpec and Self
+try:
+    from typing import ParamSpec, Self
+except ImportError:  # Python < 3.11 for Self, < 3.10 for ParamSpec
+    from typing_extensions import ParamSpec, Self
+from .. import dtypes as dt
+from . import utils
+from tilelang.jit.exceptions import JITNoBuilderError, EagerJITBuildError
+import threading
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def unwrap_expr(expr) -> PrimExpr | int | float:
+    """
+    unwrap expr and convert it into PrimExpr like
+    """
+    if isinstance(expr, tir.meta_var):
+        expr = expr.value
+    elif isinstance(expr, Ref):
+        return expr.load()
+    elif is_var(expr):
+        expr = tir.BufferLoad(expr, indices=[0])
+    elif isinstance(expr, (EqualOp, NotEqualOp)):
+        expr = expr.asobject()
+    return expr
+
+
+def unwrap_cond(expr):
+    """
+    unwrap expr and convert to bool condition
+    """
+    expr = unwrap_expr(expr)
+    if isinstance(expr, (IntImm, FloatImm, StringImm)):
+        return bool(expr.value)
+    elif isinstance(expr, PrimExpr):
+        return expr
+    elif isinstance(expr, Buffer):
+        raise TypeError(f"Buffer `{expr}` cannot be used as condition directly.")
+    elif isinstance(expr, (int, bool)) or expr is None:
+        return bool(expr)
+    else:
+        logger.warning(
+            f"Python expression `{expr}` is used as condition in TileLang, this is treated as a constant expression.",
+            stacklevel=3,
+        )
+        return bool(expr)
+
+
+thread_local_storage = threading.local()
+
+
+class Frame:
+    """
+    Frame are virtual context managers used in frontend only
+    They do not have any runtime representation in the generated TIR.
+    """
+
+    def __enter__(self): ...
+
+    def __exit__(self, exc_type, exc_value, traceback): ...
+
+
+class MacroFrame(Frame): ...
+
+
+class ExitedMacroFrame(Frame): ...
+
+
+class BoolOpFrame(Frame): ...
+
+
+class ContinueFrame(Frame): ...
+
+
+class BreakFrame(Frame): ...
+
+
+@dataclass
+class SerialForWithStep:
+    start: PrimExpr
+    stop: PrimExpr
+    step: PrimExpr
+    annotations: dict[str, Any] | None = None
+
+
+@dataclass
+class OutTensor:
+    shape: Sequence[PrimExpr]
+    dtype: dt.dtype
+
+    @property
+    def strides(self):
+        return construct_strides(tuple(self.shape))
+
+
+@dataclass
+class Ref:
+    bufload: BufferLoad
+
+    @property
+    def buffer(self):
+        return self.bufload.buffer
+
+    def store(self, value):
+        tir.buffer_store(self.bufload.buffer, value, self.bufload.indices)
+
+    def load(self):
+        return self.bufload
+
+
+class UnrollForWithStep(SerialForWithStep): ...
+
+
+# Python 3.9 compatibility: avoid PEP 604 unions at runtime
+# Use tuple for isinstance checks and typing.Union for annotations/aliases
+ContinueOrBreak = (ContinueFrame, BreakFrame)
+AnyFrame = Union[tir.frame.IRBuilderFrame, Frame]
+
+TIR_CONTROL_FRAME = (
+    tir.frame.WhileFrame,
+    tir.frame.ForFrame,
+    tir.frame.IfFrame,
+    tir.frame.PrimFuncFrame,
+)
+
+TIR_VAR_SCOPE_FRAME = (
+    tir.frame.WhileFrame,
+    tir.frame.ForFrame,
+    tir.frame.IfFrame,
+    tir.frame.PrimFuncFrame,
+    MacroFrame,
+    KernelLaunchFrame,
+)
+
+
+def is_var(v: Any) -> bool:
+    return isinstance(v, Buffer) and v.scope() == "local.var"
+
+
+# phase1: eager jit obtain function signature
+# phase2: eager jit elaborate function
+# none: not inside eager jit, i.e. it is lazyjit
+EagerJITStage = Literal["phase1", "phase2", "none"]
+
+
+class Builder(BaseBuilder):
+    def __init__(self):
+        self.frames: list[AnyFrame] = []
+        self.ir_builder = IRBuilder()
+        self.name_inside_frame: dict[str, AnyFrame] = {}
+        self.macro_arg_annot = {}
+        self.out_idx = []
+        self.out_tensor_cnt = 0
+        self.constexpr_var = set()
+        self.eager_jit: EagerJITStage = "none"
+        self.eager_jit_subs: dict[str, PrimExpr] = {}
+        self.func_pass_configs: dict[str, Any] | None = None
+        self.func_compile_flags: list[str] | str | None = None
+        self.current_file = "<unknown>"
+        self.current_line = 0
+        self.current_macro_name = "<unknown-macro>"
+        # stack to record caller fileline, not callee fileline
+        self.macro_fileline_stack: list[tuple[str, int, str]] = []
+
+    @classmethod
+    def current(cls) -> Self:
+        builder = getattr(thread_local_storage, "builder", None)
+        return builder
+
+    @contextmanager
+    def prim_func(self, name):
+        thread_local_storage.builder = self
+        try:
+            with self.ir_builder, self.with_frame(tir.prim_func()):
+                tir.func_name(name)
+                yield
+            if self.eager_jit != "phase1" and len(self.out_idx) != self.out_tensor_cnt:
+                raise RuntimeError("Not all tensor allocated from `T.empty` are returned")
+        finally:
+            del thread_local_storage.builder
+
+    @contextmanager
+    def macro(self, name=None, annotations=None):
+        if self.find_frame_idx(BoolOpFrame) is not None:
+            raise RuntimeError(
+                f"Macro `{name}` is used inside boolean expressions, "
+                "please use `if` to replace `M and M`, `M or M`, `M if xxx else M` constructs"
+            )
+        save = self.name_inside_frame, self.macro_arg_annot
+        self.name_inside_frame = {}
+        self.macro_arg_annot = annotations or {}
+        pos = len(self.frames)
+        # here we add a ExitedMacroFrame to preserve the frame stack inside macro
+        # because macro may bind some variable, and return it
+        #
+        # ```py
+        # @T.macro
+        # def foo(x):
+        #    y = x + 1
+        #    return y
+        # @T.prim_func
+        # def bar():
+        #    c = foo(1) # macro generates let y = x + 1
+        #    d = c # d = c should lay inside frame of `let y = x + 1`
+        self.macro_fileline_stack.append((self.current_file, self.current_line, self.current_macro_name))
+        self.frames.append(MacroFrame())
+        yield
+        self.frames[pos] = ExitedMacroFrame()
+        self.macro_fileline_stack.pop()
+        self.name_inside_frame, self.macro_arg_annot = save
+
+    def get(self) -> PrimFunc:
+        return self.ir_builder.get()
+
+    def find_frame_idx(self, frame: type | tuple[type, ...], start=0) -> int | None:
+        for idx in reversed(range(start, len(self.frames))):
+            f = self.frames[idx]
+            if isinstance(f, frame):
+                return idx
+
+    def enter_frame(self, frame: AbstractContextManager[Any]):
+        self.frames.append(frame)
+        return frame.__enter__()
+
+    def check_continue_break(self):
+        idx = self.find_frame_idx(ContinueOrBreak)
+        if idx is not None:
+            logger.warning("Statements after continue/break have no effect and will be ignored.", stacklevel=3)
+
+    @contextmanager
+    def with_frame(self, frame: AbstractContextManager[Any] | None):
+        pop_idx = len(self.frames)
+        yield self.enter_frame(frame)
+        while len(self.frames) > pop_idx:
+            self.frames.pop().__exit__(None, None, None)
+
+    class _has_if_frame: ...
+
+    def ctx_if(self, cond):
+        self.check_continue_break()
+        cond = unwrap_cond(cond)
+        if isinstance(cond, PrimExpr):
+            with self.with_frame(tir.If(cond)):
+                yield self._has_if_frame
+        else:
+            yield cond
+
+    def ctx_then(self, val):
+        if val is self._has_if_frame:
+            with self.with_frame(tir.Then()):
+                yield
+        else:
+            if val:
+                yield
+
+    def ctx_else(self, val):
+        if val is self._has_if_frame:
+            with self.with_frame(tir.Else()):
+                yield
+        else:
+            if not val:
+                yield
+
+    def eval(self, val: Any):
+        val = unwrap_expr(val)
+        if val is None:
+            pass
+        elif isinstance(val, tir.frame.IRBuilderFrame):
+            if isinstance(val, tir.frame.ForFrame):
+                logger.warning(
+                    "A for-loop frame is being evaluated as a standalone expression. Did you mean to use it in a `for` statement?",
+                    stacklevel=2,
+                )
+            self.enter_frame(val)
+        elif isinstance(val, PrimExpr):
+            tir.evaluate(val)
+        elif isinstance(val, (int, bool)):
+            tir.evaluate(tvm.tir.const(val))
+        elif isinstance(val, str):
+            pass
+        elif isinstance(val, tvm.tir.stmt.BufferStore):
+            tir.buffer_store(val.buffer, val.value, val.indices, val.predicate)
+        elif isinstance(val, (Buffer, Var)):
+            pass
+        else:
+            logger.warning(f"Return value `{val}` ({type(val)}) is unused and will be discarded.", stacklevel=2)
+
+    def ctx_for(self, it):
+        self.check_continue_break()
+        it = unwrap_expr(it)
+        if isinstance(it, (SerialForWithStep, UnrollForWithStep)):
+            # Validate and compute the trip count before constructing the frame
+            if isinstance(it.step, (int, IntImm)):
+                step_value = it.step if isinstance(it.step, int) else it.step.value
+                if step_value == 0:
+                    raise ValueError("Invalid stepped serial: step must be non-zero")
+                if step_value > 0:
+                    real_stop = tir.ceildiv(it.stop - it.start, step_value)
+                else:
+                    real_stop = tir.ceildiv(it.start - it.stop, -step_value)
+            else:
+                logger.warning(
+                    f"Non-constant step `{it.step}` in serial range may produce unexpected results. Consider using a constant step if possible.",
+                    stacklevel=2,
+                )
+                real_stop = tir.ceildiv(it.stop - it.start, it.step)
+            if isinstance(it, UnrollForWithStep):
+                real_frame = tir.unroll(real_stop, annotations=it.annotations)
+            elif isinstance(it, SerialForWithStep):
+                real_frame = tir.serial(real_stop, annotations=it.annotations)
+            else:
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.unroll, T.grid, T.parallel, T.vectorized, T.thread_binding"
+                )
+            with self.with_frame(real_frame) as v:
+                IRBuilder.name("_tmp", v)
+                yield it.start + v * it.step
+        else:
+            if not isinstance(it, tir.frame.ForFrame):
+                raise TypeError(
+                    f"Invalid for loop, got {it}({type(it)}), expect one of the following: "
+                    "range, T.serial, T.grid, T.parallel, T.vectorized, T.unroll, T.thread_binding"
+                )
+            with self.with_frame(it) as v:
+                yield v
+
+    def ctx_continue(self):
+        self.check_continue_break()
+        # add a dummy frame for checking code after continue/break
+        self.enter_frame(ContinueFrame())
+        tir.evaluate(tir.continue_loop())
+
+    def ctx_break(self):
+        self.check_continue_break()
+        # add a dummy frame for checking code after continue/break
+        self.enter_frame(BreakFrame())
+        tir.evaluate(tir.break_loop())
+
+    def ctx_while(self, cond):
+        self.check_continue_break()
+        cond_v = cond()
+        cond_v_unwrap = unwrap_cond(cond_v)
+        if not isinstance(cond_v_unwrap, PrimExpr):
+            if cond_v_unwrap:
+                raise RuntimeError(
+                    f"Infinite while loop detected in TileLang\n"
+                    f"Condition: {cond_v}({type(cond_v)}) => {cond_v_unwrap}({type(cond_v_unwrap)})\n"
+                )
+            else:
+                logger.warning(
+                    "While loop condition is always false; the loop body will be skipped.\n"
+                    f"Condition: {cond_v} ({type(cond_v)}) => {cond_v_unwrap} ({type(cond_v_unwrap)})\n",
+                    stacklevel=2,
+                )
+        with self.with_frame(tir.While(cond_v_unwrap)):
+            yield None
+
+    def bind(self, name, value, annot=BaseBuilder.empty):
+        self.check_continue_break()
+
+        # in prim func, before T.match_buffer
+        # user may write some shape size expression like
+        #   ```py
+        #   M = T.const('M')
+        #   M_2 = M * 2
+        #   A = T.match_buffer(A, (M, M_2))
+        #   ```
+        # If not deal properly, M_2 will be treated as a LetStmt, and causes error in match_buffer
+        # here we do a quick check in prim_func_frame, if the value is pure expr, we directly return it
+        if (
+            isinstance(value, PrimExpr)
+            and isinstance(self.frames[-1], tir.frame.PrimFuncFrame)
+            and side_effect(value) <= CallEffectKind.Pure.value
+        ):
+            return value
+
+        locals = self.get_parent_locals()
+
+        # Handle type annotation
+        if value is self.empty:
+            orig_value = locals.get(name, value)
+            if isinstance(annot, Buffer) and annot.scope() == "global":
+                from tilelang.language import match_buffer
+
+                return IRBuilder.name(
+                    name,
+                    match_buffer(
+                        orig_value,
+                        annot.shape,
+                        annot.dtype,
+                        strides=annot.strides,
+                    ),
+                )
+            else:
+                return orig_value
+
+        orig_value = locals.get(name, self.empty)
+
+        # if orig_value is a local.var, we use buffer_store to modify it immutably
+        #   however, if rvalue is not a PrimExpr, such as buffer,
+        #   we should not use buffer_store, and bind it instead
+        #   ```py
+        #   a = tl.alloc_var('float32')  # bind var `a`
+        #   a = tl.alloc_var('float32')  # bind a new var `a_1`
+        #   a = tl.alloc_shared((1,), T.float32) # bind a to new buffer
+        #   b = a                        # get value of var `b = a_1[0]``
+        #   c = tl.alloc_var('float32')  # bind var `c`
+        #   c = a                        # get and assign `c[0] = a_1[0]`
+        #   ```
+        # Convert deferred comparison ops to PrimExpr so that Ref/alloc_var
+        # store paths below can recognise them as assignable values.
+        if isinstance(value, (EqualOp, NotEqualOp)):
+            value = value.asobject()
+
+        if isinstance(orig_value, Ref) and isinstance(value, (int, float, PrimExpr)):
+            orig_value.store(value)
+            return orig_value
+        if is_var(orig_value) and isinstance(value, (int, float, PrimExpr)):
+            tir.buffer_store(orig_value, value, 0)
+            return orig_value
+
+        # 2. Quick return for trivil types
+        if isinstance(value, (tuple, list, tvm.ffi.Array, int, float, str)):
+            return value
+        if isinstance(value, tir.IntImm) and value.dtype == "int32":
+            return value.value
+        if isinstance(value, (Var, Buffer)):
+            # Bind TVM Var/Buffer names and also record scope so reusing the same
+            # Python name (e.g., loop vars like `i`) across different for-frames
+            # works without triggering out-of-scope errors.
+            IRBuilder.name(name, value)
+            if name != "_":
+                frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+                assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+                self.name_inside_frame[name] = self.frames[frame]
+            return value
+
+        # 3. Bind immutable tilelang objects
+        res = self.bind_immutable(name, value)
+
+        # 4. Check variable scope and shadowing
+        if name != "_":
+            frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+            assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+            if name in self.name_inside_frame and self.name_inside_frame[name] in self.frames:
+                logger.warning(
+                    f"Immutable value `{name}` is re-bound; use T.alloc_var to create a mutable variable.",
+                    stacklevel=2,
+                )
+            self.name_inside_frame[name] = self.frames[frame]
+        return res
+
+    def unwrap_value(self, value):
+        """
+        Unwrap some tilelang objects to get their inner value
+        """
+        value = unwrap_expr(value)
+        # handle bx, by = tl.Kernel(128, 128), rval is frame
+        if isinstance(value, tir.frame.IRBuilderFrame):
+            return self.enter_frame(value)
+        else:
+            return value
+
+    def bind_immutable(self, name, value):
+        """
+        Bind an immutable tilelang objects.
+        The immutability means the result is usually not changed or re-assigned in a python block.
+        """
+        if name == "_":
+            # use _tmp to make the generated tir more readable
+            name = "_tmp"
+        if isinstance(value, tir.meta_var):
+            return value.value
+        elif isinstance(value, tir.frame.IRBuilderFrame):
+            return self.enter_frame(value)
+        elif isinstance(value, OutTensor):
+            arg = tir.arg(
+                name,
+                tir.buffer(
+                    shape=value.shape,
+                    dtype=value.dtype,
+                    strides=value.strides,
+                ),
+            )
+            arg._out_idx = self.out_tensor_cnt
+            self.out_tensor_cnt += 1
+            return arg
+        elif isinstance(value, (Buffer, tir.IterVar, tir.Var)):
+            IRBuilder.name(name, value)
+            return value
+        elif isinstance(value, (PrimExpr, BufferRegion)):
+            frame = tir.LetStmt(value)
+            var = frame.var
+            IRBuilder.name(name, var)
+            return self.enter_frame(frame)
+        else:
+            return value
+
+    def assign_slice(self, lval: Any, sl: slice, value: Any, annot=BaseBuilder.empty):
+        self.check_continue_break()
+        if annot is not self.empty:
+            logger.warning("Type annotation on slice assignment is not supported and will be ignored.", stacklevel=2)
+        if isinstance(lval, Buffer):
+            tir.buffer_store(lval, value, sl)
+        else:
+            return super().assign_slice(lval, sl, value)
+
+    def aug_assign(self, op, target, aug_value, name: str | None = None):
+        self.check_continue_break()
+        if isinstance(target, Ref):
+            target.store(eval_op(op, target.bufload, aug_value))
+            return target
+        elif is_var(target):
+            tir.buffer_store(target, eval_op(op, target[0], aug_value), 0)
+            return target
+        elif isinstance(target, Buffer):
+            raise RuntimeError(
+                f"Attempting to update buffer `{target}` using augmented assignment.\n"
+                "Please use slice assignment, e.g. `buf[0] += value` instead."
+            )
+        elif isinstance(target, Var):
+            # Treat augmented assignment on immutable vars (SSA) as re-binding:
+            #   x -= y  ==>  x = x - y
+            #
+            # This matches user expectations and avoids hard failures, while still
+            # warning about re-binding immutable values (same as `x = x - y`).
+            name = name or getattr(target, "orig_name", None) or target.name  # type: ignore[attr-defined]
+            res = eval_op(op, target, aug_value)
+
+            # Mirror the `bind` fast-path: if we're at the prim_func frame and the
+            # expression is pure, keep it as a raw PrimExpr to avoid creating
+            # LetStmts before match_buffer.
+            if (
+                isinstance(res, PrimExpr)
+                and isinstance(self.frames[-1], tir.frame.PrimFuncFrame)
+                and side_effect(res) <= CallEffectKind.Pure.value
+            ):
+                return res
+
+            res = self.bind_immutable(name, res)
+            if name != "_":
+                frame = self.find_frame_idx(TIR_VAR_SCOPE_FRAME)
+                assert frame is not None, f"Variable `{name}` is not defined inside any control flow."
+                if name in self.name_inside_frame and self.name_inside_frame[name] in self.frames:
+                    logger.warning(
+                        f"Immutable value `{name}` is re-bound; use T.alloc_var to create a mutable variable.",
+                        stacklevel=2,
+                    )
+                self.name_inside_frame[name] = self.frames[frame]
+            return res
+        else:
+            return super().aug_assign(op, target, aug_value, name=name)
+
+    def aug_assign_slice(self, op, target, sl, aug_value):
+        self.check_continue_break()
+        if isinstance(target, Buffer):
+            tir.buffer_store(target, eval_op(op, target[sl], aug_value), sl)
+        else:
+            return super().aug_assign_slice(op, target, sl, aug_value)
+
+    def boolop(self, op, left, right=None):
+        left = unwrap_cond(left)
+        if isinstance(left, PrimExpr):
+            with self.with_frame(BoolOpFrame()):
+                if op == "And":
+                    return tir.And(left, right())
+                if op == "Or":
+                    return tir.Or(left, right())
+                if op == "Not":
+                    return tir.Not(left)
+            raise RuntimeError(f"Unsupported boolean operator: {op}")
+        else:
+            return super().boolop(op, left, right)
+
+    def ifexp(self, cond, then, otherwise):
+        cond = unwrap_cond(cond)
+        if isinstance(cond, PrimExpr):
+            with self.with_frame(BoolOpFrame()):
+                return tir.if_then_else(cond, then(), otherwise())
+        else:
+            return super().ifexp(cond, then, otherwise)
+
+    def ret(self, value=None):
+        self.check_continue_break()
+        # handle return T.alloc_var()
+        if value is None:
+            value = tuple()
+        elif isinstance(value, tuple):
+            value = tuple(self.unwrap_value(v) for v in value)
+        else:
+            value = self.unwrap_value(value)
+        last_macro = self.find_frame_idx(MacroFrame)
+        if last_macro is not None:
+            frame = self.find_frame_idx(TIR_CONTROL_FRAME, start=last_macro)
+            if frame is not None:
+                raise NotImplementedError(
+                    "In tilelang macro, return from control flow is not supported yet. \n"
+                    "You should allocate a var before the control flow, assign value inside the blocks, \n"
+                    "and return the var after the control flow. i.e.\n"
+                    "```\n"
+                    "@T.macro\n"
+                    "def my_macro(cond):\n"
+                    "    a = T.alloc_var(T.float16)\n"
+                    "    if cond:\n"
+                    "        a = 1.0\n"
+                    "    return a\n"
+                    "```"
+                )
+            return value
+        else:
+            if self.eager_jit == "phase1":
+                return NotImplemented
+            if not isinstance(value, tuple):
+                value = (value,)
+            for v in value:
+                if not isinstance(v, Buffer) or not hasattr(v, "_out_idx"):
+                    raise RuntimeError(f"Only tensor allocated from `T.empty` can be returned in a prim_func, got {v}({type(v)})")
+                # convert 0, 1, 2 => -3, -2, -1 as the out tensor index
+                self.out_idx.append(v._out_idx - self.out_tensor_cnt)
+            if len(self.out_idx) != self.out_tensor_cnt:
+                raise RuntimeError(f"Not all tensor from `T.empty` are returned, only got {value}")
+            return NotImplemented
+
+    def ctx_with(self, ctx):
+        self.check_continue_break()
+        if isinstance(ctx, tir.frame.IRBuilderFrame):
+            return self.with_frame(ctx)
+        else:
+            return super().ctx_with(ctx)
+
+    def assert_expr(self, cond, msg=None):
+        self.check_continue_break()
+        cond = unwrap_cond(cond)
+        if msg is None:
+            msg = "Assertion failed"
+        if isinstance(cond, PrimExpr):
+            self.enter_frame(tir.Assert(cond, msg))
+        elif not cond:
+            raise AssertionError(msg)
+
+    def rval(self, name: str, value: Any) -> Any:
+        if name in self.name_inside_frame:
+            frame = self.name_inside_frame[name]
+            if frame not in self.frames:
+                raise RuntimeError(
+                    f"Immutable variable `{name}` is used outside its defining region!\n"
+                    f"variable `{name}` is defined in frame: {frame}, current frames: {self.frames}."
+                )
+        return self.unwrap_value(value)
+
+    def macro_arg(self, name, value):
+        annot_value = self.macro_arg_annot.get(name, None)
+        if annot_value is Var or annot_value is Ref:
+            if annot_value is Var:
+                logger.warning("Use `T.Var` as macro annotations is deprecated, please use `T.Ref`")
+            if isinstance(value, BufferLoad):
+                if is_var(value.buffer):
+                    return value.buffer
+                idx = [self.bind("_", idx) for idx in value.indices]
+                # indices = self.bind(f'_', value.indices)
+                return Ref(BufferLoad(value.buffer, indices=idx))
+            if isinstance(value, BufferRegion):
+                region = [Range(self.bind("_", x.begin), end=self.bind("_", x.end) if x.end is not None else None) for x in value.region]
+                return BufferRegion(value.buffer, region=region)
+            raise ValueError(
+                f"To pass as reference, argument `{name}` is expected to be a variable or a buffer region, but got {value}({type(value)})"
+            )
+        elif isinstance(value, (PrimExpr, int, float)):
+            return self.bind(name, value)
+        else:
+            return value
+
+    def prim_func_arg(self, name, value):
+        if isinstance(value, (Buffer, Var)):
+            return tir.arg(name, value)
+        elif value is self.empty:
+            raise ValueError(f"Argument `{name}` is not annotated")
+        elif isinstance(value, Hashable):
+            return value
+        else:
+            raise TypeError(f"Unsupported argument type: {value}({type(value)}) for argument `{name}`.")
+
+    def arg(self, name, value):
+        if self.find_frame_idx(MacroFrame) is not None:
+            return self.macro_arg(name, value)
+        else:
+            return self.prim_func_arg(name, value)
+
+    def override(self, name: str):
+        from tilelang.language import serial
+
+        if name == "range":
+            return serial
+        raise ValueError(f"Unknown override: {name}")
+
+    def constexpr(self, name: str, dtype: str = "int32") -> Var:
+        var = tir.Var(name, dtype)
+        self.constexpr_var.add(var)
+        var.orig_name = name
+        return var
+
+    def set_fileline(self, filename: str, lineno: int, name: str):
+        self.current_file = filename
+        self.current_line = lineno
+        self.current_macro_name = name
+
+    def get_fileline_stack(self, stacklevel=1):
+        stack = self.macro_fileline_stack + [(self.current_file, self.current_line, self.current_macro_name)]
+        return stack[: len(stack) - stacklevel + 1]
+
+    def skip_kernel_ctx(self):
+        return self.eager_jit == "phase1"
+
+
+_P = ParamSpec("_P")
+_T = TypeVar("_T")
+
+
+if TYPE_CHECKING:
+
+    class PrimFunc(Generic[_P, _T], tvm.tir.PrimFunc):
+        params: list[tvm.tir.Var | tvm.tir.Buffer]
+        body: tvm.tir.Stmt
+        ret_type: tvm.ir.Type
+        buffer_map: Map[tvm.tir.Var, tvm.tir.Buffer]
+        attrs: tvm.Attrs | None
+        span: Span | None
+        ir_gen: IRGenerator[_P, _T] | None
+        orig_func: Callable[_P, _T] | None
+
+else:
+    PrimFunc = tvm.tir.PrimFunc
+
+
+@dataclass
+class Macro(Generic[_P, _T]):
+    name: str
+    orig_func: Callable[_P, _T]
+    ir_gen: IRGenerator[_P, _T]
+    annotations: dict[str, Any]
+
+    @property
+    def source(self) -> str:
+        return self.ir_gen.source
+
+    def __call__(self, *args: _P.args, **kwargs: _P.kwargs) -> _T:
+        builder = Builder.current()
+        if builder is None:
+            raise JITNoBuilderError("T.macro can only be used inside @tilelang.jit")
+
+        with builder.macro(self.name, self.annotations):
+            res = self.ir_gen.gen(builder)(*args, **kwargs)
+        return res
+
+    def __hash__(self):
+        return id(self)
+
+    def __eq__(self, other):
+        return id(self) == id(other)
+
+
+def macro(func: Callable[_P, _T] = None) -> Macro[_P, _T]:
+    """
+    Decorator that converts a Python function into a TileLang macro.
+    TileLang macro is very similar to PrimFunc, it can be used in prim_func or another macro.
+    Parameters
+    ----------
+    func : Callable[_P, _T]
+        The Python function to be converted into a macro. This function will be analyzed
+        and transformed into an IR generation function. The function can take any parameters
+        (_P) and return any type (_T).
+    Returns
+    -------
+    Macro[_P, _T]
+        A Macro object that wraps the original function with IR generation capabilities.
+        The returned Macro preserves the original function's signature (parameters _P and
+        return type _T) while adding metaprogramming capabilities.
+    Example:
+    --------
+        >>> @macro
+        ... def my_macro(x: T.int32) -> T.int32:
+        ...    return x ** 2
+        >>> @prim_func
+        ... def my_func(A: T.Tensor((10,), T.int32), B: T.Tensor((10,), T.int32)):
+        ...    with T.Kernel(1) as _:
+        ...        for i in T.serial(10):
+        ...            B[i] = my_macro(A[i])
+    See Also
+    --------
+    Macro : The class that wraps macro functions
+    mutate : The function that transforms Python code into IR generators
+    """
+
+    def impl(func: Callable[_P, _T]) -> Macro[_P, _T]:
+        annotations = get_type_hints(func)
+        return Macro(name=func.__name__, orig_func=func, ir_gen=mutate(func), annotations=annotations)
+
+    return impl(func) if func is not None else impl
+
+
+from typing import _eval_type
+import re
+
+
+def get_type_hints(func):
+    annot = getattr(func, "__annotations__", None)
+    if annot is None:
+        raise TypeError(f"Failed to get function type hints, {func} is not a function")
+    hints = {}
+    # Build eval namespaces from function globals plus captured closure variables
+    # This lets annotations reference symbols like `n`, `h`, or dtype vars
+    # defined in the outer scope of a nested function.
+    globalns = func.__globals__
+    # Here we add nonlocals into localns, to capture the parameters declared in the parent function
+    # ```py
+    # def foo():
+    #   n = 128 # n is nonlocal
+    #   def bar(
+    #       A: T.Tensor(n, T.float32) # we add nonlocal in its eval context
+    #   ):
+    #      for i in range(n): ...
+    # ```
+    #
+    # This is incomplete and buggy
+    #   the only bug scenario the function body doesn't use the parameters
+    #   but such define-no-use scenario is very rare in writing kernels
+    #
+    # ```py
+    # def foo():
+    #   n = 128
+    #   def bar(A: T.Tensor((n,), T.float32)):
+    #     ... # empty function, do not use `n`
+    localns = utils.get_func_nonlocals(func)
+    for name, value in annot.items():
+        if name == "return":
+            continue
+        if isinstance(value, tvm.DataType):
+            hints[name] = value
+            continue
+        if value is None:
+            value = type(None)
+        if isinstance(value, str):
+            # if the annotation is string, is can be: (i) a T.float32 like annotations, (ii) a ForwardRef object
+            # typing doesn't handle (i), it will try to interpret T.float32
+            #    typing see: T.float32 is str('float32'), and there is no object named `flaot32` and give a NameError
+            # here we manually interpret it to return T.float32 object
+            try:
+                _, v = value.split(".", maxsplit=1)
+            except ValueError:
+                v = value
+            if v in dt._all_dtypes:
+                try:
+                    hints[name] = eval(value, globalns, localns)
+                    continue
+                except Exception:
+                    pass
+            if sys.version_info >= (3, 10):
+                value = ForwardRef(value, module=func.__module__)
+            else:
+                value = ForwardRef(value, is_argument=True)
+            hints[name] = _eval_type(value, globalns=globalns, localns=localns)
+        else:
+            hints[name] = value
+    return hints
+
+
+def const(name: str, dtype: str = "int32") -> Var | tuple[Var, ...]:
+    """
+    Declare constexpr variables for dynamic tensor dimensions (eager mode only).
+
+    In eager mode, use T.const() to declare shape dimensions that will be
+    inferred from actual tensor arguments at runtime.
+
+    Example::
+
+        @tilelang.jit
+        def kernel(A, B):
+            M, N = T.const("M, N")
+            A: T.Tensor[[M, N], T.float32]
+            ...
+    """
+    builder = Builder.current()
+    # assert builder is not None, "T.const() can only be used inside @tilelang.jit (eager mode)"
+    # assert builder.eager_jit, "T.const() can only be used inside @tilelang.jit (eager mode)"
+    if builder is None or builder.eager_jit == "none":
+        raise JITNoBuilderError("T.const() can only be used inside @tilelang.jit (eager mode)")
+
+    if builder.eager_jit == "phase1":
+        # in stage 1, we create constexpr variables
+        if "," in name:
+            names = re.split(r"\s*,\s*", name)
+            return tuple(builder.constexpr(n, dtype) for n in names)
+        if " " in name:
+            names = re.split(r"\s+", name)
+            return tuple(builder.constexpr(n, dtype) for n in names)
+        else:
+            return builder.constexpr(name, dtype)
+    elif builder.eager_jit == "phase2":
+        # in stage 2, we substitute constexpr variables with actual values
+        if "," in name:
+            names = re.split(r"\s*,\s*", name)
+            return tuple(builder.eager_jit_subs[n] for n in names)
+        if " " in name:
+            names = re.split(r"\s+", name)
+            return tuple(builder.eager_jit_subs[n] for n in names)
+        else:
+            return builder.eager_jit_subs[name]
+
+
+def annotate_compile_flags(flags: list[str] | str) -> None:
+    """
+    Annotate additional device compile flags inside a function body.
+
+    The flags will be merged with any externally provided compile_flags
+    at compilation time. Can be placed before or after tensor type annotations.
+
+    Example::
+
+        @tilelang.jit
+        def kernel(A, B):
+            T.annotate_compile_flags(["--use_fast_math"])
+            ...
+    """
+    builder = Builder.current()
+    if builder is None:
+        raise JITNoBuilderError("T.annotate_compile_flags() can only be used inside @tilelang.jit or @T.prim_func")
+    if builder.eager_jit == "phase1":
+        return
+    builder.func_compile_flags = flags
+
+
+def annotate_pass_configs(configs: dict[str, Any]) -> None:
+    """
+    Annotate pass configuration inside a function body.
+
+    The configs will be merged with any externally provided pass_configs
+    at compilation time (function-level configs take lower priority, i.e.
+    external configs override). Can be placed before or after tensor type annotations.
+
+    Example::
+
+        @tilelang.jit
+        def kernel(A, B):
+            T.annotate_pass_configs({
+                PassConfigKey.TL_ENABLE_FAST_MATH: True})
+            ...
+    """
+    builder = Builder.current()
+    if builder is None:
+        raise JITNoBuilderError("T.annotate_pass_configs() can only be used inside @tilelang.jit or @T.prim_func")
+    if builder.eager_jit == "phase1":
+        return
+    builder.func_pass_configs = configs
+
+
+def _patch_prim_func_attrs(pf: PrimFunc, builder: Builder) -> PrimFunc:
+    """Attach function-level out_idx, pass_configs and compile_flags as PrimFunc attrs."""
+    if builder.out_idx:
+        pf = pf.with_attr("tilelang_out_idx", builder.out_idx)
+    if builder.func_pass_configs is not None:
+        pf = pf.with_attr("tilelang_pass_configs", builder.func_pass_configs)
+    if builder.func_compile_flags is not None:
+        flags = builder.func_compile_flags
+        if isinstance(flags, str):
+            flags = [flags]
+        pf = pf.with_attr("tilelang_compile_flags", flags)
+    return pf
+
+
+@dataclass
+class TirTemplate(Generic[_P, _T]):
+    """
+    Template for generating TIR PrimFunc with dynamic shape substitution.
+
+    For lazy-style functions, the PrimFunc is used directly without substitution.
+    For eager-style functions, constexpr variables are substituted based on
+    actual tensor shapes at runtime.
+    """
+
+    name: str
+    prim_func: PrimFunc[_P, _T]
+    matcher: dict[Var, tuple[tvm.tir.Var, str, int, str]] | None = None
+    constexprs: set[Var] = None
+    is_lazy_style: bool = False  # True if from lazy-style (returns PrimFunc directly)
+    ir_gen: IRGenerator[_P, _T] | None = None
+
+    @classmethod
+    def create(
+        cls, name: str, prim_func: PrimFunc[_P, _T], constexpr: set[Var], ir_gen: IRGenerator[_P, _T] | None = None
+    ) -> TirTemplate[_P, _T]:
+        matcher = {}
+        for k, v in prim_func.buffer_map.items():
+            for i, s in enumerate(v.shape):
+                if s in constexpr and s not in matcher:
+                    matcher[s] = (k.name, "shape", i, s.name)
+            for i, s in enumerate(v.strides):
+                if s in constexpr and s not in matcher:
+                    matcher[s] = (k.name, "stride", i, s.name)
+        for s in constexpr:
+            if s not in matcher:
+                shapes = {k: v.shape for k, v in prim_func.buffer_map.items()}
+                strides = {k: v.strides for k, v in prim_func.buffer_map.items()}
+                raise RuntimeError(
+                    f"Constexpr variable `{s}` is not used in any buffer shape or stride.\n"
+                    "At least one **DIRECT** usage is required. Please check:\n"
+                    "(1) the variable is not used\n"
+                    f"(2) all uses are indirect, e.g. {s} * 2, {s} * 3. (you can replace them with separate constexpr variables)\n"
+                    f"Buffer shapes: {shapes}\n"
+                    f"Buffer strides: {strides}"
+                )
+        matcher = {k: matcher[k] for k in constexpr}
+        return cls(name=name, prim_func=prim_func, matcher=matcher, constexprs=constexpr, is_lazy_style=False, ir_gen=ir_gen)
+
+    @classmethod
+    def from_lazy_style(cls, name: str, prim_func: PrimFunc[_P, _T]) -> TirTemplate[_P, _T]:
+        """Create template from lazy-style function that returns PrimFunc directly."""
+        return cls(name=name, prim_func=prim_func, is_lazy_style=True)
+
+    def _parse_phase2_key(self, **kwargs):
+        if self.matcher is None:
+            return ()
+        result = []
+        for k, ty, i, name in self.matcher.values():
+            if name in kwargs:
+                result.append(kwargs.get(name))
+            elif k in kwargs:
+                if ty == "shape":
+                    result.append(kwargs[k].shape[i])
+                elif ty == "stride":
+                    v = kwargs[k]
+                    if isinstance(v, Buffer):
+                        result.append(v.strides[i])
+                    else:
+                        result.append(kwargs[k].stride()[i])
+            else:
+                raise ValueError(
+                    f"Cannot find value for constexpr variable `{name}`\n"
+                    f"Please provide it as a keyword argument, e.g. `{name}=<value>`\n"
+                    f"Or provide the corresponding tensor argument `{k}`."
+                )
+        return tuple(result)
+
+    def get_tir(self, tensor_args, given_tensor_args, kwargs):
+        if self.is_lazy_style:
+            return self.prim_func
+        values = self._parse_phase2_key(**given_tensor_args, **kwargs)
+        subs = {name.orig_name: value for name, value in zip(self.matcher, values)}
+        builder = Builder()
+        builder.eager_jit = "phase2"
+        builder.eager_jit_subs = subs
+        with builder.prim_func(self.name):
+            self.ir_gen.gen(builder)(**tensor_args, **kwargs)
+        pf = builder.get()
+        pf = _patch_prim_func_attrs(pf, builder)
+        return pf
+
+
+@dataclass
+class JITFunc(Generic[_P, _T]):
+    """
+    Internal wrapper for JIT-compiled functions.
+
+    This class handles both lazy and eager execution styles:
+
+    - **lazy style**: Function explicitly returns a PrimFunc. The original function
+      is called directly to obtain the TIR.
+
+    - **eager style**: Function uses the DSL builder pattern with tensor type
+      annotations. The TIR is constructed by tracing the function body through
+      the Builder.
+
+    The style is determined by `_is_lazy_style()` which checks if calling the
+    original function returns a PrimFunc directly.
+    """
+
+    orig_func: Callable[_P, _T]
+    arg_names: list[str]
+    tensor_args: dict[str, Buffer | Var]
+    tensor_args_defaults: dict[str, Any]
+    ir_gen: IRGenerator[_P, _T]
+    mode: Literal["auto", "lazy", "eager"] = "auto"
+
+    def __post_init__(self):
+        # we don't want it to show up in the constructor
+        self.p1_cache: dict[Any, TirTemplate[_P, _T]] = {}
+
+    def _parse_phase1_key(self, *args, **kwargs):
+        kwargs.update({k: v for k, v in zip(self.arg_names, args)})
+        tensor_args = {}
+        for k in self.tensor_args:
+            if k in kwargs:
+                tensor_args[k] = kwargs.pop(k)
+            elif k in self.tensor_args_defaults:
+                tensor_args[k] = self.tensor_args_defaults[k]
+        p1_key = tuple(sorted(kwargs.items()))
+        return p1_key, tensor_args, kwargs
+
+    def _is_lazy_style(self, *args, **kwargs) -> bool:
+        """
+        Check if the function uses lazy style (explicitly returns PrimFunc).
+
+        Lazy style functions define an inner @T.prim_func and return it:
+            @jit
+            def foo(M, N):
+                @T.prim_func
+                def kernel(...): ...
+                return kernel  # <- returns PrimFunc
+
+        Eager style functions use the builder pattern with type annotations:
+            @jit
+            def foo(A, B):
+                A: T.Tensor[...]
+                with T.Kernel(...): ...
+                # no return
+        """
+        if has_internal_prim_func(self.orig_func):
+            return True
+        try:
+            inspect.signature(self.orig_func).bind(*args, **kwargs)
+        except TypeError:
+            return False
+        try:
+            prim_func = self.orig_func(*args, **kwargs)
+            # lazy jit must return PrimFunc
+            if isinstance(prim_func, PrimFunc):
+                p1_key, _, _ = self._parse_phase1_key(*args, **kwargs)
+                self.p1_cache[p1_key] = TirTemplate.from_lazy_style(self.orig_func.__name__, prim_func)
+                return True
+            return False
+        except (JITNoBuilderError, EagerJITBuildError):
+            # In eager mode, we construct AST directly without prim_func,
+            # so there's no Builder available when the function is called.
+            # When eager-only features like T.const() or T.Kernel() are used,
+            # they raise JITNoBuilderError because no Builder exists yet.
+            # This indicates the function is eager-style, not lazy-style.
+            return False
+
+    def _build_tir_template(self, *args, **kwargs) -> TirTemplate[_P, _T]:
+        """Build TIR template based on the execution mode."""
+        if self.mode == "lazy":
+            # lazy: function returns PrimFunc directly
+            return TirTemplate.from_lazy_style(self.orig_func.__name__, self.orig_func(*args, **kwargs))
+        elif self.mode == "eager":
+            # eager: trace function body through Builder to construct TIR
+            builder = Builder()
+            builder.eager_jit = "phase1"
+            with builder.prim_func(self.orig_func.__name__):
+                self.ir_gen.gen(builder)(**self.tensor_args, **kwargs)
+            pf = builder.get()
+            pf.orig_func = self.orig_func
+            return TirTemplate.create(self.orig_func.__name__, pf, builder.constexpr_var, self.ir_gen)
+        else:
+            raise ValueError(f"Invalid jit mode: {self.mode}, expected 'lazy' or 'eager'")
+
+    def parse_args(self, *args, **kwargs):
+        """Parse arguments and return cache key and tensor args."""
+        p1_key, tensor_args, kwargs = self._parse_phase1_key(*args, **kwargs)
+        if not tensor_args:
+            return (p1_key, None), {}
+        tir_temp = self.p1_cache.get(p1_key, None)
+        if tir_temp is None:
+            # mode should be set by JITImpl before calling parse_args
+            tir_temp = self._build_tir_template(**kwargs)
+            self.p1_cache[p1_key] = tir_temp
+        p2_key = tir_temp._parse_phase2_key(**tensor_args, **kwargs)
+        return (p1_key, p2_key), tensor_args
+
+    def get_tir(self, *args, **kwargs):
+        p1_key, tensor_args, kwargs = self._parse_phase1_key(*args, **kwargs)
+        if p1_key not in self.p1_cache:
+            # in legacy gemm, we use lazy tir template to build the tir
+            self.p1_cache[p1_key] = self._build_tir_template(**kwargs)
+        return self.p1_cache[p1_key].get_tir(self.tensor_args, tensor_args, kwargs)
+
+    def __call__(self, *args, **kwargs):
+        return self.get_tir(*args, **kwargs)
+
+    def set_mode(self, mode: Literal["lazy", "eager"]):
+        """Set the JIT execution mode (internal use only)."""
+        self.mode = mode
+
+    # Proxy function attributes for compatibility with autotuner and inspect.
+    # These attributes are needed by autotuner to extract closure variables
+    # and generate cache keys.
+    _PROXIED_ATTRS = frozenset({"__closure__", "__code__", "__name__", "__globals__", "__wrapped__"})
+
+    def __getattr__(self, name):
+        if name in JITFunc._PROXIED_ATTRS:
+            if name == "__wrapped__":
+                return self.orig_func
+            return getattr(self.orig_func, name)
+        raise AttributeError(f"'{type(self).__name__}' object has no attribute '{name}'")
+
+
+def substitute_primfunc(prim_func, vmap):
+    analyzer = tvm.arith.Analyzer()
+
+    def sub(v):
+        return analyzer.simplify(substitute(v, vmap))
+
+    def substitute_buffer(buf):
+        return tvm.tir.decl_buffer(
+            data=sub(buf.data),
+            shape=[sub(dim) for dim in buf.shape],
+            dtype=buf.dtype,
+            strides=[sub(stride) for stride in buf.strides] if buf.strides else None,
+        )
+
+    return PrimFunc(
+        params=[sub(v) for v in prim_func.params],
+        body=substitute(prim_func.body, vmap),
+        buffer_map={k: substitute_buffer(v) for k, v in prim_func.buffer_map.items()},
+        attrs=prim_func.attrs,
+    )
+
+
+def prim_func(func: Callable[_P, _T] = None, *, eager_jit: bool = False) -> PrimFunc[_P, _T] | JITFunc[_P, _T]:
+    def impl(func: Callable[_P, _T]) -> PrimFunc[_P, _T] | Callable[_P, PrimFunc[_P, _T]]:
+        sig = inspect.signature(func)
+        ir_gen = mutate(func)
+        func_annot = get_type_hints(func)
+        annot = {}
+        for param in sig.parameters.values():
+            if param.kind == param.POSITIONAL_ONLY:
+                raise TypeError(f"PrimFunc does not support positional-only parameters: `{param.name}`")
+            if param.name in ir_gen.extra_type_hints:
+                annot[param.name] = ir_gen.extra_type_hints[param.name]
+            elif param.name in func_annot:
+                annot[param.name] = func_annot[param.name]
+        for k in annot:
+            # Call callable annotations (e.g., factory functions) to get the actual type.
+            # Skip typing generics like Optional[int], Union[...], List[...] which are
+            # callable but cannot be instantiated.
+            if not isinstance(annot[k], type) and callable(annot[k]) and get_origin(annot[k]) is None:
+                annot[k] = annot[k]()
+
+        if eager_jit:
+            arg_names = list(sig.parameters.keys())
+            tensor_args = {k: v for k, v in annot.items() if isinstance(v, (Buffer, Var))}
+            tensor_args_defaults = {
+                k: sig.parameters[k].default for k in tensor_args if sig.parameters[k].default is not sig.parameters[k].empty
+            }
+            return JITFunc(func, arg_names, tensor_args, tensor_args_defaults, ir_gen)
+        else:
+            try:
+                builder = Builder()
+                with builder.prim_func(func.__name__):
+                    ir_gen.gen(builder)(**annot)
+                prim_func = builder.get()
+                prim_func = _patch_prim_func_attrs(prim_func, builder)
+                prim_func.orig_func = func
+                return prim_func
+            except Exception as e:
+                logger.fatal(f"Failed to build prim_func from {func.__name__}\nargs={annot}\nsource={ir_gen.source}")
+                raise e
+
+    return impl(func) if func is not None else impl
diff --git a/tilelang/language/eager/utils.py b/tilelang/language/eager/utils.py
new file mode 100644
index 0000000000..207bd92ad3
--- /dev/null
+++ b/tilelang/language/eager/utils.py
@@ -0,0 +1,98 @@
+from __future__ import annotations
+import ast
+import inspect
+from typing import Any, Callable, Literal
+from tilelang import env
+from hashlib import sha256
+from tvm import tir
+import linecache
+
+
+def disk_compile(source, name):
+    cache_dir = env.TILELANG_CACHE_DIR
+    if cache_dir is not None:
+        import os
+
+        save_dir = os.path.join(cache_dir, "py-cache")
+        os.makedirs(save_dir, exist_ok=True)
+        hash_sfx = sha256(source.encode("utf-8")).hexdigest()[:8]
+        path = os.path.join(save_dir, f"{name}.{hash_sfx}.py")
+        with open(path, "w") as f:
+            f.write(source)
+    linecache.cache[path] = (len(source), None, source.splitlines(), path)
+    return compile(source, path, "exec")
+
+
+def _remove_leading_ident(source: str):
+    lines = source.splitlines()
+    if not lines:
+        return source
+    ident_size = len(lines[0]) - len(lines[0].lstrip())
+    return "\n".join([line[ident_size:] if len(line) >= ident_size else line for line in lines])
+
+
+def get_func_nonlocals(func):
+    """A modified version of `inspect.getclosurevars`"""
+
+    if inspect.ismethod(func):
+        func = func.__func__
+
+    if not inspect.isfunction(func):
+        raise TypeError(f"{func!r} is not a Python function")
+
+    code = func.__code__
+    # Nonlocal references are named in co_freevars and resolved
+    # by looking them up in __closure__ by positional index
+    nonlocal_vars = {}
+    if func.__closure__ is not None:
+        for var, cell in zip(code.co_freevars, func.__closure__):
+            try:
+                nonlocal_vars[var] = cell.cell_contents
+            except ValueError as err:
+                # cell_contents may raise ValueError if the cell is empty.
+                if "empty" not in str(err):
+                    raise
+    return nonlocal_vars
+
+
+def get_ast(func: Callable):
+    _, start = inspect.getsourcelines(func)
+    filename = inspect.getsourcefile(func) or inspect.getfile(func)
+    source = inspect.getsource(func)
+    source = _remove_leading_ident(source)
+    source = "\n" * (start - 1) + source
+    tree = ast.parse(source, filename=filename)
+    return tree
+
+
+CompileMethod = Literal["direct", "disk"]
+
+
+def get_compiled_object(source: str | ast.AST, name: str, filename: str = None, globals: dict[str, Any] = None):
+    if isinstance(source, ast.AST):
+        assert filename is not None, "filename must be provided when source is an AST"
+    try:
+        if isinstance(source, ast.AST):
+            ast.fix_missing_locations(source)
+            compiled = compile(source, filename, "exec")
+        else:
+            compiled = disk_compile(source, name)
+    except Exception as e:
+        source_str = source if isinstance(source, str) else ast.unparse(source)
+        raise RuntimeError(f"Failed to compile source for {name}, Error: {e}:\n{source_str}") from e
+    locs = {}
+    exec(compiled, globals, locs)
+    return locs[name]
+
+
+def construct_strides(shape: tuple[Any, ...], allow_prim_expr: bool = True) -> tuple[Any, ...]:
+    """Construct row-major strides from shape."""
+    strides = []
+    stride = 1
+    for s in shape[::-1]:
+        strides.append(stride)
+        stride *= s
+        if not allow_prim_expr and isinstance(stride, tir.PrimExpr):
+            raise ValueError("Cannot construct strides with PrimExpr when allow_prim_expr is False.")
+    strides = tuple(reversed(strides))
+    return strides
diff --git a/tilelang/language/experimental/gemm_sp.py b/tilelang/language/experimental/gemm_sp.py
index 993cf87584..fa722b6890 100644
--- a/tilelang/language/experimental/gemm_sp.py
+++ b/tilelang/language/experimental/gemm_sp.py
@@ -1,7 +1,6 @@
 """The language interface for tl programs."""
 
 from __future__ import annotations
-
 from tilelang.tileop.base import GemmWarpPolicy
 import tilelang.language as T
 from tvm import tir
@@ -15,13 +14,14 @@
 from tilelang.language.utils import (
     buffer_region_to_tile_region,
 )
+from tilelang._typing import BufferLikeType
 
 
 def gemm_sp(
-    A_sparse: tir.Buffer | tir.Var,
-    E: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
+    A_sparse: BufferLikeType | tir.Var,
+    E: BufferLikeType | tir.Var,
+    B: BufferLikeType | tir.Var,
+    C: BufferLikeType | tir.Var,
     transpose_A: bool = False,
     transpose_B: bool = False,
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
@@ -35,10 +35,10 @@ def gemm_sp(
     The operation supports various warp policies and accumulation modes.
 
     Args:
-        A_sparse (Union[tir.Buffer, tir.Var]): First input matrix dense values
-        E (Union[tir.Buffer, tir.Var]): First input matrix sparse metadata
-        B (Union[tir.Buffer, tir.Var]): Second input matrix
-        C (Union[tir.Buffer, tir.Var]): Output matrix for results
+        A_sparse (Union[BufferLikeType, tir.Var]): First input matrix dense values
+        E (Union[BufferLikeType, tir.Var]): First input matrix sparse metadata
+        B (Union[BufferLikeType, tir.Var]): Second input matrix
+        C (Union[BufferLikeType, tir.Var]): Output matrix for results
         transpose_A (bool, optional): Whether to transpose matrix A. Defaults to False.
         transpose_B (bool, optional): Whether to transpose matrix B. Defaults to False.
         policy (GemmWarpPolicy, optional): Warp execution policy. Defaults to GemmWarpPolicy.Square.
@@ -53,14 +53,14 @@ def gemm_sp(
         AssertionError: If the K dimensions of matrices A and B don't match
     """
 
-    def legalize_arguments(arg: tir.Buffer | tir.Var):
+    def legalize_arguments(arg: BufferLikeType | tir.Var):
         """Convert let-bound variables to their corresponding buffers.
 
         Args:
-            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
+            arg (Union[BufferLikeType, tir.Var]): Input argument to legalize
 
         Returns:
-            Union[tir.Buffer, tir.Var]: The legalized argument
+            Union[BufferLikeType, tir.Var]: The legalized argument
         """
         if isinstance(arg, tir.Var) and T.has_let_value(arg):
             return T.get_let_value(arg).buffer
@@ -100,10 +100,10 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
 
 # experimental currently, for fast compilation
 def gemm_sp_v2(
-    A_sparse: tir.Buffer | tir.Var,
-    E: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
+    A_sparse: BufferLikeType | tir.Var,
+    E: BufferLikeType | tir.Var,
+    B: BufferLikeType | tir.Var,
+    C: BufferLikeType | tir.Var,
     transpose_A: bool = False,
     transpose_B: bool = False,
     transpose_E: bool = False,
@@ -118,10 +118,10 @@ def gemm_sp_v2(
     The operation supports various warp policies and accumulation modes.
 
     Args:
-        A_sparse (Union[tir.Buffer, tir.Var]): First input matrix, contains only non-zero elements
-        E (Union[tir.Buffer, tir.Var]): The metadata of A_sparse, noted as E
-        B (Union[tir.Buffer, tir.Var]): Second input matrix
-        C (Union[tir.Buffer, tir.Var]): Output matrix for results
+        A_sparse (Union[BufferLikeType, tir.Var]): First input matrix, contains only non-zero elements
+        E (Union[BufferLikeType, tir.Var]): The metadata of A_sparse, noted as E
+        B (Union[BufferLikeType, tir.Var]): Second input matrix
+        C (Union[BufferLikeType, tir.Var]): Output matrix for results
         transpose_A (bool, optional): Whether to transpose matrix A. Defaults to False.
         transpose_B (bool, optional): Whether to transpose matrix B. Defaults to False.
         policy (GemmWarpPolicy, optional): Warp execution policy. Defaults to GemmWarpPolicy.Square.
@@ -136,14 +136,14 @@ def gemm_sp_v2(
         AssertionError: If the K dimensions of matrices A and B don't match
     """
 
-    def legalize_arguments(arg: tir.Buffer | tir.Var):
+    def legalize_arguments(arg: BufferLikeType | tir.Var) -> BufferLikeType:
         """Convert let-bound variables to their corresponding buffers.
 
         Args:
-            arg (Union[tir.Buffer, tir.Var]): Input argument to legalize
+            arg (Union[BufferLikeType, tir.Var]): Input argument to legalize
 
         Returns:
-            Union[tir.Buffer, tir.Var]: The legalized argument
+            Union[BufferLikeType, tir.Var]: The legalized argument
         """
         if isinstance(arg, tir.Var) and T.has_let_value(arg):
             return T.get_let_value(arg).buffer
diff --git a/tilelang/language/fastmath.py b/tilelang/language/fastmath.py
index c77fad34c5..2842118fb3 100644
--- a/tilelang/language/fastmath.py
+++ b/tilelang/language/fastmath.py
@@ -1,9 +1,10 @@
 """Fast math operations exposed on the TileLang language surface."""
 
 from tvm import tir
+from tvm.tir import PrimExpr
 
 
-def __log(x):
+def __log(x: PrimExpr) -> PrimExpr:
     """Calculate log(x) with fast math
 
     Parameters
@@ -20,7 +21,7 @@ def __log(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log"), x)
 
 
-def __log2(x):
+def __log2(x: PrimExpr) -> PrimExpr:
     """Calculate log2(x) with fast math
 
     Parameters
@@ -37,7 +38,7 @@ def __log2(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log2"), x)
 
 
-def __log10(x):
+def __log10(x: PrimExpr) -> PrimExpr:
     """Calculate log10(x) with fast math
 
     Parameters
@@ -54,7 +55,7 @@ def __log10(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log10"), x)
 
 
-def __tan(x):
+def __tan(x: PrimExpr) -> PrimExpr:
     """Calculate tan(x) with fast math
 
     Parameters
@@ -71,7 +72,7 @@ def __tan(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__tan"), x)
 
 
-def __cos(x):
+def __cos(x: PrimExpr) -> PrimExpr:
     """Calculate cos(x) with fast math
 
     Parameters
@@ -88,7 +89,7 @@ def __cos(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__cos"), x)
 
 
-def __sin(x):
+def __sin(x: PrimExpr) -> PrimExpr:
     """Calculate sin(x) with fast math
 
     Parameters
@@ -105,7 +106,7 @@ def __sin(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__sin"), x)
 
 
-def __exp10(x):
+def __exp10(x: PrimExpr) -> PrimExpr:
     """Calculate 10**x with fast math
 
     Parameters
@@ -122,7 +123,7 @@ def __exp10(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__exp10"), x)
 
 
-def __exp(x):
+def __exp(x: PrimExpr) -> PrimExpr:
     """Calculate 2**x with fast math
 
     Parameters
diff --git a/tilelang/language/fill_op.py b/tilelang/language/fill_op.py
index a093a84599..accd92b2e3 100644
--- a/tilelang/language/fill_op.py
+++ b/tilelang/language/fill_op.py
@@ -1,12 +1,13 @@
 """Fill operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
+from tilelang._typing import BufferLikeType
 from tvm import tir
 from tilelang.language import has_let_value, get_let_value
 from tilelang.utils.language import get_buffer_region_from_load, to_buffer_region
 
 
-def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.PrimExpr):
+def fill(buffer: BufferLikeType, value: tir.PrimExpr) -> tir.PrimExpr:
     """Fill a buffer or buffer region with a specified value.
 
     Args:
@@ -36,7 +37,7 @@ def fill(buffer: tir.Buffer | tir.BufferRegion | tir.BufferLoad, value: tir.Prim
     return tir.call_intrin("handle", tir.op.Op.get("tl.tileop.fill"), to_buffer_region(buffer, access_type="w", extents=extents), value)
 
 
-def clear(buffer: tir.Buffer | tir.Var):
+def clear(buffer: BufferLikeType) -> tir.PrimExpr:
     """Clear a buffer by filling it with zeros.
 
     Args:
diff --git a/tilelang/language/frame.py b/tilelang/language/frame.py
index 2f4d1c6716..7e60f46ee9 100644
--- a/tilelang/language/frame.py
+++ b/tilelang/language/frame.py
@@ -1,7 +1,6 @@
 """Override the LetFrame to print a message when entering the frame."""
 
 from __future__ import annotations
-
 from tvm.ffi import register_object as _register_object
 from tvm.tir import Var, PrimExpr, BufferLoad, BufferRegion
 from tvm.ir import Range
diff --git a/tilelang/language/gemm_op.py b/tilelang/language/gemm_op.py
index 0b8f3ccf1b..d5ec03728f 100644
--- a/tilelang/language/gemm_op.py
+++ b/tilelang/language/gemm_op.py
@@ -2,8 +2,10 @@
 
 from __future__ import annotations
 
+from tilelang._typing import BufferLikeType, BarrierType
 from tilelang.tileop.base import GemmWarpPolicy
 import tilelang.language as T
+from tilelang.layout import Layout
 from tvm import tir
 from tilelang.utils.language import (
     to_buffer_region,
@@ -15,28 +17,28 @@
 from tilelang.language.utils import (
     buffer_region_to_tile_region,
 )
-from tilelang.env import env as _env
 
 
 def _gemm_impl(
     op_key: str,
-    A: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
     transpose_A: bool = False,
     transpose_B: bool = False,
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
     clear_accum: bool = False,
     k_pack: int = 1,
     wg_wait: int = 0,
-    mbar: tir.Buffer | None = None,
-):
+    mbar: BarrierType | None = None,
+    annotations: dict | None = None,
+) -> tir.PrimExpr:
     """Shared GEMM implementation.
 
     Returns a call_intrin handle for the given op key.
     """
 
-    def legalize_arguments(arg: tir.Buffer | tir.Var):
+    def legalize_arguments(arg: BufferLikeType | tir.Var) -> BufferLikeType:
         """Convert let-bound variables to their corresponding buffers.
 
         Args:
@@ -81,9 +83,18 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
             )
 
     M, N = C_shape
+    M_A = A_shape[-1] if transpose_A else A_shape[-2]
     K = A_shape[-2] if transpose_A else A_shape[-1]
+    N_B = B_shape[-2] if transpose_B else B_shape[-1]
     K_B = B_shape[-1] if transpose_B else B_shape[-2]
+    assert prim_expr_equal(M_A, M), f"T.gemm M shape check failed: M_A = {M_A}, M_C = {M}"
     assert prim_expr_equal(K, K_B), f"T.gemm K shape check failed: K_A = {K}, K_B = {K_B}"
+    use_2cta = annotations is not None and annotations.get("use_2cta", 0)
+    if use_2cta:
+        # In 2CTA mode each CTA holds half of B along N, so N_B should be N // 2
+        assert prim_expr_equal(N_B * 2, N), f"T.gemm N shape check failed for 2CTA: N_B = {N_B}, expected N_C / 2 = {N} / 2"
+    else:
+        assert prim_expr_equal(N_B, N), f"T.gemm N shape check failed: N_B = {N_B}, N_C = {N}"
 
     stride_a = A_stride[-2]
     stride_b = B_stride[-2]
@@ -95,12 +106,20 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
     offset_a = A_offset[-1]
     offset_b = B_offset[-1]
 
-    mbar = to_buffer_region(mbar, access_type="rw") if mbar is not None else tir.const(0, T.uint32)
+    if mbar is not None:
+        assert isinstance(mbar, (tir.Buffer, tir.BufferLoad)), (
+            f"mbar for tcgen5mma must be a tir.Buffer or tir.BufferLoad, but got {type(mbar)}"
+        )
+        mbar = to_buffer_region(mbar, access_type="rw")
     C_coords = [r.min for r in C_region.region]
     # Convert BufferRegion to tl.region calls for arguments
     A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
     B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
     C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    # When mbar is None, pass a placeholder constant (0).
+    # The C++ side checks if arg 16 is a BufferLoadNode before using it,
+    # so a non-BufferLoad value will be correctly ignored.
+    mbar_arg = mbar if mbar is not None else tir.const(0, dtype="int32")
     return tir.call_intrin(
         "handle",
         tir.op.Op.get(op_key),
@@ -120,26 +139,50 @@ def legalize_arguments(arg: tir.Buffer | tir.Var):
         offset_b,
         k_pack,
         wg_wait,
-        mbar,
+        mbar_arg,
         C_coords[0],
         C_coords[1],
+        annotations=annotations,
     )
 
 
-# Public wrappers
-def gemm_v1(
-    A: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
+def gemm(
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
     transpose_A: bool = False,
     transpose_B: bool = False,
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
     clear_accum: bool = False,
     k_pack: int = 1,
-    wg_wait: int = 0,
-    mbar: tir.Buffer | None = None,
-):
-    """GEMM v1: use op tl.gemm."""
+    mbar: BarrierType | None = None,
+) -> tir.PrimExpr:
+    """TileLang GEMM operator.
+
+    This is the default synchronous GEMM interface. On Hopper, if the compiler
+    selects WGMMA lowering, TileLang inserts the corresponding wait implicitly.
+    On Blackwell TCGEN5MMA, TileLang inserts the corresponding
+    `mbarrier_wait_parity(...)` implicitly after issue.
+
+    For manual asynchronous scheduling, use `T.wgmma_gemm(...)` with
+    `T.wait_wgmma(...)` on Hopper, or `T.tcgen05_gemm(...)` with
+    `T.mbarrier_wait_parity(...)` on Blackwell.
+
+    Args:
+        A (BufferLikeType, i.e. Buffer | BufferLoad | BufferRegion, or Var): Input buffer A.
+        B (BufferLikeType): Input buffer B.
+        C (BufferLikeType): Output buffer C.
+        transpose_A (bool): Whether to transpose A. Defaults to False.
+        transpose_B (bool): Whether to transpose B. Defaults to False.
+        policy (GemmWarpPolicy): GEMM warp partition policy.
+        clear_accum (bool): Whether to clear the accumulator.
+        k_pack (int): Numbers of packed matrix cores, for ROCm only. Defaults to 1.
+        mbar (BarrierType, i.e. Buffer | BufferLoad, or Var, optional): Mbarrier in Blackwell.
+            Required when this GEMM lowers to TCGEN5MMA. Defaults to None.
+
+    Returns:
+        tir.Call: A handle to the GEMM operation.
+    """
     return _gemm_impl(
         "tl.tileop.gemm",
         A,
@@ -150,27 +193,33 @@ def gemm_v1(
         policy,
         clear_accum,
         k_pack,
-        wg_wait,
+        0,
         mbar,
     )
 
 
-# experimental currently, for fast compilation
-def gemm_v2(
-    A: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
+def wgmma_gemm(
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
     transpose_A: bool = False,
     transpose_B: bool = False,
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
     clear_accum: bool = False,
-    k_pack: int = 1,
-    wg_wait: int = 0,
-    mbar: tir.Buffer | None = None,
-):
-    """GEMM v2: use op tl.gemm_py."""
+) -> tir.PrimExpr:
+    """Explicit Hopper WGMMA GEMM without an implicit wait.
+
+    This is the explicit asynchronous Hopper WGMMA counterpart to the default
+    synchronous `T.gemm(...)` interface, with two stricter guarantees:
+    - it always requests the WGMMA lowering path
+    - it never auto-emits an inlined `warpgroup_wait`
+
+    If the current target or operand pattern cannot use Hopper WGMMA,
+    compilation fails instead of silently falling back to MMA.
+    """
+
     return _gemm_impl(
-        "tl.tileop.gemm_py",
+        "tl.tileop.wgmma_gemm",
         A,
         B,
         C,
@@ -178,45 +227,257 @@ def gemm_v2(
         transpose_B,
         policy,
         clear_accum,
-        k_pack,
-        wg_wait,
-        mbar,
+        1,
+        -1,
+        None,
     )
 
 
-# Default to v2; allow forcing v1 via environment variable
-# gemm = gemm_v1 if _env.use_gemm_v1() else gemm_v2
-
-
-def gemm(
-    A: tir.Buffer | tir.Var,
-    B: tir.Buffer | tir.Var,
-    C: tir.Buffer | tir.Var,
+def tcgen05_gemm(
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
     transpose_A: bool = False,
     transpose_B: bool = False,
     policy: GemmWarpPolicy = GemmWarpPolicy.Square,
     clear_accum: bool = False,
-    k_pack: int = 1,
+    *,
+    mbar: BarrierType,
+    use_2cta: bool = False,
+) -> tir.PrimExpr:
+    """Explicit Blackwell TCGEN05 GEMM without an implicit wait.
+
+    This is the explicit asynchronous Blackwell TCGEN5MMA counterpart to the
+    default synchronous `T.gemm(...)` interface, with two stricter guarantees:
+    - it always requests the TCGEN5MMA lowering path
+    - it never auto-emits an inlined `mbarrier_wait_parity`
+
+    When ``use_2cta=True``, the instruction is lowered to the 2CTA variant
+    which requires ``cluster_dims`` to be ``(2,1,1)`` or ``(1,2,1)``.
+
+    If the current target or operand pattern cannot use Blackwell TCGEN5MMA,
+    compilation fails instead of silently falling back to another GEMM path.
+    """
+
+    ann = {"is_tcgen05": 1}
+    if use_2cta:
+        ann["use_2cta"] = 1
+    return _gemm_impl(
+        "tl.tileop.tcgen05_gemm",
+        A,
+        B,
+        C,
+        transpose_A,
+        transpose_B,
+        policy,
+        clear_accum,
+        1,
+        0,
+        mbar,
+        annotations=ann,
+    )
+
+
+def tcgen05_gemm_blockscaled(
+    A: BufferLikeType,
+    B: BufferLikeType,
+    C: BufferLikeType,
+    SFA_tmem: BufferLikeType,
+    SFB_tmem: BufferLikeType,
+    transpose_A: bool = False,
+    transpose_B: bool = False,
+    clear_accum=False,
     wg_wait: int = 0,
-    mbar: tir.Buffer | None = None,
-):
-    """TileLang GEMM operator.
+    mbar: BarrierType | None = None,
+    sf_a_id: int = 0,
+    sf_b_id: int = 0,
+    *,
+    use_2cta: bool = False,
+) -> tir.PrimExpr:
+    """Explicit Blackwell TCGEN05 block-scaled GEMM without an implicit wait.
+
+    This is the explicit asynchronous Blackwell TCGEN5MMA block-scaled
+    counterpart to `T.tcgen05_gemm(...)`. It never auto-emits an inlined
+    `mbarrier_wait_parity`, and compilation fails instead of silently falling
+    back if the requested ISA path is unavailable.
+
+    With ``use_2cta=True``, this lowers to the true 2CTA block-scaled TCGEN05
+    path only; there is no fallback or emulation. That mode requires
+    ``cluster_dims`` to be ``(2,1,1)`` or ``(1,2,1)``.
+
+    A and B are FP8 (E4M3/E5M2) in shared memory, C is the accumulator in
+    tensor memory, and SFA/SFB are E8M0 scale factors already resident in
+    tensor memory. As with `T.tcgen05_gemm(...)`, this API is explicit-async:
+    it issues the MMA and leaves synchronization to the user schedule.
 
     Args:
-        A (tir.Buffer | tir.Var): Input buffer A.
-        B (tir.Buffer | tir.Var): Input buffer B.
-        C (tir.Buffer | tir.Var): Output buffer C.
-        transpose_A (bool): Whether to transpose A. Defaults to False.
-        transpose_B (bool): Whether to transpose B. Defaults to False.
-        policy (GemmWarpPolicy): GEMM warp partition policy.
-        clear_accum (bool): Whether to clear the accumulator.
-        k_pack (int): Numbers of packed matrix cores, for ROCm only. Defaults to 1.
-        wg_wait (int): Int identifier of the warpgroup MMA batch to wait on.. Defaults to 0.
-        mbar (tir.Buffer | None, optional): Mbarrier in Blackwell. Defaults to None.
+        A: FP8 input buffer A in shared memory.
+        B: FP8 input buffer B in shared memory.
+        C: Accumulator in tensor memory.
+        SFA_tmem: Scale factors for A in tensor memory.
+        SFB_tmem: Scale factors for B in tensor memory.
+        transpose_A: Whether A is MN-major. Default: False (K-major).
+        transpose_B: Whether B is K-major. Default: False (MN-major).
+        clear_accum: Whether to zero the accumulator.
+        wg_wait: Warp group wait identifier.
+        mbar: Mbarrier for MMA completion signaling.
+        sf_a_id: Scale factor ID for A (0-3).
+        sf_b_id: Scale factor ID for B (0-3).
+        use_2cta: Whether to request true ``cta_group::2`` lowering.
+    """
+
+    ann = {"use_2cta": int(use_2cta)} if use_2cta else None
+
+    # Re-read normalized regions below after let legalization.
+
+    def legalize(arg):
+        if isinstance(arg, tir.Var) and T.has_let_value(arg):
+            return T.get_let_value(arg).buffer
+        return arg
+
+    A = legalize(A)
+    B = legalize(B)
+    C = legalize(C)
+    SFA_tmem = legalize(SFA_tmem)
+    SFB_tmem = legalize(SFB_tmem)
+    mbar = legalize(mbar) if mbar is not None else None
+
+    A_region = to_buffer_region(A)
+    B_region = to_buffer_region(B)
+    C_region = to_buffer_region(C)
+    SFA_region = to_buffer_region(SFA_tmem)
+    SFB_region = to_buffer_region(SFB_tmem)
+
+    A_shape = retrieve_shape(A_region)
+    B_shape = retrieve_shape(B_region)
+    C_shape = retrieve_shape(C_region)
+
+    assert len(C_shape) == 2, "current only support C as a 2D tensor"
+    assert len(A_shape) >= 2, "current only support A as a 2D or higher-order tensor"
+    assert len(B_shape) >= 2, "current only support B as a 2D or higher-order tensor"
+
+    M, N = C_shape
+    M_A = A_shape[-1] if transpose_A else A_shape[-2]
+    N_B = B_shape[-2] if transpose_B else B_shape[-1]
+    K = A_shape[-2] if transpose_A else A_shape[-1]
+    K_B = B_shape[-1] if transpose_B else B_shape[-2]
+    assert prim_expr_equal(K, K_B), f"T.tcgen05_gemm_blockscaled K shape check failed: K_A = {K}, K_B = {K_B}"
+    if use_2cta:
+        assert prim_expr_equal(M_A, M) and prim_expr_equal(N_B * 2, N), (
+            f"T.tcgen05_gemm_blockscaled 2CTA shape check failed: M_A = {M_A}, expected M_C = {M}; N_B = {N_B}, expected N_C / 2 = {N} / 2"
+        )
+    else:
+        assert prim_expr_equal(N_B, N), f"T.tcgen05_gemm_blockscaled N shape check failed: N_B = {N_B}, N_C = {N}"
+
+    A_stride = retrieve_stride(A_region)
+    B_stride = retrieve_stride(B_region)
+    stride_a = A_stride[-2]
+    stride_b = B_stride[-2]
+
+    A_offset = retrieve_offset(A_region)
+    B_offset = retrieve_offset(B_region)
+    offset_a = A_offset[-1]
+    offset_b = B_offset[-1]
+
+    if mbar is not None:
+        assert isinstance(mbar, (tir.Buffer, tir.BufferLoad)), (
+            f"mbar for tcgen5mma must be a tir.Buffer or tir.BufferLoad, but got {type(mbar)}"
+        )
+        mbar = to_buffer_region(mbar, access_type="rw")
+
+    C_coords = [r.min for r in C_region.region]
+
+    # Convert BufferRegion to tl.region calls for arguments
+    A_arg = buffer_region_to_tile_region(A_region, "r", [r for r in A_shape])
+    B_arg = buffer_region_to_tile_region(B_region, "r", [r for r in B_shape])
+    C_arg = buffer_region_to_tile_region(C_region, "rw", [r for r in C_shape])
+    SFA_arg = buffer_region_to_tile_region(SFA_region, "r", list(retrieve_shape(SFA_region)))
+    SFB_arg = buffer_region_to_tile_region(SFB_region, "r", list(retrieve_shape(SFB_region)))
+
+    assert mbar is not None, "mbar is required for tcgen05_gemm_blockscaled"
+
+    # Ensure sf_a_id and sf_b_id are PrimExpr
+    if not isinstance(sf_a_id, tir.PrimExpr):
+        sf_a_id = tir.const(sf_a_id, dtype="int32")
+    if not isinstance(sf_b_id, tir.PrimExpr):
+        sf_b_id = tir.const(sf_b_id, dtype="int32")
+
+    # Block-scaled always uses Square policy (1x1 warp partition)
+    policy = GemmWarpPolicy.Square
+
+    return tir.call_intrin(
+        "handle",
+        tir.op.Op.get("tl.tileop.gemm"),
+        A_arg,
+        B_arg,
+        C_arg,
+        transpose_A,
+        transpose_B,
+        M,
+        N,
+        K,
+        policy,
+        clear_accum,
+        stride_a,
+        stride_b,
+        offset_a,
+        offset_b,
+        1,  # k_pack
+        wg_wait,
+        mbar,
+        C_coords[0],
+        C_coords[1],
+        SFA_arg,  # arg 19
+        SFB_arg,  # arg 20
+        sf_a_id,  # arg 21
+        sf_b_id,  # arg 22
+        annotations=ann,
+    )
+
+
+def make_blockscaled_gemm_layout(
+    C: BufferLikeType,
+    A: BufferLikeType,
+    transpose_A: bool = False,
+) -> Layout:
+    """Build the TMEM store layout for the C accumulator of a block-scaled GEMM.
+
+    Users must call ``T.annotate_layout({C_tmem: layout})`` with the returned layout
+    so that subsequent ``T.copy(C_tmem, ...)`` can be lowered correctly.
+
+    Args:
+        C: The TMEM accumulator buffer (block_M, block_N).
+        A: The FP8 operand A buffer (used to infer K and dtype).
+        transpose_A: Whether A is MN-major.
 
     Returns:
-        tir.Call: A handle to the GEMM operation.
+        A Layout object for C's TMEM storage.
     """
+    from tilelang.intrinsics.tcgen05_macro_generator import TensorCoreIntrinEmitter
+
+    C_region = to_buffer_region(C)
+    A_region = to_buffer_region(A)
+
+    C_shape = retrieve_shape(C_region)
+    A_shape = retrieve_shape(A_region)
+
+    M, N = int(C_shape[0]), int(C_shape[1])
+    K = int(A_shape[-2] if transpose_A else A_shape[-1])
+    a_dtype = str(A_region.buffer.dtype)
+    accum_dtype = str(C_region.buffer.dtype)
+
+    emitter = TensorCoreIntrinEmitter(
+        a_dtype=a_dtype,
+        b_dtype=a_dtype,
+        accum_dtype=accum_dtype,
+        a_transposed=transpose_A,
+        b_transposed=False,
+        block_row_warps=1,
+        block_col_warps=1,
+        warp_row_tiles=M,
+        warp_col_tiles=N,
+        chunk=K,
+    )
 
-    impl = gemm_v1 if _env.use_gemm_v1() else gemm_v2
-    return impl(A, B, C, transpose_A, transpose_B, policy, clear_accum, k_pack, wg_wait, mbar)
+    c_buf = C_region.buffer if isinstance(C_region, tir.BufferRegion) else C
+    return emitter.make_mma_store_layout(c_buf)
diff --git a/tilelang/language/kernel.py b/tilelang/language/kernel.py
index 8679971e46..a1780a5ae1 100644
--- a/tilelang/language/kernel.py
+++ b/tilelang/language/kernel.py
@@ -2,11 +2,14 @@
 
 from __future__ import annotations
 from collections import deque
+import os
 from tvm import tir
 from tvm.tir import Var
+from tvm.script.ir_builder.tir import evaluate as T_evaluate
 from tvm.script.ir_builder.tir.frame import TIRFrame, BlockFrame
 from tvm.ffi import register_object
 from tilelang import _ffi_api
+from tilelang.jit.exceptions import JITNoBuilderError
 import threading
 
 # Ensure single-dimension kernel bindings can be unpacked like iterables.
@@ -91,6 +94,41 @@ def _normalize_bindings(bindings: list[Var]) -> Var | list[Var]:
     return bindings
 
 
+def _normalize_threads(
+    threads: int | list[int] | tuple | None,
+    *,
+    is_cpu: bool,
+) -> list[int] | None:
+    if not is_cpu and threads is None:
+        threads = 128  # default thread number
+
+    if isinstance(threads, int):
+        return [threads, 1, 1]
+    if isinstance(threads, list):
+        return threads + [1] * (3 - len(threads))
+    if isinstance(threads, tuple):
+        return list(threads) + [1] * (3 - len(threads))
+
+    assert is_cpu, "threads must be an integer or a list of integers"
+    return None
+
+
+def _normalize_cluster_dims(
+    cluster_dims: int | tuple[int, int, int] | list[int] | None,
+) -> list[int] | None:
+    if cluster_dims is None:
+        return None
+
+    if isinstance(cluster_dims, (list, tuple)):
+        cluster_dims = list(cluster_dims) + [1] * (3 - len(cluster_dims))
+    elif isinstance(cluster_dims, int):
+        cluster_dims = [cluster_dims, 1, 1]
+    else:
+        raise ValueError("cluster_dims must be a list or tuple of integers")
+
+    return None if cluster_dims == [1, 1, 1] else cluster_dims
+
+
 @register_object("tl.KernelLaunchFrame")
 class KernelLaunchFrame(TIRFrame):
     """
@@ -226,8 +264,9 @@ def num_threads(self) -> int:
 
 
 def Kernel(
-    *blocks: tir.PrimExpr,
+    *blocks: int | tir.PrimExpr,
     threads: int | list[int] | tuple | None = None,
+    cluster_dims: int | tuple[int, int, int] | list[int] | None = None,
     is_cpu: bool = False,
     prelude: str | None = None,
 ):
@@ -241,10 +280,11 @@ def Kernel(
         A integer representing blockDim.x
         Or a list of integers representing blockDim.(x|y|z)
         if the value is -1, we skip the threadIdx.x binding.
-    is_cpu : bool
-        Whether the kernel is running on CPU.
-        Thus we will not bind threadIdx.x, threadIdx.y, threadIdx.z.
-        and blockIdx.x, blockIdx.y, blockIdx.z.
+    cluster_dims : int | tuple[int, int, int] | list[int] | None
+        The cluster dimensions for SM90+ cluster launch.
+        For example, use 2 or (2, 1, 1) to create 2-CTA clusters.
+        When specified, the kernel will be launched using cudaLaunchKernelEx
+        with cudaLaunchAttributeClusterDimension.
     prelude : str
         The import c code of the kernel,
         will be injected before the generated kernel code.
@@ -279,19 +319,17 @@ def Kernel(
         with T.Kernel(loop_extent, is_cpu=True) as (i,):
             ...
     """
-    attrs: dict = {}
+    # In eager mode, we construct AST directly without prim_func,
+    # so there must be a Builder available. If not, this function
+    # is being called outside of a JIT/prim_func context.
+    # lazy import to avoid circular import
+    from tilelang.language.eager.builder import Builder
 
-    if not is_cpu and threads is None:
-        threads = 128  # default thread number
+    if Builder.current() is None:
+        raise JITNoBuilderError("T.Kernel() can only be used inside @tilelang.jit or @T.prim_func context. No Builder is available.")
 
-    if isinstance(threads, int):
-        threads = [threads, 1, 1]
-    elif isinstance(threads, list):
-        threads = threads + [1] * (3 - len(threads))
-    elif isinstance(threads, tuple):
-        threads = list(threads) + [1] * (3 - len(threads))
-    else:
-        assert is_cpu, "threads must be an integer or a list of integers"
+    attrs: dict = {}
+    threads = _normalize_threads(threads, is_cpu=is_cpu)
 
     if is_cpu:
         attrs["tilelang.is_cpu_kernel_frame"] = True
@@ -299,9 +337,104 @@ def Kernel(
     if prelude is not None:
         attrs["pragma_import_c"] = prelude
 
+    cluster_dims = _normalize_cluster_dims(cluster_dims)
+    if cluster_dims is not None:
+        attrs["cluster_dims"] = cluster_dims
+
     return _ffi_api.KernelLaunch(blocks, threads, attrs)
 
 
+# For CUDA source kernels, we need to load the source code from a file or string.
+
+
+def _load_cuda_source(source_code_or_path: str | os.PathLike[str]) -> str:
+    source = os.fspath(source_code_or_path)
+    if not isinstance(source, str) or not source.strip():
+        raise ValueError("source_code_or_path must be a non-empty source string or source path")
+
+    expanded = os.path.expanduser(source)
+    if os.path.isfile(expanded):
+        with open(expanded, encoding="utf-8") as f:
+            return f.read()
+
+    source_markers = ("\n", "__global__", 'extern "C"', "#include")
+    if any(marker in source for marker in source_markers):
+        return source
+
+    contains_path_sep = os.path.sep in source or (os.path.altsep is not None and os.path.altsep in source)
+    if contains_path_sep or source.endswith((".cu", ".cuh", ".cuda", ".cpp", ".cc", ".c")):
+        raise FileNotFoundError(f"CUDA source file not found: {source}")
+
+    return source
+
+
+def CUDASourceCodeKernel(
+    *blocks: int | tir.PrimExpr,
+    threads: int | list[int] | tuple | None = None,
+    source_code_or_path: str | os.PathLike[str],
+    entry_name: str = "main_kernel",
+    cluster_dims: int | tuple[int, int, int] | list[int] | None = None,
+    prelude: str | None = None,
+) -> None:
+    """Launch a kernel from CUDA source code or a CUDA source file.
+
+    The code must follows the following rules:
+    1. The kernel source must be a valid CUDA kernel which can be correctly compiled under TileLang's context.
+    2. The kernel source must either contains only one `__global__` function as an entry, or have a `__global__` entry function named `main_kernel`.
+
+    Parameters
+    ----------
+    source_code_or_path : str | os.PathLike[str]
+        Inline CUDA source code, or a path to a CUDA source file.
+        If the argument resolves to an existing file, the file contents are
+        loaded. Otherwise it is treated as inline CUDA source code.
+    blocks : int
+        A list of extent, can be 1-3 dimension, representing gridDim.(x|y|z)
+    entry_name : str | None
+        Optional name of the `__global__` CUDA entry function inside the
+        provided source. When specified, TileLang launches that external CUDA
+        entry directly.
+    threads : int
+        A integer representing blockDim.x
+        Or a list of integers representing blockDim.(x|y|z)
+        if the value is -1, we skip the threadIdx.x binding.
+    cluster_dims : int | tuple[int, int, int] | list[int] | None
+        The cluster dimensions for SM90+ cluster launch.
+        For example, use 2 or (2, 1, 1) to create 2-CTA clusters.
+        When specified, the kernel will be launched using cudaLaunchKernelEx
+        with cudaLaunchAttributeClusterDimension.
+    prelude : str
+        The import c code of the kernel,
+        will be injected before the generated kernel code.
+    """
+    from tilelang.language.eager.builder import Builder
+
+    if Builder.current() is None:
+        raise JITNoBuilderError(
+            "T.CUDASourceCodeKernel() can only be used inside @tilelang.jit or @T.prim_func context. No Builder is available."
+        )
+
+    source = _load_cuda_source(source_code_or_path)
+    if prelude is not None:
+        source = prelude + "\n" + source
+
+    attrs: dict = {"code_block_source": source}
+    if not isinstance(entry_name, str) or not entry_name.strip():
+        raise ValueError("entry_name must be a non-empty string when provided")
+    attrs["code_block_entry_name"] = entry_name
+
+    threads = _normalize_threads(threads, is_cpu=False)
+
+    cluster_dims = _normalize_cluster_dims(cluster_dims)
+    if cluster_dims is not None:
+        attrs["cluster_dims"] = cluster_dims
+
+    with _ffi_api.KernelLaunch(blocks, threads, attrs):
+        # Keep the launch frame alive until SplitHostDevice can lift the
+        # external CUDA source pragma onto the device PrimFunc.
+        T_evaluate(tir.call_extern("int32", entry_name))
+
+
 def get_thread_binding(dim: int = 0) -> Var:
     """Returns the thread binding for the given dimension."""
     assert KernelLaunchFrame.Current() is not None, "KernelLaunchFrame is not initialized"
diff --git a/tilelang/language/logical.py b/tilelang/language/logical.py
index 66f0a2e2b5..fec90b0d8f 100644
--- a/tilelang/language/logical.py
+++ b/tilelang/language/logical.py
@@ -6,9 +6,10 @@
 from tvm.tir import Buffer, BufferRegion, BufferLoad
 from tvm import tir
 from tilelang.utils.language import get_buffer_elems
+from tilelang._typing import BufferLikeType
 
 
-def any_of(buffer: T.Tensor | BufferRegion):
+def any_of(buffer: BufferLikeType) -> tir.PrimExpr:
     """Check if any element in the buffer is true.
 
     Args:
@@ -20,7 +21,7 @@ def any_of(buffer: T.Tensor | BufferRegion):
     return_type: str = "bool"
     if isinstance(buffer, Buffer):
         elems = get_buffer_elems(buffer)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer), elems)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.access_ptr(buffer, "r"), elems)
     elif isinstance(buffer, BufferRegion):
         buffer, region = buffer.buffer, buffer.region
         new_region = []
@@ -37,12 +38,17 @@ def any_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.any_of"), T.address_of(buffer_load), extent)
+        return T.call_intrin(
+            return_type,
+            tir.op.Op.get("tl.any_of"),
+            T.access_ptr(buffer_load, "r", extent=extent),
+            extent,
+        )
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
 
 
-def all_of(buffer: T.Tensor | BufferRegion):
+def all_of(buffer: BufferLikeType) -> tir.PrimExpr:
     """Check if all elements in the buffer are true.
 
     Args:
@@ -54,7 +60,7 @@ def all_of(buffer: T.Tensor | BufferRegion):
     return_type: str = "bool"
     if isinstance(buffer, Buffer):
         elems = get_buffer_elems(buffer)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer), elems)
+        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.access_ptr(buffer, "r"), elems)
     elif isinstance(buffer, BufferRegion):
         buffer, region = buffer.buffer, buffer.region
         new_region = []
@@ -71,6 +77,11 @@ def all_of(buffer: T.Tensor | BufferRegion):
                     )
                 new_region.append(r.min)
         buffer_load = BufferLoad(buffer, new_region)
-        return T.call_intrin(return_type, tir.op.Op.get("tl.all_of"), T.address_of(buffer_load), extent)
+        return T.call_intrin(
+            return_type,
+            tir.op.Op.get("tl.all_of"),
+            T.access_ptr(buffer_load, "r", extent=extent),
+            extent,
+        )
     else:
         raise ValueError(f"Invalid buffer type: {type(buffer)}")
diff --git a/tilelang/language/loop.py b/tilelang/language/loop.py
index 45b768095a..78596e9eae 100644
--- a/tilelang/language/loop.py
+++ b/tilelang/language/loop.py
@@ -5,12 +5,18 @@
 from tvm import tir
 from tvm.tir import IntImm
 import tvm.script.ir_builder.tir as tb_tir
-from .v2.builder import SerialForWithStep, UnrollForWithStep
+from .eager.builder import SerialForWithStep, UnrollForWithStep
 from tilelang import _ffi_api
 from tvm.script.ir_builder.tir import frame
 
 
-def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
+def Parallel(
+    *extents: int | tir.PrimExpr,
+    coalesced_width: int | None = None,
+    loop_layout: Any | None = None,
+    prefer_async: bool | None = None,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
     """Tools to construct nested parallel for loop.
        This can be used to create element-wise tensor expression.
 
@@ -22,23 +28,71 @@ def Parallel(*extents: tir.PrimExpr, coalesced_width: int | None = None):
     coalesced_width : Optional[int]
         The coalesced width of the parallel loop.
 
+    loop_layout : Optional[Fragment]
+        A layout annotation for the parallel loop nest, expressed as a
+        ``T.Fragment``. When provided, it is attached as the
+        ``"parallel_loop_layout"`` annotation on the outermost parallel loop.
+        For a k-dimensional ``T.Parallel(...)`` nest, the fragment's
+        ``InputDim`` must equal ``k``.
+
+    prefer_async : Optional[bool]
+        Optional hint for PTX async-copy rewrite in this parallel loop subtree.
+        When set to ``True``, it requests cp.async injection even outside
+        pipelined loops. ``False``/``None`` keeps default behavior.
+        Internally lowered as loop annotation ``"parallel_prefer_async"``.
+
+    annotations : Optional[Dict[str, Any]]
+        Optional user-provided loop annotations attached to the outermost
+        generated parallel loop. For example:
+        ``{"parallel_async_without_async_commit_wait": True}``.
+
+    Notes on layout constraints
+    ---------------------------
+    TileLang validates parallel loop layout annotations during
+    ``tl.transform.LayoutInference`` with ``ParallelLoopLayoutValidator``.
+    The key constraints are:
+
+    - Every parallel loop must be covered by a layout annotation after
+      layout inference. For a nested parallel nest, this annotation must live
+      on the outermost loop; inner parallel loops must not carry the layout
+      annotation themselves.
+    - For a nest depth of ``k``, the layout must satisfy
+      ``InputDim == k``.
+    - Violations (missing annotation on the outermost loop, annotations on
+      inner loops, or mismatched ``InputDim``) cause a compilation error.
+
+    Rationale: inner loops cannot control/annotate their outer loops, while the
+    outermost loop can manage its inner nest. Therefore the layout is placed on
+    the outermost loop so lowering passes can rewrite the entire region.
+
+    To make this easy, ``T.Parallel`` attaches any provided ``loop_layout``
+    to the outermost generated loop only. If you omit ``loop_layout``, the
+    compiler will try to infer a valid layout and attach it during the
+    LayoutInference pass.
+
     Returns
     -------
     res : frame.ForFrame
         The ForFrame.
     """
-    annotations: dict[str, Any] = {}
+    merged_annotations: dict[str, Any] = dict(annotations) if annotations is not None else {}
     if coalesced_width is not None:
-        annotations.update({"coalesced_width": coalesced_width})
-    return _ffi_api.Parallel(extents, annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
+        merged_annotations["coalesced_width"] = coalesced_width
+    if loop_layout is not None:
+        # Pass through to C++ as the standard parallel loop layout key.
+        # The builder will attach it only on the outermost parallel loop.
+        merged_annotations["parallel_loop_layout"] = loop_layout
+    if prefer_async is not None:
+        merged_annotations["parallel_prefer_async"] = prefer_async
+    return _ffi_api.Parallel(extents, merged_annotations)  # type: ignore[attr-defined] # pylint: disable=no-member
 
 
 def Persistent(
     domain: list[tir.PrimExpr],
     wave_size: tir.PrimExpr,
     index: tir.PrimExpr,
-    group_size: tir.PrimExpr | None = 8,
-):
+    group_size: tir.PrimExpr | int | None = 8,
+) -> frame.ForFrame:
     """Tools to construct persistent for loop.
 
     Parameters
@@ -57,13 +111,13 @@ def Persistent(
 
 def Pipelined(
     start: tir.PrimExpr,
-    stop: tir.PrimExpr = None,
+    stop: tir.PrimExpr | None = None,
     num_stages: int = 0,
     order: list[int] | None = None,
     stage: list[int] | None = None,
     sync: list[list[int]] | None = None,
     group: list[list[int]] | None = None,
-):
+) -> frame.ForFrame:
     """Tools to construct pipelined for loop.
 
     Parameters
@@ -96,7 +150,11 @@ def Pipelined(
 
 
 def serial(
-    start: tir.PrimExpr, stop: tir.PrimExpr | None = None, step: tir.PrimExpr | None = None, *, annotations: dict[str, Any] | None = None
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    step: tir.PrimExpr | None = None,
+    *,
+    annotations: dict[str, Any] | None = None,
 ) -> frame.ForFrame:
     """The serial For statement.
 
@@ -206,7 +264,7 @@ def Serial(
     step: tir.PrimExpr | None = None,
     *,
     annotations: dict[str, Any] | None = None,
-):
+) -> frame.ForFrame:
     """Alias of T.serial."""
 
     return serial(start, stop, step, annotations=annotations)
@@ -220,7 +278,45 @@ def Unroll(
     explicit: bool = False,
     unroll_factor: int | None = None,
     annotations: dict[str, Any] | None = None,
-):
+) -> frame.ForFrame:
     """Alias of T.unroll."""
 
     return unroll(start, stop, step, explicit=explicit, unroll_factor=unroll_factor, annotations=annotations)
+
+
+def vectorized(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    *,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """The vectorized For statement.
+
+    Parameters
+    ----------
+    start : PrimExpr
+        The minimum value of iteration.
+
+    stop : PrimExpr
+        The maximum value of iteration.
+
+    annotations : Dict[str, Any]
+        The optional annotations of the For statement.
+
+    Returns
+    -------
+    res : frame.ForFrame
+        The ForFrame.
+    """
+    return tb_tir.vectorized(start, stop, annotations=annotations)
+
+
+def Vectorized(
+    start: tir.PrimExpr,
+    stop: tir.PrimExpr | None = None,
+    *,
+    annotations: dict[str, Any] | None = None,
+) -> frame.ForFrame:
+    """Alias of T.vectorized."""
+
+    return vectorized(start, stop, annotations=annotations)
diff --git a/tilelang/language/math_intrinsics.py b/tilelang/language/math_intrinsics.py
index 6dfb617e55..49a729247b 100644
--- a/tilelang/language/math_intrinsics.py
+++ b/tilelang/language/math_intrinsics.py
@@ -1,6 +1,7 @@
 """Common math intrinsics exposed on the TileLang language surface."""
 
 from tvm import tir
+from tvm.tir import PrimExpr
 
 
 def _validate_rounding_mode(rounding_mode):
@@ -11,7 +12,7 @@ def _validate_rounding_mode(rounding_mode):
     raise ValueError(f"Invalid rounding mode '{rounding_mode}'. Must be one of: {valid_modes}")
 
 
-def __log(x):
+def __log(x: PrimExpr) -> PrimExpr:
     """Calculate log(x) with fast math
 
     Parameters
@@ -28,7 +29,7 @@ def __log(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log"), x)
 
 
-def __log2(x):
+def __log2(x: PrimExpr) -> PrimExpr:
     """Calculate log2(x) with fast math
 
     Parameters
@@ -45,7 +46,7 @@ def __log2(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log2"), x)
 
 
-def __log10(x):
+def __log10(x: PrimExpr) -> PrimExpr:
     """Calculate log10(x) with fast math
 
     Parameters
@@ -62,7 +63,7 @@ def __log10(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__log10"), x)
 
 
-def __tan(x):
+def __tan(x: PrimExpr) -> PrimExpr:
     """Calculate tan(x) with fast math
 
     Parameters
@@ -79,7 +80,7 @@ def __tan(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__tan"), x)
 
 
-def __cos(x):
+def __cos(x: PrimExpr) -> PrimExpr:
     """Calculate cos(x) with fast math
 
     Parameters
@@ -96,7 +97,7 @@ def __cos(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__cos"), x)
 
 
-def __sin(x):
+def __sin(x: PrimExpr) -> PrimExpr:
     """Calculate sin(x) with fast math
 
     Parameters
@@ -113,7 +114,7 @@ def __sin(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__sin"), x)
 
 
-def __exp10(x):
+def __exp10(x: PrimExpr) -> PrimExpr:
     """Calculate 10**x with fast math
 
     Parameters
@@ -130,7 +131,7 @@ def __exp10(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.__exp10"), x)
 
 
-def __exp(x):
+def __exp(x: PrimExpr) -> PrimExpr:
     """Calculate 2**x with fast math
 
     Parameters
@@ -148,7 +149,7 @@ def __exp(x):
 
 
 # IEEE-compliant operations
-def ieee_add(x, y, rounding_mode="rn"):
+def ieee_add(x: PrimExpr, y: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant addition with specified rounding mode
 
     Parameters
@@ -174,7 +175,7 @@ def ieee_add(x, y, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_add"), x, y, rounding_mode)
 
 
-def ieee_sub(x, y, rounding_mode="rn"):
+def ieee_sub(x: PrimExpr, y: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant subtraction with specified rounding mode
 
     Parameters
@@ -198,7 +199,7 @@ def ieee_sub(x, y, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_sub"), x, y, rounding_mode)
 
 
-def ieee_mul(x, y, rounding_mode="rn"):
+def ieee_mul(x: PrimExpr, y: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant multiplication with specified rounding mode
 
     Parameters
@@ -222,7 +223,7 @@ def ieee_mul(x, y, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_mul"), x, y, rounding_mode)
 
 
-def ieee_fmaf(x, y, z, rounding_mode="rn"):
+def ieee_fmaf(x: PrimExpr, y: PrimExpr, z: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant fused multiply-add with specified rounding mode
 
     Parameters
@@ -249,7 +250,7 @@ def ieee_fmaf(x, y, z, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_fmaf"), x, y, z, rounding_mode)
 
 
-def ieee_frcp(x, rounding_mode="rn"):
+def ieee_frcp(x: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant reciprocal with specified rounding mode
 
     Parameters
@@ -270,7 +271,7 @@ def ieee_frcp(x, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_frcp"), x, rounding_mode)
 
 
-def ieee_fsqrt(x, rounding_mode="rn"):
+def ieee_fsqrt(x: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant square root with specified rounding mode
 
     Parameters
@@ -291,7 +292,7 @@ def ieee_fsqrt(x, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_fsqrt"), x, rounding_mode)
 
 
-def ieee_frsqrt(x):
+def ieee_frsqrt(x: PrimExpr) -> PrimExpr:
     """IEEE-compliant reciprocal square root (round to nearest only)
 
     Parameters
@@ -308,7 +309,7 @@ def ieee_frsqrt(x):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_frsqrt"), x)
 
 
-def ieee_fdiv(x, y, rounding_mode="rn"):
+def ieee_fdiv(x: PrimExpr, y: PrimExpr, rounding_mode="rn") -> PrimExpr:
     """IEEE-compliant division with specified rounding mode
 
     Parameters
@@ -332,6 +333,84 @@ def ieee_fdiv(x, y, rounding_mode="rn"):
     return tir.call_intrin(x.dtype, tir.op.Op.get("tl.ieee_fdiv"), x, y, rounding_mode)
 
 
+_PACKED_X2_DTYPES = frozenset({"float32x2", "bfloat16x2", "float16x2"})
+
+
+def _validate_packed_x2_args(*args: PrimExpr) -> None:
+    """Validate that all arguments are PrimExpr with a supported packed x2 dtype."""
+    for arg in args:
+        if not isinstance(arg, PrimExpr):
+            raise TypeError(f"Expected PrimExpr, got {type(arg)}: {arg}")
+        if arg.dtype not in _PACKED_X2_DTYPES:
+            raise ValueError(f"Expected dtype in {sorted(_PACKED_X2_DTYPES)}, got '{arg.dtype}'")
+
+
+# ---------------------------------------------------------------------------
+# Packed x2 element-wise operations
+#
+# All ops accept float32x2, bfloat16x2, and float16x2 operands.
+# On CUDA, the codegen emits ``tl::<op>(...)`` which resolves to the
+# appropriate C++ overload (float2, __half2, __nv_bfloat162, or the uint1
+# bridge overload used by TVM for 16-bit packed types).
+# ---------------------------------------------------------------------------
+
+
+def add2(x: PrimExpr, y: PrimExpr) -> PrimExpr:
+    """Packed element-wise add (x + y)."""
+    x = tir.convert(x)
+    y = tir.convert(y)
+    _validate_packed_x2_args(x, y)
+    return tir.call_intrin(x.dtype, tir.op.Op.get("tl.add2"), x, y)
+
+
+def sub2(x: PrimExpr, y: PrimExpr) -> PrimExpr:
+    """Packed element-wise subtract (x - y)."""
+    x = tir.convert(x)
+    y = tir.convert(y)
+    _validate_packed_x2_args(x, y)
+    return tir.call_intrin(x.dtype, tir.op.Op.get("tl.sub2"), x, y)
+
+
+def mul2(x: PrimExpr, y: PrimExpr) -> PrimExpr:
+    """Packed element-wise multiply (x * y)."""
+    x = tir.convert(x)
+    y = tir.convert(y)
+    _validate_packed_x2_args(x, y)
+    return tir.call_intrin(x.dtype, tir.op.Op.get("tl.mul2"), x, y)
+
+
+def fma2(x: PrimExpr, y: PrimExpr, z: PrimExpr) -> PrimExpr:
+    """Packed fused multiply-add (x * y + z)."""
+    x = tir.convert(x)
+    y = tir.convert(y)
+    z = tir.convert(z)
+    _validate_packed_x2_args(x, y, z)
+    return tir.call_intrin(x.dtype, tir.op.Op.get("tl.fma2"), x, y, z)
+
+
+def max2(x: PrimExpr, y: PrimExpr) -> PrimExpr:
+    """Packed element-wise maximum."""
+    x = tir.convert(x)
+    y = tir.convert(y)
+    _validate_packed_x2_args(x, y)
+    return tir.call_intrin(x.dtype, tir.op.Op.get("tl.max2"), x, y)
+
+
+def min2(x: PrimExpr, y: PrimExpr) -> PrimExpr:
+    """Packed element-wise minimum."""
+    x = tir.convert(x)
+    y = tir.convert(y)
+    _validate_packed_x2_args(x, y)
+    return tir.call_intrin(x.dtype, tir.op.Op.get("tl.min2"), x, y)
+
+
+def abs2(x: PrimExpr) -> PrimExpr:
+    """Packed element-wise absolute value."""
+    x = tir.convert(x)
+    _validate_packed_x2_args(x)
+    return tir.call_intrin(x.dtype, tir.op.Op.get("tl.abs2"), x)
+
+
 __all__ = [
     "__log",  # noqa: F401
     "__log2",  # noqa: F401
@@ -349,4 +428,11 @@ def ieee_fdiv(x, y, rounding_mode="rn"):
     "ieee_fsqrt",  # noqa: F401
     "ieee_frsqrt",  # noqa: F401
     "ieee_fdiv",  # noqa: F401
+    "add2",  # noqa: F401
+    "sub2",  # noqa: F401
+    "mul2",  # noqa: F401
+    "fma2",  # noqa: F401
+    "max2",  # noqa: F401
+    "min2",  # noqa: F401
+    "abs2",  # noqa: F401
 ]
diff --git a/tilelang/language/overrides/__init__.py b/tilelang/language/overrides/__init__.py
index c900642fa8..b0d48972ef 100644
--- a/tilelang/language/overrides/__init__.py
+++ b/tilelang/language/overrides/__init__.py
@@ -6,3 +6,4 @@
 
 # Register parser overrides upon import.
 from . import parser  # noqa: F401
+from . import buffer  # noqa: F401
diff --git a/tilelang/language/overrides/buffer.py b/tilelang/language/overrides/buffer.py
new file mode 100644
index 0000000000..35f2106df3
--- /dev/null
+++ b/tilelang/language/overrides/buffer.py
@@ -0,0 +1,26 @@
+"""TileLang-specific override for TIR Buffer __getitem__ to provide
+user-friendly error messages when the number of indices does not match
+the buffer's dimensionality."""
+
+from __future__ import annotations
+
+from tvm import tir
+
+
+_original_buffer_getitem = tir.Buffer.__getitem__
+
+
+def _patched_buffer_getitem(self, indices):
+    if not isinstance(indices, (tuple, list)):
+        indices = [indices]
+    ndim = len(self.shape)
+    if len(indices) != ndim:
+        raise IndexError(
+            f"Buffer {self.name} is {ndim}-dimensional (shape={self.shape}), "
+            f"but {len(indices)} index(es) were provided: {indices}. "
+            f"Please provide exactly {ndim} index/indices or slice(s)."
+        )
+    return _original_buffer_getitem(self, indices)
+
+
+tir.Buffer.__getitem__ = _patched_buffer_getitem
diff --git a/tilelang/language/parser/entry.py b/tilelang/language/parser/entry.py
index 53316d8c28..f648dc8c8c 100644
--- a/tilelang/language/parser/entry.py
+++ b/tilelang/language/parser/entry.py
@@ -23,12 +23,16 @@
 from typing import Callable, Optional, Union
 
 from tvm.ir.base import deprecated
-from tvm.tir import Buffer, PrimFunc
+from tvm import tir
+from tvm.tir import PrimFunc
 
 from ..ast import buffer, ptr
 from tvm.script.parser._core import parse, scan_macro, utils
 from tvm.script.parser.core.parser import Parser, ScriptMacro
 
+from tilelang._typing import ShapeType, DType
+from tilelang.language import dtypes as _dtypes
+
 
 def prim_func(func: Optional[Callable] = None, private: bool = False, check_well_formed=True) -> Union[PrimFunc, Callable]:
     """The parsing method for tir prim func, by using `@prim_func` as decorator.
@@ -156,8 +160,8 @@ class BufferProxy:
 
     def __call__(
         self,
-        shape,
-        dtype=T.float32,
+        shape: ShapeType,
+        dtype: DType = _dtypes.float32,
         data=None,
         strides=None,
         elem_offset=None,
@@ -166,7 +170,7 @@ def __call__(
         offset_factor=0,
         buffer_type="",
         axis_separators=None,
-    ) -> Buffer:
+    ) -> tir.Buffer:
         return buffer(
             shape,
             dtype=dtype,
@@ -181,7 +185,7 @@ def __call__(
         )
 
     @deprecated("T.Tensor[...]", "T.Tensor(...)")
-    def __getitem__(self, keys) -> Buffer:
+    def __getitem__(self, keys) -> tir.Buffer:
         if not isinstance(keys, tuple):
             return self(keys)
         if len(keys) >= 2 and not isinstance(keys[1], str):
diff --git a/tilelang/language/pdl.py b/tilelang/language/pdl.py
new file mode 100644
index 0000000000..32ff540411
--- /dev/null
+++ b/tilelang/language/pdl.py
@@ -0,0 +1,21 @@
+from tvm import tir
+
+
+__all__ = [
+    "pdl_trigger",
+    "pdl_sync",
+]
+
+
+def pdl_trigger() -> tir.PrimExpr:
+    return tir.call_intrin(
+        "void",
+        tir.op.Op.get("tl.pdl_trigger"),
+    )
+
+
+def pdl_sync() -> tir.PrimExpr:
+    return tir.call_intrin(
+        "void",
+        tir.op.Op.get("tl.pdl_sync"),
+    )
diff --git a/tilelang/language/print_op.py b/tilelang/language/print_op.py
index 3b9a49b1cf..8b19047da1 100644
--- a/tilelang/language/print_op.py
+++ b/tilelang/language/print_op.py
@@ -3,6 +3,7 @@
 It includes functionality to print variables, print values in buffers, conditionally execute debug prints and assert.
 """
 
+from tilelang.language.eager.builder import Builder
 from tvm import tir
 from typing import Any
 import tilelang.language as T
@@ -12,7 +13,7 @@
 
 
 @macro
-def print_var(var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
+def print_var(var: tir.PrimExpr, msg: str = "") -> None:
     """
     Prints the value of a TIR primitive expression (PrimExpr) for debugging purposes.
 
@@ -26,7 +27,7 @@ def print_var(var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
 
 
 @macro
-def print_var_with_condition(condition: tir.PrimExpr, var: tir.PrimExpr, msg: str = "") -> tir.PrimExpr:
+def print_var_with_condition(condition: tir.PrimExpr, var: tir.PrimExpr, msg: str = "") -> None:
     """
     Conditionally prints a TIR primitive expression (PrimExpr) if a given condition is True.
 
@@ -51,12 +52,10 @@ def print_global_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buff
         for i in serial(elems):
             coords = index_to_coordinates(i, buffer.shape)
             tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
-    else:
-        tir.call_extern("handle", "debug_print_buffer_value", msg, buffer.name, i, buffer[coords])
 
 
 @macro
-def print_shared_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
+def print_shared_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> None:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -76,7 +75,7 @@ def print_shared_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buff
 
 
 @macro
-def print_fragment_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
+def print_fragment_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> None:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -98,7 +97,17 @@ def print_fragment_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Bu
 
 
 @macro
-def print_local_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> tir.PrimExpr:
+def print_msg(msg: str) -> None:
+    """
+    Prints a message string.
+    """
+    assert isinstance(msg, str), "msg must be a string"
+    assert msg != "", "msg must not be empty"
+    tir.call_extern("handle", "debug_print_msg", msg)
+
+
+@macro
+def print_local_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffer, elems: int, msg: str = "") -> None:
     """
     Conditionally prints the values of a flattened TIR buffer if the condition is True.
 
@@ -106,9 +115,6 @@ def print_local_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffe
         condition (tir.PrimExpr): A TIR expression representing the condition to check.
         buffer (tir.Buffer): The buffer whose values need to be printed.
         elems (int): The number of elements in the buffer to print.
-
-    Returns:
-        tir.PrimExpr: The TIR expression for the debug print operation.
     """
     if condition:
         # Iterate through the buffer elements and print each one.
@@ -123,30 +129,34 @@ def print_local_buffer_with_condition(condition: tir.PrimExpr, buffer: tir.Buffe
 _IS_CUDA_AVAILABLE = check_cuda_availability()
 
 
+def get_stack_str(msg, stacklevel=1):
+    stack = Builder.current().get_fileline_stack(stacklevel)
+    msg = msg + "\n"
+    for fileline, lineno, macro_name in stack:
+        msg += f"  at {fileline}:{lineno} in {macro_name}\n"
+    return msg
+
+
 @macro
-def device_assert(condition: tir.PrimExpr, msg: str = ""):
+def device_assert(condition: tir.PrimExpr, msg: str = "", no_stack_info=False):
     """
     Device-side assert emulation.
     Emits a device-side assert call on CUDA targets when CUDA is available.
     The assert is always enabled and cannot be disabled at runtime.
     """
     if _IS_CUDA_AVAILABLE:
-        if msg == "":
-            T.call_intrin("void", tir.op.Op.get("tl.device_assert"), condition)
+        if no_stack_info:
+            if msg == "":
+                T.call_intrin("void", tir.op.Op.get("tl.device_assert"), condition)
+            else:
+                warnings.warn("Non-empty msg may slightly slow down the kernel", stacklevel=2)
+                T.call_intrin("void", tir.op.Op.get("tl.device_assert_with_msg"), condition, msg)
         else:
-            warnings.warn("Non-empty msg may slightly slow down the kernel", stacklevel=2)
-            T.call_intrin("void", tir.op.Op.get("tl.device_assert_with_msg"), condition, msg)
-
-
-@macro
-def print_msg(msg: str) -> tir.PrimExpr:
-    """
-    Prints a message for debugging purposes.
-    """
-    tir.call_extern("handle", "debug_print_msg", msg)
+            T.call_intrin("void", tir.op.Op.get("tl.device_assert_with_msg"), condition, get_stack_str(msg, stacklevel=2))
 
 
-def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) -> tir.PrimExpr:
+# NOTE(chaofan): T.print is implemented as a macro, so no return
+def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int = 0) -> None:
     """
     A generic print function that handles both TIR buffers and primitive expressions.
 
@@ -154,14 +164,11 @@ def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int =
     - If the input is a TIR primitive expression, it prints its value directly.
 
     Parameters:
-        obj (Any): The object to print. It can be either a tir.Buffer, tir.PrimExpr or None.
+        obj (Any): The object to print. It can be either a tir.Buffer, tir.PrimExpr, or None (for msg-only print).
         msg (str): An optional message to include in the print statement.
         warp_group_id (int): The warp group id to print.
         warp_id (int): The warp id to print.
-        print thread will be warp_group_id * warp_group_size + warp_id.
-
-    Returns:
-        tir.PrimExpr: The TIR expression for the debug print operation.
+        print thread will be warp_group_id * warp_group_size + warp_id
 
     Raises:
         ValueError: If the input object type is unsupported.
@@ -184,7 +191,7 @@ def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int =
             condition = True
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
-            return print_local_buffer_with_condition(condition, buffer, elems, msg)
+            print_local_buffer_with_condition(condition, buffer, elems, msg)
         elif buffer.scope() == "local.fragment":
             # Get the number of elements in the buffer.
             elems = 1
@@ -195,7 +202,7 @@ def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int =
             condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
-            return print_fragment_buffer_with_condition(condition, buffer, elems, msg)
+            print_fragment_buffer_with_condition(condition, buffer, elems, msg)
         elif buffer.scope() in {"shared", "shared.dyn"}:
             # Get the number of elements in the buffer.
             elems = 1
@@ -206,14 +213,14 @@ def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int =
             condition = tx == main_lane and ty == 0 and tz == 0
             if not msg:
                 msg = f"buffer<{buffer.name}, {buffer.dtype}>"
-            return print_shared_buffer_with_condition(condition, buffer, elems, msg)
+            print_shared_buffer_with_condition(condition, buffer, elems, msg)
         elif buffer.scope() == "global":
             # Get the number of elements in the buffer.
             elems = 1
             for dim in buffer.shape:
                 elems *= dim
             condition = True
-            return print_global_buffer_with_condition(condition, buffer, elems, msg)
+            print_global_buffer_with_condition(condition, buffer, elems, msg)
         else:
             raise ValueError(f"Unsupported buffer scope: {buffer.scope()}")
 
@@ -221,11 +228,11 @@ def print(obj: Any = None, msg: str = "", warp_group_id: int = 0, warp_id: int =
         if not msg:
             msg = f"expr<{obj}>"
         # Directly print primitive expressions.
-        return print_var(obj, msg)
+        print_var(obj, msg)
 
     elif obj is None:
-        return print_msg(msg)
+        print_msg(msg)
 
     else:
         # Unsupported object type.
-        raise ValueError(f"Unexpected type: {type(obj)}. Supported types are tir.Buffer and tir.PrimExpr.")
+        raise ValueError(f"Unexpected type: {type(obj)}. Supported types are tir.Buffer, tir.PrimExpr, and None.")
diff --git a/tilelang/language/proxy.py b/tilelang/language/proxy.py
index 90a2d5ff34..00d847fb29 100644
--- a/tilelang/language/proxy.py
+++ b/tilelang/language/proxy.py
@@ -2,14 +2,17 @@
 
 from __future__ import annotations
 
-from typing import Any, SupportsIndex, TYPE_CHECKING, Generic, TypeVar
-from collections.abc import Sequence
+from typing import Any, TYPE_CHECKING, Generic, TypeVar
+from tilelang._typing import DType, ShapeType
 from typing_extensions import Self
 
-from tvm import tir
+from tvm import DataType, ir as tvm_ir, tir
 from tvm.tir import Var, PrimExpr
-from tvm.script.ir_builder.tir import buffer, handle, match_buffer
+from tvm.script.ir_builder.tir import LetStmt, buffer, handle, match_buffer
 from tilelang.utils import deprecated
+from tilelang.jit.exceptions import JITNoBuilderError
+
+from tilelang.language import dtypes as _dtypes
 
 
 class BufferProxy:
@@ -19,8 +22,8 @@ class BufferProxy:
     @deprecated("T.Buffer(...)", "T.Tensor(...)")
     def __call__(
         self,
-        shape,
-        dtype="float32",
+        shape: ShapeType,
+        dtype: DType = _dtypes.float32,
         data=None,
         strides=None,
         elem_offset=None,
@@ -53,8 +56,8 @@ def __getitem__(self, keys) -> tir.Buffer:
         return self(*keys)  # type: ignore[attr-defined] # pylint: disable=no-member
 
     def from_ptr(
-        self, pointer_var: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
-    ) -> Buffer:
+        self, pointer_var: Var, shape: ShapeType, dtype: DType = "float32", strides: tuple[PrimExpr, ...] | None = None
+    ) -> tir.Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
         Args:
@@ -68,6 +71,15 @@ def from_ptr(
         return match_buffer(pointer_var, shape, dtype=dtype, strides=strides)
 
 
+def _normalize_tensor_dtype(dtype: DType) -> DType:
+    # `T.Tensor(..., T.ptr)` is a frontend-only marker for pointer tables.
+    # Keep the runtime ABI as int64 storage and rely on T.make_tensor(...)
+    # to reinterpret loaded addresses back into typed pointers as needed.
+    if dtype is ptr:
+        return _dtypes.int64
+    return dtype
+
+
 class BaseTensorProxy:
     """Base proxy class for tensor types with configurable defaults.
 
@@ -82,8 +94,8 @@ class BaseTensorProxy:
 
     def __call__(
         self,
-        shape,
-        dtype="float32",
+        shape: ShapeType,
+        dtype: DType = "float32",
         data=None,
         strides=None,
         elem_offset=None,
@@ -94,6 +106,7 @@ def __call__(
         axis_separators=None,
     ) -> tir.Buffer:
         # Use class defaults if not specified
+        dtype = _normalize_tensor_dtype(dtype)
         scope = scope or self.default_scope
         align = align or self.default_align
         offset_factor = offset_factor or self.default_offset_factor
@@ -118,7 +131,7 @@ def __getitem__(self, keys) -> tir.Buffer:
         return self(*keys)
 
     def from_ptr(
-        self, pointer_var: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+        self, pointer_var: Var, shape: ShapeType, dtype: DType = "float32", strides: tuple[PrimExpr, ...] | None = None
     ) -> tir.Buffer:
         """Create a buffer from a pointer, shape, and data type.
 
@@ -130,6 +143,7 @@ def from_ptr(
         Returns:
             A buffer created from the given parameters
         """
+        dtype = _normalize_tensor_dtype(dtype)
         return match_buffer(pointer_var, shape, dtype=dtype, strides=strides)
 
 
@@ -148,7 +162,7 @@ def _construct_strides(shape: tuple[Any]):
             strides.append(s)
         return tuple(reversed(strides))
 
-    def __call__(self, shape: tuple[Any] | PrimExpr | int, dtype: str = "float32", data=None, scope=None) -> tir.Buffer:
+    def __call__(self, shape: ShapeType | PrimExpr | int, dtype: DType = "float32", data=None, scope=None) -> tir.Buffer:
         if isinstance(shape, (int, PrimExpr)):
             shape = (shape,)
         return super().__call__(shape, dtype=dtype, strides=TensorProxy._construct_strides(shape), data=data, scope=scope)
@@ -160,7 +174,7 @@ class StridedTensorProxy(BaseTensorProxy):
     This class implements the default tensor proxy with global memory scope, with the stride information required.
     """
 
-    def __call__(self, shape: tuple[Any], strides: tuple[Any], dtype: str = "float32", scope=None) -> tir.Buffer:
+    def __call__(self, shape: ShapeType, strides: tuple[Any], dtype: DType = "float32", scope=None) -> tir.Buffer:
         if len(shape) != len(strides):
             raise ValueError("Invalid shape/strides' dimensions")
         return super().__call__(shape, dtype=dtype, strides=strides, scope=scope)
@@ -212,8 +226,8 @@ def __setitem__(self, key, value) -> None: ...
 
         def __init__(
             self,
-            shape: Sequence[SupportsIndex],
-            dtype="float32",
+            shape: ShapeType,
+            dtype: DType = "float32",
             data=None,
             strides=None,
             elem_offset=None,
@@ -226,7 +240,7 @@ def __init__(
 
         @classmethod
         def from_ptr(
-            cls, pointer_var: Var, shape: Sequence[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None
+            cls, pointer_var: Var, shape: ShapeType, dtype: DType = "float32", strides: tuple[PrimExpr, ...] | None = None
         ) -> Self: ...
 
     class Tensor(BaseTensor): ...
@@ -252,12 +266,12 @@ class Ref(Generic[_T], tir.Var): ...
     class Ref: ...
 
 
-def ptr(dtype: str | None = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
+def ptr(dtype: DType | None = None, storage_scope: str = "global", *, is_size_var: bool = False) -> Var:
     """Create a TIR var that represents a pointer.
 
     Parameters
     ----------
-    dtype: str
+    dtype: DType
         The data type of the pointer.
 
     storage_scope: str
@@ -274,5 +288,54 @@ def ptr(dtype: str | None = None, storage_scope: str = "global", *, is_size_var:
     return handle(dtype=dtype, storage_scope=storage_scope, is_size_var=is_size_var)
 
 
-def make_tensor(ptr: Var, shape: tuple[PrimExpr, ...], dtype: str = "float32", strides: tuple[PrimExpr, ...] = None) -> tir.Buffer:
-    return Tensor.from_ptr(ptr, shape, dtype, strides)
+def _get_pointer_type_annotation(value: object) -> tvm_ir.PointerType | None:
+    type_annotation = getattr(value, "type_annotation", None)
+    if isinstance(type_annotation, tvm_ir.PointerType):
+        return type_annotation
+    return None
+
+
+def _materialize_pointer_from_addr(addr: PrimExpr, dtype: DType, storage_scope: str = "global") -> Var:
+    from tilelang.language.eager.builder import Builder
+
+    builder = Builder.current()
+    if builder is None:
+        raise JITNoBuilderError("T.make_tensor() can only be used inside @tilelang.jit or @T.prim_func context. No Builder is available.")
+
+    value = addr if str(addr.dtype) == "handle" else tir.reinterpret("handle", addr)
+    ptr_type = tvm_ir.PointerType(tvm_ir.PrimType(DataType(dtype)), storage_scope)
+    return builder.enter_frame(LetStmt(value, type_annotation=ptr_type))
+
+
+def make_tensor_from_addr(
+    addr: PrimExpr,
+    shape: ShapeType,
+    dtype: DType = "float32",
+    strides: tuple[PrimExpr, ...] | None = None,
+    storage_scope: str = "global",
+) -> tir.Buffer:
+    from tilelang.language.eager.builder import Builder
+
+    if Builder.current() is None:
+        raise JITNoBuilderError(
+            "T.make_tensor_from_addr() can only be used inside @tilelang.jit or @T.prim_func context. No Builder is available."
+        )
+
+    dtype = _normalize_tensor_dtype(dtype)
+    pointer_var = _materialize_pointer_from_addr(addr, dtype, storage_scope)
+    return buffer(shape, dtype=dtype, data=pointer_var, strides=strides, scope=storage_scope)
+
+
+def make_tensor(ptr: Var | PrimExpr, shape: ShapeType, dtype: DType = "float32", strides: tuple[PrimExpr, ...] | None = None) -> tir.Buffer:
+    from tilelang.language.eager.builder import Builder
+
+    if Builder.current() is None:
+        raise JITNoBuilderError("T.make_tensor() can only be used inside @tilelang.jit or @T.prim_func context. No Builder is available.")
+
+    dtype = _normalize_tensor_dtype(dtype)
+    if isinstance(ptr, Var):
+        return Tensor.from_ptr(ptr, shape, dtype, strides)
+
+    ptr_type = _get_pointer_type_annotation(ptr)
+    storage_scope = ptr_type.storage_scope if ptr_type is not None and ptr_type.storage_scope else "global"
+    return make_tensor_from_addr(ptr, shape, dtype=dtype, strides=strides, storage_scope=storage_scope)
diff --git a/tilelang/language/random.py b/tilelang/language/random.py
index a76625be2e..f0b0b1f877 100644
--- a/tilelang/language/random.py
+++ b/tilelang/language/random.py
@@ -3,7 +3,7 @@
 
 
 # https://docs.nvidia.com/cuda/curand/device-api-overview.html#device-api-overview
-def rng_init(seed, seq=None, off=0):
+def rng_init(seed, seq=None, off=0, generator="curandStatePhilox4_32_10_t") -> tir.PrimExpr:
     """Initialize CUDA curand random number generator state
 
     Parameters
@@ -14,12 +14,16 @@ def rng_init(seed, seq=None, off=0):
         Sequence number for parallel random number generation.
     off : PrimExpr
         Offset number for parallel random number generation.
+    generator : StringImm
+        Set random generator.
+        See https://docs.nvidia.com/cuda/curand/group__DEVICE.html
 
     Returns
     -------
     state : PrimExpr
         The random number generator state handle.
     """
+    assert generator in ["curandStateMRG32k3a_t", "curandStatePhilox4_32_10_t", "curandStateXORWOW_t"]
     seed = tir.convert(seed)
     if seq is None:
         bx = T.get_block_binding()
@@ -30,10 +34,10 @@ def rng_init(seed, seq=None, off=0):
     else:
         seq = tir.convert(seq)
     off = tir.convert(off)
-    return tir.call_intrin("void", tir.op.Op.get("tl.rng_init"), seed, seq, off)
+    return tir.call_intrin("void", tir.op.Op.get("tl.rng_init"), seed, seq, off, generator)
 
 
-def rng_rand():
+def rng_rand() -> tir.PrimExpr:
     """Generate a 32-bit unsigned random integer
 
     Returns
@@ -42,3 +46,23 @@ def rng_rand():
         A 32-bit unsigned random integer.
     """
     return tir.call_intrin("uint32", tir.op.Op.get("tl.rng_rand"))
+
+
+def rng_rand_float(bit=32, dist="uniform") -> tir.PrimExpr:
+    """Generate a random float
+
+    Parameters
+    ----------
+    bit : int = [32, 64]
+        Bitwidth of random float.
+    dist : StringImm = ["uniform", "normal"]
+        Random distribution.
+
+    Returns
+    -------
+    random_value : PrimExpr
+        A random float.
+    """
+    assert bit in [32, 64]
+    assert dist in ["uniform", "normal"]
+    return tir.call_intrin("float" + str(bit), tir.op.Op.get("tl.rng_rand_float"), dist)
diff --git a/tilelang/language/reduce_op.py b/tilelang/language/reduce_op.py
index 9db56df0d1..fa82e8c72d 100644
--- a/tilelang/language/reduce_op.py
+++ b/tilelang/language/reduce_op.py
@@ -1,6 +1,8 @@
 """Reduce operations exposed on the TileLang language surface."""
 
 from __future__ import annotations
+from typing import Literal
+from tilelang._typing import BufferLikeType
 from tvm import tir
 from tilelang.language import copy, macro, alloc_shared, alloc_fragment
 from tilelang.utils.language import to_buffer_region, retrieve_shape, _get_buffer
@@ -16,8 +18,13 @@ def _legalize_dim(buffer: tir.Buffer, dim: int):
 
 _REDUCE_OP_KEY = "tl.tileop.reduce"
 
+ReduceKind = Literal["sum", "abssum", "max", "absmax", "min", "bitand", "bitor", "bitxor"]
 
-def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
+
+# NOTE(chaofan): T.reduce is implemented as a macro, so no return
+def reduce(
+    buffer: tir.Buffer, out: tir.Buffer, reduce_type: ReduceKind, dim: int, clear: bool, batch: int = 1, nan_propagate: bool = False
+) -> None:
     """Perform a reduction operation on a buffer along a specified dimension.
 
     Args:
@@ -26,10 +33,18 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
         reduce_type (str): Type of reduction ('max', 'min', 'sum', 'abssum')
         dim (int): Dimension along which to perform reduction
         clear (bool): Whether to initialize the output buffer before reduction
-
-    Returns:
-        tir.Call: Handle to the reduction operation
+        batch (int): Number of output elements per batched AllReduce call
+            (default 1 = scalar, current behaviour). When batch > 1 the
+            compiler emits ceil(N/batch) batched AllReduce calls each sharing
+            a single pair of barriers, reducing total barrier count by batch×.
+            batch must evenly divide the per-thread output element count N.
+        nan_propagate (bool): Only meaningful for max/min/absmax on
+            float16/bfloat16. When True, lower to CUDA __hmax_nan/__hmin_nan so
+            NaNs propagate through the reduction. When False (default), use
+            __hmax/__hmin which return the non-NaN operand. CUDA-only.
     """
+    if batch < 1:
+        raise ValueError(f"batch must be >= 1, got {batch}")
     # input shape: [X, d, Y], expected output shape: [X, Y] or [X, 1, Y]
     expected_shapes = [buffer.shape[:dim] + buffer.shape[dim + 1 :], buffer.shape[:dim] + [1] + buffer.shape[dim + 1 :]]
     if list(out.shape) not in expected_shapes:
@@ -39,8 +54,16 @@ def reduce(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clea
             f"output shape is {out.shape}, expected shapes are {expected_shapes_str}"
         )
 
+    annotations = {}
+    if batch > 1:
+        annotations["batch"] = batch
+    if nan_propagate:
+        annotations["nan_propagate"] = True
+    if not annotations:
+        annotations = None
+
     @macro
-    def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool):
+    def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int, clear: bool) -> None:
         if is_shared(buffer) and is_shared(out):
             red_frag_in = alloc_fragment(buffer.shape, buffer.dtype)
             red_frag_out = alloc_fragment(out.shape, out.dtype)
@@ -49,6 +72,9 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
             IRBuilder.name(buffer.name + "_frag", red_frag_in)
             IRBuilder.name(out.name + "_frag", red_frag_out)
 
+            if not clear:
+                copy(out, red_frag_out)
+
             copy(buffer, red_frag_in)
             tir.call_intrin(
                 "handle",
@@ -58,6 +84,7 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
                 reduce_type,
                 dim,
                 clear,
+                annotations=annotations,
             )
             copy(red_frag_out, out)
         elif is_shared(buffer) and is_fragment(out):
@@ -73,11 +100,15 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
                 reduce_type,
                 dim,
                 clear,
+                annotations=annotations,
             )
         elif is_fragment(buffer) and is_shared(out):
             red_frag_out = alloc_fragment(out.shape, out.dtype)
             IRBuilder.name(out.name + "_frag", red_frag_out)
 
+            if not clear:
+                copy(out, red_frag_out)
+
             tir.call_intrin(
                 "handle",
                 tir.op.Op.get(_REDUCE_OP_KEY),
@@ -86,6 +117,7 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
                 reduce_type,
                 dim,
                 clear,
+                annotations=annotations,
             )
             copy(red_frag_out, out)
         elif is_fragment(buffer) and is_fragment(out):
@@ -97,14 +129,15 @@ def reduce_macro(buffer: tir.Buffer, out: tir.Buffer, reduce_type: str, dim: int
                 reduce_type,
                 dim,
                 clear,
+                annotations=annotations,
             )
         else:
             raise ValueError(f"Invalid buffer scopes: {buffer.scope()} and {out.scope()}")
 
-    return reduce_macro(buffer, out, reduce_type, dim, clear)
+    reduce_macro(buffer, out, reduce_type, dim, clear)
 
 
-def reduce_max(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_max(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True, batch: int = 1, nan_propagate: bool = False) -> None:
     """Perform reduce max on input buffer, store the result to output buffer
 
     Parameters
@@ -117,15 +150,21 @@ def reduce_max(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool =
         The dimension to perform reduce on
     clear : bool
         If set to True, the output buffer will first be initialized to -inf.
+    batch : int
+        Number of output elements per batched AllReduce call (default 1).
+    nan_propagate : bool
+        For float16/bfloat16 only. When True, NaN inputs propagate through the
+        reduction (CUDA __hmax_nan). When False (default), NaN inputs are
+        ignored in favor of the other operand (CUDA __hmax). CUDA-only.
     Returns
     -------
     handle : PrimExpr
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "max", dim, clear)
+    reduce(buffer, out, "max", dim, clear, batch=batch, nan_propagate=nan_propagate)
 
 
-def reduce_min(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_min(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True, batch: int = 1, nan_propagate: bool = False) -> None:
     """Perform reduce min on input buffer, store the result to output buffer.
 
     Args:
@@ -133,15 +172,19 @@ def reduce_min(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool =
         out (tir.Buffer): The output buffer
         dim (int): The dimension to perform reduce on
         clear (bool, optional): If True, output buffer will be initialized to inf. Defaults to True.
+        batch (int): Number of output elements per batched AllReduce call (default 1).
+        nan_propagate (bool, optional): For float16/bfloat16 only. When True,
+            NaN inputs propagate (CUDA __hmin_nan). When False (default), NaNs
+            are ignored (CUDA __hmin). CUDA-only.
 
     Returns:
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "min", dim, clear)
+    reduce(buffer, out, "min", dim, clear, batch=batch, nan_propagate=nan_propagate)
 
 
-def reduce_sum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_sum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True, batch: int = 1) -> None:
     """Perform reduce sum on input buffer, store the result to output buffer.
 
     Args:
@@ -151,6 +194,7 @@ def reduce_sum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool =
         clear (bool, optional): If True, output buffer will be cleared before reduction.
                               If False, results will be accumulated on existing values.
                               Defaults to True.
+        batch (int): Number of output elements per batched AllReduce call (default 1).
     Note: When clear=True, reduce_sum will not compute directly on the output buffer. This is because
           during warp reduction, the same value would be accumulated multiple times (number of threads
           in the warp). Therefore, the implementation with clear=True follows these steps:
@@ -163,70 +207,79 @@ def reduce_sum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool =
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "sum", dim, clear)
+    reduce(buffer, out, "sum", dim, clear, batch=batch)
 
 
-def reduce_abssum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1):
+def reduce_abssum(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, batch: int = 1) -> None:
     """Perform reduce absolute sum on input buffer, store the result to output buffer.
 
     Args:
         buffer (tir.Buffer): The input buffer
         out (tir.Buffer): The output buffer
         dim (int): The dimension to perform reduce on
+        batch (int): Number of output elements per batched AllReduce call (default 1).
 
     Returns:
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "abssum", dim, True)
+    reduce(buffer, out, "abssum", dim, True, batch=batch)
 
 
-def reduce_absmax(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_absmax(
+    buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True, batch: int = 1, nan_propagate: bool = False
+) -> None:
     """Perform reduce absolute max on input buffer, store the result to output buffer.
 
     Args:
         buffer (tir.Buffer): The input buffer
         out (tir.Buffer): The output buffer
         dim (int): The dimension to perform reduce on
+        batch (int): Number of output elements per batched AllReduce call (default 1).
+        nan_propagate (bool, optional): For float16/bfloat16 only. When True,
+            NaN inputs propagate (CUDA __hmax_nan). When False (default), NaNs
+            are ignored. CUDA-only.
 
     Returns:
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "absmax", dim, clear)
+    reduce(buffer, out, "absmax", dim, clear, batch=batch, nan_propagate=nan_propagate)
 
 
-def reduce_bitand(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_bitand(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True, batch: int = 1) -> None:
     """Perform reduce bitwise-and on input buffer, store the result to output buffer.
 
     Args:
         buffer (tir.Buffer): The input buffer
         out (tir.Buffer): The output buffer
         dim (int): The dimension to perform reduce on
+        batch (int): Number of output elements per batched AllReduce call (default 1).
 
     Returns:
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "bitand", dim, clear)
+    reduce(buffer, out, "bitand", dim, clear, batch=batch)
 
 
-def reduce_bitor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_bitor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True, batch: int = 1) -> None:
     """Perform reduce bitwise-or on input buffer, store the result to output buffer.
 
     Args:
         buffer (tir.Buffer): The input buffer
         out (tir.Buffer): The output buffer
         dim (int): The dimension to perform reduce on
+        batch (int): Number of output elements per batched AllReduce call (default 1).
 
     Returns:
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "bitor", dim, clear)
+    reduce(buffer, out, "bitor", dim, clear, batch=batch)
 
 
-def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True):
+def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: bool = True, batch: int = 1) -> None:
     """Perform reduce bitwise-xor on input buffer, store the result to output buffer.
 
     Args:
@@ -238,16 +291,16 @@ def reduce_bitxor(buffer: tir.Buffer, out: tir.Buffer, dim: int = -1, clear: boo
         tir.Call: Handle to the reduction operation
     """
     dim = _legalize_dim(buffer, dim)
-    return reduce(buffer, out, "bitxor", dim, clear)
+    reduce(buffer, out, "bitxor", dim, clear, batch=batch)
 
 
 @macro
 def cumsum_fragment(
-    src: tir.Buffer,
-    dst: tir.Buffer,
+    src: BufferLikeType,
+    dst: BufferLikeType,
     dim: int,
     reverse: bool,
-) -> tir.PrimExpr:
+) -> None:
     """
     Compute cumulative sum for fragment buffers by copying to shared memory first.
 
@@ -259,9 +312,6 @@ def cumsum_fragment(
         dst: Destination buffer (Buffer, BufferRegion, or BufferLoad) for output data.
         dim: Dimension along which to compute cumulative sum.
         reverse: If True, compute cumulative sum in reverse order.
-
-    Returns:
-        tir.PrimExpr: A handle to the cumulative sum operation.
     """
     src_shape = retrieve_shape(src)
     src_buffer = _get_buffer(src)
@@ -283,12 +333,13 @@ def cumsum_fragment(
     copy(cumsum_smem, dst)
 
 
+# NOTE(chaofan): T.cumsum returns None if it goes to macro implementations
 def cumsum(
-    src: tir.Buffer | tir.BufferRegion | tir.BufferLoad,
-    dst: tir.Buffer | tir.BufferRegion | tir.BufferLoad | None = None,
+    src: BufferLikeType,
+    dst: BufferLikeType | None = None,
     dim: int = 0,
     reverse: bool = False,
-):
+) -> tir.PrimExpr | None:
     """
     Compute the cumulative sum of `src` along `dim`, writing results to `dst`.
 
@@ -353,7 +404,9 @@ def cumsum(
 
     # Check if src is a fragment buffer
     if is_fragment(src):
-        return cumsum_fragment(src, dst, dim, reverse)
+        cumsum_fragment(src, dst, dim, reverse)
+        return
+
     return tir.call_intrin(
         "handle",
         tir.op.Op.get("tl.tileop.cumsum"),
@@ -364,7 +417,7 @@ def cumsum(
     )
 
 
-def finalize_reducer(reducer: tir.Buffer):
+def finalize_reducer(reducer: tir.Buffer, batch: int = 1) -> tir.PrimExpr:
     """
     Finalize a reducer buffer by emitting the `tl.tileop.finalize_reducer` intrinsic.
 
@@ -373,18 +426,29 @@ def finalize_reducer(reducer: tir.Buffer):
 
     Parameters:
         reducer (tir.Buffer): Reducer buffer whose writable pointer will be finalized.
+        batch (int): Batch size for the AllReduce call (default 1 = scalar path,
+            matching the T.reduce default).  When batch > 1, the compiler emits a
+            single batched AllReduce call covering `batch` output elements at a
+            time, reducing barrier count by batch×.  batch must evenly divide the
+            total number of per-thread output elements.
 
     Returns:
         tir.Call: Handle to the finalize reducer intrinsic call.
     """
+    if batch < 1:
+        raise ValueError(f"finalize_reducer: batch must be >= 1, got {batch}")
+    annotations = {}
+    if batch > 1:
+        annotations["batch"] = batch
     return tir.call_intrin(
         "handle",
         tir.op.Op.get("tl.tileop.finalize_reducer"),
         to_buffer_region(reducer, access_type="w"),
+        annotations=annotations if annotations else None,
     )
 
 
-def warp_reduce_sum(value: tir.PrimExpr):
+def warp_reduce_sum(value: tir.PrimExpr) -> tir.PrimExpr:
     """Perform warp reduction sum on a register value.
 
     This function reduces a value across all threads in a warp using shuffle operations.
@@ -400,7 +464,7 @@ def warp_reduce_sum(value: tir.PrimExpr):
     return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_sum"), value)
 
 
-def warp_reduce_max(value: tir.PrimExpr):
+def warp_reduce_max(value: tir.PrimExpr) -> tir.PrimExpr:
     """Perform warp reduction max on a register value.
 
     This function reduces a value across all threads in a warp using shuffle operations.
@@ -416,7 +480,7 @@ def warp_reduce_max(value: tir.PrimExpr):
     return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_max"), value)
 
 
-def warp_reduce_min(value: tir.PrimExpr):
+def warp_reduce_min(value: tir.PrimExpr) -> tir.PrimExpr:
     """Perform warp reduction min on a register value.
 
     This function reduces a value across all threads in a warp using shuffle operations.
@@ -432,7 +496,7 @@ def warp_reduce_min(value: tir.PrimExpr):
     return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_min"), value)
 
 
-def warp_reduce_bitand(value: tir.PrimExpr):
+def warp_reduce_bitand(value: tir.PrimExpr) -> tir.PrimExpr:
     """Perform warp reduction bitwise-and on a register value.
 
     This function reduces a value across all threads in a warp using shuffle operations.
@@ -448,7 +512,7 @@ def warp_reduce_bitand(value: tir.PrimExpr):
     return tir.call_intrin(value.dtype, tir.op.Op.get("tl.warp_reduce_bitand"), value)
 
 
-def warp_reduce_bitor(value: tir.PrimExpr):
+def warp_reduce_bitor(value: tir.PrimExpr) -> tir.PrimExpr:
     """Perform warp reduction bitwise-or on a register value.
 
     This function reduces a value across all threads in a warp using shuffle operations.
diff --git a/tilelang/language/symbolics.py b/tilelang/language/symbolics.py
index 34c74a2b3f..517d6e21c9 100644
--- a/tilelang/language/symbolics.py
+++ b/tilelang/language/symbolics.py
@@ -4,12 +4,13 @@
 import re
 from tvm import tir
 
+from tilelang._typing import DType
 from tilelang.utils import deprecated
 
 __all__ = ["dynamic", "symbolic"]
 
 
-def dynamic(name: str, dtype: str = "int32") -> tuple[tir.Var, ...] | tir.Var:
+def dynamic(name: str, dtype: DType = "int32") -> tuple[tir.Var, ...] | tir.Var:
     """
     Create a TIR dynamic symbolic variable.
 
@@ -30,6 +31,6 @@ def dynamic(name: str, dtype: str = "int32") -> tuple[tir.Var, ...] | tir.Var:
 
 
 @deprecated("T.symbolic(...)", "T.dynamic(...)", "v0.1.9")
-def symbolic(name: str, dtype: str = "int32"):
+def symbolic(name: str, dtype: DType = "int32") -> tuple[tir.Var, ...] | tir.Var:
     """Deprecated alias for `T.dynamic`."""
     return dynamic(name, dtype)
diff --git a/tilelang/language/tir/ir.py b/tilelang/language/tir/ir.py
index 0b86033f80..384be21ccc 100644
--- a/tilelang/language/tir/ir.py
+++ b/tilelang/language/tir/ir.py
@@ -1,4 +1,3 @@
-from __future__ import annotations
 import tvm.script.ir_builder.tir.ir as _ir
 from tvm.script.ir_builder.tir import frame
 from tvm.tir import PrimExpr
@@ -193,6 +192,7 @@ def wrapped(*args, **kwargs):
 exp10 = _op_wrapper(_tir_op.exp10)
 floor = _op_wrapper(_tir_op.floor)
 ceildiv = _op_wrapper(_tir_op.ceildiv)
+cdiv = ceildiv
 floordiv = _op_wrapper(_tir_op.floordiv)
 floormod = _op_wrapper(_tir_op.floormod)
 fmod = _op_wrapper(_tir_op.fmod)
@@ -261,6 +261,7 @@ def wrapped(*args, **kwargs):
 ptx_commit_group = _op_wrapper(_tir_op.ptx_commit_group)
 ptx_cp_async_barrier = _op_wrapper(_tir_op.ptx_cp_async_barrier)
 ptx_init_barrier_thread_count = _op_wrapper(_tir_op.ptx_init_barrier_thread_count)
+ptx_fence_barrier_init = _op_wrapper(_tir_op.ptx_fence_barrier_init)
 ptx_arrive_barrier = _op_wrapper(_tir_op.ptx_arrive_barrier)
 ptx_arrive_barrier_expect_tx = _op_wrapper(_tir_op.ptx_arrive_barrier_expect_tx)
 ptx_wait_barrier = _op_wrapper(_tir_op.ptx_wait_barrier)
@@ -277,7 +278,9 @@ def wrapped(*args, **kwargs):
 anylist_setitem_call_cpacked = _op_wrapper(_tir_op.anylist_setitem_call_cpacked)
 vscale = _op_wrapper(_tir_op.vscale)
 
-reinterpret = _dtype_forward(_tir_op.reinterpret)
+# reinterpret = _dtype_forward(_tir_op.reinterpret)
+reinterpret = _tir_op.reinterpret
+
 call_extern = _dtype_forward(_tir_op.call_extern)
 call_intrin = _dtype_forward(_tir_op.call_intrin)
 call_llvm_intrin = _dtype_forward(_tir_op.call_llvm_intrin)
@@ -289,7 +292,8 @@ def wrapped(*args, **kwargs):
 ptx_wgmma_rs = _dtype_forward(_tir_op.ptx_wgmma_rs)
 ptx_tcgen05_mma_ss = _dtype_forward(_tir_op.ptx_tcgen05_mma_ss)
 ptx_tcgen05_mma_ts = _dtype_forward(_tir_op.ptx_tcgen05_mma_ts)
-ptx_ldmatrix = _dtype_forward(_tir_op.ptx_ldmatrix)
+ptx_tcgen05_mma_blockscaled_ss = _dtype_forward(_tir_op.ptx_tcgen05_mma_blockscaled_ss)
+ptx_ldmatrix = _tir_op.ptx_ldmatrix
 ptx_cp_async = _dtype_forward(_tir_op.ptx_cp_async)
 ptx_cp_async_bulk = _dtype_forward(_tir_op.ptx_cp_async_bulk)
 mma_store = _dtype_forward(_tir_op.mma_store)
@@ -301,65 +305,3 @@ def wrapped(*args, **kwargs):
 tvm_mfma_store = _dtype_forward(_tir_op.tvm_mfma_store)
 tvm_rdna_wmma = _dtype_forward(_tir_op.tvm_rdna_wmma)
 tvm_rdna_wmma_store = _dtype_forward(_tir_op.tvm_rdna_wmma_store)
-
-### Distributed enums ###
-import sys
-from enum import IntEnum
-
-
-class Team(IntEnum):
-    INVALID = -1
-    WORLD = 0
-    WORLD_INDEX = 0
-    SHARED = 1
-    SHARED_INDEX = 1
-    NODE = 2
-    NODE_INDEX = 2
-    SAME_MYPE_NODE = 3
-    SAME_MYPE_NODE_INDEX = 3
-    SAME_GPU = 4
-    SAME_GPU_INDEX = 4
-    GPU_LEADERS = 5
-    GPU_LEADERS_INDEX = 5
-    TEAMS_MIN = 6
-    TEAM_INDEX_MAX = sys.maxsize
-
-
-class CmpType(IntEnum):
-    EQ = 0
-    NE = 1
-    GT = 2
-    LE = 3
-    LT = 4
-    GE = 5
-    SENTINEL = sys.maxsize
-
-
-class Amo(IntEnum):
-    """Atomic Memory Operation (AMO) types.
-    Note: Signal ops (AMO_SIGNAL_SET and AMO_SIGNAL_ADD) are
-    included as a part of the AMO operations.
-    """
-
-    AMO_ACK = 1
-    AMO_INC = 2
-    AMO_SET = 3
-    AMO_ADD = 4
-    AMO_AND = 5
-    AMO_OR = 6
-    AMO_XOR = 7
-    AMO_SIGNAL = 8
-    SIGNAL_SET = 9
-    SIGNAL_ADD = 10
-    AMO_SIGNAL_SET = 9  # the same as SIGNAL_SET
-    AMO_SIGNAL_ADD = 10  # the same as SIGNAL_ADD
-    AMO_END_OF_NONFETCH = 11  # end of nonfetch atomics
-    AMO_FETCH = 12
-    AMO_FETCH_INC = 13
-    AMO_FETCH_ADD = 14
-    AMO_FETCH_AND = 15
-    AMO_FETCH_OR = 16
-    AMO_FETCH_XOR = 17
-    AMO_SWAP = 18
-    AMO_COMPARE_SWAP = 19
-    AMO_OP_SENTINEL = sys.maxsize
diff --git a/tilelang/language/tir/ir.pyi b/tilelang/language/tir/ir.pyi
index 76199cc3b3..6414d6453e 100644
--- a/tilelang/language/tir/ir.pyi
+++ b/tilelang/language/tir/ir.pyi
@@ -1,9 +1,11 @@
 from typing import TypeVar, Literal
+from tilelang._typing import DType
 from tvm.tir.expr import Span, PrimExpr, BufferLoad, Var, IntImm
 
 _T = TypeVar("_T")
 
-def Cast(dtype, value: _T, span: Span | None = None) -> _T: ...
+def cast(value: _T, dtype: DType, span: Span | None = None) -> _T: ...
+def reinterpret(value: _T, dtype: DType, span: Span | None = None) -> _T: ...
 def abs(x: _T, span: Span | None = None) -> _T: ...
 def acos(x: _T) -> _T: ...
 def acosh(x: _T) -> _T: ...
@@ -28,12 +30,13 @@ def exp2(x: _T) -> _T: ...
 def exp10(x: _T) -> _T: ...
 def floor(x: _T, span: Span | None = None) -> _T: ...
 def ceildiv(lhs: _T, rhs: _T, span: Span | None = None) -> _T: ...
+def cdiv(lhs: _T, rhs: _T, span: Span | None = None) -> _T: ...
 def floordiv(a: _T, b: _T, span: Span | None = None) -> _T: ...
 def floormod(a: _T, b: _T, span: Span | None = None) -> _T: ...
 def fmod(x: _T, y: _T) -> _T: ...
 def hypot(x1: _T, x2: _T) -> _T: ...
 def if_then_else(cond: PrimExpr, t: _T, f: _T, span: Span | None = None) -> _T: ...
-def infinity(dtype: _T, span: Span | None = None) -> _T: ...
+def infinity(dtype: DType, span: Span | None = None) -> PrimExpr: ...
 def isfinite(x: _T, span: Span | None = None) -> _T: ...
 def isinf(x: _T, span: Span | None = None) -> _T: ...
 def isnan(x: _T, span: Span | None = None) -> _T: ...
@@ -46,9 +49,9 @@ def log2(x: _T) -> _T: ...
 def log10(x: _T) -> _T: ...
 def lookup_param(param_name: str, span: Span | None = None) -> PrimExpr: ...
 def max(x: _T, y: _T, span: Span | None = None) -> _T: ...
-def max_value(dtype: str, span: Span | None = None) -> PrimExpr: ...
+def max_value(dtype: DType, span: Span | None = None) -> PrimExpr: ...
 def min(x: _T, y: _T, span: Span | None = None) -> _T: ...
-def min_value(dtype: str, span: Span | None = None) -> PrimExpr: ...
+def min_value(dtype: DType, span: Span | None = None) -> PrimExpr: ...
 def nearbyint(x: _T, span: Span | None = None) -> _T: ...
 def nextafter(x1: _T, x2: _T) -> _T: ...
 def popcount(x: _T) -> _T: ...
diff --git a/tilelang/language/tir/op.py b/tilelang/language/tir/op.py
index 20876a9441..0aee38da35 100644
--- a/tilelang/language/tir/op.py
+++ b/tilelang/language/tir/op.py
@@ -7,6 +7,9 @@
 from tvm.tir.expr import IntImm, PrimExprWithOp
 import tvm.tir.op as _tvm_op
 
+from tilelang.language.dtypes import _is_any_dtype
+from tilelang.utils.deprecated import deprecated_warning
+
 
 def call_packed(*args, span=None):
     """Build expression by call an external packed function.
@@ -1152,13 +1155,14 @@ def ptx_tcgen05_mma_ss(
     mask2,
     mask3,
     enable_ws=False,
+    enable_2cta=False,
     ws=None,
     warp_specialized=None,
     variant=None,
 ):
-    """TVM intrinsic for tcgen05.mma shared-memory × shared-memory instructions.
+    """TVM intrinsic for tcgen05.mma shared-memory x shared-memory instructions.
 
-    Expects 13 or 14 positional arguments:
+    Expects 14 or 15 positional arguments:
     (kind_dtype, desc_a, A_offset, desc_b, B_offset, C_ptr, C_offset,
      desc_val, scale_out, mask0, mask1, mask2, mask3[, enable_ws]).
     Aliases: you can also pass `ws` or `warp_specialized` (booleans) instead of `enable_ws`.
@@ -1201,6 +1205,7 @@ def ptx_tcgen05_mma_ss(
         mask2,
         mask3,
         enable_ws,
+        enable_2cta,
     )
 
 
@@ -1218,8 +1223,9 @@ def ptx_tcgen05_mma_ts(
     mask1,
     mask2,
     mask3,
+    enable_2cta=False,
 ):
-    """TVM intrinsic for tcgen05.mma tensor-memory × shared-memory instructions.
+    """TVM intrinsic for tcgen05.mma tensor-memory x shared-memory instructions.
 
     Expects 13 positional arguments:
     (kind_dtype, A_ptr, A_offset, desc_b, B_offset, C_ptr, C_offset,
@@ -1243,6 +1249,79 @@ def ptx_tcgen05_mma_ts(
         mask1,
         mask2,
         mask3,
+        enable_2cta,
+    )
+
+
+def ptx_tcgen05_mma_blockscaled_ss(
+    kind_dtype,
+    desc_a,
+    A_offset,
+    desc_b,
+    B_offset,
+    C_ptr,
+    C_offset,
+    desc_val,
+    scale_out,
+    sfa_ptr,
+    sfa_offset,
+    sfb_ptr,
+    sfb_offset,
+    reserved0=0,
+    reserved1=0,
+    variant=False,
+    enable_2cta=False,
+):
+    """TVM intrinsic for tcgen05.mma block-scaled (mxf8f6f4.block_scale) instructions.
+
+    Block-scaled TCGEN05 is explicit-async and carries an explicit ``enable_2cta``
+    flag, analogous to the regular SS/TS TCGEN05 intrinsics. There is no
+    fallback path if 2CTA is requested.
+
+    Positional args:
+    kind_dtype, desc_a, A_offset, desc_b, B_offset, C_ptr, C_offset,
+    desc_val, scale_out, sfa_ptr, sfa_offset, sfb_ptr, sfb_offset,
+    reserved0, reserved1, enable_ws, enable_2cta.
+    """
+
+    if enable_2cta and isinstance(variant, str):
+        v_check = variant.lower()
+        if v_check in ("ws", "warp_specialized", "warp-specialized"):
+            raise ValueError("ptx_tcgen05_mma_blockscaled_ss: .ws and 2CTA cannot be combined")
+    elif enable_2cta and bool(variant):
+        raise ValueError("ptx_tcgen05_mma_blockscaled_ss: .ws and 2CTA cannot be combined")
+
+    if isinstance(variant, str):
+        v = variant.lower()
+        if v in ("ws", "warp_specialized", "warp-specialized"):
+            enable_ws = True
+        elif v in ("default", "std", "ss"):
+            enable_ws = False
+        else:
+            raise ValueError(f"ptx_tcgen05_mma_blockscaled_ss: unknown variant: {variant}")
+    else:
+        enable_ws = bool(variant)
+
+    return call_intrin(
+        "handle",
+        _tvm_op.Op.get("tl.ptx_tcgen05_mma_blockscaled_ss"),
+        kind_dtype,
+        desc_a,
+        A_offset,
+        desc_b,
+        B_offset,
+        C_ptr,
+        C_offset,
+        desc_val,
+        scale_out,
+        sfa_ptr,
+        sfa_offset,
+        sfb_ptr,
+        sfb_offset,
+        reserved0,
+        reserved1,
+        enable_ws,
+        enable_2cta,
     )
 
 
@@ -1305,74 +1384,97 @@ def mma_fill(dtype, local_size, local_ptr, offset):
     return _tvm_op.mma_fill(dtype, local_size, local_ptr, offset)
 
 
-def ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset):
-    """TVM intrinsic for ptx load matrix from shared memory
+def ptx_ldmatrix(trans, num, src_access_ptr, dst_access_ptr):
+    """TileLang intrinsic for ptx load matrix from shared memory
+
+    Uses `tl.ptx_ldmatrix` which expects access pointers created via
+    `T.access_ptr` (i.e. `tl.access_ptr` wrapping a `BufferLoad`).
+
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#warp-level-matrix-instructions-ldmatrix
 
     Parameters
     ----------
-    dtype : str
-       The data type of the result.
-
     trans : bool
         The matrix is loaded in column-major format.
 
     num : IntImm
-        The number of matrices.
-
-    type : Literal[".b16"]
-        The data type of the matrices.
-
-    local_ptr : Var
-        The local pointer variable.
-
-    local_offset : Expr
-        The offset of local pointer.
+        The number of matrices (2 or 4).
 
-    smem_ptr : Var
-        The shared memory pointer variable.
+    src_access_ptr : PrimExpr
+        A `tl.access_ptr` pointing to the source (shared memory) buffer.
 
-    smem_offset : Expr
-        The offset of shared memort pointer.
+    dst_access_ptr : PrimExpr
+        A `tl.access_ptr` pointing to the destination (local/register) buffer.
 
     Returns
     -------
     call : PrimExpr
-        The call expression.
+        The call expression (handle-typed).
     """
-    return _tvm_op.ptx_ldmatrix(dtype, trans, num, type, local_ptr, local_offset, smem_ptr, smem_offset)
+    return tvm.tir.call_intrin(
+        "handle",
+        tvm.tir.op.Op.get("tl.ptx_ldmatrix"),
+        trans,
+        num,
+        src_access_ptr,
+        dst_access_ptr,
+    )
 
 
-def ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes):
+def ptx_cp_async(dst_access_ptr, src_access_ptr, num_elems, predicate=None):
     """TVM intrinsic for ptx async copy from global to shared memory using cp.async
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#data-movement-and-conversion-instructions-cp-async
 
     Parameters
     ----------
-    dtype : str
-       The data type of the result.
+    dst_access_ptr : PrimExpr
+        The destination (shared memory) access pointer created by tvm_access_ptr.
+        Should include pointer, offset, extent, and write access flag (rw_mask=2).
 
-    shared_ptr : Var
-        The shared memory pointer variable.
+    src_access_ptr : PrimExpr
+        The source (global memory) access pointer created by tvm_access_ptr.
+        Should include pointer, offset, extent, and read access flag (rw_mask=1).
 
-    shared_offset : Expr
-        The offset of shared memory pointer.
+    num_elems : int or PrimExpr
+        The number of logical elements to copy.
 
-    global_ptr : Var
-        The global memory pointer variable.
+        For TileLang's ``tl.ptx_cp_async`` frontend op, the final PTX byte width
+        is derived later from ``num_elems * element_bits(access_ptr)`` and must
+        eventually land on a legal ``cp.async`` width of 4, 8, or 16 bytes.
 
-    global_offset : Expr
-        The offset of global memory pointer.
-
-    bytes : int
-        The data size to copy.
+    predicate : PrimExpr, optional
+        Optional predicate condition for conditional cp.async. When provided, the copy
+        will only be performed if the predicate evaluates to true. Otherwise, the
+        destination will be filled with zeros (default behavior of cp.async).
 
     Returns
     -------
     call : PrimExpr
         The call expression.
-    """
-    return _tvm_op.ptx_cp_async(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes)
+
+    Examples
+    --------
+    >>> # Copy 16 uint8 elements (= 16 bytes) from global to shared memory
+    >>> T.ptx_cp_async(
+    ...     T.tvm_access_ptr(T.type_annotation(T.uint8), A_shared.data, 0, 16, 2),  # dst
+    ...     T.tvm_access_ptr(T.type_annotation(T.uint8), B_global.data, 0, 16, 1),  # src
+    ...     16  # num_elems
+    ... )
+    >>>
+    >>> # Predicated cp.async (only copy if condition is true)
+    >>> T.ptx_cp_async(
+    ...     T.tvm_access_ptr(T.type_annotation(T.uint8), A_shared.data, 0, 16, 2),
+    ...     T.tvm_access_ptr(T.type_annotation(T.uint8), B_global.data, 0, 16, 1),
+    ...     16,
+    ...     predicate=guard  # only copy if guard is true
+    ... )
+    """
+    from tvm import tir
+
+    if predicate is None:
+        return tir.call_intrin("", tir.op.Op.get("tl.ptx_cp_async"), dst_access_ptr, src_access_ptr, num_elems)
+    else:
+        return tir.call_intrin("", tir.op.Op.get("tl.ptx_cp_async"), dst_access_ptr, src_access_ptr, num_elems, predicate)
 
 
 def ptx_cp_async_bulk(dtype, shared_ptr, shared_offset, global_ptr, global_offset, bytes, barrier_id):
@@ -1726,6 +1828,19 @@ def ptx_init_barrier_thread_count(barrier_id, thread_count):
     return _tvm_op.ptx_init_barrier_thread_count(barrier_id, thread_count)
 
 
+def ptx_fence_barrier_init():
+    """TVM intrinsic for ptx fence barrier initialization.
+
+    Returns
+    -------
+    call : PrimExpr
+        The call expression.
+    """
+    from tvm import tir
+
+    return tir.call_intrin("handle", tir.op.Op.get("tl.ptx_fence_barrier_init"))
+
+
 def ptx_arrive_barrier(barrier_id):
     """TVM intrinsic for ptx barrier arrival using mbarrier.arrive
     https://docs.nvidia.com/cuda/parallel-thread-execution/index.html#parallel-synchronization-and-communication-instructions-mbarrier-arrive
@@ -1995,17 +2110,19 @@ def infinity(dtype: str, span: Span | None = None) -> Any:
     return call_intrin(dtype, _tvm_op.Op.get("tl.infinity"), dtype, span=span)
 
 
-def reinterpret(dtype, value, span: Span | None = None) -> Any:
-    """infinity value of dtype
+# NOTE(chaofan): Here we use the argument order (value, dtype, ...) instead of (dtype, value, ...) in TVM
+# to be consistent with T.cast.
+def reinterpret(value, dtype, span: Span | None = None) -> Any:
+    """Reinterpret cast a value to dtype.
 
     Parameters
     ----------
-    dtype : str
-        The data type.
-
     value : PrimExpr
         The input value.
 
+    dtype : str
+        The data type.
+
     span : Optional[Span]
         The location of this operator in the source code.
 
@@ -2014,6 +2131,11 @@ def reinterpret(dtype, value, span: Span | None = None) -> Any:
     value : tvm.Expr
         The reinterpret cast value of dtype.
     """
+
+    # NOTE(chaofan): For compatibility, we allow the old API where dtype comes first
+    if _is_any_dtype(value):
+        deprecated_warning("T.reinterpret(dtype, value)", "reinterpret(value, dtype)")
+        value, dtype = dtype, value
     return _tvm_op.reinterpret(dtype, value, span)
 
 
diff --git a/tilelang/language/utils.py b/tilelang/language/utils.py
index 7d6829419b..2e1df2d239 100644
--- a/tilelang/language/utils.py
+++ b/tilelang/language/utils.py
@@ -1,25 +1,21 @@
+"""Utils in TileLang operators."""
+
+from __future__ import annotations
+
 from tilelang import tvm as tvm
-from tvm import tir
+from tvm import ir, tir
 from tvm.tir import PrimExpr, BufferLoad, op
 from tilelang import language as T
+from tilelang._typing import BufferLikeType, ShapeType
 
 
-def region(buffer: BufferLoad, access_type: str, *args: PrimExpr):
+def region(buffer: BufferLoad, access_type: str, *args: PrimExpr) -> PrimExpr:
     """Create a tl.region call for a BufferLoad and extents."""
     access_type = {"r": 1, "w": 2, "rw": 3}[access_type]
     return T.call_intrin("handle", op.Op.get("tl.tileop.region"), buffer, access_type, *args)
 
 
-def buffer_load_to_tile_region(load: BufferLoad, access_type: str, extents: list[PrimExpr]):
-    """Convert a BufferLoad to a tl.region call with explicit extents."""
-    indices = list(load.indices)
-    if len(indices) > len(extents):
-        extents = [tir.IntImm("int32", 1) for _ in range(len(indices) - len(extents))] + list(extents)
-    assert len(indices) == len(extents), f"indices = {indices}, extents = {extents}"
-    return region(load, access_type, *extents)
-
-
-def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str, extents: list[tir.PrimExpr]):
+def buffer_region_to_tile_region(buffer_region: tir.BufferRegion, access_type: str, extents: list[tir.PrimExpr]) -> PrimExpr:
     """Clamp extents and return a tl.region call."""
     mins = [r.min for r in buffer_region.region]
     region_extents = [r.extent for r in buffer_region.region]
@@ -99,3 +95,67 @@ def linear_index(*args: PrimExpr) -> PrimExpr:
     for idx, stride in zip(coords[1:], strides):
         linear = linear * stride + idx
     return linear
+
+
+def get_buffer_region_from_load(buffer_load: tir.BufferLoad, extents: list[PrimExpr] | None = None) -> tir.BufferRegion | None:
+    """
+    Get the buffer region from a buffer load.
+
+    May encounter buffer load like C[0:128, 0:32], ref to pull request
+    for buffer wise op: https://github.com/apache/tvm/pull/14693
+    convert load to region.
+
+    If the buffer load has ramp indices, we will use the ramp's base and lanes to create the region.
+    Otherwise, return None since the load cannot be converted to a region.
+    """
+    buffer, indices = buffer_load.buffer, buffer_load.indices
+    regions = []
+    found_ramp: bool = False
+
+    if extents is not None:
+        assert len(extents) == len(indices), "extents should have the same length as indices"
+    for i, indice in enumerate(indices):
+        if isinstance(indice, tir.Ramp):
+            assert extents is None, "extents should be provided for BufferLoad with Ramp indices"
+            regions.append(ir.Range.from_min_extent(indice.base, indice.lanes))
+            found_ramp = True
+        elif isinstance(indice, tir.PrimExpr):
+            if extents is not None:
+                regions.append(ir.Range.from_min_extent(indice, extents[i]))
+                found_ramp = True
+            else:
+                regions.append(ir.Range.from_min_extent(indice, 1))
+        else:
+            raise ValueError(f"Unsupported type: {type(indice)} for index {i}")
+    if found_ramp:
+        return tir.BufferRegion(buffer, regions)
+    else:
+        # NOTE(chaofan): Or we can return a region with extent 1?
+        return None
+
+
+def get_extent(data: BufferLikeType) -> ShapeType | None:
+    """Return the inferred extent (shape) of a buffer-like object.
+
+    If `data` is a Var bound to a let value, the let value is resolved before inspection.
+
+    Parameters:
+        data: A Var, Buffer, BufferLoad or BufferRegion to inspect.
+
+    Returns:
+        The shape/extents as a list-like of PrimExpr (Buffer.shape or list of region item extents), or None if the extent cannot be determined.
+    """
+
+    if isinstance(data, tir.Var) and T.has_let_value(data):
+        data = T.get_let_value(data)
+    if isinstance(data, tir.Buffer):
+        return data.shape
+    elif isinstance(data, tir.BufferRegion):
+        return [x.extent for x in data.region]
+    elif isinstance(data, tir.BufferLoad):
+        region = get_buffer_region_from_load(data)
+        if region is None:
+            return None
+        return [x.extent for x in region.region]
+    else:
+        return None
diff --git a/tilelang/language/warpgroup.py b/tilelang/language/warpgroup.py
index 788ed3a0b7..47a8c335e6 100644
--- a/tilelang/language/warpgroup.py
+++ b/tilelang/language/warpgroup.py
@@ -1,7 +1,5 @@
 """The language interface for tl programs."""
 
-from __future__ import annotations
-
 from tvm.script.ir_builder.tir.frame import TIRFrame
 from tvm.ffi import register_object
 from tilelang import _ffi_api
@@ -16,7 +14,7 @@ class WarpSpecializeFrame(TIRFrame):
     """
 
 
-def WarpSpecialize(*warp_group_idx):
+def WarpSpecialize(*warp_group_idx) -> WarpSpecializeFrame:
     """Tools to construct a warp group frame.
 
     Parameters
diff --git a/tilelang/layout/__init__.py b/tilelang/layout/__init__.py
index 777802d2c7..ae50e86cb4 100644
--- a/tilelang/layout/__init__.py
+++ b/tilelang/layout/__init__.py
@@ -12,5 +12,8 @@
     make_half_bank_swizzled_layout,  # noqa: F401
     make_quarter_bank_swizzled_layout,  # noqa: F401
     make_linear_layout,  # noqa: F401
+    make_gemm_fragment_8x8,  # noqa: F401
+    make_gemm_fragment_8x8_transposed,  # noqa: F401
+    make_fully_replicated_layout_fragment,  # noqa: F401
 )
 from .gemm_sp import make_cutlass_metadata_layout  # noqa: F401
diff --git a/tilelang/layout/fragment.py b/tilelang/layout/fragment.py
index e6b603ba23..256a7d5ee1 100644
--- a/tilelang/layout/fragment.py
+++ b/tilelang/layout/fragment.py
@@ -1,7 +1,5 @@
 """Wrapping Layouts."""
 
-from __future__ import annotations
-
 # pylint: disable=invalid-name, unsupported-binary-operation
 import tvm
 import tvm_ffi
@@ -117,7 +115,7 @@ def get_thread_size(self):
         """
         return _ffi_api.Fragment_thread_size(self)
 
-    def repeat(self, repeats, repeat_on_thread: bool = False, lower_dim_first: bool = True) -> Fragment:
+    def repeat(self, repeats, repeat_on_thread: bool = False, lower_dim_first: bool = True) -> "Fragment":
         """
         Returns a new Fragment that repeats the iteration space a given number of times.
 
@@ -137,7 +135,7 @@ def repeat(self, repeats, repeat_on_thread: bool = False, lower_dim_first: bool
         """
         return _ffi_api.Fragment_repeat(self, repeats, repeat_on_thread, lower_dim_first)
 
-    def replicate(self, replicate: int) -> Fragment:
+    def replicate(self, replicate: int) -> "Fragment":
         """
         Replicate the Fragment across a new thread dimension.
 
@@ -153,7 +151,7 @@ def replicate(self, replicate: int) -> Fragment:
         """
         return _ffi_api.Fragment_replicate(self, replicate)
 
-    def condense_rep_var(self) -> Fragment:
+    def condense_rep_var(self) -> "Fragment":
         """
         Condense or fold the replicate variable into the existing iteration space.
         This operation may be used to reduce dimensionality if the replicate variable
@@ -200,7 +198,7 @@ def __repr__(self):
         return self._DebugOutput()
         # return f"Fragment<{self.get_input_shape()}->{self.get_output_shape()}, thread={self.thread}, index={self.index}>"
 
-    def is_equal(self, other: Fragment) -> bool:
+    def is_equal(self, other: "Fragment") -> bool:
         """
         Check if the current fragment is equal to another fragment.
         """
diff --git a/tilelang/layout/layout.py b/tilelang/layout/layout.py
index beca482247..548954ca8d 100644
--- a/tilelang/layout/layout.py
+++ b/tilelang/layout/layout.py
@@ -1,7 +1,5 @@
 """Wrapping Layouts."""
 
-from __future__ import annotations
-
 # pylint: disable=invalid-name, unsupported-binary-operation
 import tvm_ffi
 from tvm.ir import Node, Range
@@ -122,7 +120,97 @@ def map_forward_index(self, indices: list[PrimExpr]) -> PrimExpr:
         # Map the provided indices using the constructed index mapping
         return index_map.map_indices(indices)
 
-    def inverse(self) -> Layout:
+    def repeat(self, dim: int, factor: int) -> "Layout":
+        """
+        Repeat a layout along a single input dimension.
+
+        This is useful for building a larger layout by tiling an "atom" layout.
+        Conceptually, repeating on dimension ``dim`` with ``factor`` constructs a
+        new layout ``L'`` such that::
+
+            L'(*idx) = [idx[dim] // extent_dim] + L(idx with idx[dim] % extent_dim)
+
+        where ``extent_dim`` is the original extent of the repeated dimension.
+
+        Parameters
+        ----------
+        dim : int
+            The input dimension to repeat (0-based, supports negative indexing).
+        factor : int
+            The repeat factor. Must be a positive integer.
+
+        Returns
+        -------
+        Layout
+            A new Layout with the repeated input shape and an extra leading
+            output dimension representing the repeat-group index.
+        """
+        if not isinstance(dim, int):
+            raise TypeError(f"dim must be an int, got {type(dim)!r}")
+        if not isinstance(factor, int):
+            raise TypeError(f"factor must be an int, got {type(factor)!r}")
+        if factor < 1:
+            raise ValueError(f"factor must be >= 1, got {factor}")
+        if factor == 1:
+            return self
+
+        input_shape = list(self.get_input_shape())
+        ndim = len(input_shape)
+        if ndim == 0:
+            raise ValueError("Cannot repeat a 0-dim layout")
+
+        if dim < 0:
+            dim += ndim
+        if dim < 0 or dim >= ndim:
+            raise ValueError(f"dim out of range: dim={dim}, ndim={ndim}")
+        return _ffi_api.Layout_repeat(self, dim, factor)
+
+    def expand(self, leading_shape) -> "Layout":
+        """
+        Expand (lift) this layout by prepending new leading input dimensions.
+
+        The new leading dimensions are forwarded unchanged to the output, and
+        the original layout is applied to the remaining trailing dimensions.
+
+        Example
+        -------
+        Given a 2D layout ``L`` over ``[J, K]``, you can lift it to a 3D layout
+        over ``[I, J, K]`` by::
+
+            L3 = L.expand([I])
+            # [i, j, k] -> [i, *L(j, k)]
+
+        Parameters
+        ----------
+        leading_shape : int or Sequence[int or PrimExpr]
+            The shape of the new leading dimensions to prepend. Use an empty
+            list/tuple for a no-op.
+
+        Returns
+        -------
+        Layout
+            A new Layout with input shape ``leading_shape + input_shape`` and
+            output indices ``[leading_dims] + old_forward_index``.
+        """
+        if isinstance(leading_shape, int):
+            leading_shape = [leading_shape]
+        if not isinstance(leading_shape, (list, tuple)):
+            raise TypeError(f"leading_shape must be an int or a sequence, got {type(leading_shape)!r}")
+
+        leading_shape = list(leading_shape)
+        if len(leading_shape) == 0:
+            return self
+
+        for idx, extent in enumerate(leading_shape):
+            if isinstance(extent, int):
+                if extent <= 0:
+                    raise ValueError(f"leading_shape[{idx}] must be > 0, got {extent}")
+            elif not isinstance(extent, PrimExpr):
+                raise TypeError(f"leading_shape elements must be int or PrimExpr, got {type(extent)!r} at index {idx}")
+
+        return _ffi_api.Layout_expand(self, leading_shape)
+
+    def inverse(self) -> "Layout":
         """
         Compute the inverse of the current layout transformation.
 
@@ -133,7 +221,22 @@ def inverse(self) -> Layout:
         """
         return _ffi_api.Layout_inverse(self)
 
-    def is_equal(self, other: Layout) -> bool:
+    def reshape(self, shape, rescale_num=1, rescale_den=1) -> "Layout":
+        """
+        Reshape the input shape of the layout.
+
+        Parameters
+        ----------
+        shape : list[PrimExpr] or list[int]
+            The new input shape.
+        rescale_num : int
+            Rescale numerator for element size changes.
+        rescale_den : int
+            Rescale denominator for element size changes.
+        """
+        return _ffi_api.Layout_reshape(self, shape, rescale_num, rescale_den)
+
+    def is_equal(self, other: "Layout") -> bool:
         """
         Check if the current layout is equal to another layout.
 
@@ -144,6 +247,8 @@ def is_equal(self, other: Layout) -> bool:
         """
         return _ffi_api.Layout_is_equal(self, other)
 
+    def __call__(self, *args: list[PrimExpr]) -> PrimExpr:
+        return self.map_forward_index(args)
+
     def __repr__(self):
         return self._DebugOutput()
-        # return f"Layout<{self.get_input_shape()}->{self.get_output_shape()}, {self.get_forward_vars()} -> {self.get_forward_index()}>"
diff --git a/tilelang/layout/swizzle.py b/tilelang/layout/swizzle.py
index beaf3b6b5c..01a2f741cd 100644
--- a/tilelang/layout/swizzle.py
+++ b/tilelang/layout/swizzle.py
@@ -4,51 +4,55 @@
 from __future__ import annotations
 
 import tvm
-from tvm.tir import Buffer, BufferLoad, BufferRegion
+from tvm import tir
 from tilelang import _ffi_api
+from tilelang._typing import BufferLikeType, BufferLikeTypeTuple
 
 
-def _get_buffer_info(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[Buffer, list[int], str]:
+def _get_buffer_info(buffer_or_load_or_region: BufferLikeType) -> tuple[tir.Buffer, list[int], str]:
     """
-    Extract buffer, shape, and dtype from Buffer, BufferLoad, or BufferRegion.
+    Extract buffer, shape, and dtype from BufferLikeType.
 
     Args:
-        buffer_or_load_or_region: Can be Buffer, BufferLoad, or BufferRegion
+        buffer_or_load_or_region: BufferLikeType
 
     Returns:
         tuple: (buffer, shape, dtype)
     """
-    if isinstance(buffer_or_load_or_region, Buffer):
+    if isinstance(buffer_or_load_or_region, tir.Buffer):
         return buffer_or_load_or_region, buffer_or_load_or_region.shape, buffer_or_load_or_region.dtype
-    elif isinstance(buffer_or_load_or_region, (BufferLoad, BufferRegion)):
+    elif isinstance(buffer_or_load_or_region, BufferLikeTypeTuple):
         buf = buffer_or_load_or_region.buffer
         return buf, buf.shape, buf.dtype
     else:
-        raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
+        raise TypeError(f"Expected BufferLikeType, got {type(buffer_or_load_or_region)}")
 
 
-def _get_stride_continuous(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> tuple[int, int]:
+def _get_stride_continuous(buffer_or_load_or_region: BufferLikeType) -> tuple[int, int]:
     """
-    Get stride (last 2nd dimension) and continuous (last dimension) from Buffer, BufferLoad, or BufferRegion.
+    Get stride (product of all dims except the last) and continuous (last dimension)
+    from BufferLikeType.
 
     Args:
-        buffer_or_load_or_region: Can be Buffer, BufferLoad, or BufferRegion
+        buffer_or_load_or_region: BufferLikeType
 
     Returns:
         tuple: (stride, continuous) as integers
     """
     _, shape, _ = _get_buffer_info(buffer_or_load_or_region)
-    stride = int(shape[-2])
+    stride = 1
+    for dim in shape[:-1]:
+        stride *= int(dim)
     continuous = int(shape[-1])
     return stride, continuous
 
 
-def _get_element_size(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> int:
+def _get_element_size(buffer_or_load_or_region: BufferLikeType) -> int:
     """
-    Get element size in bits from Buffer, BufferLoad, or BufferRegion.
+    Get element size in bits from BufferLikeType.
 
     Args:
-        buffer_or_load_or_region: Can be Buffer, BufferLoad, or BufferRegion
+        buffer_or_load_or_region: BufferLikeType
 
     Returns:
         int: Element size in bits
@@ -59,140 +63,127 @@ def _get_element_size(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegi
 
 # Use a stable swizzled layout to ensure consistent memory access patterns.
 # Swizzling should be enabled or disabled based on whether TMA (Tensor Memory Access) is applied.
-def make_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, k_major: bool = True, allow_pad: bool = True):
-    stride, continuous = _get_stride_continuous(buffer)
-    element_size = _get_element_size(buffer)
-    return _ffi_api.make_swizzled_layout(
-        stride,
-        continuous,
-        element_size,
-        k_major,
-        allow_pad,
-    )
+def make_swizzled_layout(buffer: BufferLikeType, k_major: bool = True, allow_pad: bool = True):
+    buf, _, _ = _get_buffer_info(buffer)
+    return _ffi_api.make_swizzled_layout(buf, k_major, allow_pad)
 
 
 # for Volta Intrinsics
-def make_volta_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, is_a: bool = True, k_inner: bool = True):
-    stride, continuous = _get_stride_continuous(buffer)
-    return _ffi_api.make_volta_swizzled_layout(
-        stride,
-        continuous,
-        is_a,
-        k_inner,
-    )
+def make_volta_swizzled_layout(buffer: BufferLikeType, is_a: bool = True, k_inner: bool = True):
+    buf, _, _ = _get_buffer_info(buffer)
+    return _ffi_api.make_volta_swizzled_layout(buf, is_a, k_inner)
 
 
 # for WGMMA Intrinsics
-def make_wgmma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
-    stride, continuous = _get_stride_continuous(buffer)
-    element_size = _get_element_size(buffer)
+def make_wgmma_swizzled_layout(buffer: BufferLikeType, continuity: int = None, k_major: bool = True):
+    buf, _, _ = _get_buffer_info(buffer)
     if continuity is None:
-        continuity = continuous
-    return _ffi_api.make_wgmma_swizzled_layout(
-        stride,
-        continuous,
-        continuity,
-        element_size,
-        k_major,
-    )
+        continuity = -1
+    return _ffi_api.make_wgmma_swizzled_layout(buf, continuity, k_major)
 
 
 # for TCGEN05MMA Intrinsics
-def make_tcgen05mma_swizzled_layout(buffer: Buffer | BufferLoad | BufferRegion, continuity: int = None, k_major: bool = True):
-    stride, continuous = _get_stride_continuous(buffer)
-    element_size = _get_element_size(buffer)
+def make_tcgen05mma_swizzled_layout(buffer: BufferLikeType, continuity: int = None, k_major: bool = True):
+    buf, _, _ = _get_buffer_info(buffer)
     if continuity is None:
-        continuity = continuous
-    return _ffi_api.make_tcgen05mma_swizzled_layout(
-        stride,
-        continuous,
-        continuity,
-        element_size,
-        k_major,
-    )
+        continuity = -1
+    return _ffi_api.make_tcgen05mma_swizzled_layout(buf, continuity, k_major)
 
 
 # swizzle 128B
-# args: buffer or (stride, continuous, element_size)
-def make_full_bank_swizzled_layout(*args):
+def make_full_bank_swizzled_layout(buffer: BufferLikeType):
     """
     Args:
-        args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
+        buffer: BufferLikeType
     Examples:
         make_full_bank_swizzled_layout(buffer)
-        make_full_bank_swizzled_layout(stride, continuous, element_size)
     """
-    if len(args) == 1:
-        stride, continuous = _get_stride_continuous(args[0])
-        element_size = _get_element_size(args[0])
-    elif len(args) == 3:
-        stride, continuous, element_size = args
-    else:
-        raise ValueError(f"Invalid arguments: {args}")
-    return _ffi_api.make_full_bank_swizzled_layout(
-        stride,
-        continuous,
-        element_size,
-    )
+    buf, _, _ = _get_buffer_info(buffer)
+    return _ffi_api.make_full_bank_swizzled_layout(buf)
 
 
 # swizzle 64B
-# args: buffer or (stride, continuous, element_size)
-def make_half_bank_swizzled_layout(*args):
+def make_half_bank_swizzled_layout(buffer: BufferLikeType):
     """
     Args:
-        args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
+        buffer: BufferLikeType
     Examples:
         make_half_bank_swizzled_layout(buffer)
-        make_half_bank_swizzled_layout(stride, continuous, element_size)
     """
-    if len(args) == 1:
-        stride, continuous = _get_stride_continuous(args[0])
-        element_size = _get_element_size(args[0])
-    elif len(args) == 3:
-        stride, continuous, element_size = args
-    else:
-        raise ValueError(f"Invalid arguments: {args}")
-    return _ffi_api.make_half_bank_swizzled_layout(
-        stride,
-        continuous,
-        element_size,
-    )
+    buf, _, _ = _get_buffer_info(buffer)
+    return _ffi_api.make_half_bank_swizzled_layout(buf)
 
 
 # swizzle 32B
-# args: buffer or (stride, continuous, element_size)
-def make_quarter_bank_swizzled_layout(*args):
+def make_quarter_bank_swizzled_layout(buffer: BufferLikeType):
     """
     Args:
-        args: buffer/BufferLoad/BufferRegion or (stride, continuous, element_size)
+        buffer: BufferLikeType
     Examples:
         make_quarter_bank_swizzled_layout(buffer)
-        make_quarter_bank_swizzled_layout(stride, continuous, element_size)
     """
-    if len(args) == 1:
-        stride, continuous = _get_stride_continuous(args[0])
-        element_size = _get_element_size(args[0])
-    elif len(args) == 3:
-        stride, continuous, element_size = args
-    else:
-        raise ValueError(f"Invalid arguments: {args}")
-    return _ffi_api.make_quarter_bank_swizzled_layout(
-        stride,
-        continuous,
-        element_size,
-    )
+    buf, _, _ = _get_buffer_info(buffer)
+    return _ffi_api.make_quarter_bank_swizzled_layout(buf)
 
 
-def make_linear_layout(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion):
+def make_linear_layout(buffer_or_load_or_region: BufferLikeType):
     """
     Create a row-major linear layout for any dimension.
 
     Args:
-        buffer_or_load_or_region: Buffer, BufferLoad, or BufferRegion
+        buffer_or_load_or_region: BufferLikeType
 
     Returns:
         Layout: A row-major linear layout
     """
     _, shape, _ = _get_buffer_info(buffer_or_load_or_region)
     return _ffi_api.make_linear_layout(list(shape))
+
+
+def make_gemm_fragment_8x8():
+    """
+    Create a standard 8x8 GEMM fragment layout for ldmatrix/stmatrix.
+
+    This layout matches the warp-level matrix multiplication pattern used in tensor cores.
+
+    Returns:
+        Fragment: An 8x8 fragment layout
+    """
+    return _ffi_api.make_gemm_fragment_8x8()
+
+
+def make_gemm_fragment_8x8_transposed():
+    """
+    Create a transposed 8x8 GEMM fragment layout for ldmatrix/stmatrix.
+
+    This layout is the transposed version of make_gemm_fragment_8x8, useful for
+    different access patterns in matrix operations.
+
+    Returns:
+        Fragment: A transposed 8x8 fragment layout
+    """
+    return _ffi_api.make_gemm_fragment_8x8_transposed()
+
+
+def make_fully_replicated_layout_fragment(buffer: BufferLikeType, threads: int):
+    """
+    Create a fully replicated layout for a fragment buffer.
+
+    A fully replicated fragment means all threads hold identical copies of the
+    entire buffer. This is useful for index buffers or masks that need to be
+    accessed uniformly across all threads.
+
+    Args:
+        buffer: BufferLikeType to get shape information
+        threads: Number of threads (replicate extent)
+
+    Returns:
+        Fragment: A fully replicated layout where each thread has a complete copy
+
+    Example:
+        >>> C_local = T.alloc_fragment((2,), T.float32)
+        >>> layout = make_fully_replicated_layout_fragment(C_local, 256)
+        >>> T.annotate_layout({C_local: layout})
+    """
+    _, shape, _ = _get_buffer_info(buffer)
+    return _ffi_api.make_fully_replicated_layout_fragment(list(shape), threads)
diff --git a/tilelang/profiler/__init__.py b/tilelang/profiler/__init__.py
index 01a546db11..1ff75c7d52 100644
--- a/tilelang/profiler/__init__.py
+++ b/tilelang/profiler/__init__.py
@@ -1,13 +1,10 @@
 """The profiler and convert to torch utils"""
 
 from __future__ import annotations
-
 from typing import Callable, Any, Literal
 from functools import partial
 import torch
-from contextlib import suppress
 from dataclasses import dataclass
-import tvm
 from tilelang.utils.tensor import (
     get_tensor_supply,
     TensorSupplyType,
@@ -17,24 +14,7 @@
 from tilelang.engine.param import KernelParam
 from tilelang.jit.adapter import BaseKernelAdapter
 from tilelang.profiler.bench import do_bench
-from tilelang import env
-
-import logging
-
-
-def _use_nvshmem():
-    """Check if NVSHMEM is enabled in the environment."""
-    val = str(env.USE_NVSHMEM).lower()
-    return val in ("1", "true", "yes", "on")
-
-
-def _use_distributed():
-    """Check if distributed mode is enabled in the environment."""
-    val = str(env.USE_DISTRIBUTED).lower()
-    return val in ("1", "true", "yes", "on")
-
-
-logger = logging.getLogger(__name__)
+from tvm import tir
 
 
 @dataclass
@@ -78,78 +58,40 @@ def with_default_adapter(self, adapter: BaseKernelAdapter) -> Profiler:
         self.adapter = adapter
         return self
 
-    def init_distributed(self):
-        import os
-        import datetime
-
-        WORLD_SIZE = int(os.environ.get("WORLD_SIZE", 1))
-        RANK = int(os.environ.get("RANK", 0))
-        LOCAL_RANK = int(os.environ.get("LOCAL_RANK", 0))
-
-        torch.cuda.set_device(LOCAL_RANK)
-        torch.distributed.init_process_group(
-            backend="nccl",
-            world_size=WORLD_SIZE,
-            rank=RANK,
-            timeout=datetime.timedelta(seconds=1800),
-        )
-        assert torch.distributed.is_initialized()
-        TP_GROUP = torch.distributed.new_group(ranks=list(range(WORLD_SIZE)), backend="nccl")
-
-        torch.cuda.synchronize()
-        if _use_nvshmem():
-            try:
-                import pynvshmem
-            except ImportError as e:
-                raise ValueError("pynvshmem is not installed but required for distributed inputs") from e
-            pynvshmem.init_nvshmem_by_uniqueid(TP_GROUP)
-
-    def _get_inputs(self, with_output=False):
+    def _get_inputs(self, with_output=False, dynamic_symbolic_constraints: dict[str, int] | None = None):
         ins = []
         for i in range(len(self.params)):
             if with_output or i not in self.result_idx:
-                ins.append(self.supply(self.params[i]))
+                param = self.params[i]
+                if dynamic_symbolic_constraints:
+                    param = self._substitute_dynamic_symbols(param, dynamic_symbolic_constraints)
+                ins.append(self.supply(param))
         return ins
 
-    def _get_distributed_inputs(self, with_output=False):
-        if not _use_nvshmem():
-            raise ValueError("NVSHMEM is required for distributed inputs but USE_NVSHMEM is False")
+    def _substitute_dynamic_symbols(self, param: KernelParam, constraints: dict[str, int]) -> KernelParam:
+        """Substitute dynamic symbolic variables in param shape with concrete values.
 
-        try:
-            import pynvshmem
-        except ImportError as e:
-            raise ValueError("pynvshmem is not installed but required for distributed inputs") from e
+        Args:
+            param: The kernel parameter with potentially dynamic shape
+            constraints: A dict mapping symbolic variable names to concrete int values
 
-        ins = []
-        for i in range(len(self.params)):
-            if with_output or i not in self.result_idx:
-                shape = list(map(int, self.params[i].shape))
-                tensor = pynvshmem.nvshmem_create_tensor(shape, self.params[i].dtype)
-                if self.supply_type == TensorSupplyType.Integer:
-                    is_unsigned = self.params[i].is_unsigned()
-                    is_float8 = self.params[i].is_float8()
-                    if is_unsigned:
-                        tensor[:] = torch.randint(low=0, high=3, size=shape, device=tensor.device, dtype=tensor.dtype)
-                    elif is_float8:
-                        tensor[:] = torch.randint(low=-128, high=128, size=shape, device=tensor.device, dtype=torch.int8).to(
-                            dtype=tensor.dtype
-                        )
-                    else:
-                        tensor[:] = torch.randint(low=-2, high=3, size=shape, device=tensor.device, dtype=tensor.dtype)
-                elif self.supply_type == TensorSupplyType.Uniform:
-                    tensor[:] = torch.empty(*shape, device=tensor.device, dtype=tensor.dtype).uniform_(-1.0, 1.0)
-                elif self.supply_type == TensorSupplyType.Normal:
-                    tensor[:] = torch.empty(*shape, device=tensor.device, dtype=tensor.dtype).normal_(-1.0, 1.0)
-                elif self.supply_type == TensorSupplyType.Randn:
-                    tensor[:] = torch.randn(*shape, device=tensor.device).to(dtype=tensor.dtype)
-                elif self.supply_type == TensorSupplyType.Zero:
-                    tensor[:] = torch.zeros(*shape, device=tensor.device, dtype=tensor.dtype)
-                elif self.supply_type == TensorSupplyType.One:
-                    tensor[:] = torch.ones(*shape, device=tensor.device, dtype=tensor.dtype)
+        Returns:
+            A new KernelParam with substituted shape
+        """
+        new_shape = []
+        for dim in param.shape:
+            if isinstance(dim, tir.Var):
+                var_name = dim.name
+                if var_name in constraints:
+                    new_shape.append(constraints[var_name])
                 else:
-                    raise ValueError(f"Unknown supply type: {self.supply_type}")
-                ins.append(tensor)
-        return ins
+                    raise ValueError(
+                        f"Dynamic symbolic variable '{var_name}' not found in constraints. "
+                        f"Available constraints: {list(constraints.keys())}"
+                    )
+            else:
+                new_shape.append(dim)
+        return KernelParam(dtype=param.dtype, shape=new_shape)
 
     def _get_params(self, with_output=False):
         params = []
@@ -175,11 +117,7 @@ def assert_allclose(
             rtol: Relative tolerance for comparison
             max_mismatched_ratio: Maximum allowed ratio of mismatched elements
         """
-        if _use_distributed():
-            self.init_distributed()
-            ins = self._get_distributed_inputs()
-        else:
-            ins = self._get_inputs() if input_tensors is None else input_tensors
+        ins = self._get_inputs() if input_tensors is None else input_tensors
         ref_outs = reference_program(*ins)
         torch.cuda.synchronize()
         lib_outs = self.func(*ins)
@@ -214,14 +152,6 @@ def assert_allclose(
             if lhs is not None and rhs is not None:
                 # in case of numsplit template, the ref output may be None
                 # which means the value is invalid, so we skip the comparison
-                def is_float8(tensor: torch.Tensor) -> bool:
-                    return tensor.dtype in {
-                        torch.float8_e5m2,
-                        torch.float8_e5m2fnuz,
-                        torch.float8_e4m3fn,
-                        torch.float8_e4m3fnuz,
-                    }
-
                 torch_assert_close(
                     lhs if not is_float8_dtype(lhs.dtype) else lhs.to(torch.float32),
                     rhs if not is_float8_dtype(rhs.dtype) else rhs.to(torch.float32),
@@ -270,11 +200,7 @@ def assert_consistent(self, repeat=10):
             repeat: Number of times to repeat the consistency check
         """
         # Used to check no race condition inside the kernel
-        if _use_distributed():
-            self.init_distributed()
-            ins = self._get_distributed_inputs()
-        else:
-            ins = self._get_inputs()
+        ins = self._get_inputs()
         ref_outs = self.func(*ins)
 
         for _ in range(repeat):
@@ -292,32 +218,18 @@ def run_once(self, func: Callable | None = None):
             func = self.__call__
         return func(*ins)
 
-    def determine_profiler(self, func: Callable | None = None):
-        """Determines which profiler backend to use based on function type.
-
-        Args:
-            func: Function to be profiled
-            profiler: Explicitly specified profiler type or "auto" for automatic detection
-
-        Returns:
-            str: The determined profiler type ("torch" or "tvm")
-        """
-        if isinstance(func, tvm.runtime.Module):
-            return "tvm"
-        else:
-            return "torch"
-
     def do_bench(
         self,
         func: Callable | None = None,
         warmup: int = 25,
         rep: int = 100,
-        n_warmup: int = 1,
-        n_repeat: int = 1,
+        n_warmup: int = 0,
+        n_repeat: int = 0,
         input_tensors: list[torch.Tensor] = None,
-        backend: Literal["event", "cupti"] = "event",
+        backend: Literal["event", "cupti", "cudagraph"] = "event",
         quantiles: list[float] | None = None,
         return_mode: Literal["min", "max", "mean", "median"] = "mean",
+        dynamic_symbolic_constraints: dict[str, int] | None = None,
     ) -> float:
         """Benchmarks the execution time of a given function.
 
@@ -327,51 +239,35 @@ def do_bench(
             rep: Number of repetitions for timing
             n_warmup: Number of warmup iterations
             n_repeat: Number of timing iterations
-            profiler: Which profiling backend to use
+            backend: Which profiling backend to use - "event", "cupti", or "cudagraph"
             input_tensors: Optional pre-generated input tensors
+            dynamic_symbolic_constraints: Optional dict mapping dynamic symbolic variable
+                names to concrete int values. Use this when benchmarking kernels with
+                dynamic shapes, e.g., {"m": 2048, "n": 1024}
 
         Returns:
             float: Average execution time in milliseconds
         """
-        profiler = self.determine_profiler(func)
-        if profiler == "torch":
-            if func is None:
-                assert self.adapter is not None, "benchmarking function should be provided"
-                func = self.adapter
-            if _use_distributed():
-                self.init_distributed()
-                ins = self._get_distributed_inputs() if input_tensors is None else input_tensors
-            else:
-                ins = self._get_inputs() if input_tensors is None else input_tensors
-            bench_func = partial(func, *ins)
-            return do_bench(
-                bench_func,
-                warmup=warmup,
-                rep=rep,
-                _n_warmup=n_warmup,
-                _n_repeat=n_repeat,
-                quantiles=quantiles,
-                backend=backend,
-                return_mode=return_mode,
-            )
-        elif profiler == "tvm":
-            assert func is not None, "func should not be None"
-            assert isinstance(func, tvm.runtime.Module), f"func should be a TVM module, but got {type(func)}"
-
-            ins = self._get_inputs(with_output=True) if input_tensors is None else input_tensors
-            target = "cuda"
-
-            with suppress(Exception):
-                target = self.mod.imported_modules[0].type_key
-
-            assert target in ["cuda", "hip"], f"Unknown target: {target}"
-
-            device = tvm.cuda(0) if target == "cuda" else tvm.rocm(0)
-            time_evaluator = self.mod.time_evaluator(self.mod.entry_name, device, number=rep, repeat=n_repeat)
-            # Transform Latency to ms
-            return time_evaluator(*ins).mean * 1e3
+        if func is None:
+            assert self.adapter is not None, "benchmarking function should be provided"
+            func = self.adapter
+        if input_tensors is not None:
+            ins = input_tensors
+        elif dynamic_symbolic_constraints is not None:
+            ins = self._get_inputs(dynamic_symbolic_constraints=dynamic_symbolic_constraints)
         else:
-            raise ValueError(f"Unknown profiler: {profiler}")
+            ins = self._get_inputs()
+        bench_func = partial(func, *ins)
+        return do_bench(
+            bench_func,
+            warmup=warmup,
+            rep=rep,
+            _n_warmup=n_warmup,
+            _n_repeat=n_repeat,
+            quantiles=quantiles,
+            backend=backend,
+            return_mode=return_mode,
+        )
 
     @property
     def func(self):
diff --git a/tilelang/profiler/bench.py b/tilelang/profiler/bench.py
index bfcb5043de..5ef832134d 100644
--- a/tilelang/profiler/bench.py
+++ b/tilelang/profiler/bench.py
@@ -69,7 +69,7 @@ def do_bench(
     _n_repeat: int = 0,
     quantiles: list[float] | None = None,
     fast_flush: bool = True,
-    backend: Literal["event", "cupti"] = "event",
+    backend: Literal["event", "cupti", "cudagraph"] = "event",
     return_mode: Literal["min", "max", "mean", "median"] = "mean",
 ) -> float | list[float]:
     """Benchmark the runtime of a PyTorch function with L2 cache management.
@@ -77,7 +77,7 @@ def do_bench(
     This function provides accurate GPU kernel timing by:
     - Clearing L2 cache between runs for consistent measurements
     - Auto-calculating warmup and repeat counts based on kernel runtime
-    - Supporting multiple profiling backends (CUDA events or CUPTI)
+    - Supporting multiple profiling backends (CUDA events, CUPTI, or CUDA graph replay)
     - Offering flexible result aggregation (mean/median/min/max/quantiles)
 
     Args:
@@ -88,7 +88,7 @@ def do_bench(
         _n_repeat: Manual override for benchmark iterations (default: 0 = auto)
         quantiles: Performance percentiles to compute (e.g., [0.5, 0.95])
         fast_flush: Use faster L2 cache flush with int32 vs int8 (default: True)
-        backend: Profiler backend - "event" (CUDA events) or "cupti" (default: "event")
+        backend: Profiler backend - "event" (CUDA events), "cupti", or "cudagraph" (default: "event")
         return_mode: Result aggregation method - "mean", "median", "min", or "max"
 
     Returns:
@@ -131,6 +131,8 @@ def do_bench(
         return _bench_with_cuda_events(fn, cache, n_repeat, quantiles, return_mode)
     elif backend == "cupti":
         return _bench_with_cupti(fn, cache, n_repeat)
+    elif backend == "cudagraph":
+        return _bench_with_cudagraph(fn, cache, n_repeat, quantiles, return_mode)
     else:
         raise ValueError(f"Unknown profiler backend: {backend}")
 
@@ -202,3 +204,54 @@ def _bench_with_cupti(
 
     kernel_time_us = (total_cuda_time - excluded_time) / n_repeat
     return kernel_time_us * 1e-3  # Convert microseconds to milliseconds
+
+
+def _bench_with_cudagraph(
+    fn: Callable,
+    cache: torch.Tensor,
+    n_repeat: int,
+    quantiles: list[float] | None,
+    return_mode: str,
+) -> float | list[float]:
+    """Benchmark using CUDA graph for minimal launch overhead.
+
+    This implementation follows triton.testing.do_bench_cudagraph.
+    It captures the kernel execution in a CUDA graph and replays it multiple
+    times to minimize host overhead and provide accurate timing measurements.
+
+    Note: Cache flushing is done before graph replay, not within the graph,
+    since CUDA graphs require fixed execution patterns.
+    """
+    n_retries = 10
+    with torch.cuda.stream(torch.cuda.Stream()):
+        # Construct a CUDA graph with `n_repeat` unrolled function calls to minimize host overhead.
+        g = torch.cuda.CUDAGraph()
+        with torch.cuda.graph(g):
+            for _ in range(n_repeat):
+                fn()
+
+        torch.cuda.synchronize()
+
+        # Measure time by replaying the graph multiple times.
+        # Clear cache before each replay for consistent measurements.
+        start_events = [torch.cuda.Event(enable_timing=True) for _ in range(n_retries)]
+        end_events = [torch.cuda.Event(enable_timing=True) for _ in range(n_retries)]
+        for i in range(n_retries):
+            cache.zero_()  # Clear L2 cache before replay
+            start_events[i].record()
+            g.replay()
+            end_events[i].record()
+
+        torch.cuda.synchronize()
+        times = torch.tensor(
+            [s.elapsed_time(e) / n_repeat for s, e in zip(start_events, end_events)],
+            dtype=torch.float,
+        )
+
+        # Return quantiles if requested
+        if quantiles is not None:
+            quantile_values = torch.quantile(times, torch.tensor(quantiles, dtype=torch.float)).tolist()
+            return quantile_values[0] if len(quantile_values) == 1 else quantile_values
+
+        # Return aggregated result
+        return getattr(torch, return_mode)(times).item()
diff --git a/tilelang/testing/__init__.py b/tilelang/testing/__init__.py
index 951838d544..d2add21e5a 100644
--- a/tilelang/testing/__init__.py
+++ b/tilelang/testing/__init__.py
@@ -6,6 +6,7 @@
 import torch
 import numpy as np
 from tilelang.contrib import nvcc
+from tilelang.utils.target import determine_target, target_is_gfx950
 from tvm.testing.utils import requires_cuda, requires_package, requires_llvm, requires_metal, requires_rocm, _compose
 
 from tilelang.utils.tensor import torch_assert_close as torch_assert_close
@@ -18,22 +19,33 @@
     "requires_rocm",
     "requires_llvm",
     "requires_distributed",
+    "requires_gfx950",
     "main",
     "requires_cuda_compute_version",
     "process_func",
     "regression",
 ] + [f"requires_cuda_compute_version_{op}" for op in ("ge", "gt", "le", "lt", "eq")]
 
-__all__ = [
-    "requires_package",
-    "requires_cuda",
-    "requires_metal",
-    "requires_rocm",
-    "requires_llvm",
-    "requires_distributed",
-    "main",
-    "requires_cuda_compute_version",
-] + [f"requires_cuda_compute_version_{op}" for op in ("ge", "gt", "le", "lt", "eq")]
+
+def _check_is_gfx950() -> bool:
+    try:
+        target = determine_target("auto", return_object=True)
+        return target_is_gfx950(target)
+    except (ValueError, RuntimeError):
+        return False
+
+
+def requires_gfx950(func):
+    """Skip the test unless the ROCm device is gfx950 (CDNA4 / MI350)."""
+    is_gfx950 = _check_is_gfx950()
+    marks = [
+        pytest.mark.skipif(
+            not is_gfx950,
+            reason="Requires gfx950 (CDNA4/MI350)",
+        ),
+        *requires_rocm.marks(),
+    ]
+    return _compose([func], marks)
 
 
 # pytest.main() wrapper to allow running single test file
diff --git a/tilelang/testing/perf_regression.py b/tilelang/testing/perf_regression.py
index 49e9303fdc..d1518fb2de 100644
--- a/tilelang/testing/perf_regression.py
+++ b/tilelang/testing/perf_regression.py
@@ -2,7 +2,9 @@
 
 import inspect
 import json
+import logging
 import os
+import time
 from dataclasses import dataclass
 from typing import Any, Callable
 from collections.abc import Sequence
@@ -66,7 +68,7 @@ def process_func(func: Callable[..., float], name: str | None = None, /, **kwarg
     _RESULTS.append(PerfResult(name=result_name, latency=latency))
 
 
-def regression(prefixes: Sequence[str] = ("regression_",)) -> None:
+def regression(prefixes: Sequence[str] = ("regression_",), verbose: bool = True) -> None:
     """Run entrypoints in the caller module and print a markdown table.
 
     This is invoked by many example scripts.
@@ -82,7 +84,27 @@ def regression(prefixes: Sequence[str] = ("regression_",)) -> None:
         if any(k.startswith(p) for p in prefixes):
             functions.append((k, v))
 
-    for _, fn in sorted(functions, key=lambda kv: kv[0]):
-        fn()
+    sorted_functions = sorted(functions, key=lambda kv: kv[0])
+    total = len(sorted_functions)
+
+    for idx, (name, fn) in enumerate(sorted_functions, 1):
+        if verbose:
+            # Strip 'regression_' prefix for cleaner display
+            display_name = name[len("regression_") :] if name.startswith("regression_") else name
+            print(f"  ├─ [{idx}/{total}] {display_name}", end="", flush=True)
+        start_time = time.perf_counter()
+        with warnings.catch_warnings():
+            warnings.simplefilter("ignore")
+            # Suppress logging warnings during benchmark execution
+            prev_level = logging.root.level
+            logging.disable(logging.WARNING)
+            try:
+                fn()
+            finally:
+                logging.disable(logging.NOTSET)
+                logging.root.setLevel(prev_level)
+        elapsed = time.perf_counter() - start_time
+        if verbose:
+            print(f" ({elapsed:.2f}s)", flush=True)
 
     _emit_results()
diff --git a/tilelang/tileop/__init__.py b/tilelang/tileop/__init__.py
index 6e7798a051..849ba541e1 100644
--- a/tilelang/tileop/__init__.py
+++ b/tilelang/tileop/__init__.py
@@ -1,3 +1,3 @@
 from .base import GemmWarpPolicy  # noqa: F401
-from .gemm import GemmPy  # noqa: F401
+from .gemm import Gemm  # noqa: F401
 from .gemm_sp import GemmSPPy  # noqa: F401
diff --git a/tilelang/tileop/gemm/__init__.py b/tilelang/tileop/gemm/__init__.py
index bdb1ac0c60..a2290b0191 100644
--- a/tilelang/tileop/gemm/__init__.py
+++ b/tilelang/tileop/gemm/__init__.py
@@ -1,4 +1,3 @@
-from enum import IntEnum
 from tilelang import tvm as tvm
 from tvm import tir
 from tvm.target import Target
@@ -6,56 +5,40 @@
 from tvm.ir import Range
 from tvm.runtime import Scriptable
 import tvm_ffi
+from .inst import GemmInst
 from .gemm_mma import GemmMMA
 from .gemm_mma_sm70 import GemmMMASm70
 from .gemm_wgmma import GemmWGMMA
 from .gemm_tcgen05 import GemmTCGEN5
 from .gemm_mfma import GemmMFMA
-from .gemm_cutedsl import GemmCuTeDSL
+from .gemm_wmma import GemmWMMA
+from .gemm_scalar import GemmScalar
 from tilelang import _ffi_api
 from tilelang.utils.target import target_is_volta
-from tilelang.jit.adapter.utils import is_cutedsl_target
 
 
-@tvm_ffi.register_global_func("tl.gemm_py.infer_layout")
-def gemm_py_infer_layout(gemm_py: GemmMMA, target: Target, thread_bounds: Range):
+@tvm_ffi.register_global_func("tl.gemm.infer_layout")
+def gemm_infer_layout(gemm: GemmMMA, target: Target, thread_bounds: Range):
     thread_nums = thread_bounds.extent
-    return gemm_py.infer_layout(target, thread_nums)
-
-
-@tvm_ffi.register_global_func("tl.gemm_py.lower")
-def gemm_py_lower(gemm_py: GemmMMA, layout_map, target: Target, thread_bounds: Range, thread_var: tir.Var):
-    thread_nums = thread_bounds.extent
-    stmt = gemm_py.lower(layout_map, target, thread_nums, thread_var)
+    return gemm.infer_layout(target, thread_nums)
+
+
+@tvm_ffi.register_global_func("tl.gemm.lower")
+def gemm_lower(
+    gemm: GemmMMA,
+    layout_map,
+    target: Target,
+    thread_bounds: Range,
+    thread_var: tir.Var,
+    mbar_phase_expr: tir.PrimExpr,
+):
+    # We pass thread_bounds rather than thread_extents because tcgen5mma need to check this
+    stmt = gemm.lower(layout_map, target, thread_bounds, thread_var, mbar_phase_expr)
     return stmt
 
 
-# TODO(lei): support Volta and WMMA?
-# same definition with src/op/gemm_py.h
-class GemmInst(IntEnum):
-    MMA = 0
-    WGMMA = 1
-    TCGEN5MMA = 2
-    MFMA = 3
-
-    def is_mma(self) -> bool:
-        return self == GemmInst.MMA
-
-    def is_wgmma(self) -> bool:
-        return self == GemmInst.WGMMA
-
-    def is_tcgen5mma(self) -> bool:
-        return self == GemmInst.TCGEN5MMA
-
-    def is_mfma(self) -> bool:
-        return self == GemmInst.MFMA
-
-    def __repr__(self) -> str:
-        return self.name
-
-
-@tvm_ffi.register_object("tl.GemmPy")
-class GemmPy(Node, Scriptable):
+@tvm_ffi.register_object("tl.Gemm")
+class Gemm(Node, Scriptable):
     # FFI fields (LLVM/MLIR-style lowerCamel via reflection):
     # a, b, c, aPtr, bPtr, cPtr, m, n, k, transA, transB,
     # strideA, strideB, offsetA, offsetB, clearAccum, kPack, wgWait, policy
@@ -135,26 +118,47 @@ def k_pack(self):
     def wg_wait(self):
         return self.wgWait
 
+    @property
+    def is_tcgen05(self):
+        return getattr(self, "isTcgen05", False)
+
+    @property
+    def sf_a_id(self):
+        return self.sfAId
+
+    @property
+    def sf_b_id(self):
+        return self.sfBId
+
     def infer_layout(self, target: Target, thread_nums: int):
         """Infer the layout for the GEMM operation based on target architecture."""
         gemm_inst = self._select_gemm_instruction(thread_nums, target)
         impl_class = self._get_implementation_class(gemm_inst, target)
         return impl_class(self).infer_layout(target, thread_nums)
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
+    def lower(
+        self,
+        layout_map: dict,
+        target: Target,
+        thread_bounds: Range,
+        thread_var: tir.Var,
+        mbar_phase_expr: tir.PrimExpr,
+    ):
         """Lower the GEMM operation to TIR statements based on target architecture."""
+        thread_nums = thread_bounds.extent
         gemm_inst = self._select_gemm_instruction(thread_nums, target)
         impl_class = self._get_implementation_class(gemm_inst, target)
-        return impl_class(self).lower(layout_map, target, thread_nums, thread_var)
+        return impl_class(self).lower(layout_map, target, thread_bounds, thread_var, mbar_phase_expr)
 
     def _select_gemm_instruction(self, thread_nums: int, target: Target) -> GemmInst:
         """Select the appropriate GEMM instruction based on target and thread configuration.
 
         The selection logic follows this priority:
-        1. WGMMA for Hopper architecture with sufficient matrix size and warp count
-        2. MFMA for CDNA (AMD) architecture
-        3. MMA for CUDA architecture
-        4. Fallback to MMA for other cases
+        1. TCGEN5MMA for Blackwell architecture
+        2. WGMMA for Hopper architecture with sufficient matrix size and warp count
+        3. MFMA for CDNA (AMD) architecture
+        4. MMA for CUDA architecture
+        5. Scalar for CPU target (scalar fallback)
 
         Args:
             thread_nums: Number of threads in the block
@@ -163,7 +167,7 @@ def _select_gemm_instruction(self, thread_nums: int, target: Target) -> GemmInst
         Returns:
             GemmInst: The selected GEMM instruction type
         """
-        return GemmInst(_ffi_api.GemmPyGemmInst(self, int(thread_nums), target))
+        return GemmInst(_ffi_api.GemmGetGemmInst(self, int(thread_nums), target))
 
     def _get_implementation_class(self, gemm_inst: GemmInst, target: Target):
         """Get the appropriate implementation class for the given GEMM instruction.
@@ -179,10 +183,6 @@ def _get_implementation_class(self, gemm_inst: GemmInst, target: Target):
             NotImplementedError: If the instruction type is not supported
             ValueError: If the instruction type is unknown
         """
-        # CuTeDSL backend uses direct intrinsic call, bypass complex lowering
-        if is_cutedsl_target(target):
-            return GemmCuTeDSL
-
         if gemm_inst.is_mma():
             if target_is_volta(target):
                 return GemmMMASm70
@@ -193,7 +193,9 @@ def _get_implementation_class(self, gemm_inst: GemmInst, target: Target):
             return GemmTCGEN5
         elif gemm_inst.is_mfma():
             return GemmMFMA
-        elif gemm_inst.is_tcgen5mma():
-            raise NotImplementedError("TCGEN5MMA is not implemented")
+        elif gemm_inst.is_wmma():
+            return GemmWMMA
+        elif gemm_inst.is_scalar():
+            return GemmScalar
         else:
             raise ValueError(f"Unsupported GEMM instruction: {gemm_inst}")
diff --git a/tilelang/tileop/gemm/gemm_base.py b/tilelang/tileop/gemm/gemm_base.py
index 7d31ae46d7..95f69bbfa6 100644
--- a/tilelang/tileop/gemm/gemm_base.py
+++ b/tilelang/tileop/gemm/gemm_base.py
@@ -1,9 +1,12 @@
+from __future__ import annotations
+
 from dataclasses import dataclass
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
-from tilelang.utils.language import is_shared, is_fragment
+from tilelang.utils.language import is_shared, is_fragment, is_tensor_memory
 from tilelang.tileop.base import GemmWarpPolicy
 from tvm.ir.base import Node
 from tvm.ir import PrimExpr
@@ -11,24 +14,46 @@
 
 @dataclass
 class GemmBase:
+    """Base class for GEMM tile operators.
+
+    Classifies the GEMM variant by the memory scopes of operands A and B
+    (SS, SR, RS, TS, RR) and provides common property accessors for the
+    underlying ``gemm_node`` IR node.
+    """
+
     gemm_node: Node
 
     def infer_layout(self, target: Target, thread_nums: int):
         raise NotImplementedError("infer_layout is not implemented")
 
-    def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
+    def lower(
+        self,
+        layout_map: dict,
+        target: Target,
+        thread_bounds: Range,
+        thread_var: tir.Var,
+        mbar_phase_expr: tir.PrimExpr | None = None,
+    ):
         raise NotImplementedError("lower is not implemented")
 
     def is_gemm_ss(self) -> bool:
+        """Return True if both A and B are in shared memory (SS variant)."""
         return is_shared(self.A) and is_shared(self.B)
 
     def is_gemm_sr(self) -> bool:
+        """Return True if A is in shared memory and B is in registers (SR variant)."""
         return is_shared(self.A) and is_fragment(self.B)
 
     def is_gemm_rs(self) -> bool:
+        """Return True if A is in registers and B is in shared memory (RS variant)."""
         return is_fragment(self.A) and is_shared(self.B)
 
+    def is_gemm_ts(self) -> bool:
+        """Return True if A is in tensor memory and B is in shared memory (TS variant)."""
+        return is_tensor_memory(self.A) and is_shared(self.B)
+
     def is_gemm_rr(self) -> bool:
+        """Return True if both A and B are in registers (RR variant)."""
         return is_fragment(self.A) and is_fragment(self.B)
 
     @property
@@ -53,6 +78,13 @@ def trans_B(self) -> bool:
 
     @property
     def in_dtype(self) -> str:
+        """Input data type for the multiplication.
+
+        For the TS variant, A resides in TMEM with the accumulator dtype, so
+        the actual input dtype is derived from B.
+        """
+        if is_tensor_memory(self.A):
+            return self.B.dtype
         assert self.A.dtype == self.B.dtype, "A and B must have the same dtype"
         return self.A.dtype
 
@@ -116,6 +148,10 @@ def k_pack(self) -> int:
     def wg_wait(self) -> int:
         return getattr(self.gemm_node, "wgWait", 0)
 
+    @property
+    def is_tcgen05(self) -> bool:
+        return getattr(self.gemm_node, "isTcgen05", False)
+
     @property
     def policy(self) -> GemmWarpPolicy:
         return getattr(self.gemm_node, "policy", None)
@@ -125,7 +161,7 @@ def mbarptr(self) -> PrimExpr:
         return getattr(self.gemm_node, "mbarPtr", tvm.tir.const(0, T.uint32))
 
     @property
-    def mbar(self) -> tir.Buffer:
+    def mbar(self) -> tir.BufferLoad | None:
         return getattr(self.gemm_node, "mbar", None)
 
     @property
@@ -136,6 +172,26 @@ def C_coords(self):
             return [zero, zero]
         return [coords[i] for i in range(len(coords))]
 
+    @property
+    def SFARegion(self):
+        return getattr(self.gemm_node, "sfaRegion", None)
+
+    @property
+    def SFBRegion(self):
+        return getattr(self.gemm_node, "sfbRegion", None)
+
+    @property
+    def sf_a_id(self) -> PrimExpr:
+        return getattr(self.gemm_node, "sfAId", tvm.tir.const(0, T.int32))
+
+    @property
+    def sf_b_id(self) -> PrimExpr:
+        return getattr(self.gemm_node, "sfBId", tvm.tir.const(0, T.int32))
+
+    @property
+    def is_blockscaled(self) -> bool:
+        return self.SFARegion is not None and self.SFBRegion is not None
+
     def get_region_base_offsets(self, region):
         """
         Get the base offset (start index) for each dimension from a BufferRegion.
diff --git a/tilelang/tileop/gemm/gemm_mfma.py b/tilelang/tileop/gemm/gemm_mfma.py
index d827d8a2a3..68804bc008 100644
--- a/tilelang/tileop/gemm/gemm_mfma.py
+++ b/tilelang/tileop/gemm/gemm_mfma.py
@@ -1,4 +1,7 @@
+from __future__ import annotations
+
 from .gemm_base import GemmBase
+from .inst import GemmInst
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mfma_macro_generator import (
     MatrixCoreIntrinEmitter,
@@ -6,6 +9,7 @@
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
@@ -13,7 +17,7 @@
 
 class GemmMFMA(GemmBase):
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MFMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mfma_emitter = MatrixCoreIntrinEmitter(
@@ -28,6 +32,7 @@ def infer_layout(self, target: Target, thread_nums: int):
             warp_col_tiles=warp_col_tiles,
             chunk=self.chunk,
             k_pack=self.k_pack,
+            target=target,
         )
 
         if self.is_gemm_ss():
@@ -57,8 +62,16 @@ def infer_layout(self, target: Target, thread_nums: int):
         else:
             raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+    def lower(
+        self,
+        layout_map: dict,
+        target: Target,
+        thread_bounds: Range,
+        thread_var: tir.Var,
+        mbar_phase_expr: tir.PrimExpr | None = None,
+    ):
+        thread_nums = thread_bounds.extent
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MFMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mfma_emitter = MatrixCoreIntrinEmitter(
@@ -74,8 +87,10 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
             chunk=self.chunk,
             thread_var=thread_var,
             k_pack=self.k_pack,
+            target=target,
         )
 
+        k_pack = mfma_emitter.k_pack
         in_dtype = self.in_dtype
         warp_rows = mfma_emitter.warp_rows
         warp_cols = mfma_emitter.warp_cols
@@ -96,7 +111,10 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
 
         clear_accum = self.clear_accum
 
-        assert block_K >= micro_size_k, f"block_K ({block_K}) must be >= micro_size_k ({micro_size_k})"
+        assert block_K >= micro_size_k * k_pack, f"block_K ({block_K}) must be >= micro_size_k ({micro_size_k}) * k_pack ({k_pack})"
+        assert block_K % (micro_size_k * k_pack) == 0, (
+            f"block_K ({block_K}) must be divisible by micro_size_k ({micro_size_k}) * k_pack ({k_pack})"
+        )
 
         assert is_full_region(C_region), "Fragment output C must be a full region"
 
@@ -109,11 +127,11 @@ def _gemm_ssr() -> None:
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
-                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a * k_pack), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * k_pack), in_dtype)
                 if clear_accum:
                     T.clear(C_buf)
-                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
+                for ki in T.serial(0, (block_K // (micro_size_k * k_pack))):
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -144,12 +162,12 @@ def _gemm_srr() -> None:
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                A_local = T.alloc_local((warp_rows * local_size_a * self.k_pack), in_dtype)
+                A_local = T.alloc_local((warp_rows * local_size_a * k_pack), in_dtype)
 
                 if clear_accum:
                     T.clear(C_buf)
 
-                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
+                for ki in T.serial(0, (block_K // (micro_size_k * k_pack))):
                     # Load A into fragment
                     mfma_emitter.ldmatrix_a(
                         A_local,
@@ -175,10 +193,10 @@ def _gemm_rsr() -> None:
                 B_shared into local fragments, then issues Matrix Core mfma ops,
                 accumulating into C_local.
                 """
-                B_local = T.alloc_local((warp_cols * local_size_b * self.k_pack), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * k_pack), in_dtype)
                 if clear_accum:
                     T.clear(C_buf)
-                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
+                for ki in T.serial(0, (block_K // (micro_size_k * k_pack))):
                     # Load B into fragment
                     mfma_emitter.ldmatrix_b(
                         B_local,
diff --git a/tilelang/tileop/gemm/gemm_mma.py b/tilelang/tileop/gemm/gemm_mma.py
index b151734838..ebbd3653d9 100644
--- a/tilelang/tileop/gemm/gemm_mma.py
+++ b/tilelang/tileop/gemm/gemm_mma.py
@@ -1,4 +1,7 @@
+from __future__ import annotations
+
 from .gemm_base import GemmBase
+from .inst import GemmInst
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mma_macro_generator import (
     TensorCoreIntrinEmitter,
@@ -6,17 +9,18 @@
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
 
 
 class GemmMMA(GemmBase):
-    def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+    def _make_mma_emitter(self, target: Target, thread_nums: int, thread_var: tir.Var | None = None):
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
-        mma_emitter = TensorCoreIntrinEmitter(
+        emitter = TensorCoreIntrinEmitter(
             a_dtype=self.in_dtype,
             b_dtype=self.in_dtype,
             accum_dtype=self.accum_dtype,
@@ -27,7 +31,12 @@ def infer_layout(self, target: Target, thread_nums: int):
             warp_row_tiles=warp_row_tiles,
             warp_col_tiles=warp_col_tiles,
             chunk=self.chunk,
+            thread_var=thread_var,
         )
+        return emitter
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        mma_emitter = self._make_mma_emitter(target, thread_nums)
         if self.is_gemm_ss():
             return {
                 self.A: make_swizzled_layout(self.A),
@@ -55,23 +64,16 @@ def infer_layout(self, target: Target, thread_nums: int):
         else:
             raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
-        warp_row_tiles = int(self.M // m_warp)
-        warp_col_tiles = int(self.N // n_warp)
-        mma_emitter = TensorCoreIntrinEmitter(
-            a_dtype=self.in_dtype,
-            b_dtype=self.in_dtype,
-            accum_dtype=self.accum_dtype,
-            a_transposed=self.trans_A,
-            b_transposed=self.trans_B,
-            block_row_warps=m_warp,
-            block_col_warps=n_warp,
-            warp_row_tiles=warp_row_tiles,
-            warp_col_tiles=warp_col_tiles,
-            chunk=self.chunk,
-            thread_var=thread_var,
-        )
+    def lower(
+        self,
+        layout_map: dict,
+        target: Target,
+        thread_bounds: Range,
+        thread_var: tir.Var,
+        mbar_phase_expr: tir.PrimExpr | None = None,
+    ):
+        thread_nums = thread_bounds.extent
+        mma_emitter = self._make_mma_emitter(target, thread_nums, thread_var=thread_var)
 
         in_dtype = self.in_dtype
         warp_rows = mma_emitter.warp_rows
diff --git a/tilelang/tileop/gemm/gemm_mma_sm70.py b/tilelang/tileop/gemm/gemm_mma_sm70.py
index 52a4bf3262..48c6b4e883 100644
--- a/tilelang/tileop/gemm/gemm_mma_sm70.py
+++ b/tilelang/tileop/gemm/gemm_mma_sm70.py
@@ -1,5 +1,8 @@
+from __future__ import annotations
+
 # for Volta GPUs, which use legacy MMA instructions
 from .gemm_base import GemmBase
+from .inst import GemmInst
 from tilelang.layout import make_volta_swizzled_layout
 from tilelang.intrinsics.mma_sm70_macro_generator import (
     TensorCoreIntrinEmitter,
@@ -7,6 +10,7 @@
 from tilelang.utils.language import is_shared, is_fragment, is_full_region
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
@@ -14,7 +18,7 @@
 
 class GemmMMASm70(GemmBase):
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -46,8 +50,16 @@ def infer_layout(self, target: Target, thread_nums: int):
         else:
             raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+    def lower(
+        self,
+        layout_map: dict,
+        target: Target,
+        thread_bounds: Range,
+        thread_var: tir.Var,
+        mbar_phase_expr: tir.PrimExpr | None = None,
+    ):
+        thread_nums = thread_bounds.extent
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
diff --git a/tilelang/tileop/gemm/gemm_scalar.py b/tilelang/tileop/gemm/gemm_scalar.py
new file mode 100644
index 0000000000..3b854b0ed6
--- /dev/null
+++ b/tilelang/tileop/gemm/gemm_scalar.py
@@ -0,0 +1,52 @@
+from __future__ import annotations
+
+from tilelang.tileop.gemm.gemm_base import GemmBase
+from tilelang import language as T
+from tvm.target import Target
+from tvm.ir import Range
+from tvm import tir
+
+
+class GemmScalar(GemmBase):
+    """CPU scalar fallback: triple nested loop gemm."""
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        return {}
+
+    def lower(
+        self,
+        layout_map: dict,
+        target: Target,
+        thread_bounds: Range,
+        thread_var: tir.Var,
+        mbar_phase_expr: tir.PrimExpr | None = None,
+    ):
+        M, N, K = self.M, self.N, self.K
+        A_buf = self.ARegion.buffer
+        B_buf = self.BRegion.buffer
+        C_buf = self.CRegion.buffer
+        trans_A = self.trans_A
+        trans_B = self.trans_B
+        clear_accum = self.clear_accum
+        accum_dtype = self.accum_dtype
+
+        # Region offsets for strided gemm (e.g. T.gemm(A[0:64, :], B, C))
+        a0 = self.ARegion.region[0].min
+        a1 = self.ARegion.region[1].min
+        b0 = self.BRegion.region[0].min
+        b1 = self.BRegion.region[1].min
+        c0 = self.CRegion.region[0].min
+        c1 = self.CRegion.region[1].min
+
+        @T.prim_func
+        def _gemm_scalar() -> None:
+            if clear_accum:
+                T.clear(C_buf)
+            for i, j, k in T.grid(M, N, K):
+                C_buf[c0 + i, c1 + j] += T.cast(
+                    A_buf[a0 + (k if trans_A else i), a1 + (i if trans_A else k)]
+                    * B_buf[b0 + (j if trans_B else k), b1 + (k if trans_B else j)],
+                    accum_dtype,
+                )
+
+        return _gemm_scalar
diff --git a/tilelang/tileop/gemm/gemm_tcgen05.py b/tilelang/tileop/gemm/gemm_tcgen05.py
index de3e72143c..f1e373ef9b 100644
--- a/tilelang/tileop/gemm/gemm_tcgen05.py
+++ b/tilelang/tileop/gemm/gemm_tcgen05.py
@@ -1,12 +1,26 @@
+from __future__ import annotations
+
 from .gemm_base import GemmBase
-from tilelang.layout import make_tcgen05mma_swizzled_layout
+from .inst import GemmInst
+from tilelang.layout import (
+    Layout,
+    make_full_bank_swizzled_layout,
+    make_half_bank_swizzled_layout,
+    make_quarter_bank_swizzled_layout,
+    make_linear_layout,
+)
 from tilelang.intrinsics.tcgen05_macro_generator import (
     TensorCoreIntrinEmitter,
 )
 from tilelang import language as T
+from tilelang.utils.language import retrieve_ptr
 from tilelang.transform.simplify import _Simplify
 from tvm import tir
 from tvm.target import Target
+from tvm.ir import Range
+from tvm.arith import Analyzer
+from typing import Callable
+
 
 _FLOAT8_DTYPES = {
     "float8_e4m3",
@@ -19,8 +33,38 @@
 
 
 class GemmTCGEN5(GemmBase):
+    """GEMM operator for Blackwell (SM100) TCGEN5MMA instructions.
+
+    Supports the SS (Shared-Shared) and TS (TensorMemory-Shared) variants,
+    as well as block-scaled MXFP8 GEMM when SFA/SFB scale factors are present.
+    Layout inference and lowering are dispatched based on the memory scopes
+    of operands A and B.
+    """
+
+    def infer_shared_layout(self, continuity: int) -> Callable[[tir.Buffer], Layout]:
+        """Infer a standard shared-memory swizzle layout for TCGEN05 operands."""
+        vectorized_size = 128 // self.in_dtype.bits
+        if continuity % (vectorized_size * 8) == 0:
+            return make_full_bank_swizzled_layout
+        elif continuity % (vectorized_size * 4) == 0:
+            return make_half_bank_swizzled_layout
+        elif continuity % (vectorized_size * 2) == 0:
+            return make_quarter_bank_swizzled_layout
+        else:
+            return make_linear_layout
+
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
+        """Infer swizzled layouts for operands and accumulator.
+
+        For SS: both A and B get swizzled shared-memory layouts.
+        For TS: A and C get TMEM store layouts, B gets a swizzled shared-memory layout.
+        For block-scaled: same as SS (A and B get swizzle, C gets TMEM store layout).
+        """
+        # Block-scaled GEMM keeps a 1x1 warp partition even when using cta_group::2.
+        if self.is_blockscaled:
+            m_warp, n_warp = 1, 1
+        else:
+            m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.TCGEN5MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -38,21 +82,44 @@ def infer_layout(self, target: Target, thread_nums: int):
         a_is_k_major = not self.trans_A
         b_is_k_major = self.trans_B
 
-        if self.is_gemm_ss():
-            a_continuity = self.M if a_is_k_major else 4 * self.K // m_warp
-            b_continuity = self.K if b_is_k_major else self.N // n_warp
+        annotations = getattr(self.gemm_node, "annotations", {})
+        use_2cta = bool(annotations.get("use_2cta", 0))
+        mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K, disable_2cta=not use_2cta)
+
+        if self.is_blockscaled or self.is_gemm_ss():
+            a_continuity = self.K if a_is_k_major else self.M
+            b_continuity = self.K if b_is_k_major else int(self.B.shape[-1])  # don't use N, as it may be for 2cta
 
             return {
-                # WGMMA does not support padding
-                self.A: make_tcgen05mma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B: make_tcgen05mma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.A: self.infer_shared_layout(a_continuity)(self.A),
+                self.B: self.infer_shared_layout(b_continuity)(self.B),
+                self.C: mma_emitter.make_mma_store_layout(self.C),
+            }
+        if self.is_gemm_ts():
+            b_continuity = self.K if b_is_k_major else int(self.B.shape[-1])
+            layouts = {
+                self.A: mma_emitter.make_mma_store_layout(self.A),
+                self.B: self.infer_shared_layout(b_continuity)(self.B),
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
-        # No special swizzle requirement; rely on existing layout.
+            return layouts
         return {}
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
+    def lower(
+        self,
+        layout_map: dict,
+        target: Target,
+        thread_bounds: Range,
+        thread_var: tir.Var,
+        mbar_phase_expr: tir.PrimExpr | None = None,
+    ):
+        """Lower the GEMM tile-op into a TIR prim_func containing TCGEN5MMA calls."""
+        thread_nums = thread_bounds.extent
+        # Block-scaled GEMM keeps a 1x1 warp partition even when using cta_group::2.
+        if self.is_blockscaled:
+            m_warp, n_warp = 1, 1
+        else:
+            m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.TCGEN5MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -73,10 +140,16 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
         if self.B in layout_map:
             mma_emitter._assign_b_shared_layout(layout_map[self.B])
 
-        if not self.is_gemm_ss():
-            raise ValueError(f"TCGEN5MMA currently only supports gemm_ss, got A scope {self.A.scope()}, B scope {self.B.scope()}")
+        if self.is_blockscaled:
+            return self._lower_blockscaled(mma_emitter, thread_bounds, thread_var, mbar_phase_expr)
+
+        if not (self.is_gemm_ss() or self.is_gemm_ts()):
+            raise ValueError(f"TCGEN5MMA supports gemm_ss and gemm_ts, got A scope {self.A.scope()}, B scope {self.B.scope()}")
 
-        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K)
+        annotations = getattr(self.gemm_node, "annotations", {})
+        use_2cta = bool(annotations.get("use_2cta", 0))
+        mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K, disable_2cta=not use_2cta)
+        atom_m, atom_n, atom_k, enable_ws, enable_2cta = mma_emitter.meta
 
         if self.A.scope() not in {"shared", "shared.dyn", "shared.tmem"}:
             raise ValueError(f"Unsupported A scope for TCGEN5MMA: {self.A.scope()}")
@@ -84,31 +157,134 @@ def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var:
             raise ValueError(f"Unsupported B scope for TCGEN5MMA: {self.B.scope()}")
         if self.C.scope() != "shared.tmem":
             raise ValueError(f"TCGEN5MMA expects C in shared.tmem, got {self.C.scope()}")
-        if self.wg_wait != -1:
-            raise ValueError("TCGEN5MMA currently requires wg_wait == -1")
+        if self.wg_wait not in (0, -1):
+            raise ValueError("TCGEN5MMA only accepts wg_wait in {0, -1}")
 
         mbar = self.mbar
-        if mbar == 0:
+        if mbar is None:
             raise ValueError("TCGEN5MMA requires a valid mbarrier")
 
-        mbarptr = mbar.access_ptr("rw")
+        mbarptr = retrieve_ptr(mbar, "rw")
 
         C_coords = self.C_coords
         if len(C_coords) != 2:
             raise ValueError("TCGEN5MMA expects 2D coordinates for C buffer access")
 
         accum_dtype = str(self.C.dtype)
-        if accum_dtype not in [str(T.float32), str(T.float16)]:
+        if accum_dtype not in [str(T.float32), str(T.float16), str(T.int32)]:
             raise ValueError(f"Unsupported accumulator dtype for TCGEN5MMA: {accum_dtype}")
 
         A_shared = self.ARegion
         B_shared = self.BRegion
         C_local = self.C
         clear_accum = self.clear_accum
+        mbar_phase = mbar_phase_expr if mbar_phase_expr is not None else 0
+
+        # Since TCGEN5MMA atoms provided by CUTLASS always have an internal
+        # `elect_one_sync()`, we check if we are calling it using full warps
+        analyzer = Analyzer()
+        warp_size = 32
+        assert analyzer.can_prove(thread_bounds.min % warp_size == 0 and thread_bounds.extent % warp_size == 0), (
+            "TCGEN5MMA requires thread bounds to be multiples of warp size (32) and aligned to warps."
+        )
+
+        cluster_cond = not enable_2cta or T.block_rank_in_cluster() == 0
+
+        @T.prim_func
+        def _gemm_ss_cond() -> None:
+            if cluster_cond and thread_var // 32 == thread_bounds.min // warp_size:
+                mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbarptr, clear_accum)
+            if not self.is_tcgen05:
+                T.mbarrier_wait_parity(mbar, mbar_phase)
 
         @T.prim_func
         def _gemm_ss() -> None:
-            if thread_var // 32 == 0:
+            if cluster_cond:
                 mma_emitter.tcgen05mma(A_shared, B_shared, C_local, mbarptr, clear_accum)
+            if not self.is_tcgen05:
+                T.mbarrier_wait_parity(mbar, mbar_phase)
 
-        return _Simplify(_gemm_ss, inline_let=True)
+        return (
+            _Simplify(_gemm_ss, inline_let=True)
+            if analyzer.can_prove(thread_bounds.extent == warp_size)
+            else _Simplify(_gemm_ss_cond, inline_let=True)
+        )
+
+    def _lower_blockscaled(self, mma_emitter, thread_bounds, thread_var, mbar_phase_expr: tir.PrimExpr | None = None):
+        """Lower block-scaled MXFP8 GEMM to TIR.
+
+        Block-scaled GEMM follows explicit-async TCGEN5MMA semantics: the MMA
+        issue posts completion to `mbar`, and the user (or pipeline pass) is
+        responsible for waiting on that barrier at the consumption point. We
+        therefore never auto-emit `mbarrier_wait_parity` here. This mirrors the
+        `is_tcgen05=True` branch of `_gemm_ss`. `mbar_phase_expr` is accepted
+        for API consistency with the rest of the `GemmPyNode.Lower` chain and
+        so that a future synchronous block-scaled path can use it without
+        needing another signature change.
+        """
+        mbar = self.mbar
+        if mbar is None:
+            raise ValueError("Block-scaled GEMM requires a valid mbarrier")
+        mbarptr = retrieve_ptr(mbar, "rw")
+
+        A_shared = self.ARegion
+        B_shared = self.BRegion
+        C_local = self.C
+        clear_accum = self.clear_accum
+        SFA_tmem = self.SFARegion.buffer
+        SFB_tmem = self.SFBRegion.buffer
+        sf_a_id = self.sf_a_id
+        sf_b_id = self.sf_b_id
+        # NOTE: mbar_phase_expr is intentionally unused in the current
+        # frontend, which always requests explicit-async semantics. Keep the
+        # parameter so the signature matches `_gemm_ss` and the call site in
+        # `lower()` does not need a special case.
+        del mbar_phase_expr
+
+        annotations = getattr(self.gemm_node, "annotations", {})
+        use_2cta = bool(annotations.get("use_2cta", 0))
+        mma_emitter.get_tcgen5_mma_meta(self.M, self.N, self.K, disable_2cta=not use_2cta)
+        _atom_m, _atom_n, _atom_k, _enable_ws, enable_2cta = (int(x) for x in mma_emitter.meta)
+
+        analyzer = Analyzer()
+        warp_size = 32
+        assert analyzer.can_prove(thread_bounds.min % warp_size == 0 and thread_bounds.extent % warp_size == 0), (
+            "Block-scaled GEMM requires thread bounds aligned to warps."
+        )
+        cluster_cond = not enable_2cta or T.block_rank_in_cluster() == 0
+
+        @T.prim_func
+        def _gemm_blockscaled_cond() -> None:
+            if cluster_cond and thread_var // 32 == thread_bounds.min // warp_size:
+                mma_emitter.tcgen05mma_blockscaled(
+                    A_shared,
+                    B_shared,
+                    C_local,
+                    SFA_tmem,
+                    SFB_tmem,
+                    mbarptr,
+                    clear_accum,
+                    sf_a_id,
+                    sf_b_id,
+                )
+
+        @T.prim_func
+        def _gemm_blockscaled() -> None:
+            if cluster_cond:
+                mma_emitter.tcgen05mma_blockscaled(
+                    A_shared,
+                    B_shared,
+                    C_local,
+                    SFA_tmem,
+                    SFB_tmem,
+                    mbarptr,
+                    clear_accum,
+                    sf_a_id,
+                    sf_b_id,
+                )
+
+        return (
+            _Simplify(_gemm_blockscaled, inline_let=True)
+            if analyzer.can_prove(thread_bounds.extent == warp_size)
+            else _Simplify(_gemm_blockscaled_cond, inline_let=True)
+        )
diff --git a/tilelang/tileop/gemm/gemm_wgmma.py b/tilelang/tileop/gemm/gemm_wgmma.py
index 038aa2cd66..d2b0f75a3a 100644
--- a/tilelang/tileop/gemm/gemm_wgmma.py
+++ b/tilelang/tileop/gemm/gemm_wgmma.py
@@ -1,19 +1,53 @@
+from __future__ import annotations
+
 from .gemm_base import GemmBase
-from tilelang.layout import make_wgmma_swizzled_layout
+from .inst import GemmInst
+from tilelang.layout import (
+    make_full_bank_swizzled_layout,
+    make_half_bank_swizzled_layout,
+    make_quarter_bank_swizzled_layout,
+    make_linear_layout,
+    Layout,
+)
 from tilelang.intrinsics.wgmma_macro_generator import (
     TensorCoreIntrinEmitter,
 )
 from tilelang.utils.language import is_shared, is_fragment
 from tilelang import tvm as tvm
 from tvm.target import Target
+from tvm.ir import Range
 from tvm import tir
 from tilelang import language as T
 from tilelang.transform.simplify import _Simplify
+from typing import Callable
 
 
 class GemmWGMMA(GemmBase):
+    def infer_shared_layout(self, continuity: int) -> Callable[[tir.Buffer], Layout]:
+        """Infer the swizzle layout for shared memory based on continuity.
+
+        WGMMA can directly use shared memory as input, so the swizzle layout must
+        match the tensor core's access pattern. The swizzle granularity is determined
+        by the continuous dimension size:
+          - 128B swizzle (Full):    continuity % (vectorized_size * 8) == 0
+          - 64B swizzle (Half):     continuity % (vectorized_size * 4) == 0
+          - 32B swizzle (Quarter):  continuity % (vectorized_size * 2) == 0
+          - Linear (no swizzle):    otherwise
+
+        See: https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html
+        """
+        vectorized_size = 128 // self.in_dtype.bits
+        if continuity % (vectorized_size * 8) == 0:
+            return make_full_bank_swizzled_layout
+        elif continuity % (vectorized_size * 4) == 0:
+            return make_half_bank_swizzled_layout
+        elif continuity % (vectorized_size * 2) == 0:
+            return make_quarter_bank_swizzled_layout
+        else:
+            return make_linear_layout
+
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.WGMMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
@@ -30,30 +64,34 @@ def infer_layout(self, target: Target, thread_nums: int):
         )
         a_is_k_major = not self.trans_A
         b_is_k_major = self.trans_B
-
+        a_continuity = self.K if a_is_k_major else mma_emitter.wgmma_inst_m
+        b_continuity = self.K if b_is_k_major else mma_emitter.wgmma_inst_n
         if self.is_gemm_ss():
-            a_continuity = self.M if a_is_k_major else 4 * self.K // m_warp
-            b_continuity = self.K if b_is_k_major else self.N // n_warp
-
             return {
                 # WGMMA does not support padding
-                self.A: make_wgmma_swizzled_layout(self.A, continuity=a_continuity, k_major=a_is_k_major),
-                self.B: make_wgmma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.A: self.infer_shared_layout(a_continuity)(self.A),
+                self.B: self.infer_shared_layout(b_continuity)(self.B),
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         elif self.is_gemm_rs():
-            b_continuity = self.N if b_is_k_major else 4 * self.K // n_warp
             return {
                 self.A: mma_emitter.make_mma_load_layout(self.A, matrix="A"),
-                self.B: make_wgmma_swizzled_layout(self.B, continuity=b_continuity, k_major=b_is_k_major),
+                self.B: self.infer_shared_layout(b_continuity)(self.B),
                 self.C: mma_emitter.make_mma_store_layout(self.C),
             }
         else:
             raise ValueError(f"Unsupported gemm combination for wgmma, A: {self.A.scope()}, B: {self.B.scope()}")
 
-    def lower(self, layout_map: dict, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, True)
-
+    def lower(
+        self,
+        layout_map: dict,
+        target: Target,
+        thread_bounds: Range,
+        thread_var: tir.Var,
+        mbar_phase_expr: tir.PrimExpr | None = None,
+    ):
+        thread_nums = thread_bounds.extent
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.WGMMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = TensorCoreIntrinEmitter(
diff --git a/tilelang/tileop/gemm/gemm_wmma.py b/tilelang/tileop/gemm/gemm_wmma.py
new file mode 100644
index 0000000000..6c59a41331
--- /dev/null
+++ b/tilelang/tileop/gemm/gemm_wmma.py
@@ -0,0 +1,165 @@
+"""GEMM implementation using AMD RDNA WMMA instructions (gfx11/gfx12)."""
+
+from __future__ import annotations
+
+from .gemm_base import GemmBase
+from .inst import GemmInst
+from tilelang.layout import make_swizzled_layout
+from tilelang.intrinsics.wmma_macro_generator import WMMAIntrinEmitter
+from tilelang.utils.language import is_shared, is_fragment, is_full_region
+from tilelang import tvm as tvm
+from tvm.target import Target
+from tvm.ir import Range
+from tvm import tir
+from tilelang import language as T
+from tilelang.transform.simplify import _Simplify
+
+
+class GemmWMMA(GemmBase):
+    """GEMM using AMD RDNA WMMA instructions (16×16×16, warp-size=32)."""
+
+    def _make_emitter(self, target: Target, thread_nums: int, thread_var=None) -> WMMAIntrinEmitter:
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.WMMA)
+        warp_row_tiles = int(self.M // m_warp)
+        warp_col_tiles = int(self.N // n_warp)
+        return WMMAIntrinEmitter(
+            a_dtype=self.in_dtype,
+            b_dtype=self.in_dtype,
+            accum_dtype=self.accum_dtype,
+            a_transposed=self.trans_A,
+            b_transposed=self.trans_B,
+            block_row_warps=m_warp,
+            block_col_warps=n_warp,
+            warp_row_tiles=warp_row_tiles,
+            warp_col_tiles=warp_col_tiles,
+            chunk=self.chunk,
+            k_pack=self.k_pack,
+            thread_var=thread_var,
+            target=target,
+        )
+
+    def infer_layout(self, target: Target, thread_nums: int):
+        wmma_emitter = self._make_emitter(target, thread_nums)
+
+        if self.is_gemm_ss():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: make_swizzled_layout(self.B),
+                self.C: wmma_emitter.make_wmma_store_layout(self.C),
+            }
+        elif self.is_gemm_sr():
+            return {
+                self.A: make_swizzled_layout(self.A),
+                self.B: wmma_emitter.make_wmma_load_layout(self.B, matrix="B"),
+                self.C: wmma_emitter.make_wmma_store_layout(self.C),
+            }
+        elif self.is_gemm_rs():
+            return {
+                self.A: wmma_emitter.make_wmma_load_layout(self.A, matrix="A"),
+                self.B: make_swizzled_layout(self.B),
+                self.C: wmma_emitter.make_wmma_store_layout(self.C),
+            }
+        elif self.is_gemm_rr():
+            return {
+                self.A: wmma_emitter.make_wmma_load_layout(self.A, matrix="A"),
+                self.B: wmma_emitter.make_wmma_load_layout(self.B, matrix="B"),
+                self.C: wmma_emitter.make_wmma_store_layout(self.C),
+            }
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def lower(
+        self, layout_map: dict, target: Target, thread_bounds: Range, thread_var: tir.Var, mbar_phase_expr: tir.PrimExpr | None = None
+    ):
+        thread_nums = thread_bounds.extent
+        wmma_emitter = self._make_emitter(target, thread_nums, thread_var=thread_var)
+
+        block_K = wmma_emitter.chunk
+        micro_size_k = wmma_emitter.micro_size_k
+        in_dtype = self.in_dtype
+        warp_rows = wmma_emitter.warp_rows
+        warp_cols = wmma_emitter.warp_cols
+        local_size_a = wmma_emitter.local_size_a
+        local_size_b = wmma_emitter.local_size_b
+        k_pack = wmma_emitter.k_pack
+
+        A_region = self.ARegion
+        B_region = self.BRegion
+        C_region = self.CRegion
+        A_buf = A_region.buffer
+        B_buf = B_region.buffer
+        C_buf = C_region.buffer
+        clear_accum = self.clear_accum
+
+        assert block_K >= micro_size_k * k_pack
+        assert block_K % (micro_size_k * k_pack) == 0
+        assert is_full_region(C_region), "Fragment output C must be a full region"
+
+        if self.is_gemm_ss():
+
+            @T.prim_func
+            def _gemm_ssr() -> None:
+                A_local = T.alloc_local((warp_rows * local_size_a * k_pack), in_dtype)
+                B_local = T.alloc_local((warp_cols * local_size_b * k_pack), in_dtype)
+                if clear_accum:
+                    T.clear(C_buf)
+                for ki in T.serial(0, (block_K // (micro_size_k * k_pack))):
+                    wmma_emitter.ldmatrix_a(A_local, A_region, ki)
+                    wmma_emitter.ldmatrix_b(B_local, B_region, ki)
+                    wmma_emitter.wmma(A_local, B_local, C_buf, ki)
+
+            return _Simplify(_gemm_ssr, inline_let=True)
+
+        elif self.is_gemm_sr():
+            assert is_full_region(B_region), "Fragment input B must be a full region"
+
+            @T.prim_func
+            def _gemm_srr() -> None:
+                A_local = T.alloc_local((warp_rows * local_size_a * k_pack), in_dtype)
+                if clear_accum:
+                    T.clear(C_buf)
+                for ki in T.serial(0, (block_K // (micro_size_k * k_pack))):
+                    wmma_emitter.ldmatrix_a(A_local, A_region, ki)
+                    wmma_emitter.wmma(A_local, B_buf, C_buf, ki)
+
+            return _Simplify(_gemm_srr, inline_let=True)
+
+        elif self.is_gemm_rs():
+            assert is_full_region(A_region), "Fragment input A must be a full region"
+
+            @T.prim_func
+            def _gemm_rsr() -> None:
+                B_local = T.alloc_local((warp_cols * local_size_b * k_pack), in_dtype)
+                if clear_accum:
+                    T.clear(C_buf)
+                for ki in T.serial(0, (block_K // (micro_size_k * k_pack))):
+                    wmma_emitter.ldmatrix_b(B_local, B_region, ki)
+                    wmma_emitter.wmma(A_buf, B_local, C_buf, ki)
+
+            return _Simplify(_gemm_rsr, inline_let=True)
+
+        elif self.is_gemm_rr():
+            assert is_full_region(A_region), "Fragment input A must be a full region"
+            assert is_full_region(B_region), "Fragment input B must be a full region"
+
+            @T.prim_func
+            def _gemm_rrr() -> None:
+                for ki in T.serial(0, (block_K // (micro_size_k * self.k_pack))):
+                    wmma_emitter.wmma(A_buf, B_buf, C_buf, ki)
+
+            return _Simplify(_gemm_rrr, inline_let=True)
+
+        else:
+            raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
+
+    def is_gemm_ss(self) -> bool:
+        return is_shared(self.A) and is_shared(self.B)
+
+    def is_gemm_sr(self) -> bool:
+        return is_shared(self.A) and is_fragment(self.B)
+
+    def is_gemm_rs(self) -> bool:
+        return is_fragment(self.A) and is_shared(self.B)
+
+    def is_gemm_rr(self) -> bool:
+        return is_fragment(self.A) and is_fragment(self.B)
diff --git a/tilelang/tileop/gemm/inst.py b/tilelang/tileop/gemm/inst.py
new file mode 100644
index 0000000000..cbfaf016f2
--- /dev/null
+++ b/tilelang/tileop/gemm/inst.py
@@ -0,0 +1,32 @@
+from enum import IntEnum
+
+
+# same definition with src/op/gemm.h
+class GemmInst(IntEnum):
+    MMA = 0
+    WGMMA = 1
+    TCGEN5MMA = 2
+    MFMA = 3
+    Scalar = 4
+    WMMA = 5  # AMD RDNA WMMA (gfx11/gfx12)
+
+    def is_mma(self) -> bool:
+        return self == GemmInst.MMA
+
+    def is_wgmma(self) -> bool:
+        return self == GemmInst.WGMMA
+
+    def is_tcgen5mma(self) -> bool:
+        return self == GemmInst.TCGEN5MMA
+
+    def is_mfma(self) -> bool:
+        return self == GemmInst.MFMA
+
+    def is_scalar(self) -> bool:
+        return self == GemmInst.Scalar
+
+    def is_wmma(self) -> bool:
+        return self == GemmInst.WMMA
+
+    def __repr__(self) -> str:
+        return self.name
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_base.py b/tilelang/tileop/gemm_sp/gemm_sp_base.py
index 8226a06641..3e6ae7c8fc 100644
--- a/tilelang/tileop/gemm_sp/gemm_sp_base.py
+++ b/tilelang/tileop/gemm_sp/gemm_sp_base.py
@@ -84,19 +84,19 @@ def C(self) -> tir.Buffer:
 
     @property
     def ARegion(self) -> tir.PrimExpr:
-        return self.gemm_sp_node.ARegion
+        return self.gemm_sp_node.aRegion
 
     @property
     def ERegion(self) -> tir.PrimExpr:
-        return self.gemm_sp_node.ERegion
+        return self.gemm_sp_node.eRegion
 
     @property
     def BRegion(self) -> tir.PrimExpr:
-        return self.gemm_sp_node.BRegion
+        return self.gemm_sp_node.bRegion
 
     @property
     def CRegion(self) -> tir.PrimExpr:
-        return self.gemm_sp_node.CRegion
+        return self.gemm_sp_node.cRegion
 
     @property
     def stride_A(self) -> int:
diff --git a/tilelang/tileop/gemm_sp/gemm_sp_mma.py b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
index 956c294a1d..eabebddc42 100644
--- a/tilelang/tileop/gemm_sp/gemm_sp_mma.py
+++ b/tilelang/tileop/gemm_sp/gemm_sp_mma.py
@@ -1,4 +1,5 @@
 from .gemm_sp_base import GemmSPBase
+from tilelang.tileop.gemm.inst import GemmInst
 from tilelang.layout import make_swizzled_layout
 from tilelang.intrinsics.mma_sp_macro_generator import SparseTensorCoreIntrinEmitter
 from tilelang.utils.language import is_shared, is_fragment
@@ -11,7 +12,8 @@
 
 class GemmSPMMA(GemmSPBase):
     def infer_layout(self, target: Target, thread_nums: int):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        # NOTE(wt): Actually gemm_sp v2 currently use GemmWarpPolicy
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = SparseTensorCoreIntrinEmitter(
@@ -56,7 +58,8 @@ def infer_layout(self, target: Target, thread_nums: int):
             raise ValueError(f"Unsupported gemm combination, A: {self.A.scope()}, B: {self.B.scope()}")
 
     def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
-        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, False)
+        # NOTE(wt): Actually gemm_sp v2 currently use GemmWarpPolicy
+        m_warp, n_warp = self.policy.compute_warp_partition(self.M, self.N, thread_nums, target, GemmInst.MMA)
         warp_row_tiles = int(self.M // m_warp)
         warp_col_tiles = int(self.N // n_warp)
         mma_emitter = SparseTensorCoreIntrinEmitter(
@@ -82,9 +85,9 @@ def lower(self, target: Target, thread_nums: int, thread_var: tir.Var):
         local_size_e = mma_emitter.local_size_e
         local_size_b = mma_emitter.local_size_b
         micro_size_k = mma_emitter.micro_size_k
-        A_shared = self.A
-        E_shared = self.E
-        B_shared = self.B
+        A_shared = self.ARegion
+        E_shared = self.ERegion
+        B_shared = self.BRegion
         C_local = self.C
         clear_accum = self.clear_accum
         assert micro_size_k <= self.K, f"K dimension {self.K} should be >= micro size k {micro_size_k}"
diff --git a/tilelang/tools/Analyzer.py b/tilelang/tools/Analyzer.py
index 3af5222f29..bb406ac251 100644
--- a/tilelang/tools/Analyzer.py
+++ b/tilelang/tools/Analyzer.py
@@ -9,7 +9,6 @@
 # Each entry contains: (cores per SM, default clock (GHz), FLOPs per cycle, max SM count)
 ARCH_CONFIGS = {"80": (128, 1.41, 2, 108), "86": (128, 1.70, 2, 84), "89": (128, 2.52, 2, 128)}
 
-logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
 
 
diff --git a/tilelang/tools/plot_layout.py b/tilelang/tools/plot_layout.py
index 299c3e86b6..a6d7471a50 100644
--- a/tilelang/tools/plot_layout.py
+++ b/tilelang/tools/plot_layout.py
@@ -1,38 +1,122 @@
 from __future__ import annotations
 import tilelang.language as T
+import itertools
 
 
 def plot_layout(
-    layout: T.Fragment,
+    layout,
     save_directory="./tmp",
     name: str = "layout",
-    colormap: str = "RdPu",
+    colormap: str = None,
     verbose: bool = False,
-    formats: str | list[str] = "png",
+    formats: str | list[str] = "pdf",
+    view: str = "input",
+    grid_shape: tuple[int, int] | None = None,
 ) -> None:
     """
-    Plot the layout of a buffer.
+    Plot the layout mapping as a 2D grid visualization.
+
+    Dispatches to Fragment-specific or Layout-specific plotting based on the
+    type of the layout object.
 
     Parameters
     ----------
-    layout : T.Layout
-        The layout object that describes how indices are mapped.
+    layout : T.Layout or T.Fragment
+        The layout object to visualize.
     save_directory : str, optional
-        The directory where the output images will be saved (default is "./tmp").
+        Output directory (default "./tmp").
     name : str, optional
-        The base name of the output files (default is "layout").
+        Base filename for saved images (default "layout").
     colormap : str, optional
-        The colormap to use for visualization (default is "RdPu").
+        Matplotlib colormap name. Defaults to "RdPu" for Fragment, "Spectral" for Layout.
     verbose : bool, optional
-        If True, prints additional information about the mapping (default is False).
+        If True, print mapping details.
     formats : str | list[str], optional
-        The formats to save the image in (default is "png").
-    Returns
-    -------
-    None
+        Output format(s): "pdf", "png", "svg", "all", or comma-separated (default "pdf").
+    view : str, optional
+        For `T.Layout` only: choose which space is shown as the 2D grid.
+
+        - "input" (default): grid is input space, labels show output (flattened) coordinates.
+        - "output": grid is output space, labels show input coordinates.
+    grid_shape : tuple[int, int] | None, optional
+        For `view="input"` only: override the 2D grid shape (rows, cols). The
+        product must match the total number of input elements.
     """
+    from tilelang.layout.fragment import Fragment
+
+    if isinstance(layout, Fragment):
+        _plot_fragment_layout(layout, save_directory, name, colormap or "RdPu", verbose, formats)
+    elif isinstance(layout, T.Layout):
+        _plot_layout_map(
+            layout,
+            save_directory,
+            name,
+            colormap or "Spectral",
+            verbose,
+            formats,
+            view=view,
+            grid_shape=grid_shape,
+        )
+    else:
+        raise TypeError(f"Expected T.Layout or T.Fragment, but got {type(layout).__name__}.")
+
+
+def _parse_formats(formats):
+    """Parse the formats parameter into a list of format strings."""
+    if isinstance(formats, str):
+        formats_str = formats.strip().lower()
+        if formats_str == "all":
+            return ["pdf", "png", "svg"]
+        elif "," in formats_str:
+            return [f.strip() for f in formats_str.split(",")]
+        else:
+            return [formats_str]
+    else:
+        raise TypeError(
+            f"Expected str, but got {type(formats).__name__}. Please pass a string like 'png', 'pdf', 'svg', 'all', or 'png,pdf'."
+        )
+
+
+def _save_plot(plt, save_directory, name, formats):
+    """Save the current matplotlib figure in the specified format(s)."""
     import os
     import pathlib
+
+    formats_list = _parse_formats(formats)
+
+    tmp_directory = pathlib.Path(save_directory)
+    if not os.path.exists(tmp_directory):
+        os.makedirs(tmp_directory)
+
+    if "pdf" in formats_list:
+        pdf_path = tmp_directory / f"{name}.pdf"
+        plt.savefig(pdf_path, bbox_inches="tight")
+        print(f"Saved pdf format into {pdf_path}")
+
+    if "png" in formats_list:
+        png_path = tmp_directory / f"{name}.png"
+        plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
+        print(f"Saved png format into {png_path}")
+
+    if "svg" in formats_list:
+        svg_path = tmp_directory / f"{name}.svg"
+        plt.savefig(svg_path, bbox_inches="tight", format="svg")
+        print(f"Saved svg format into {svg_path}")
+
+
+# ---------------------------------------------------------------------------
+# Fragment-specific layout visualization (thread ID + local ID per cell)
+# ---------------------------------------------------------------------------
+
+
+def _plot_fragment_layout(
+    layout: T.Fragment,
+    save_directory="./tmp",
+    name: str = "layout",
+    colormap: str = "RdPu",
+    verbose: bool = False,
+    formats: str | list[str] = "pdf",
+) -> None:
     import numpy as np
     import matplotlib.pyplot as plt
     import matplotlib.patches as patches
@@ -45,8 +129,6 @@ def plot_layout(
     # Get the total number of threads
     num_threads = int(layout.get_thread_size())
 
-    import itertools
-
     # Initialize a 2D array to store thread mappings
     thread_map = np.empty(input_shape, dtype=object)
     for idx in np.ndindex(thread_map.shape):
@@ -176,39 +258,438 @@ def plot_layout(
         ncols=2,
     )
 
-    # Create the output directory if it does not exist
-    tmp_directory = pathlib.Path(save_directory)
-    if not os.path.exists(tmp_directory):
-        os.makedirs(tmp_directory)
-
-    # Save the figure in multiple formats
     plt.tight_layout()
+    _save_plot(plt, save_directory, name, formats)
+    plt.close()
 
-    if isinstance(formats, str):
-        formats_str = formats.strip().lower()
-        if formats_str == "all":
-            formats_list = ["pdf", "png", "svg"]
-        elif "," in formats_str:
-            formats_list = [f.strip() for f in formats_str.split(",")]
+
+# ---------------------------------------------------------------------------
+# Layout-specific visualization (position mapping, no thread/local ID)
+# ---------------------------------------------------------------------------
+
+
+def _plot_layout_map(
+    layout: T.Layout,
+    save_directory="./tmp",
+    name: str = "layout",
+    colormap: str = "Spectral",
+    verbose: bool = False,
+    formats: str | list[str] = "pdf",
+    view: str = "input",
+    grid_shape: tuple[int, int] | None = None,
+) -> None:
+    """
+    Visualize a Layout object as a 2D grid showing position mappings.
+
+    By default (`view="input"`), the grid represents the input space (viewed as
+    2D by keeping the last dimension and flattening all preceding dimensions),
+    and each cell displays the output (flattened) coordinate.
+
+    With `view="output"`, the grid represents the output space (similarly viewed
+    as 2D), and each cell displays the original input coordinate that maps to
+    that output position.
+
+    Parameters
+    ----------
+    layout : T.Layout
+        The layout object to visualize.
+    save_directory : str
+        Output directory.
+    name : str
+        Base filename.
+    colormap : str
+        Matplotlib colormap for coloring cells by source position.
+    verbose : bool
+        Print mapping details.
+    formats : str | list[str]
+        Output format(s).
+    view : str
+        Choose which space is shown as the 2D grid: "input" (default) or
+        "output".
+    grid_shape : tuple[int, int] | None
+        For `view="input"` only: override the 2D grid shape (rows, cols). The
+        product must match the total number of input elements.
+    """
+    import functools
+    import operator
+    import numpy as np
+    import matplotlib.pyplot as plt
+    import matplotlib.patches as patches
+
+    input_shape = [int(v) for v in layout.get_input_shape()]
+    total_in = functools.reduce(operator.mul, input_shape, 1)
+
+    # -- helpers for N-D → 2-D conversion --------------------------------
+
+    def _flatten_to_2d(shape):
+        """Keep last dim, merge all preceding dims into one row dim."""
+        if len(shape) <= 1:
+            return (1, shape[0]) if shape else (1, 1)
+        return (functools.reduce(operator.mul, shape[:-1], 1), shape[-1])
+
+    def _nd_to_2d(idx, shape):
+        """Convert an N-D index to (row, col) in the flattened 2-D view."""
+        if len(shape) <= 1:
+            return (0, idx[0]) if shape else (0, 0)
+        row = 0
+        for k in range(len(shape) - 1):
+            row = row * shape[k] + idx[k]
+        return (row, idx[-1])
+
+    def _nd_to_flat(idx, shape):
+        """Convert an N-D index to a row-major flat index."""
+        flat = 0
+        for k in range(len(shape)):
+            flat = flat * shape[k] + idx[k]
+        return flat
+
+    # -- collect all input→output mappings ---------------------------------
+
+    mappings = []
+    num_out_dims = None
+    for in_idx in itertools.product(*[range(d) for d in input_shape]):
+        out_vals = layout.map_forward_index(list(in_idx))
+        out_idx = tuple(int(v) for v in out_vals)
+        if num_out_dims is None:
+            num_out_dims = len(out_idx)
+        mappings.append((tuple(in_idx), out_idx))
+
+    # determine output shape from actual output indices
+    output_shape = [0] * num_out_dims
+    for _, out_idx in mappings:
+        for k in range(num_out_dims):
+            output_shape[k] = max(output_shape[k], out_idx[k] + 1)
+
+    view_norm = (view or "output").strip().lower()
+
+    if view_norm in ("output", "out", "dst"):
+        out_rows, out_cols = _flatten_to_2d(output_shape)
+
+        if verbose:
+            print(f"Input shape : {input_shape}")
+            print(f"Output shape: {output_shape}")
+            print(f"Grid size   : {out_rows} x {out_cols} (output view)")
+
+        # -- build the output grid ---------------------------------------------
+
+        grid_labels = [[None] * out_cols for _ in range(out_rows)]
+        grid_src_flat = np.full((out_rows, out_cols), -1, dtype=int)
+
+        for in_idx, out_idx in mappings:
+            out_r, out_c = _nd_to_2d(out_idx, output_shape)
+            src_flat = _nd_to_flat(in_idx, input_shape)
+
+            grid_labels[out_r][out_c] = list(in_idx)
+            grid_src_flat[out_r, out_c] = src_flat
+
+            if verbose:
+                print(f"  {list(in_idx)} -> {list(out_idx)} -> grid[{out_r}, {out_c}]")
+
+        # -- plotting ----------------------------------------------------------
+
+        cmap = plt.get_cmap(colormap, max(total_in, 2))
+
+        # dynamic sizing
+        max_dim = max(out_rows, out_cols, 1)
+        cell_size = max(0.5, min(1.2, 16.0 / max_dim))
+
+        fig_w = cell_size * out_cols + 1.5
+        fig_h = cell_size * out_rows + 1.0
+        fig, ax = plt.subplots(figsize=(fig_w, fig_h))
+
+        # font size: adapt to cell size and longest label
+        sample_label = "[" + ",".join(str(d - 1) for d in input_shape) + "]"
+        max_label_len = len(sample_label)
+        cell_pts = cell_size * 72  # approximate cell width in points
+        base_font = max(5, min(16, cell_pts * 0.9 / max(max_label_len * 0.55, 1)))
+
+        for i in range(out_rows):
+            for j in range(out_cols):
+                sf = grid_src_flat[i, j]
+                if sf >= 0:
+                    color = cmap(sf / max(total_in - 1, 1))
+                else:
+                    color = (0.95, 0.95, 0.95, 1.0)
+
+                rect = patches.Rectangle(
+                    (j, i),
+                    1,
+                    1,
+                    linewidth=0.8,
+                    edgecolor="#aaaaaa",
+                    facecolor=color,
+                )
+                ax.add_patch(rect)
+
+                coords = grid_labels[i][j]
+                if coords is not None:
+                    label = "[" + ",".join(str(x) for x in coords) + "]"
+                    r, g, b = color[0], color[1], color[2]
+                    brightness = r * 0.299 + g * 0.587 + b * 0.114
+                    text_color = "white" if brightness < 0.5 else "black"
+                    ax.text(
+                        j + 0.5,
+                        i + 0.5,
+                        label,
+                        ha="center",
+                        va="center",
+                        color=text_color,
+                        fontsize=base_font,
+                        fontfamily="monospace",
+                        fontweight="bold",
+                    )
+
+        # axis labels
+        label_font = max(5, min(10, base_font * 0.85))
+        # row labels on the left
+        for i in range(out_rows):
+            ax.text(-0.15, i + 0.5, str(i), ha="right", va="center", fontsize=label_font, color="#666666")
+        # column labels at the bottom
+        for j in range(out_cols):
+            ax.text(j + 0.5, out_rows + 0.15, str(j), ha="center", va="top", fontsize=label_font, color="#666666")
+
+        ax.set_xlim(-0.3, out_cols)
+        ax.set_ylim(-0.1, out_rows + 0.5)
+        ax.invert_yaxis()
+        ax.set_aspect("equal")
+        ax.set_xticks([])
+        ax.set_yticks([])
+        for spine in ax.spines.values():
+            spine.set_visible(False)
+
+        # outer border
+        border = patches.Rectangle(
+            (0, 0),
+            out_cols,
+            out_rows,
+            linewidth=1.5,
+            edgecolor="#333333",
+            facecolor="none",
+        )
+        ax.add_patch(border)
+
+        # title: show shape transformation
+        in_str = "x".join(str(d) for d in input_shape)
+        out_str = "x".join(str(d) for d in output_shape)
+        title_font = max(8, min(14, base_font * 1.1))
+        ax.set_title(f"[{in_str}] -> [{out_str}]", fontsize=title_font, color="#333333", pad=8)
+
+        plt.tight_layout()
+        _save_plot(plt, save_directory, name, formats)
+        plt.close()
+        return
+
+    if view_norm in ("input", "in", "src"):
+        # Plot in input-space: keep the input arrangement, label by output.
+
+        if grid_shape is not None:
+            if len(grid_shape) != 2:
+                raise ValueError(f"grid_shape must be a (rows, cols) tuple, got: {grid_shape}")
+            in_rows, in_cols = int(grid_shape[0]), int(grid_shape[1])
+            if in_rows <= 0 or in_cols <= 0:
+                raise ValueError(f"grid_shape values must be positive, got: {grid_shape}")
+            if in_rows * in_cols != total_in:
+                raise ValueError(
+                    f"grid_shape product must match total input elements ({total_in}), got {in_rows}x{in_cols}={in_rows * in_cols}."
+                )
         else:
-            formats_list = [formats_str]
-    else:
-        raise TypeError(
-            f"Expected str, but got {type(formats).__name__}. Please pass a string like 'png', 'pdf', 'svg', 'all', or 'png,pdf'."
+            in_rows, in_cols = _flatten_to_2d(input_shape)
+
+        if verbose:
+            print(f"Input shape : {input_shape}")
+            print(f"Output shape: {output_shape}")
+            print(f"Grid size   : {in_rows} x {in_cols} (input view)")
+
+        grid_out_flat = np.full((in_rows, in_cols), -1, dtype=int)
+        grid_labels = [[None] * in_cols for _ in range(in_rows)]
+
+        max_out_flat = -1
+        for in_idx, out_idx in mappings:
+            in_flat = _nd_to_flat(in_idx, input_shape)
+            if grid_shape is None:
+                in_r, in_c = _nd_to_2d(in_idx, input_shape)
+            else:
+                in_r, in_c = divmod(in_flat, in_cols)
+
+            out_flat = _nd_to_flat(out_idx, output_shape)
+            max_out_flat = max(max_out_flat, out_flat)
+
+            grid_out_flat[in_r, in_c] = out_flat
+            grid_labels[in_r][in_c] = str(out_flat)
+
+            if verbose:
+                print(f"  {list(in_idx)} -> {list(out_idx)} -> grid[{in_r}, {in_c}] = {out_flat}")
+
+        cmap_n = max(max_out_flat + 1, 2)
+        cmap = plt.get_cmap(colormap, cmap_n)
+
+        max_dim = max(in_rows, in_cols, 1)
+        cell_size = max(0.5, min(1.2, 16.0 / max_dim))
+
+        fig_w = cell_size * in_cols + 1.5
+        fig_h = cell_size * in_rows + 1.0
+        fig, ax = plt.subplots(figsize=(fig_w, fig_h))
+
+        sample_label = str(max_out_flat)
+        max_label_len = len(sample_label)
+        cell_pts = cell_size * 72  # approximate cell width in points
+        base_font = max(5, min(16, cell_pts * 0.9 / max(max_label_len * 0.55, 1)))
+
+        for i in range(in_rows):
+            for j in range(in_cols):
+                of = grid_out_flat[i, j]
+                if of >= 0:
+                    color = cmap(of / max(max_out_flat, 1))
+                else:
+                    color = (0.95, 0.95, 0.95, 1.0)
+
+                rect = patches.Rectangle(
+                    (j, i),
+                    1,
+                    1,
+                    linewidth=0.8,
+                    edgecolor="#aaaaaa",
+                    facecolor=color,
+                )
+                ax.add_patch(rect)
+
+                label = grid_labels[i][j]
+                if label is not None:
+                    r, g, b = color[0], color[1], color[2]
+                    brightness = r * 0.299 + g * 0.587 + b * 0.114
+                    text_color = "white" if brightness < 0.5 else "black"
+                    ax.text(
+                        j + 0.5,
+                        i + 0.5,
+                        label,
+                        ha="center",
+                        va="center",
+                        color=text_color,
+                        fontsize=base_font,
+                        fontfamily="monospace",
+                        fontweight="bold",
+                    )
+
+        label_font = max(5, min(10, base_font * 0.85))
+        for i in range(in_rows):
+            ax.text(-0.15, i + 0.5, str(i), ha="right", va="center", fontsize=label_font, color="#666666")
+        for j in range(in_cols):
+            ax.text(j + 0.5, in_rows + 0.15, str(j), ha="center", va="top", fontsize=label_font, color="#666666")
+
+        ax.set_xlim(-0.3, in_cols)
+        ax.set_ylim(-0.1, in_rows + 0.5)
+        ax.invert_yaxis()
+        ax.set_aspect("equal")
+        ax.set_xticks([])
+        ax.set_yticks([])
+        for spine in ax.spines.values():
+            spine.set_visible(False)
+
+        border = patches.Rectangle(
+            (0, 0),
+            in_cols,
+            in_rows,
+            linewidth=1.5,
+            edgecolor="#333333",
+            facecolor="none",
         )
+        ax.add_patch(border)
 
-    # Save the figure
-    if "pdf" in formats_list:
-        pdf_path = tmp_directory / f"{name}.pdf"
-        plt.savefig(pdf_path, bbox_inches="tight")
-        print(f"Saved pdf format into {pdf_path}")
+        in_str = "x".join(str(d) for d in input_shape)
+        out_str = "x".join(str(d) for d in output_shape)
+        title_font = max(8, min(14, base_font * 1.1))
+        ax.set_title(f"[{in_str}] -> [{out_str}] (input view)", fontsize=title_font, color="#333333", pad=8)
 
-    if "png" in formats_list:
-        png_path = tmp_directory / f"{name}.png"
-        plt.savefig(png_path, bbox_inches="tight", transparent=False, dpi=255)
-        print(f"Saved png format into {png_path}")
+        plt.tight_layout()
+        _save_plot(plt, save_directory, name, formats)
+        plt.close()
+        return
 
-    if "svg" in formats_list:
-        svg_path = tmp_directory / f"{name}.svg"
-        plt.savefig(svg_path, bbox_inches="tight", format="svg")
-        print(f"Saved svg format into {svg_path}")
+    raise ValueError(f"Unknown view '{view}'. Expected 'output' or 'input'.")
+
+
+# ---------------------------------------------------------------------------
+# Fragment thread-value (TV) view
+# ---------------------------------------------------------------------------
+
+
+def plot_fragment_tv(
+    frag: T.Fragment,
+    save_directory: str | None = None,
+    name: str = "layout",
+    apply_idx_fn=lambda *args: args,
+    colormap: str = "RdPu",
+    item_scale: float = 0.75,
+    formats: str | list[str] = "pdf",
+    dpi=80,
+):
+    """
+    Plot fragment in terms of thread and local index mapping.
+    Parameters
+    ----------
+    frag : T.Fragment
+        The fragment object that describes how indices are mapped.
+    save_directory : str | None, optional
+        The directory where the output images will be saved.
+    name : str, optional
+        The base name of the output files (default is "layout").
+    apply_idx_fn : function, optional
+        A function to apply to the source indices for labeling (default is identity).
+    colormap : str, optional
+        The colormap to use for visualization (default is "RdPu").
+    item_scale : float, optional
+        The scale factor for each item in the plot (default is 0.75).
+    formats : str | list[str], optional
+        The formats to save the image in (default is "pdf").
+    dpi : int, optional
+        The resolution in dots per inch for the saved image (default is 80).
+    """
+
+    import numpy as np
+    import matplotlib.pyplot as plt
+    from pathlib import Path
+
+    src_shape = [i.value for i in frag.get_input_shape()]
+    num_local_dim = frag.get_output_shape()[0].value
+    num_thread_dim = frag.get_thread_size().value
+    dst_shape = [num_local_dim, num_thread_dim]
+    num_rep = frag.replicate_size.value
+    src_flat_idx = np.zeros(dst_shape, dtype=np.int64)
+    src_idx_str = np.full(dst_shape, "", dtype="<U32")
+    if num_rep > 1:
+        for rep in range(num_rep):
+            for src_idx, item in enumerate(itertools.product(*([range(i) for i in src_shape]))):
+                th = frag.map_forward_thread([rep] + list(item))[0].value
+                dst_idx = frag.map_forward_index([rep] + list(item))[0].value
+                src_flat_idx[dst_idx, th] = src_idx
+                src_idx_str[dst_idx, th] = "(" + ",".join([str(i) for i in apply_idx_fn(*item)]) + ")"
+    else:
+        for src_idx, item in enumerate(itertools.product(*([range(i) for i in src_shape]))):
+            th = frag.map_forward_thread(item)[0].value
+            dst_idx = frag.map_forward_index(item)[0].value
+            src_flat_idx[dst_idx, th] = src_idx
+            src_idx_str[dst_idx, th] = "(" + ",".join([str(i) for i in apply_idx_fn(*item)]) + ")"
+
+    plt.figure(figsize=(item_scale * num_thread_dim, item_scale * num_local_dim))
+    cmap = plt.get_cmap(colormap)
+    plt.pcolormesh(src_flat_idx, cmap=colormap, edgecolors="k", linewidth=0.5)
+    mx = np.max(src_flat_idx) + 1
+    for i in range(num_local_dim):
+        for j in range(num_thread_dim):
+            r, g, b, a = cmap(src_flat_idx[i, j] / mx)
+            light_color = r + g + b < 1.5
+            plt.text(j + 0.5, i + 0.5, src_idx_str[i, j], ha="center", va="center", color="white" if light_color else "black")
+    plt.xticks(np.arange(num_thread_dim) + 0.5, [f"T{i}" for i in range(num_thread_dim)])
+    plt.yticks(np.arange(num_local_dim) + 0.5, [f"I{i}" for i in range(num_local_dim)])
+    plt.gca().invert_yaxis()
+    plt.gca().xaxis.tick_top()
+    plt.tight_layout()
+
+    if save_directory is not None:
+        save_dir = Path(save_directory)
+        save_dir.mkdir(parents=True, exist_ok=True)
+        if isinstance(formats, str):
+            formats = [formats]
+        for fmt in formats:
+            plt.savefig(save_dir / f"{name}.{fmt}", bbox_inches="tight", dpi=dpi)
+        plt.close()
diff --git a/tilelang/transform/__init__.py b/tilelang/transform/__init__.py
index 697dee2b19..677887bd49 100644
--- a/tilelang/transform/__init__.py
+++ b/tilelang/transform/__init__.py
@@ -7,6 +7,8 @@
 from tilelang import tvm as tvm  # noqa: F401
 from tvm.ir.transform import PassContext  # noqa: F401
 from .add_bufstore_wrapper import AddWrapperForSingleBufStore  # noqa: F401
+from .hoist_broadcast_values import HoistBroadcastValues  # noqa: F401
+from .decouple_type_cast import DecoupleTypeCast  # noqa: F401
 
 
 def get_pass_context():
@@ -36,6 +38,22 @@ def PipelinePlanning():
     return _ffi_api.PipelinePlanning()  # type: ignore
 
 
+def InstructionAnnotation():
+    """Annotate tile operations with coarse-grained instruction kind.
+
+    This pass runs before LayoutInference and LowerTileOp.  It adds a
+    ``tl_instruction_kind`` annotation to each tile-op Call node indicating
+    the instruction category ("tma", "cp_async", "sync", "wgmma", etc.)
+    that will be selected during lowering.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.InstructionAnnotation()  # type: ignore
+
+
 def LayoutInference():
     """LayoutInference
 
@@ -69,17 +87,6 @@ def InjectSoftwarePipeline():
     return _ffi_api.InjectSoftwarePipeline()  # type: ignore
 
 
-def FrontendLegalize():
-    """FrontendLegalize
-
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    """
-    return _ffi_api.FrontendLegalize()  # type: ignore
-
-
 def LegalizeNegativeIndex():
     """Legalize negative indices in buffer loads.
 
@@ -103,37 +110,26 @@ def InjectAssumes():
     return _ffi_api.InjectAssumes()
 
 
-def LowerHopperIntrin():
-    """LowerHopperIntrin
+def VerifyParallelLoop():
+    """VerifyParallelLoop
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f  # type: ignore
-
+    return _ffi_api.VerifyParallelLoop()  # type: ignore
 
-def WarpSpecializedPipeline():
-    """WarpSpecializedPipeline
 
-    Returns
-    -------
-    fpass : tvm.transform.Pass
-        The result pass
-    """
-    return _ffi_api.WarpSpecializedPipeline()  # type: ignore
-
-
-def RewriteWgmmaSync():
-    """RewriteWgmmaSync
+def LowerHopperIntrin():
+    """LowerHopperIntrin
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.RewriteWgmmaSync()  # type: ignore
+    return _ffi_api.LowerHopperIntrin() if hasattr(_ffi_api, "LowerHopperIntrin") else lambda f: f  # type: ignore
 
 
 def ThreadSync(storage_scope: str):
@@ -152,64 +148,66 @@ def ThreadSync(storage_scope: str):
     return _ffi_api.ThreadSync(storage_scope)  # type: ignore
 
 
-def ThreadPartialSync(storage_scope: str):
-    """Insert partial sync.
-
-    Parameters
-    ----------
-    storage_scope: str
-        The target storage scope.
+def IfStmtBinding():
+    """IfStmtBinding
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.ThreadPartialSync(storage_scope)  # type: ignore
+    return _ffi_api.IfStmtBinding()  # type: ignore
 
 
-def IfStmtBinding():
-    """IfStmtBinding
+def MergeIfStmt():
+    """MergeIfStmt
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.IfStmtBinding()  # type: ignore
+    return _ffi_api.MergeIfStmt()  # type: ignore
 
 
-def MergeIfStmt():
-    """MergeIfStmt
+def LoopUnswitching():
+    """LoopUnswitching: Hoist loop-invariant if statements out of loops.
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.MergeIfStmt()  # type: ignore
+    return _ffi_api.LoopUnswitching()  # type: ignore
+
 
+def ProducerConsumerWarpSpecialized():
+    """Producer-consumer warp specialization at the tile-op level.
 
-def MultiVersionBuffer():
-    """WarpSpecializedPipeline
+    This pass runs before LayoutInference and LowerTileOp. It rewrites
+    eligible pipelined tile-op loops into warp-specialized producer and
+    consumer branches with explicit barrier synchronization.
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.MultiVersionBuffer()  # type: ignore
+    return _ffi_api.ProducerConsumerWarpSpecialized()  # type: ignore
 
 
-def WarpSpecialized():
-    """WarpSpecializedPipeline
+def ProducerConsumerWarpSpecializedTiled():
+    """Compatibility alias for ``ProducerConsumerWarpSpecialized``.
+
+    The tiled tile-op implementation is now the canonical
+    ``ProducerConsumerWarpSpecialized`` pass.
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.WarpSpecialized()  # type: ignore
+    return ProducerConsumerWarpSpecialized()
 
 
 def AnnotateWarpGroupRegAlloc():
@@ -227,26 +225,38 @@ def AnnotateWarpGroupRegAlloc():
     return _ffi_api.AnnotateWarpGroupRegAlloc()  # type: ignore
 
 
-def InjectTmaBarrier():
-    """InjectTmaBarrier
+def FuseMBarrierArriveExpectTx():
+    """Fuse simple expect_tx -> TMA issue -> arrive back into arrive_and_expect_tx."""
+    return _ffi_api.FuseMBarrierArriveExpectTx()  # type: ignore
+
+
+def InjectFenceProxy():
+    """InjectFenceProxy
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.InjectTmaBarrier()  # type: ignore
+    return _ffi_api.InjectFenceProxy()  # type: ignore
 
 
-def InjectFenceProxy():
-    """InjectFenceProxy
+def InjectTcgen05Fence():
+    """Inject tcgen05.fence::before_thread_sync / after_thread_sync at
+    conservative TCGEN05/TMEM synchronization boundaries on Blackwell
+    (SM100+) targets.
+
+    The current pass wraps CTA-wide shared-memory syncs and also inserts
+    fences around linear mbarrier wait/use and use/arrive handoff patterns.
+    It is intentionally conservative and does not try to infer arbitrary
+    barrier protocols.
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.InjectFenceProxy()  # type: ignore
+    return _ffi_api.InjectTcgen05Fence()  # type: ignore
 
 
 def LegalizeVectorizedLoop():
@@ -271,6 +281,17 @@ def LegalizeSafeMemoryAccess():
     return _ffi_api.LegalizeSafeMemoryAccess()  # type: ignore
 
 
+def LowerAccessPtr():
+    """Lower TileLang frontend `tl.access_ptr` to `tir.builtin.tvm_access_ptr`.
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.LowerAccessPtr()  # type: ignore
+
+
 def MakePackedAPI():
     """MakePackedAPI
 
@@ -330,15 +351,28 @@ def VectorizeLoop(enable_vectorize: bool = True):
     return _ffi_api.VectorizeLoop(enable_vectorize)  # type: ignore
 
 
-def InjectPTXAsyncCopy():
-    """Rewrite global to shared memory copy on CUDA with asynchronous copy.
+def LowerPTXAsyncCopy():
+    """Lower eligible global->shared copies into PTX `cp.async` on CUDA.
+
+    When enabled (pass config `tl.enable_async_copy`, default True), this pass
+    may rewrite plain user-written global->shared `BufferStore` patterns (e.g.
+    SIMT copies in `T.Parallel`) into `tir.ptx_cp_async`, and insert
+    `tir.ptx_commit_group` + `tir.ptx_wait_group(0)` to preserve synchronous
+    semantics for normal stores. If explicit commit/wait intrinsics already
+    exist, the pass avoids duplicating them (and may insert a missing commit
+    immediately before an existing wait to cover injected `cp.async`).
 
     Returns
     -------
     fpass : tvm.transform.Pass
         The result pass
     """
-    return _ffi_api.InjectPTXAsyncCopy()  # type: ignore
+    return _ffi_api.LowerPTXAsyncCopy()  # type: ignore
+
+
+def InjectPTXAsyncCopy():
+    """Deprecated alias of `LowerPTXAsyncCopy`."""
+    return LowerPTXAsyncCopy()
 
 
 def LowerDeviceStorageAccessInfo():
@@ -379,11 +413,6 @@ def FlattenBuffer():
     return _ffi_api.FlattenBuffer()  # type: ignore
 
 
-def EliminateStorageSyncForMBarrier():
-    """EliminateStorageSyncForMBarrier"""
-    return _ffi_api.EliminateStorageSyncForMBarrier()  # type: ignore
-
-
 def MergeSharedMemoryAllocations(enable_aggressive_merge: bool = False, align_bytes: int = 16):
     """MergeSharedMemoryAllocations
 
@@ -400,25 +429,16 @@ def LowerL2Persistent():
     return _ffi_api.LowerL2Persistent()  # type: ignore
 
 
+def MarkCudaSyncCalls(have_pdl: bool = False):
+    """MarkCudaSyncCalls"""
+    return _ffi_api.MarkCudaSyncCalls(have_pdl)  # type: ignore
+
+
 def PersistThreadblock():
     """PersistThreadblock"""
     return _ffi_api.PersistThreadblock()  # type: ignore
 
 
-def AlignDynamicSharedMemoryAllocations(align_bytes: int = 16):
-    """AlignDynamicSharedMemoryAllocations
-
-    Parameters
-    ----------
-    align_bytes: int
-        The alignment bytes.
-
-    Returns
-    -------
-    """
-    return _ffi_api.AlignDynamicSharedMemoryAllocations(align_bytes)  # type: ignore
-
-
 def LowerSharedBarrier():
     """LowerSharedBarrier"""
     return _ffi_api.LowerSharedBarrier()  # type: ignore
@@ -435,6 +455,17 @@ def PlanAndUpdateBufferAllocationLocation():
     return _ffi_api.PlanAndUpdateBufferAllocationLocation()  # type: ignore
 
 
+def HoistGlobalBufferAllocations():
+    """Hoist global buffer allocations to the top of the block (host side).
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.HoistGlobalBufferAllocations()  # type: ignore
+
+
 def HoistNonRestrictParams():
     return _ffi_api.HoistNonRestrictParams()  # type: ignore
 
@@ -493,3 +524,53 @@ def LayoutReducer():
         The transform pass object produced by the FFI backend.
     """
     return _ffi_api.LayoutReducer()  # type: ignore
+
+
+def UnrollLoop():
+    """Unroll loops as in Halide pipeline.
+
+    This pass unrolls loops based on configuration options including:
+    - auto_max_step: Threshold of number of steps to be automatically unrolled
+    - auto_max_depth: Maximum nested level of loops that can be automatically unrolled
+    - auto_max_extent: Maximum extent of loop that will be unrolled
+    - explicit_unroll: Whether to explicitly unroll instead of setting a pragma
+    - unroll_local_access: Whether to always unroll local access
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.UnrollLoop()  # type: ignore
+
+
+def LowerLDGSTG():
+    """Lower Ramp-based global memory load/store to ldg/stg intrinsics.
+
+    This pass transforms vectorized global memory loads and stores (using Ramp indices)
+    into explicit ldg32/64/128/256 and stg32/64/128/256 intrinsics for better codegen.
+
+    Key behaviors:
+    - Converts Ramp-based global BufferLoad to ldg intrinsics
+    - Converts Ramp-based global BufferStore to stg intrinsics
+    - Supports predicated loads (if_then_else with else=0)
+    - Supports predicated stores (if in then case)
+    - Skips loads in async scope (will be lowered to cp.async)
+    - Only enabled for CUDA targets
+
+    Returns
+    -------
+    fpass : tvm.transform.Pass
+        The result pass
+    """
+    return _ffi_api.LowerLDGSTG()  # type: ignore
+
+
+def LowerBlackwell2SM():
+    """Lower 2SM TCGEN5MMA and related on Blackwell target
+
+    Returns:
+        fpass : tvm.transform.Pass
+            The result pass
+    """
+    return _ffi_api.LowerBlackwell2SM()  # type: ignore
diff --git a/tilelang/transform/decouple_type_cast.py b/tilelang/transform/decouple_type_cast.py
new file mode 100644
index 0000000000..eafd8b36f5
--- /dev/null
+++ b/tilelang/transform/decouple_type_cast.py
@@ -0,0 +1,563 @@
+"""
+Decouple type cast vectorization constraints.
+
+When a vectorized loop has mixed-precision operations between local and memory
+buffers, the vectorization length would be constrained by the GCD of all
+involved dtypes.
+
+This pass decouples the constraints by inserting a local buffer as an
+intermediate stage, allowing optimal vectorization for both computation and
+memory access.
+
+Mixed-precision is detected by the presence of Cast nodes in the loop body.
+
+Two cases are handled:
+
+Case 1: local → memory (store to memory with mixed types)
+---------------------------------------------------------
+Before:
+    for vec in T.vectorized(16):
+        b[vec] = T.cast(a_frag[vec], "float4_e2m1fn")
+
+After:
+    for vec in T.vectorized(16):
+        cast_buf[vec] = T.cast(a_frag[vec], "float4_e2m1fn")  # compute
+    for vec_copy in T.vectorized(16):
+        b[vec_copy] = cast_buf[vec_copy]                      # copy to memory
+
+Case 2: memory → local (load from memory with different dtype)
+--------------------------------------------------------------
+Before:
+    for vec in T.vectorized(16):
+        a_frag[vec] = T.cast(b[vec], "float32")
+
+After:
+    for vec_copy in T.vectorized(16):
+        cast_buf[vec_copy] = b[vec_copy]                      # copy from memory
+    for vec in T.vectorized(16):
+        a_frag[vec] = T.cast(cast_buf[vec], "float32")        # compute
+"""
+
+from __future__ import annotations
+
+from tvm import ir as tvm_ir
+from tvm import tir
+from tvm.ir import Op
+from tvm.tir import (
+    Allocate,
+    Buffer,
+    BufferLoad,
+    BufferStore,
+    Call,
+    Cast,
+    DeclBuffer,
+    For,
+    ForKind,
+    IfThenElse,
+    IntImm,
+    LetStmt,
+    PrimFunc,
+    PyStmtExprVisitor,
+    SeqStmt,
+    Stmt,
+    Var,
+)
+from tvm.tir.stmt_functor import post_order_visit, substitute
+from tvm.tir.transform import prim_func_pass
+
+# Cache the Op for if_then_else to avoid repeated lookups
+_IF_THEN_ELSE_OP = Op.get("tir.if_then_else")
+
+from tilelang.utils.language import is_fragment, is_global, is_local, is_local_var, is_shared
+
+
+def is_local_buffer(buffer: Buffer) -> bool:
+    """Check if a buffer is local (register-level), including local.var."""
+    if buffer is None:
+        return False
+    return is_local(buffer) or is_fragment(buffer) or is_local_var(buffer)
+
+
+def is_global_or_shared_buffer(buffer: Buffer) -> bool:
+    """Check if a buffer is a global or shared buffer."""
+    if buffer is None:
+        return False
+    return is_global(buffer) or is_shared(buffer)
+
+
+# ---------------------------------------------------------------------------
+# Mixed-precision detection: check for Cast nodes in the statement tree
+# ---------------------------------------------------------------------------
+
+
+@tir.functor.visitor
+class _CastFinder(PyStmtExprVisitor):
+    """Find Cast nodes in a statement, skipping BufferLoad/BufferStore indices.
+
+    A Cast that only appears inside an index expression is not a mixed-precision
+    compute — it's just an index-type conversion — so it should not trigger the
+    decoupling transformation.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.found = False
+
+    def visit_cast_(self, op: Cast) -> None:
+        self.found = True
+        self.visit_expr(op.value)
+
+    def visit_buffer_store_(self, op: BufferStore) -> None:
+        self.visit_expr(op.value)
+
+    def visit_buffer_load_(self, op: BufferLoad) -> None:
+        pass
+
+
+def _has_cast(stmt: Stmt) -> bool:
+    """Check if a statement tree contains any Cast node outside of indices."""
+    finder = _CastFinder()
+    finder.visit_stmt(stmt)
+    return finder.found
+
+
+def _contains_seq_stmt(stmt: Stmt) -> bool:
+    """Check if statement contains SeqStmt (multiple statements).
+
+    When the For body has SeqStmt, the transformation is more complex
+    and we skip the optimization for now.
+    """
+    found = False
+
+    def visitor(node) -> None:
+        nonlocal found
+        if isinstance(node, SeqStmt):
+            found = True
+
+    post_order_visit(stmt, visitor)
+    return found
+
+
+def _expr_depends_on_var(expr: tir.PrimExpr, var: Var) -> bool:
+    """Check if an expression references the given Var."""
+    found = False
+
+    def visitor(node) -> None:
+        nonlocal found
+        if isinstance(node, Var) and node.same_as(var):
+            found = True
+
+    post_order_visit(expr, visitor)
+    return found
+
+
+# ---------------------------------------------------------------------------
+# Collection: gather all shared/global BufferStores and BufferLoads
+# ---------------------------------------------------------------------------
+
+
+@tir.functor.visitor
+class MemoryAccessCollector(PyStmtExprVisitor):
+    """Collect shared/global BufferStore and BufferLoad nodes.
+
+    Skips indices traversal so that index expressions (which may contain
+    BufferLoads to index buffers) do not pollute the result.
+
+    BufferLoads in if_then_else conditions are skipped because conditions
+    don't participate in the type-cast compute path.
+
+    BufferLoads whose indices do not depend on ``loop_var`` are skipped
+    because they are scalar accesses (e.g. ``b[0]``) that should remain
+    in the compute loop as broadcasts.
+    """
+
+    def __init__(self, loop_var: Var):
+        super().__init__()
+        self.loop_var = loop_var
+        self.stores: list[BufferStore] = []
+        self.loads: list[BufferLoad] = []
+
+    def visit_buffer_store_(self, op: BufferStore) -> None:
+        if is_global_or_shared_buffer(op.buffer):
+            self.stores.append(op)
+        # Visit value but skip indices
+        self.visit_expr(op.value)
+
+    def visit_buffer_load_(self, op: BufferLoad) -> None:
+        # Skip loads whose indices do not depend on loop_var (scalar access).
+        # Collect ALL qualifying loads (even from the same buffer with different
+        # indices, e.g. a[i] and a[i+32]) so each gets its own cast buffer.
+        if is_global_or_shared_buffer(op.buffer) and any(_expr_depends_on_var(idx, self.loop_var) for idx in op.indices):
+            self.loads.append(op)
+        # Skip indices traversal
+
+    def visit_call_(self, op: Call) -> None:
+        if op.op.same_as(_IF_THEN_ELSE_OP):
+            # Skip condition (args[0]), only visit true/false values
+            self.visit_expr(op.args[1])
+            self.visit_expr(op.args[2])
+        else:
+            for arg in op.args:
+                self.visit_expr(arg)
+
+
+# ---------------------------------------------------------------------------
+# Utilities
+# ---------------------------------------------------------------------------
+
+
+def inline_let_stmts(stmt: Stmt) -> Stmt:
+    """Inline all LetStmt bindings in *stmt* so that downstream visitors can
+    see the original BufferLoad nodes that were hidden behind Var references.
+
+    Used before collecting memory accesses so that BufferLoads inside LetStmt
+    values are visible to ``MemoryAccessCollector``.
+    """
+    if isinstance(stmt, LetStmt):
+        body = inline_let_stmts(stmt.body)
+        return substitute(body, {stmt.var: stmt.value})
+    elif isinstance(stmt, IfThenElse):
+        then_case = inline_let_stmts(stmt.then_case)
+        else_case = inline_let_stmts(stmt.else_case) if stmt.else_case else None
+        if then_case is not stmt.then_case or else_case is not stmt.else_case:
+            return IfThenElse(stmt.condition, then_case, else_case)
+        return stmt
+    elif isinstance(stmt, SeqStmt):
+        new_seq = [inline_let_stmts(s) for s in stmt.seq]
+        return SeqStmt(new_seq)
+    else:
+        return stmt
+
+
+def extract_if_condition(stmt: Stmt) -> tuple[tir.PrimExpr | None, Stmt]:
+    """Extract IfThenElse condition from statement if present.
+
+    Returns:
+        A tuple of (condition, inner_body). If no IfThenElse, returns (None, stmt).
+    """
+    if isinstance(stmt, IfThenElse) and stmt.else_case is None:
+        return stmt.condition, stmt.then_case
+    return None, stmt
+
+
+# Cast entry: (original buffer, original indices, cast buffer)
+# Each unique (buffer, indices) pair gets its own entry, so that accesses
+# like a[i] and a[i+32] from the same buffer are handled correctly.
+CastEntry = tuple[Buffer, list[tir.PrimExpr], Buffer]
+
+
+def _buf_indices_match(
+    buf_a: Buffer,
+    indices_a: list[tir.PrimExpr],
+    buf_b: Buffer,
+    indices_b: list[tir.PrimExpr],
+) -> bool:
+    """Check if two (buffer, indices) pairs refer to the same access pattern."""
+    if not buf_a.same_as(buf_b):
+        return False
+    if len(indices_a) != len(indices_b):
+        return False
+    return all(tvm_ir.structural_equal(a, b) for a, b in zip(indices_a, indices_b))
+
+
+def _find_cast_entry(
+    entries: list[CastEntry],
+    buffer: Buffer,
+    indices: list[tir.PrimExpr],
+) -> Buffer | None:
+    """Find the cast buffer for a given (buffer, indices) pair, or None."""
+    for orig_buf, orig_indices, cast_buf in entries:
+        if _buf_indices_match(orig_buf, orig_indices, buffer, indices):
+            return cast_buf
+    return None
+
+
+# ---------------------------------------------------------------------------
+# Mutator
+# ---------------------------------------------------------------------------
+
+
+@tir.functor.mutator
+class DecoupleTypeCastMutator(tir.PyStmtExprMutator):
+    """Mutator that decouples type cast vectorization constraints.
+
+    This mutator transforms vectorized loops that have mixed-precision
+    operations (detected by the presence of Cast nodes) by inserting local
+    cache buffers as intermediate stages.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self._var_counter = 0
+
+    def _make_unique_name(self, base: str) -> str:
+        """Generate a unique name with incrementing counter."""
+        name = f"{base}"
+        if self._var_counter > 0:
+            name += f"_{self._var_counter}"
+        self._var_counter += 1
+        return name
+
+    def _make_for(self, original: For, new_body: Stmt) -> For:
+        """Create a new For node with updated body, preserving other attributes."""
+        return For(
+            original.loop_var,
+            original.min,
+            original.extent,
+            original.kind,
+            new_body,
+            original.thread_binding,
+            original.annotations,
+            original.step,
+        )
+
+    # ----- entry point for each For loop -----
+
+    def visit_for_(self, op: For) -> Stmt:
+        """Visit For nodes, transforming vectorized loops with mixed-type stores."""
+        # Recursively visit body to handle nested loops
+        new_body = self.visit_stmt(op.body)
+
+        # Only transform vectorized loops with static extent
+        if op.kind != ForKind.VECTORIZED:
+            return self._make_for(op, new_body) if new_body is not op.body else op
+        if not isinstance(op.extent, IntImm):
+            return self._make_for(op, new_body) if new_body is not op.body else op
+
+        # Check if the body has any Cast nodes
+        if not _has_cast(new_body):
+            return self._make_for(op, new_body) if new_body is not op.body else op
+
+        # Skip SeqStmt (multiple statements) — not supported yet
+        if _contains_seq_stmt(new_body):
+            return self._make_for(op, new_body) if new_body is not op.body else op
+
+        # Inline LetStmts for analysis so BufferLoads behind Vars are visible
+        inlined_body = inline_let_stmts(new_body)
+
+        # Collect all shared/global stores and loads
+        collector = MemoryAccessCollector(op.loop_var)
+        collector.visit_stmt(inlined_body)
+
+        if not collector.stores and not collector.loads:
+            # Cast exists but no memory access → nothing to decouple
+            return self._make_for(op, new_body) if new_body is not op.body else op
+
+        extent = op.extent.value
+
+        # Extract condition (from inlined body for correctness)
+        condition, _ = extract_if_condition(inlined_body)
+
+        # Create cast entries for stores and loads
+        store_entries = self._create_cast_entries(collector.stores, extent)
+        # For loads, skip those already covered by a store entry (read-modify-write)
+        # by matching (buffer, indices). Loads with different indices from the same
+        # buffer still get their own cast buffer.
+        uncovered_loads = [ld for ld in collector.loads if _find_cast_entry(store_entries, ld.buffer, list(ld.indices)) is None]
+        load_entries = self._create_cast_entries(uncovered_loads, extent)
+
+        # Build copy-from-memory loops (before compute)
+        # For read-modify-write, reuse the store-side cast buffer for copy-from.
+        rmw_entries = [
+            entry
+            for entry in store_entries
+            if any(_buf_indices_match(entry[0], entry[1], ld.buffer, list(ld.indices)) for ld in collector.loads)
+        ]
+        copy_from_loops = self._create_copy_loops(
+            op,
+            load_entries + rmw_entries,
+            direction="from_memory",
+            condition=condition,
+        )
+
+        # Build compute loop: replace stores and loads in the *inlined* body
+        # so that indices match what the collector saw (LetStmt vars are expanded).
+        # For RMW (a load whose (buffer, indices) matches a store entry), the load
+        # must be rewritten to the *same* cast buffer the store writes to, so we
+        # feed both store and load entries into the load-replacement table.
+        load_replacement_entries = store_entries + load_entries
+        compute_body = inlined_body
+        if store_entries or load_entries:
+            compute_body = self._replace_access(compute_body, store_entries, load_replacement_entries, op.loop_var)
+        compute_loop = self._make_vectorized_loop(op, compute_body)
+
+        # Build copy-to-memory loops (after compute)
+        copy_to_loops = self._create_copy_loops(
+            op,
+            store_entries,
+            direction="to_memory",
+            condition=condition,
+        )
+
+        # Combine: copy-from → compute → copy-to
+        all_stmts = copy_from_loops + [compute_loop] + copy_to_loops
+        result: Stmt = SeqStmt(all_stmts) if len(all_stmts) > 1 else all_stmts[0]
+
+        # Wrap with buffer declarations and allocations
+        result = self._wrap_with_allocations(result, store_entries + load_entries)
+
+        return result
+
+    # ----- helpers -----
+
+    def _create_cast_entries(self, accesses: list[BufferStore | BufferLoad], extent: int) -> list[CastEntry]:
+        """Create local cast buffers for memory accesses.
+
+        Each unique (buffer, indices) pair gets its own cast buffer.
+        """
+        entries: list[CastEntry] = []
+
+        for access in accesses:
+            indices = list(access.indices)
+            if _find_cast_entry(entries, access.buffer, indices) is not None:
+                continue
+
+            cache_name = self._make_unique_name(f"{access.buffer.name}_local_cast")
+            cast_buffer = tir.decl_buffer(
+                shape=(extent,),
+                dtype=access.buffer.dtype,
+                name=cache_name,
+                scope="local",
+            )
+            entries.append((access.buffer, indices, cast_buffer))
+
+        return entries
+
+    def _make_vectorized_loop(self, original: For, body: Stmt) -> For:
+        """Create a vectorized For loop based on the original."""
+        return For(
+            original.loop_var,
+            original.min,
+            original.extent,
+            ForKind.VECTORIZED,
+            body,
+            original.thread_binding,
+            original.annotations,
+            original.step,
+        )
+
+    def _create_copy_loops(
+        self,
+        op: For,
+        entries: list[CastEntry],
+        direction: str,
+        condition: tir.PrimExpr | None = None,
+    ) -> list[For]:
+        """Create vectorized copy loops between memory and cast buffers.
+
+        direction: "to_memory" (cast → memory) or "from_memory" (memory → cast).
+        """
+        copy_loops: list[For] = []
+
+        for orig_buffer, orig_indices, cast_buffer in entries:
+            # vectorized loop only has one iteration variable,
+            # so we use the same name for the copy variable
+            copy_var = Var(f"{op.loop_var.name}_copy", op.loop_var.dtype)
+
+            # Substitute loop_var with copy_var in original indices
+            new_indices = [substitute(idx, {op.loop_var: copy_var}) for idx in orig_indices]
+
+            if direction == "to_memory":
+                copy_store: Stmt = BufferStore(
+                    orig_buffer,
+                    BufferLoad(cast_buffer, [copy_var]),
+                    new_indices,
+                )
+            else:
+                copy_store = BufferStore(
+                    cast_buffer,
+                    BufferLoad(orig_buffer, new_indices),
+                    [copy_var],
+                )
+
+            # Wrap with condition if present
+            if condition is not None:
+                new_condition = substitute(condition, {op.loop_var: copy_var})
+                copy_store = IfThenElse(new_condition, copy_store, None)
+
+            copy_loop = For(
+                copy_var,
+                op.min,
+                op.extent,
+                ForKind.VECTORIZED,
+                copy_store,
+                op.thread_binding,
+                op.annotations,
+                op.step,
+            )
+            copy_loops.append(copy_loop)
+
+        return copy_loops
+
+    def _wrap_with_allocations(self, body: Stmt, entries: list[CastEntry]) -> Stmt:
+        """Wrap statement with buffer declarations and allocations."""
+        result = body
+        for _, _, cast_buffer in entries:
+            result = DeclBuffer(cast_buffer, result)
+            result = Allocate(
+                cast_buffer.data,
+                cast_buffer.dtype,
+                cast_buffer.shape,
+                tir.const(True),
+                result,
+            )
+        return result
+
+    def _replace_access(self, stmt: Stmt, store_entries: list[CastEntry], load_entries: list[CastEntry], loop_var: Var) -> Stmt:
+        """Replace memory accesses with cast buffer accesses."""
+        replacer = AccessReplacer(store_entries, load_entries, loop_var)
+        return replacer.visit_stmt(stmt)
+
+
+@tir.functor.mutator
+class AccessReplacer(tir.PyStmtExprMutator):
+    """Mutator to replace memory BufferStores/BufferLoads with cast buffer accesses.
+
+    Matches by both buffer and indices (structural equality) so that accesses
+    like a[i] and a[i+32] from the same buffer map to different cast buffers.
+    """
+
+    def __init__(self, store_entries: list[CastEntry], load_entries: list[CastEntry], loop_var: Var):
+        super().__init__()
+        self.store_entries = store_entries
+        self.load_entries = load_entries
+        self.loop_var = loop_var
+
+    def visit_buffer_store_(self, op: BufferStore) -> Stmt:
+        new_value = self.visit_expr(op.value)
+        cast_buf = _find_cast_entry(self.store_entries, op.buffer, list(op.indices))
+        if cast_buf is not None:
+            return BufferStore(cast_buf, new_value, [self.loop_var])
+        if new_value is not op.value:
+            return BufferStore(op.buffer, new_value, list(op.indices))
+        return op
+
+    def visit_buffer_load_(self, op: BufferLoad) -> tir.PrimExpr:
+        cast_buf = _find_cast_entry(self.load_entries, op.buffer, list(op.indices))
+        if cast_buf is not None:
+            return BufferLoad(cast_buf, [self.loop_var])
+        return op
+
+
+def DecoupleTypeCast():
+    """Create a TVM pass that decouples type cast vectorization constraints.
+
+    This pass inserts a local buffer as an intermediate stage for vectorized
+    loops where the body contains Cast nodes (mixed-precision operations).
+
+    This allows optimal vectorization for both computation and memory access.
+
+    Note:
+        This pass must be applied before VectorizeLoop and StorageRewrite passes,
+        while the IR still uses BufferLoad/BufferStore (not tvm_access_ptr).
+
+    Returns:
+        A TVM PrimFunc pass.
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx) -> PrimFunc:
+        mutator = DecoupleTypeCastMutator()
+        new_body = mutator.visit_stmt(func.body)
+        return func.with_body(new_body)
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/transform/hoist_broadcast_values.py b/tilelang/transform/hoist_broadcast_values.py
new file mode 100644
index 0000000000..91073ecbfc
--- /dev/null
+++ b/tilelang/transform/hoist_broadcast_values.py
@@ -0,0 +1,129 @@
+from tvm import tir
+from tvm.tir import (
+    BufferStore,
+    LetStmt,
+    Broadcast,
+    Var,
+    PrimFunc,
+    PyStmtExprMutator,
+)
+from tvm.tir.transform import prim_func_pass
+
+
+@tir.functor.mutator
+class HoistBroadcastValuesMutator(PyStmtExprMutator):
+    def __init__(self):
+        super().__init__()
+        # Temporary queue: used to store variables that need to be defined within the current statement.
+        self.pending_defs = []
+        # Flag to indicate if hoist should be enabled.
+        self.hoist_enabled = False
+
+    def visit_broadcast_(self, op):
+        if self.hoist_enabled and isinstance(op.value, (tir.IntImm, tir.FloatImm)):
+            # 1. Intercept Broadcast nodes.
+            # Extract the value to be hoisted into a variable.
+            val = self.visit_expr(op.value)
+            # 2. Create a new variable.
+            new_var = Var("broadcast_var", dtype=val.dtype)
+
+            # 3. Add the (variable, value) pair to the pending queue.
+            # Note: Do not create the LetStmt here; it must wrap the statement.
+            self.pending_defs.append((new_var, val))
+
+            # 4. Return a new Broadcast node, using the new variable to replace the original value.
+            return Broadcast(new_var, op.lanes)
+        return Broadcast(self.visit_expr(op.value), self.visit_expr(op.lanes))
+
+    # Intercept statement types that might contain expressions with broadcasts.
+    # Currently handled: BufferStore, LetStmt.
+    def visit_buffer_store_(self, op: BufferStore):
+        # 1. Save the current state to handle nested statements correctly.
+        saved_hoist_enabled = self.hoist_enabled
+        saved_pending_defs = self.pending_defs
+
+        # 2. Enable hoist flag and clear the pending queue for the current statement context.
+        self.hoist_enabled = True
+        self.pending_defs = []
+
+        # 3. Visit child nodes normally (this will trigger visit_broadcast_).
+        new_indices = [self.visit_expr(idx) for idx in op.indices]
+        new_stmt = BufferStore(op.buffer, self.visit_expr(op.value), new_indices)
+
+        # 4. Check if there are variables waiting to be defined.
+        if self.pending_defs:
+            # 5. Wrap the current statement with LetStmt.
+            # Order: Traverse in reverse to ensure the first definition wraps the outermost layer.
+            # Structure generated: Let my_var = val In BufferStore(...)
+            for var, val in reversed(self.pending_defs):
+                new_stmt = LetStmt(var, val, new_stmt)
+
+        # 6. Restore the saved state.
+        self.hoist_enabled = saved_hoist_enabled
+        self.pending_defs = saved_pending_defs
+
+        return new_stmt
+
+    def visit_let_stmt_(self, op: LetStmt):
+        # 1. Save the current state to handle nested statements correctly.
+        saved_hoist_enabled = self.hoist_enabled
+        saved_pending_defs = self.pending_defs
+
+        # 2. Enable hoist flag and clear the pending queue for the current statement context.
+        self.hoist_enabled = True
+        self.pending_defs = []
+
+        # 3. Visit the value expression (this will trigger visit_broadcast_).
+        new_value = self.visit_expr(op.value)
+
+        # 4. Capture the pending defs from the value expression before visiting body.
+        value_pending_defs = self.pending_defs
+
+        # 5. Disable hoist flag and clear pending defs before visiting body.
+        self.hoist_enabled = False
+        self.pending_defs = []
+
+        # 6. Recursively visit the body.
+        new_body = self.visit_stmt(op.body)
+
+        # 7. Create the new LetStmt.
+        new_stmt = LetStmt(op.var, new_value, new_body)
+
+        # 8. Check if there are variables waiting to be defined from the value expression.
+        if value_pending_defs:
+            # 9. Wrap the current statement with LetStmt.
+            for var, val in reversed(value_pending_defs):
+                new_stmt = LetStmt(var, val, new_stmt)
+
+        # 10. Restore the saved state.
+        self.hoist_enabled = saved_hoist_enabled
+        self.pending_defs = saved_pending_defs
+
+        return new_stmt
+
+
+def HoistBroadcastValues():
+    """
+    TVM Pass: HoistBroadcastValues.
+
+    This pass scans the TIR for Broadcast operations involving immediate constants (IntImm, FloatImm).
+    It extracts these constants into variables defined via LetStmt immediately surrounding
+    the statement where the broadcast occurs.
+
+    Example Transformation:
+    -----------------------
+    Before:
+        A[i] = B[i] + T.Broadcast(3.14, 4) + T.Broadcast(3.14, 4)
+
+    After:
+        bv_3_14 = 3.14
+        bv_3_14_1 = 3.14
+        A[i] = B[i] + T.Broadcast(bv_3_14, 4) + T.Broadcast(bv_3_14_1, 4)
+    """
+
+    def pass_fn(func: PrimFunc, mod, ctx):
+        mutator = HoistBroadcastValuesMutator()
+        new_body = mutator.visit_stmt(func.body)
+        return func.with_body(new_body)
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/transform/metal/__init__.py b/tilelang/transform/metal/__init__.py
new file mode 100644
index 0000000000..cdcb106ca8
--- /dev/null
+++ b/tilelang/transform/metal/__init__.py
@@ -0,0 +1 @@
+from .mark_host_metal_context import MarkHostMetalContext  # noqa: F401
diff --git a/tilelang/transform/metal/mark_host_metal_context.py b/tilelang/transform/metal/mark_host_metal_context.py
new file mode 100644
index 0000000000..f0203ccf00
--- /dev/null
+++ b/tilelang/transform/metal/mark_host_metal_context.py
@@ -0,0 +1,57 @@
+from tvm.ir import Op
+from tvm.tir import (
+    PyStmtExprMutator,
+    functor,
+    Evaluate,
+    AttrStmt,
+)
+from tvm.tir.transform import prim_func_pass
+
+"""
+Transformation pass to mark host-side kernel calls for Metal/MPS synchronization.
+
+To execute TVM-generated Metal kernels within a PyTorch environment, the TVM runtime
+must utilize PyTorch's active Metal command buffer (MPS). This ensures correct
+execution ordering and memory consistency between PyTorch operators and TVM kernels.
+
+This pass identifies calls to `tir.tvm_call_packed_lowered` occurring within a
+`compute_scope` and wraps them with a `metal_context` attribute. This attribute
+signals the downstream host C codegen to inject specific runtime logic that:
+1. Retrieves the current command buffer from `torch::mps`.
+2. Passes this stream to the TVM runtime before the kernel executes.
+"""
+
+
+tvm_call_packed_lowered = Op.get("tir.tvm_call_packed_lowered")
+
+
+@functor.mutator
+class MarkHostMetalContextMutator(PyStmtExprMutator):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.is_in_compute_scope = False
+
+    def visit_attr_stmt_(self, stmt):
+        switch = stmt.attr_key == "compute_scope"
+        old_value = False
+        if switch:
+            assert not self.is_in_compute_scope
+            old_value, self.is_in_compute_scope = self.is_in_compute_scope, True
+        s = self.visit_stmt(stmt.body)
+        if switch:
+            self.is_in_compute_scope = old_value
+        return s
+
+    def visit_evaluate_(self, op: Evaluate):
+        if self.is_in_compute_scope and op.value.op.same_as(tvm_call_packed_lowered):
+            return AttrStmt(0, "metal_context", "", op)
+        return op
+
+
+def MarkHostMetalContext():
+    def pass_fn(func, mod, ctx):
+        mutator = MarkHostMetalContextMutator()
+        new_body = mutator.visit_stmt(func.body)
+        return func.with_body(new_body)
+
+    return prim_func_pass(pass_fn, opt_level=0)
diff --git a/tilelang/transform/pass_config.py b/tilelang/transform/pass_config.py
index ee30149078..c95c62a1da 100644
--- a/tilelang/transform/pass_config.py
+++ b/tilelang/transform/pass_config.py
@@ -1,14 +1,59 @@
+from __future__ import annotations
+
 # TODO: Add more documentation for each pass config
 
+import warnings
 from enum import Enum
+from typing import Any
 
 
 class PassConfigKey(str, Enum):
     """Pass configuration keys for TileLang compiler."""
 
-    # TileLang specific configs
+    # TileLang specific configs: TL_XX
+
     TL_SIMPLIFY = "tl.Simplify"
-    """Enable/disable TileLang simplification passes. Default: True"""
+    """Configuration for TileLang simplification passes.
+
+    This is a dict-based config with the following options:
+    - transitively_prove_inequalities: bool, default False
+    - convert_boolean_to_and_of_ors: bool, default False
+    - apply_constraints_to_boolean_branches: bool, default False
+    - propagate_knowns_to_prove_conditional: bool, default False
+    - propagate_knowns_to_simplify_expressions: bool, default False
+    - enable_simplify_let_inline: bool, default True
+
+    Usage:
+        with tvm.transform.PassContext(config={
+            "tl.Simplify": {"enable_simplify_let_inline": False}
+        }):
+            mod = tl.transform.Simplify()(mod)
+    """
+
+    # TL_SIMPLIFY sub-config keys
+    TL_SIMPLIFY_TRANSITIVELY_PROVE_INEQUALITIES = "transitively_prove_inequalities"
+    """Enable transitive inequality proving in simplification. Default: False"""
+
+    TL_SIMPLIFY_CONVERT_BOOLEAN_TO_AND_OF_ORS = "convert_boolean_to_and_of_ors"
+    """Convert boolean expressions to AND of ORs form. Default: False"""
+
+    TL_SIMPLIFY_APPLY_CONSTRAINTS_TO_BOOLEAN_BRANCHES = "apply_constraints_to_boolean_branches"
+    """Apply constraints to simplify boolean branches. Default: False"""
+
+    TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_PROVE_CONDITIONAL = "propagate_knowns_to_prove_conditional"
+    """Propagate known values to prove conditionals. Default: False"""
+
+    TL_SIMPLIFY_PROPAGATE_KNOWNS_TO_SIMPLIFY_EXPRESSIONS = "propagate_knowns_to_simplify_expressions"
+    """Propagate known values to simplify expressions. Default: False"""
+
+    TL_SIMPLIFY_ENABLE_LET_INLINE = "enable_simplify_let_inline"
+    """Enable inlining of let statements during simplification. Default: True"""
+
+    TL_DISABLE_DATA_RACE_CHECK = "tl.disable_data_race_check"
+    """Disable data race check in TileLang. Default: False"""
+
+    TL_DISABLE_PRELOWER_SEMANTIC_CHECK = "tl.disable_prelower_semantic_check"
+    """Disable Python-side pre-lower semantic checks. Default: False"""
 
     TL_DISABLE_WARP_SPECIALIZED = "tl.disable_warp_specialized"
     """Disable warp specialization optimization. Default: False"""
@@ -44,21 +89,63 @@ class PassConfigKey(str, Enum):
     """Bitwidth for configuration indices. Default: 32"""
 
     TL_DISABLE_TMA_LOWER = "tl.disable_tma_lower"
-    """Disable TMA (Tensor Memory Access) lowering. Default: False"""
+    """Deprecated flag — prevents plain T.copy() from auto-lowering to TMA store.
+
+    Temporarily re-enabled for backward compatibility. Will be removed in
+    v0.1.10.
+    """
 
     TL_DISABLE_SAFE_MEMORY_ACCESS = "tl.disable_safe_memory_legalize"
     """Disable safe memory access optimization. Default: False"""
 
     TL_DISABLE_VECTORIZE_256 = "tl.disable_vectorize_256"
     """Disable usage of LDG/STG 256. Default: False"""
+
+    TL_ENABLE_ASYNC_COPY = "tl.enable_async_copy"
+    """Enable lowering eligible global->shared copies to PTX `cp.async`.
+
+    When True (default), TileLang may lower:
+    - `T.copy(global -> shared, ...)` to `ptx_cp_async + commit + wait`
+    - `T.async_copy(global -> shared, ...)` to `ptx_cp_async + commit` (no wait)
+    - plain user-written global->shared copy stores (e.g. in `T.Parallel`) to
+      `ptx_cp_async + commit + wait`
+
+    Important: Automatic cp.async lowering is gated by the surrounding loop
+    context. TileLang will only auto-enable cp.async when the copy is observed
+    inside a software-pipelined loop annotated with `num_stages > 0`
+    (e.g. created by `T.Pipelined(..., num_stages=...)` or by pipeline planning).
+    Outside such loops, TileLang will prefer synchronous copy lowering even when
+    this flag is True.
+    You can request local cp.async injection on a specific parallel loop via
+    `T.Parallel(..., prefer_async=True)`.
+
+    When False, TileLang will avoid the cp.async lowering path for `T.copy`.
+    Explicit `T.async_copy` still requires cp.async support and may error if
+    it cannot be lowered.
+
+    Default: True
+    """
+
+    TL_ENABLE_LOWER_LDGSTG = "tl.enable_lower_ldgstg"
+    """Enable non-predicated LDG/STG lowering for global memory access.
+    When enabled, converts Ramp-based global buffer load/store to ldg/stg intrinsics.
+    Default: False"""
+
+    TL_ENABLE_LOWER_LDGSTG_PREDICATED = "tl.enable_lower_ldgstg_predicated"
+    """Enable predicated LDG/STG lowering.
+    When True, predicated loads (if_then_else with else=0) and
+    predicated stores (IfThenElse with empty then case) are lowered to
+    ldg/stg intrinsics. Default: False"""
+
+    TL_ENABLE_VECTORIZE_PLANNER_VERBOSE = "tl.enable_vectorize_planner_verbose"
+    """Enable verbose output for vectorize planner. When enabled, prints detailed
+    information about each buffer's inferred vector size and which buffer determines
+    the final vectorization factor. Useful for debugging vectorization issues.
+    Default: False"""
+
     TL_DISABLE_WGMMA = "tl.disable_wgmma"
     """Disable usage of Hopper WGMMA. Default: False"""
 
-    TL_DISABLE_RDC = "tl.disable_rdc"
-    """Disable relocatable device code (RDC) compilation. When NVSHMEM is enabled,
-    RDC is required for device-side linking. Set this to True to disable RDC
-    when NVSHMEM functions are not used. Default: False"""
-
     TL_DEBUG_MERGE_SHARED_MEMORY_ALLOCATIONS = "tl.debug_merge_shared_memory_allocations"
     """Enable debug information for merge shared memory allocations. Default: False"""
 
@@ -68,6 +155,15 @@ class PassConfigKey(str, Enum):
     TL_DISABLE_SHUFFLE_ELECT = "tl.disable_shuffle_elect"
     """Disable shuffle election optimization. Default: False"""
 
+    TL_DISABLE_LOOP_UNSWITCHING = "tl.disable_loop_unswitching"
+    """Disable loop unswitching optimization. Default: False"""
+
+    TL_LOOP_UNSWITCHING_ALLOW_NON_TRIVIAL_ELSE = "tl.loop_unswitching_allow_non_trivial_else"
+    """Allow loop unswitching even when the else-version of the loop body has side effects.
+
+    This is more aggressive and may increase code size. Default: False.
+    """
+
     TL_DISABLE_THREAD_STORAGE_SYNC = "tl.disable_thread_storage_sync"
     """Disable thread storage synchronization pass. When enabled, disables the
     automatic insertion of thread synchronization barriers (e.g., __syncthreads())
@@ -130,7 +226,8 @@ class PassConfigKey(str, Enum):
     ```
     """
 
-    # TIR related configs
+    # TIR related configs: TIR_XX
+
     TIR_ENABLE_EQUIV_TERMS_IN_CSE = "tir.enable_equiv_terms_in_cse_tir"
     """Enable equivalent terms in TIR Common Subexpression Elimination. Default: True"""
 
@@ -161,5 +258,49 @@ class PassConfigKey(str, Enum):
     TIR_NOALIAS = "tir.noalias"
     """Enable pointer non-aliasing assumptions. Default: True"""
 
+    # Output debugging options
+
     CUDA_KERNELS_OUTPUT_DIR = "cuda.kernels_output_dir"
     """Output directory for generated CUDA kernels. Default: empty string"""
+
+    TL_DISABLE_OUT_OF_BOUND_WARNING = "tl.disable_out_of_bound_warning"
+    """Disable out-of-bound access warnings in safe memory access legalization. Default: True"""
+
+    TL_ENABLE_DUMP_IR = "tl.enable_dump_ir"
+    """Enable dumping IR during lowering between passes. Default: False"""
+
+    TL_DUMP_IR_DIR = "tl.dump_ir_path"
+    """Path to the directory where IR will be dumped. Default: ./dump_ir/"""
+
+
+_DEPRECATED_PASS_CONFIG_MESSAGES = {
+    PassConfigKey.TL_DISABLE_TMA_LOWER.value: (
+        "`tl.disable_tma_lower` is deprecated and will be removed in v0.1.10. Use `T.copy(..., disable_tma=True)` per-copy instead."
+    ),
+}
+
+
+def normalize_pass_configs(pass_configs: dict[str, Any] | None) -> dict[str, Any]:
+    """Canonicalize known pass-config keys and emit compatibility warnings."""
+    if pass_configs is None:
+        return {}
+
+    normalized: dict[str, Any] = {}
+    warned_keys: set[str] = set()
+
+    for key, value in pass_configs.items():
+        normalized_key = key
+        if isinstance(key, str):
+            try:
+                normalized_key = PassConfigKey(key)
+            except ValueError:
+                normalized_key = key
+
+        normalized[normalized_key] = value
+
+        warning_key = normalized_key.value if isinstance(normalized_key, PassConfigKey) else normalized_key
+        if warning_key in _DEPRECATED_PASS_CONFIG_MESSAGES and warning_key not in warned_keys:
+            warnings.warn(_DEPRECATED_PASS_CONFIG_MESSAGES[warning_key], DeprecationWarning, stacklevel=3)
+            warned_keys.add(warning_key)
+
+    return normalized
diff --git a/tilelang/utils/__init__.py b/tilelang/utils/__init__.py
index f2e8ec4414..81398db262 100644
--- a/tilelang/utils/__init__.py
+++ b/tilelang/utils/__init__.py
@@ -1,7 +1,11 @@
 """The profiler and convert to torch utils"""
 
-from .target import determine_target  # noqa: F401
-from .tensor import TensorSupplyType, torch_assert_close, map_torch_type  # noqa: F401
+from .target import (  # noqa: F401
+    determine_target,
+    determine_fp8_type,
+    determine_torch_fp8_type,
+)
+from .tensor import TensorSupplyType, torch_assert_close  # noqa: F401
 from .language import (
     is_global,  # noqa: F401
     is_shared,  # noqa: F401
@@ -20,3 +24,4 @@
     side_effect,  # noqa: F401
 )
 from .deprecated import deprecated  # noqa: F401
+from .version import build_date  # noqa: F401
diff --git a/tilelang/utils/allocator.py b/tilelang/utils/allocator.py
index c363ffa453..fc806345d9 100644
--- a/tilelang/utils/allocator.py
+++ b/tilelang/utils/allocator.py
@@ -2,9 +2,28 @@
 
 import ctypes
 import ctypes.util
+import os
 import torch
 import torch.distributed as dist
-from tilescale_ext import tensor_from_ptr, _create_ipc_handle, _sync_ipc_handles
+from tilelang.distributed.shared_memory import (
+    tensor_from_ptr,
+    _create_ipc_handle,
+    _sync_ipc_handles,
+    _vmm_malloc,
+    _vmm_free,
+    _create_vmm_handle,
+    _sync_vmm_handles,
+    _supports_multicast,
+    _mc_create,
+    _mc_export_handle,
+    _mc_import_handle,
+    _mc_add_device,
+    _mc_bind_mem,
+    _mc_map,
+    _mc_release_handle,
+    _mc_unmap,
+    _mc_get_aligned_size,
+)
 from tilelang.utils.target import parse_device
 import contextlib
 
@@ -67,6 +86,17 @@ def _load_cudart():
     _libcudart.cudaSetDevice.restype = ctypes.c_int
 
 
+def _resolve_use_vmm(use_vmm: bool | None) -> bool:
+    """Resolve whether to use VMM based on env var and hardware support."""
+    env_val = os.environ.get("TILESCALE_USE_VMM", None)
+    if env_val is not None:
+        return env_val == "1"
+    if use_vmm is not None:
+        return use_vmm
+    # Default: opt-in (False) for Step 1
+    return False
+
+
 class BaseAllocator:
     func: callable | None = None
 
@@ -79,10 +109,13 @@ def __init__(
         num_local_ranks: int | None = None,
         group: dist.ProcessGroup | None = None,
         align: int = 256,
+        use_vmm: bool | None = None,
+        mcast_size: int | None = None,
     ) -> None:
         if size <= 0:
             raise ValueError("size must be > 0")
         self.size = int(size)
+        self._use_vmm = _resolve_use_vmm(use_vmm)
         self._base_ptr = ctypes.c_void_p(0)
         self._ptr = ctypes.c_void_p(0)
         self._device = parse_device(device)
@@ -91,6 +124,7 @@ def __init__(
         self._num_local_ranks = num_local_ranks
         self._group = group
         self._align = align
+        self._mcast_size_requested = mcast_size
         # table items:
         # 1. local_rank, size: 8 bytes
         # 2. num_local_ranks, size: 8 bytes
@@ -100,6 +134,14 @@ def __init__(
         self._buffer_ptrs = None
         self._device_ids = None
         self._initialized = False
+        self._closed = False
+        # Multicast state
+        self._mcast_base_ptr = 0
+        self._mcast_ptr = 0
+        self._mcast_phys_ptr = 0
+        self._mcast_aligned_size = 0
+        self._use_multicast = False
+
         if self._is_distributed:
             assert self._group is not None, "group must be provided when is_distributed is True"
             assert self._local_rank is not None, "local_rank must be provided when is_distributed is True"
@@ -121,22 +163,151 @@ def _alloc(self):
             rc = _libcudart.cudaSetDevice(int(self._device))
             if rc != 0:
                 raise RuntimeError(f"cudaSetDevice failed: {rc} {_libcudart.cudaGetErrorString(rc).decode()}")
-        rc = _libcudart.cudaMalloc(ctypes.byref(self._base_ptr), ctypes.c_size_t(self.size))
-        if rc != 0:
-            msg = _libcudart.cudaGetErrorString(rc)
-            raise RuntimeError(f"cudaMalloc failed: {rc} {msg.decode() if msg else ''}")
+
+        if self._use_vmm:
+            ptr_val = _vmm_malloc(self.size)
+            self._base_ptr.value = ptr_val
+        else:
+            rc = _libcudart.cudaMalloc(ctypes.byref(self._base_ptr), ctypes.c_size_t(self.size))
+            if rc != 0:
+                msg = _libcudart.cudaGetErrorString(rc)
+                raise RuntimeError(f"cudaMalloc failed: {rc} {msg.decode() if msg else ''}")
         self._ptr.value = self._base_ptr.value
 
+        # Multicast buffer (only when explicitly requested via mcast_size)
+        if self._mcast_size_requested is not None:
+            assert self._use_vmm, "mcast_size requires use_vmm=True"
+            assert self._is_distributed, "mcast_size requires is_distributed=True"
+            if _supports_multicast():
+                self._init_multicast_buffer()
+            else:
+                raise RuntimeError("Multicast not supported on this hardware")
+
+    def _init_multicast_buffer(self):
+        """Create multicast object and map, following multi-process fabric pattern."""
+        mcast_size = self._mcast_size_requested if self._mcast_size_requested else self.size
+        num_devices = self._num_local_ranks
+        aligned = _mc_get_aligned_size(mcast_size, num_devices)
+        self._mcast_aligned_size = aligned
+
+        # Allocate physical memory (reuses vmm_malloc, same fabric handle type)
+        self._mcast_phys_ptr = _vmm_malloc(aligned)
+
+        # Rank 0 creates MC object, exports fabric handle; broadcast to all
+        if self._local_rank == 0:
+            mcast_handle = _mc_create(aligned, num_devices)
+            mcast_fabric_bytes = _mc_export_handle(mcast_handle)
+        else:
+            mcast_handle = 0
+            mcast_fabric_bytes = None
+
+        obj_list = [mcast_fabric_bytes]
+        dist.broadcast_object_list(obj_list, src=0, group=self._group)
+        mcast_fabric_bytes = obj_list[0]
+
+        # Non-rank-0 import the MC handle
+        if self._local_rank != 0:
+            mcast_handle = _mc_import_handle(mcast_fabric_bytes)
+
+        # Each rank adds its own device
+        _mc_add_device(mcast_handle, self._local_rank)
+
+        # Barrier: all devices must be added before binding
+        dist.barrier(self._group)
+
+        # Each rank binds its own physical memory
+        _mc_bind_mem(mcast_handle, self._mcast_phys_ptr, aligned)
+
+        # Barrier: all binds must complete before mapping
+        dist.barrier(self._group)
+
+        # Each rank maps the MC object to a local VA
+        self._mcast_base_ptr = _mc_map(mcast_handle, aligned, num_devices)
+        self._mcast_ptr = self._mcast_base_ptr
+
+        # Release handle (backing persists due to mapping)
+        _mc_release_handle(mcast_handle)
+        self._use_multicast = True
+
+    def _allocate_mcast_tensor(self, shape: tuple[int, ...], dtype: torch.dtype) -> tuple[torch.Tensor, torch.Tensor]:
+        """Allocate from multicast buffer (bump-pointer).
+
+        Returns:
+            (mcast_tensor, local_tensor):
+                mcast_tensor: backed by MC VA, for multimem read instructions
+                local_tensor: backed by physical VA, for writing data
+        """
+        if not self._use_multicast:
+            raise RuntimeError("Multicast buffer not initialized")
+
+        numel = _prod_shape(shape)
+        itemsize = _element_size_bytes(dtype)
+        bytes_needed = numel * itemsize
+        bytes_alloc = _align_up(bytes_needed, self._align)
+
+        current_offset = self._mcast_ptr - self._mcast_base_ptr
+        if current_offset + bytes_alloc > self._mcast_aligned_size:
+            raise MemoryError(
+                f"Mcast allocation failed: Requesting {bytes_alloc} bytes, but only "
+                f"{self._mcast_aligned_size - current_offset} bytes available "
+                f"(total mcast size: {self._mcast_aligned_size} bytes)."
+            )
+
+        dtype_str = _dtype_to_str.get(dtype)
+        if dtype_str is None:
+            dtype_str = str(dtype).split(".")[-1]
+        if isinstance(shape, tuple):
+            shape = list(shape)
+        elif not isinstance(shape, list):
+            shape = [shape]
+
+        mcast_t = tensor_from_ptr(self._mcast_ptr, shape, dtype_str, self._device, False)
+        local_t = tensor_from_ptr(self._mcast_phys_ptr + current_offset, shape, dtype_str, self._device, False)
+        self._mcast_ptr += bytes_alloc
+        return mcast_t, local_t
+
+    def close(self):
+        """Explicitly free resources with proper distributed coordination.
+
+        Must be called collectively by all ranks before process group destruction.
+        Safe to call multiple times.
+        """
+        if self._closed:
+            return
+        self._closed = True
+        # Barrier before multicast teardown to ensure no rank is still using MC VA
+        if getattr(self, "_use_multicast", False) and self._group is not None:
+            with contextlib.suppress(Exception):
+                dist.barrier(self._group)
+        self._free()
+
     def _free(self):
+        # Free multicast resources
+        if getattr(self, "_mcast_base_ptr", 0) and self._mcast_base_ptr:
+            _mc_unmap(self._mcast_base_ptr, self._mcast_aligned_size, self._num_local_ranks)
+            self._mcast_base_ptr = 0
+            self._mcast_ptr = 0
+        if getattr(self, "_mcast_phys_ptr", 0) and self._mcast_phys_ptr:
+            _vmm_free(self._mcast_phys_ptr)
+            self._mcast_phys_ptr = 0
+        self._use_multicast = False
+
+        # Free main buffer
         if getattr(self, "_base_ptr", None) and self._base_ptr.value:
-            rc = _libcudart.cudaFree(self._base_ptr)
-            # mark freed even if error to avoid double free in destructor
-            self._base_ptr = ctypes.c_void_p(0)
-            if rc != 0:
-                msg = _libcudart.cudaGetErrorString(rc)
-                raise RuntimeError(f"cudaFree failed: {rc} {msg.decode() if msg else ''}")
+            if getattr(self, "_use_vmm", False):
+                _vmm_free(self._base_ptr.value)
+                self._base_ptr = ctypes.c_void_p(0)
+            else:
+                rc = _libcudart.cudaFree(self._base_ptr)
+                self._base_ptr = ctypes.c_void_p(0)
+                if rc != 0:
+                    msg = _libcudart.cudaGetErrorString(rc)
+                    raise RuntimeError(f"cudaFree failed: {rc} {msg.decode() if msg else ''}")
 
     def _init_table(self):
+        # Ensure torch device matches the CUDA device set in _alloc()
+        if self._device is not None:
+            torch.cuda.set_device(self._device)
         device_ids = [
             None,
         ] * self._group.size()
@@ -144,14 +315,21 @@ def _init_table(self):
         dist.all_gather_object(device_ids, local_device_id, self._group)
         self._device_ids = device_ids
 
-        # Synchronize IPC handles
-        ipc_handles = [
+        # Synchronize handles (VMM or IPC)
+        handles = [
             None,
         ] * self._group.size()
-        local_ipc_handle = _create_ipc_handle(self._base_ptr.value)
-        dist.all_gather_object(ipc_handles, local_ipc_handle, self._group)
+        if self._use_vmm:
+            local_handle = _create_vmm_handle(self._base_ptr.value)
+        else:
+            local_handle = _create_ipc_handle(self._base_ptr.value)
+        dist.all_gather_object(handles, local_handle, self._group)
+
         buffer_ptrs = torch.empty(self._group.size(), dtype=torch.uint64, device="cuda")
-        _sync_ipc_handles(self._local_rank, device_ids, ctypes.c_void_p(buffer_ptrs.data_ptr()).value, ipc_handles, None)
+        if self._use_vmm:
+            _sync_vmm_handles(self._local_rank, device_ids, ctypes.c_void_p(buffer_ptrs.data_ptr()).value, handles)
+        else:
+            _sync_ipc_handles(self._local_rank, device_ids, ctypes.c_void_p(buffer_ptrs.data_ptr()).value, handles, None)
         buffer_ptrs[self._local_rank] = self._base_ptr.value
         self._buffer_ptrs = buffer_ptrs
         self._table_size = 2 + self._group.size()
@@ -237,9 +415,15 @@ def table(self) -> torch.Tensor:
     def table_size(self) -> int:
         return self._table_size
 
+    def __enter__(self):
+        return self
+
+    def __exit__(self, *_):
+        self.close()
+
     def __del__(self):
         with contextlib.suppress(Exception):
-            self._free()
+            self.close()
 
 
 def get_allocator(
@@ -249,7 +433,16 @@ def get_allocator(
     local_rank: int = 0,
     num_local_ranks: int = 1,
     group: dist.ProcessGroup | None = None,
+    use_vmm: bool | None = None,
+    mcast_size: int | None = None,
 ) -> BaseAllocator:
     return BaseAllocator(
-        size, device=device, is_distributed=is_distributed, local_rank=local_rank, num_local_ranks=num_local_ranks, group=group
+        size,
+        device=device,
+        is_distributed=is_distributed,
+        local_rank=local_rank,
+        num_local_ranks=num_local_ranks,
+        group=group,
+        use_vmm=use_vmm,
+        mcast_size=mcast_size,
     )
diff --git a/tilelang/utils/language.py b/tilelang/utils/language.py
index d9b85a197a..53a5730ca2 100644
--- a/tilelang/utils/language.py
+++ b/tilelang/utils/language.py
@@ -1,6 +1,8 @@
 from __future__ import annotations
+from tilelang._typing import BufferLikeType
 from tvm.tir import Buffer, BufferLoad, BufferRegion, PrimExpr
 from tilelang.language.utils import region as _make_region_call
+from tilelang.language.utils import get_buffer_region_from_load
 from functools import reduce
 from tvm import IRModule, DataType
 from tvm.tir import PrimFunc
@@ -10,7 +12,7 @@
 # These utility functions check the memory scope of a given TVM buffer.
 
 
-def _get_buffer(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) -> Buffer:
+def _get_buffer(buffer_or_load_or_region: BufferLikeType) -> Buffer:
     """
     Extract Buffer from Buffer, BufferLoad, or BufferRegion.
 
@@ -28,7 +30,7 @@ def _get_buffer(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion) ->
         raise TypeError(f"Expected Buffer, BufferLoad, or BufferRegion, got {type(buffer_or_load_or_region)}")
 
 
-def is_global(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_global(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is in the global memory scope.
 
@@ -42,7 +44,7 @@ def is_global(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope() == "global"
 
 
-def is_shared(buffer: Buffer | BufferLoad | BufferRegion, allow_dynamic: bool = True) -> bool:
+def is_shared(buffer: BufferLikeType, allow_dynamic: bool = True) -> bool:
     """
     Check if the buffer is in the shared memory scope.
 
@@ -60,7 +62,7 @@ def is_shared(buffer: Buffer | BufferLoad | BufferRegion, allow_dynamic: bool =
     return any(conditions)
 
 
-def is_shared_dynamic(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_shared_dynamic(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is in the dynamic shared memory scope.
 
@@ -74,7 +76,7 @@ def is_shared_dynamic(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope() == "shared.dyn"
 
 
-def is_tensor_memory(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_tensor_memory(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is in tensor memory scope (e.g., shared.tmem).
 
@@ -88,7 +90,7 @@ def is_tensor_memory(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope().startswith("shared.tmem")
 
 
-def is_local(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_local(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is in the local memory scope.
 
@@ -102,7 +104,7 @@ def is_local(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope() == "local"
 
 
-def is_fragment(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
+def is_fragment(buffer: BufferLikeType) -> bool:
     """
     Check if the buffer is a fragment (e.g., for matrix multiplication operations).
 
@@ -116,6 +118,20 @@ def is_fragment(buffer: Buffer | BufferLoad | BufferRegion) -> bool:
     return buffer.scope().startswith("local.fragment")
 
 
+def is_local_var(buffer: BufferLikeType) -> bool:
+    """
+    Check if the buffer is in the local.var memory scope.
+
+    Args:
+        buffer: The TVM buffer, BufferLoad, or BufferRegion to check.
+
+    Returns:
+        bool: True if the buffer is in local.var memory, False otherwise.
+    """
+    buffer = _get_buffer(buffer)
+    return buffer.scope() == "local.var"
+
+
 def get_buffer_elems(buffer: Buffer) -> int:
     """
     Get the number of elements in the buffer.
@@ -158,42 +174,7 @@ def retrieve_func_from_module(ir_module: IRModule) -> PrimFunc:
     return func
 
 
-def get_buffer_region_from_load(buffer_load: tir.BufferLoad, extents: list[PrimExpr] | None = None) -> tir.BufferRegion | None:
-    """
-    Get the buffer region from a buffer load.
-
-    May encounter buffer load like C[0:128, 0:32], ref to pull request
-    for buffer wise op: https://github.com/apache/tvm/pull/14693
-    convert load to region
-    """
-    buffer, indices = buffer_load.buffer, buffer_load.indices
-    regions = []
-    found_ramp: bool = False
-
-    if extents is not None:
-        assert len(extents) == len(indices), "extents should have the same length as indices"
-    for i, indice in enumerate(indices):
-        if isinstance(indice, tir.Ramp):
-            assert extents is None, "extents should be provided for BufferLoad with Ramp indices"
-            regions.append(ir.Range.from_min_extent(indice.base, indice.lanes))
-            found_ramp = True
-        elif isinstance(indice, tir.PrimExpr):
-            if extents is not None:
-                regions.append(ir.Range.from_min_extent(indice, extents[i]))
-                found_ramp = True
-            else:
-                regions.append(ir.Range.from_min_extent(indice, 1))
-        else:
-            raise ValueError(f"Unsupported type: {type(indice)} for index {i}")
-    if found_ramp:
-        return tir.BufferRegion(buffer, regions)
-    else:
-        return None
-
-
-def to_buffer_region(
-    obj: Buffer | BufferLoad | BufferRegion | tir.Var, access_type: str = "rw", extents: list[PrimExpr] | None = None
-) -> PrimExpr | BufferRegion:
+def to_buffer_region(obj: BufferLikeType, access_type: str = "rw", extents: list[PrimExpr] | None = None) -> PrimExpr | BufferRegion:
     """
     Convert to/from the tl.region representation.
 
@@ -237,7 +218,7 @@ def to_buffer_region(
     raise ValueError(f"Unsupported argument type for to_buffer_region: {type(obj)}")
 
 
-def retrieve_shape(obj: Buffer | BufferRegion | BufferLoad) -> list:
+def retrieve_shape(obj: BufferLikeType) -> list:
     """
     Retrieve shape-like extents for a buffer-like object.
 
@@ -257,7 +238,7 @@ def retrieve_shape(obj: Buffer | BufferRegion | BufferLoad) -> list:
     raise ValueError(f"Unsupported retrieve_shape argument type: {type(obj)} for object {obj}")
 
 
-def retrieve_stride(obj: Buffer | BufferRegion | BufferLoad) -> list:
+def retrieve_stride(obj: BufferLikeType) -> list:
     """
     Retrieve row-major strides for a buffer-like object based on its buffer.shape.
 
@@ -278,7 +259,7 @@ def retrieve_stride(obj: Buffer | BufferRegion | BufferLoad) -> list:
     return strides
 
 
-def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad | BufferRegion, access_type: str = "r") -> PrimExpr:
+def retrive_ptr_from_buffer_region(buffer_or_load_or_region: BufferLikeType, access_type: str = "r") -> PrimExpr:
     if isinstance(buffer_or_load_or_region, Buffer):
         return buffer_or_load_or_region.access_ptr(access_type)
     elif isinstance(buffer_or_load_or_region, BufferLoad):
@@ -308,7 +289,7 @@ def retrive_ptr_from_buffer_region(buffer_or_load_or_region: Buffer | BufferLoad
 
 
 def retrieve_ptr(
-    obj: Buffer | BufferRegion | BufferLoad,
+    obj: BufferLikeType,
     access_type: str = "r",
     ignore_last_ndim: int = 0,
 ) -> PrimExpr:
@@ -354,7 +335,44 @@ def retrieve_ptr(
     raise ValueError(f"Unsupported retrieve_ptr argument type: {type(obj)} for object {obj}")
 
 
-def retrieve_offset(obj: Buffer | BufferRegion | BufferLoad) -> list:
+def retrieve_buffer_and_offset(obj: BufferLikeType) -> tuple[Buffer, PrimExpr | int]:
+    """
+    Retrieve the underlying buffer together with its logical element offset.
+
+    - Buffer -> (buffer, 0)
+    - BufferRegion -> (buffer, offset from region minima)
+    - BufferLoad -> (buffer, offset from indices or derived region minima)
+
+    This is useful when callers need to build custom access patterns from a
+    common buffer base rather than materializing a full `access_ptr` directly.
+    """
+    if isinstance(obj, tir.Buffer):
+        return obj, 0
+
+    if isinstance(obj, tir.BufferRegion):
+        buffer, region = obj.buffer, obj.region
+        strides = retrieve_stride(obj)
+        offset = 0
+        for i, r in enumerate(region):
+            offset += r.min * strides[i]
+        return buffer, offset
+
+    if isinstance(obj, tir.BufferLoad):
+        region = get_buffer_region_from_load(obj)
+        if region is not None:
+            return retrieve_buffer_and_offset(region)
+
+        buffer = obj.buffer
+        strides = retrieve_stride(obj)
+        offset = 0
+        for i, idx in enumerate(obj.indices):
+            offset += idx * strides[i]
+        return buffer, offset
+
+    raise ValueError(f"Unsupported retrieve_buffer_and_offset argument type: {type(obj)} for object {obj}")
+
+
+def retrieve_offset(obj: BufferLikeType) -> list:
     """
     Retrieve per-dimension minima offsets.
 
@@ -374,15 +392,48 @@ def retrieve_offset(obj: Buffer | BufferRegion | BufferLoad) -> list:
     raise ValueError(f"Unsupported retrieve_offset argument type: {type(obj)} for object {obj}")
 
 
+def retrieve_dtype(obj: BufferLikeType) -> str:
+    """
+    Retrieve the dtype of a buffer-like object.
+
+    - Buffer -> buffer.dtype
+    - BufferRegion -> convert to BufferLoad with Ramp indices, then use load.dtype
+    - BufferLoad -> load.dtype
+    """
+    if isinstance(obj, tir.Buffer):
+        return obj.dtype
+    if isinstance(obj, tir.BufferRegion):
+        # Convert region ranges to indices, using Ramp for vector access
+        indices = []
+        for r in obj.region:
+            extent = r.extent
+            if isinstance(extent, tir.IntImm) and extent.value == 1:
+                indices.append(r.min)
+            else:
+                # Use Ramp for vector access: Ramp(base, stride=1, lanes=extent)
+                indices.append(tir.Ramp(r.min, 1, extent))
+        load = tir.BufferLoad(obj.buffer, indices)
+        return load.dtype
+    if isinstance(obj, tir.BufferLoad):
+        return obj.dtype
+    raise ValueError(f"Unsupported retrieve_dtype argument type: {type(obj)} for object {obj}")
+
+
 def bits_product(shape: list[PrimExpr], dtype: str) -> PrimExpr:
     """
-    Compute the number of bits in a Buffer (shape with dtype)."""
+    Compute the number of bits in a Buffer (shape with dtype).
+
+    For vector types (e.g. ``bfloat16x2``) ``DataType.bits`` returns the
+    per-lane width, not the full element width.  Multiply by ``lanes`` so
+    that the total bit count is correct.
+    """
     if len(shape) == 0:
         return tir.IntImm("int32", 1)
     result = shape[0]
     for i in range(1, len(shape)):
         result = result * shape[i]
-    return result * DataType(dtype).bits
+    dt = DataType(dtype)
+    return result * dt.bits * dt.lanes
 
 
 def prim_expr_equal(lhs, rhs) -> bool:
diff --git a/tilelang/utils/target.py b/tilelang/utils/target.py
index 3d7aa77c9a..ff5b4ab36b 100644
--- a/tilelang/utils/target.py
+++ b/tilelang/utils/target.py
@@ -6,6 +6,7 @@
 from platform import mac_ver
 from typing import Literal
 from tilelang import tvm as tvm
+from tilelang import language as T
 from tilelang import _ffi_api
 from tvm.target import Target
 from tvm.contrib import rocm
@@ -64,10 +65,38 @@ def check_metal_availability() -> bool:
     return arch == "arm64"
 
 
-def normalize_cutedsl_target(target: str | Target | None) -> Target | None:
-    if target is None:
-        return None
-
+def determine_fp8_type(fp8_format: Literal["e4m3", "e5m2"] = "e4m3") -> str:
+    """
+    Select the correct FP8 dtype string for the current platform.
+    - CUDA defaults to FP8 E4M3FN / E5M2.
+    - ROCm uses FNUZ except gfx950 (OCP), which prefers non-FNUZ when available.
+    """
+    if fp8_format not in {"e4m3", "e5m2"}:
+        raise ValueError(f"Unsupported FP8 format: {fp8_format}")
+    if torch.version.hip is None:
+        return T.float8_e4m3fn if fp8_format == "e4m3" else T.float8_e5m2
+    if not torch.cuda.is_available():
+        return T.float8_e4m3fnuz if fp8_format == "e4m3" else T.float8_e5m2fnuz
+    props = torch.cuda.get_device_properties(0)
+    gcn_arch = getattr(props, "gcnArchName", "")
+    if fp8_format == "e4m3":
+        if gcn_arch.startswith("gfx950"):
+            return T.float8_e4m3fn
+        return T.float8_e4m3fnuz
+    if gcn_arch.startswith("gfx950") and hasattr(T, "float8_e5m2"):
+        return T.float8_e5m2
+    return T.float8_e5m2fnuz
+
+
+def determine_torch_fp8_type(fp8_format: Literal["e4m3", "e5m2"] = "e4m3") -> torch.dtype:
+    dtype_name = determine_fp8_type(fp8_format)
+    torch_dtype = getattr(torch, dtype_name, None)
+    if torch_dtype is None:
+        raise RuntimeError(f"PyTorch does not expose dtype {dtype_name}")
+    return torch_dtype
+
+
+def normalize_cutedsl_target(target: str | Target) -> Target | None:
     if isinstance(target, Target):
         if target.kind.name == "cuda" and "cutedsl" in target.keys:
             return target
@@ -89,25 +118,22 @@ def normalize_cutedsl_target(target: str | Target | None) -> Target | None:
     return None
 
 
-def determine_target(target: str | Target | Literal["auto"] | None = "auto", return_object: bool = False) -> str | Target:
+def determine_target(target: str | Target | Literal["auto"] = "auto", return_object: bool = False) -> str | Target:
     """
     Determine the appropriate target for compilation (CUDA, HIP, or manual selection).
 
     Args:
-        target (str | Target | Literal["auto"] | None): User-specified target.
-            - If "auto" or None, the system will automatically detect whether CUDA or HIP is available.
+        target (Union[str, Target, Literal["auto"]]): User-specified target.
+            - If "auto", the system will automatically detect whether CUDA or HIP is available.
             - If a string or Target, it is directly validated.
 
     Returns:
-        str | Target: The selected target ("cuda", "hip", or a valid Target object).
+        Union[str, Target]: The selected target ("cuda", "hip", or a valid Target object).
 
     Raises:
         ValueError: If no CUDA or HIP is available and the target is "auto".
         AssertionError: If the target is invalid.
     """
-    # Treat None as "auto"
-    if target is None:
-        target = "auto"
 
     return_var: str | Target = target
 
@@ -180,6 +206,10 @@ def target_is_hip(target: Target) -> bool:
     return _ffi_api.TargetIsRocm(target)
 
 
+def target_is_metal(target: Target) -> bool:
+    return _ffi_api.TargetIsMetal(target)
+
+
 def target_is_volta(target: Target) -> bool:
     return _ffi_api.TargetIsVolta(target)
 
@@ -192,6 +222,26 @@ def target_is_ampere(target: Target) -> bool:
     return _ffi_api.TargetIsAmpere(target)
 
 
+def parse_device(device: str | torch.device | int | None) -> int:
+    """Parse a device specification and return the device index."""
+    if device is None:
+        if torch.cuda.is_available():
+            return torch.cuda.current_device()
+        return 0
+    if isinstance(device, int):
+        return device
+    if isinstance(device, torch.device):
+        return device.index if device.index is not None else 0
+    if isinstance(device, str):
+        s = device.lower().strip()
+        if not s.startswith("cuda"):
+            raise ValueError(f"Invalid device string: {device!r}")
+        if ":" in s:
+            return int(s.split(":")[1])
+        return torch.cuda.current_device()
+    raise ValueError(f"Invalid device specification: {device!r}")
+
+
 def target_is_hopper(target: Target) -> bool:
     return _ffi_api.TargetIsHopper(target)
 
@@ -204,6 +254,10 @@ def target_is_cdna(target: Target) -> bool:
     return _ffi_api.TargetIsCDNA(target)
 
 
+def target_is_gfx950(target: Target) -> bool:
+    return _ffi_api.TargetIsGfx950(target)
+
+
 def target_has_async_copy(target: Target) -> bool:
     return _ffi_api.TargetHasAsyncCopy(target)
 
@@ -222,52 +276,3 @@ def target_has_bulk_copy(target: Target) -> bool:
 
 def target_get_warp_size(target: Target) -> int:
     return _ffi_api.TargetGetWarpSize(target)
-
-
-def parse_device(device: str | torch.device | int | None) -> int:
-    """
-    Parse a device specification and return the device index.
-
-    Args:
-        device: Device specification. Can be:
-            - None: Returns current CUDA device index
-            - int: Returns the device index directly
-            - str: Parses strings like "cuda", "cuda:0", "0"
-            - torch.device: Extracts the device index
-
-    Returns:
-        int: The device index
-
-    Raises:
-        ValueError: If the device specification is invalid
-    """
-    if device is None:
-        if torch.cuda.is_available():
-            return torch.cuda.current_device()
-        return 0
-
-    if isinstance(device, int):
-        return device
-
-    if isinstance(device, torch.device):
-        if device.type != "cuda":
-            raise ValueError(f"Only CUDA devices are supported, got {device.type}")
-        return device.index if device.index is not None else 0
-
-    if isinstance(device, str):
-        device = device.strip().lower()
-        if device == "cuda" or device == "gpu":
-            if torch.cuda.is_available():
-                return torch.cuda.current_device()
-            return 0
-        if device.startswith("cuda:"):
-            try:
-                return int(device[5:])
-            except ValueError as e:
-                raise ValueError(f"Invalid device specification: {device}") from e
-        try:
-            return int(device)
-        except ValueError as e:
-            raise ValueError(f"Invalid device specification: {device}") from e
-
-    raise ValueError(f"Invalid device type: {type(device)}")
diff --git a/tilelang/utils/tensor.py b/tilelang/utils/tensor.py
index e563aed529..c9be2ac170 100644
--- a/tilelang/utils/tensor.py
+++ b/tilelang/utils/tensor.py
@@ -2,20 +2,30 @@
 
 from __future__ import annotations
 from enum import Enum
+from typing import TYPE_CHECKING
 import torch
 from tvm import tir
 import numpy as np
 from tilelang.utils.target import parse_device
-from tilelang.utils.allocator import BaseAllocator
+
+if TYPE_CHECKING:
+    from tilelang.utils.allocator import BaseAllocator  # noqa: F401
+
+
+def _get_float8_dtypes():
+    """Collect available float8 dtypes - some may not exist in older torch versions."""
+    dtypes = set()
+    for name in ("float8_e5m2", "float8_e5m2fnuz", "float8_e4m3fn", "float8_e4m3fnuz"):
+        if hasattr(torch, name):
+            dtypes.add(getattr(torch, name))
+    return dtypes
+
+
+_FLOAT8_DTYPES = _get_float8_dtypes()
 
 
 def is_float8_dtype(dtype: torch.dtype) -> bool:
-    return dtype in {
-        torch.float8_e5m2,
-        torch.float8_e5m2fnuz,
-        torch.float8_e4m3fn,
-        torch.float8_e4m3fnuz,
-    }
+    return dtype in _FLOAT8_DTYPES
 
 
 def fp8_remove_negative_zeros_(tensor: torch.Tensor):
@@ -35,70 +45,6 @@ class TensorSupplyType(Enum):
     Auto = 7
 
 
-def tensor(
-    shape: tuple[int, ...],
-    dtype: torch.dtype,
-    device: str | torch.device | int | None = None,
-    allocator: BaseAllocator | None = None,
-    return_peers: bool | None = None,
-) -> torch.Tensor | list[torch.Tensor]:
-    """Allocate a tensor using the given allocator or standard torch allocation.
-
-    Args:
-        shape: The shape of the tensor to allocate.
-        dtype: The data type of the tensor.
-        device: The device to allocate on (if not using allocator).
-        allocator: Optional BaseAllocator for distributed memory allocation.
-        return_peers: If True, return peer tensors for distributed allocation.
-
-    Returns:
-        A torch.Tensor or list of torch.Tensors (if return_peers is True).
-    """
-    if allocator is not None:
-        assert allocator.initialized(), "Allocator is not initialized"
-        if device is not None:
-            device = parse_device(device)
-            assert allocator.device == device, (
-                f"Allocator device must be the same as the device of the tensor, but got {allocator.device} != {device}"
-            )
-        return allocator._allocate_tensor(shape, dtype, return_peers)
-    else:
-        return torch.empty(shape, dtype=dtype, device=device)
-
-
-def map_torch_type(intype) -> torch.dtype:
-    # Convert to string if needed
-    if not isinstance(intype, str):
-        intype = str(intype)
-
-    if intype == "float8_e4m3":
-        assert hasattr(torch, "float8_e4m3fn"), "torch.float8_e4m3fn is not supported in this version of torchPlease upgrade torch >= 2.1.0"
-        return torch.float8_e4m3fn
-    elif intype == "float8_e5m2":
-        assert hasattr(torch, "float8_e5m2"), "torch.float8_e5m2 is not supported in this version of torchPlease upgrade torch >= 2.1.0"
-        return torch.float8_e5m2
-    elif intype == "e4m3fnuz_float8":
-        assert hasattr(torch, "float8_e4m3fnuz"), (
-            "torch.float8_e4m3fnuz is not supported in this version of torchPlease upgrade torch >= 2.2.0"
-        )
-        return torch.float8_e4m3fnuz
-    elif intype == "float8_e8m0fnu":
-        assert hasattr(torch, "float8_e8m0fnu"), (
-            "torch.float8_e8m0fnu is not supported in this version of torchPlease upgrade torch >= 2.8.0"
-        )
-        return torch.float8_e8m0fnu
-    elif intype == "float4_e2m1fnx2":
-        assert hasattr(torch, "float4_e2m1fnx2"), (
-            "torch.float4_e2m1fnx2 is not supported in this version of torchPlease upgrade torch >= 2.8.0"
-        )
-        return torch.float4_e2m1fnx2
-    elif "float4" in intype:
-        # PyTorch doesn't support float4, use int8 as storage type
-        return torch.int8
-    else:
-        return getattr(torch, intype)
-
-
 def get_tensor_supply(supply_type: TensorSupplyType = TensorSupplyType.Integer):
     from tilelang.engine.param import KernelParam
     from .device import get_current_device
@@ -231,7 +177,7 @@ def _equalize_attributes(actual: torch.Tensor, expected: torch.Tensor) -> tuple[
         actual (Tensor): Actual tensor.
         expected (Tensor): Expected tensor.
     Returns:
-        (tuple[Tensor, Tensor]): Equalized tensors.
+        (Tuple[Tensor, Tensor]): Equalized tensors.
     """
     # The comparison logic uses operators currently not supported by the MPS backends.
     #  See https://github.com/pytorch/pytorch/issues/77144 for details.
@@ -351,3 +297,34 @@ def torch_assert_close(
         )
     else:
         return True
+
+
+def tensor(
+    shape: tuple[int, ...],
+    dtype: torch.dtype,
+    device: str | torch.device | int | None = None,
+    allocator: BaseAllocator | None = None,
+    return_peers: bool | None = None,
+) -> torch.Tensor | list[torch.Tensor]:
+    """Allocate a tensor using the given allocator or standard torch allocation.
+
+    Args:
+        shape: The shape of the tensor to allocate.
+        dtype: The data type of the tensor.
+        device: The device to allocate on (if not using allocator).
+        allocator: Optional BaseAllocator for distributed memory allocation.
+        return_peers: If True, return peer tensors for distributed allocation.
+
+    Returns:
+        A torch.Tensor or list of torch.Tensors (if return_peers is True).
+    """
+    if allocator is not None:
+        assert allocator.initialized(), "Allocator is not initialized"
+        if device is not None:
+            device = parse_device(device)
+            assert allocator.device == device, (
+                f"Allocator device must be the same as the device of the tensor, but got {allocator.device} != {device}"
+            )
+        return allocator._allocate_tensor(shape, dtype, return_peers)
+    else:
+        return torch.empty(shape, dtype=dtype, device=device)
diff --git a/tilelang/utils/ts_ext/__init__.py b/tilelang/utils/ts_ext/__init__.py
deleted file mode 100644
index e8f1bb87c5..0000000000
--- a/tilelang/utils/ts_ext/__init__.py
+++ /dev/null
@@ -1,18 +0,0 @@
-from importlib import import_module as _imp
-
-_C = _imp("tilescale_ext._C")
-
-tensor_from_ptr = _C.tensor_from_ptr
-_create_tensor = _C._create_tensor
-_create_ipc_handle = _C._create_ipc_handle
-_sync_ipc_handles = _C._sync_ipc_handles
-create_host_device_tensor = _C.create_host_device_tensor
-
-__all__ = [
-    "tensor_from_ptr",
-    "_create_tensor",
-    "_create_ipc_handle",
-    "_sync_ipc_handles",
-    "create_host_device_tensor",
-    "_C",
-]
diff --git a/tilelang/utils/ts_ext/exception.h b/tilelang/utils/ts_ext/exception.h
deleted file mode 100644
index cd3b7f1961..0000000000
--- a/tilelang/utils/ts_ext/exception.h
+++ /dev/null
@@ -1,36 +0,0 @@
-#pragma once
-
-#include <cuda_runtime.h>
-#include <exception>
-#include <string>
-
-class TSException : public std::exception {
-  std::string message;
-
-public:
-  TSException(const char *name, const char *file, int line,
-              const std::string &error) {
-    message = std::string("Failed: ") + name + " error " + file + ":" +
-              std::to_string(line) + " '" + error + "'";
-  }
-  const char *what() const noexcept override { return message.c_str(); }
-};
-
-#ifndef TS_HOST_ASSERT
-#define TS_HOST_ASSERT(cond)                                                   \
-  do {                                                                         \
-    if (!(cond)) {                                                             \
-      throw TSException("Assertion", __FILE__, __LINE__, #cond);               \
-    }                                                                          \
-  } while (0)
-#endif
-
-#ifndef CUDA_CHECK
-#define CUDA_CHECK(cmd)                                                        \
-  do {                                                                         \
-    cudaError_t e = (cmd);                                                     \
-    if (e != cudaSuccess) {                                                    \
-      throw TSException("CUDA", __FILE__, __LINE__, cudaGetErrorString(e));    \
-    }                                                                          \
-  } while (0)
-#endif
diff --git a/tilelang/utils/ts_ext/ipc_ops.cpp b/tilelang/utils/ts_ext/ipc_ops.cpp
deleted file mode 100644
index eaa820a545..0000000000
--- a/tilelang/utils/ts_ext/ipc_ops.cpp
+++ /dev/null
@@ -1,98 +0,0 @@
-#include <ATen/ops/from_blob.h>
-#include <c10/core/ScalarType.h>
-#include <c10/cuda/CUDAFunctions.h>
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-#include <pybind11/pybind11.h>
-#include <pybind11/pytypes.h>
-#include <pybind11/stl.h>
-
-#include <torch/python.h>
-#include <torch/types.h>
-
-#include <cstdio>
-#include <cstring>
-#include <numeric>
-#include <optional>
-#include <stdexcept>
-#include <vector>
-
-#include "exception.h"
-#include "ts_ext_ops.h"
-
-namespace py = pybind11;
-
-static size_t numel_of(const std::vector<int64_t> &shape) {
-  return std::accumulate(shape.begin(), shape.end(), size_t{1},
-                         [](size_t a, int64_t b) {
-                           if (b < 0)
-                             throw std::runtime_error("Negative dim");
-                           return a * (size_t)b;
-                         });
-}
-
-static size_t dtype_nbytes(c10::ScalarType dtype) {
-  return (size_t)at::elementSize(dtype);
-}
-
-torch::Tensor create_tensor(const std::vector<int64_t> &shape,
-                            c10::ScalarType dtype) {
-  auto current_device = c10::cuda::current_device();
-  auto options =
-      at::TensorOptions(at::kCUDA).dtype(dtype).device_index(current_device);
-
-  const size_t bytes = dtype_nbytes(dtype) * numel_of(shape);
-
-  CUDA_CHECK(cudaDeviceSynchronize());
-  void *ptr = nullptr;
-  CUDA_CHECK(cudaMalloc(&ptr, bytes));
-
-  return at::from_blob(
-      ptr, shape,
-      [](void *p) {
-        cudaError_t cerr = cudaFree(p);
-        if (cerr != cudaSuccess) {
-          std::fprintf(stderr, "cudaFree failed in deleter: %s\n",
-                       cudaGetErrorString(cerr));
-        }
-      },
-      options);
-}
-
-py::bytearray create_ipc_handle(void *ptr) {
-  cudaIpcMemHandle_t handle{};
-  CUDA_CHECK(cudaIpcGetMemHandle(&handle, ptr));
-  return py::bytearray(reinterpret_cast<const char *>(handle.reserved),
-                       CUDA_IPC_HANDLE_SIZE);
-}
-
-void sync_ipc_handles(
-    int rank, const std::vector<int> &device_ids, void **buffer_ptrs_gpu,
-    const std::vector<std::optional<py::bytearray>> &all_gathered_handles,
-    const std::optional<py::bytearray> & /*root_unique_id_opt*/) {
-
-  const int num = (int)device_ids.size();
-  const int rdma_rank = 0;
-
-  TS_HOST_ASSERT((size_t)num == all_gathered_handles.size());
-
-  std::vector<cudaIpcMemHandle_t> ipc_handles(num);
-  std::vector<void *> buffer_ptrs(num, nullptr);
-
-  for (int i = 0, offset = rdma_rank * num; i < num; ++i) {
-    TS_HOST_ASSERT(all_gathered_handles[offset + i].has_value());
-    std::string s = (std::string)all_gathered_handles[offset + i].value();
-    TS_HOST_ASSERT(s.size() == CUDA_IPC_HANDLE_SIZE);
-    if (offset + i != rank) {
-      std::memcpy(ipc_handles[i].reserved, s.data(), CUDA_IPC_HANDLE_SIZE);
-      CUDA_CHECK(cudaIpcOpenMemHandle(&buffer_ptrs[i], ipc_handles[i],
-                                      cudaIpcMemLazyEnablePeerAccess));
-    }
-  }
-
-  CUDA_CHECK(cudaMemcpy(buffer_ptrs_gpu, buffer_ptrs.data(),
-                        sizeof(void *) * buffer_ptrs.size(),
-                        cudaMemcpyHostToDevice));
-  CUDA_CHECK(cudaDeviceSynchronize());
-}
diff --git a/tilelang/utils/ts_ext/pyproject.toml b/tilelang/utils/ts_ext/pyproject.toml
deleted file mode 100644
index ec90f70f15..0000000000
--- a/tilelang/utils/ts_ext/pyproject.toml
+++ /dev/null
@@ -1,9 +0,0 @@
-[build-system]
-requires = ["setuptools>=65", "wheel", "torch", "packaging"]
-build-backend = "setuptools.build_meta"
-
-[project]
-name = "tilescale_ext"
-version = "0.1.0"
-requires-python = ">=3.8"
-description = "TileScale unified CUDA/IPCs extension."
diff --git a/tilelang/utils/ts_ext/setup.py b/tilelang/utils/ts_ext/setup.py
deleted file mode 100644
index 52d203f931..0000000000
--- a/tilelang/utils/ts_ext/setup.py
+++ /dev/null
@@ -1,59 +0,0 @@
-from setuptools import setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension, CUDA_HOME
-import os
-import torch
-
-try:
-    from torch.utils.cpp_extension import _get_torch_lib_dir
-
-    torch_lib_dir = _get_torch_lib_dir()
-except Exception:
-    torch_lib_dir = os.path.join(os.path.dirname(torch.__file__), "lib")
-
-include_dirs = []
-if CUDA_HOME is not None:
-    cuda_inc = os.path.join(CUDA_HOME, "include")
-    if os.path.isdir(cuda_inc):
-        include_dirs.append(cuda_inc)
-    cuda_lib = os.path.join(CUDA_HOME, "lib64")
-else:
-    cuda_lib = None
-
-extra_compile_args = {
-    "cxx": ["-O3", "-std=c++17", "-fPIC"],
-    "nvcc": ["-O3", "-std=c++17", "-Xcompiler", "-fPIC"],
-}
-
-extra_link_args = [f"-Wl,-rpath,{torch_lib_dir}"]
-runtime_library_dirs = [torch_lib_dir]
-libraries = []
-library_dirs = [torch_lib_dir]
-
-if cuda_lib and os.path.isdir(cuda_lib):
-    libraries.append("cudart")
-    library_dirs.append(cuda_lib)
-    extra_link_args.append(f"-Wl,-rpath,{cuda_lib}")
-
-setup(
-    name="tilescale_ext",
-    packages=["tilescale_ext"],
-    package_dir={"tilescale_ext": "."},
-    ext_modules=[
-        CUDAExtension(
-            name="tilescale_ext._C",
-            sources=[
-                "ts_ext_bindings.cpp",
-                "tensor.cpp",
-                "ipc_ops.cpp",
-            ],
-            include_dirs=include_dirs,
-            extra_compile_args=extra_compile_args,
-            extra_link_args=extra_link_args,
-            runtime_library_dirs=runtime_library_dirs,
-            libraries=libraries,
-            library_dirs=library_dirs,
-        )
-    ],
-    cmdclass={"build_ext": BuildExtension},
-    zip_safe=False,
-)
diff --git a/tilelang/utils/ts_ext/tensor.cpp b/tilelang/utils/ts_ext/tensor.cpp
deleted file mode 100644
index d9a9753f93..0000000000
--- a/tilelang/utils/ts_ext/tensor.cpp
+++ /dev/null
@@ -1,113 +0,0 @@
-#include <cstdint>
-#include <cstdio>
-#include <cuda_runtime.h>
-#include <functional>
-#include <limits>
-#include <stdexcept>
-#include <string>
-#include <torch/extension.h>
-#include <vector>
-
-#include "exception.h"
-#include "ts_ext_ops.h"
-
-static int64_t safe_mul_int64(int64_t a, int64_t b) {
-  if (a == 0 || b == 0)
-    return 0;
-  int64_t maxv = std::numeric_limits<int64_t>::max();
-  if (a > maxv / b)
-    throw std::overflow_error("integer overflow in multiplication");
-  return a * b;
-}
-
-static at::ScalarType dtype_from_string(const std::string &s) {
-  if (s == "float32" || s == "float")
-    return at::kFloat;
-  if (s == "float16" || s == "half")
-    return at::kHalf;
-  if (s == "bfloat16" || s == "bfloat")
-    return at::kBFloat16;
-  if (s == "float64" || s == "double")
-    return at::kDouble;
-  if (s == "uint32")
-    return at::kUInt32;
-  if (s == "uint64")
-    return at::kUInt64;
-  if (s == "int32" || s == "int")
-    return at::kInt;
-  if (s == "int64" || s == "long" || s == "long int")
-    return at::kLong;
-  if (s == "uint8" || s == "byte")
-    return at::kByte;
-  if (s == "int8")
-    return at::kChar;
-  if (s == "bool")
-    return at::kBool;
-  throw std::runtime_error("Unsupported dtype string: '" + s + "'");
-}
-
-torch::Tensor tensor_from_ptr(uint64_t ptr_val, std::vector<int64_t> shape,
-                              const std::string &dtype, int64_t device,
-                              bool take_ownership) {
-  if (ptr_val == 0)
-    throw std::runtime_error("Received null pointer (0).");
-  void *data_ptr = reinterpret_cast<void *>(static_cast<uintptr_t>(ptr_val));
-
-  at::ScalarType st = dtype_from_string(dtype);
-  auto options = torch::TensorOptions().dtype(st).device(
-      torch::kCUDA, static_cast<int>(device));
-
-  int64_t nelems = 1;
-  for (auto d : shape) {
-    if (d < 0)
-      throw std::runtime_error("Negative dimension in shape");
-    nelems = safe_mul_int64(nelems, d);
-  }
-
-  std::function<void(void *)> deleter;
-  if (take_ownership) {
-    uint64_t saved_ptr = ptr_val;
-    deleter = [saved_ptr](void *) {
-      void *p = reinterpret_cast<void *>(static_cast<uintptr_t>(saved_ptr));
-      cudaError_t cerr = cudaFree(p);
-      if (cerr != cudaSuccess) {
-        std::fprintf(stderr, "tensor_from_ptr deleter cudaFree failed: %s\n",
-                     cudaGetErrorString(cerr));
-      }
-    };
-  } else {
-    deleter = [](void *) {};
-  }
-
-  if (nelems == 0) {
-    return torch::empty(shape, options);
-  } else {
-    return at::from_blob(data_ptr, shape, deleter, options);
-  }
-}
-
-std::pair<torch::Tensor, torch::Tensor>
-create_host_device_tensor(const std::vector<int64_t> &shape,
-                          c10::ScalarType dtype) {
-  size_t elem_size = at::elementSize(dtype);
-  int64_t numel = 1;
-  for (int64_t s : shape)
-    numel *= s;
-
-  size_t bytes = numel * elem_size;
-
-  void *host_ptr = nullptr;
-  CUDA_CHECK(cudaHostAlloc(&host_ptr, bytes, cudaHostAllocMapped));
-
-  void *device_ptr = nullptr;
-  CUDA_CHECK(cudaHostGetDevicePointer(&device_ptr, host_ptr, 0));
-
-  auto host_tensor = torch::from_blob(
-      host_ptr, shape, torch::TensorOptions().dtype(dtype).device(torch::kCPU));
-
-  auto device_tensor = torch::from_blob(
-      device_ptr, shape,
-      torch::TensorOptions().dtype(dtype).device(torch::kCUDA));
-
-  return std::make_pair(host_tensor, device_tensor);
-}
diff --git a/tilelang/utils/ts_ext/ts_ext_bindings.cpp b/tilelang/utils/ts_ext/ts_ext_bindings.cpp
deleted file mode 100644
index 685ba11098..0000000000
--- a/tilelang/utils/ts_ext/ts_ext_bindings.cpp
+++ /dev/null
@@ -1,49 +0,0 @@
-#include "ts_ext_ops.h"
-#include <pybind11/pybind11.h>
-#include <pybind11/stl.h>
-#include <torch/extension.h>
-
-namespace py = pybind11;
-
-PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
-  m.doc() = "TileScale unified CUDA/IPCs extension";
-
-  // alloc_cuda API
-  m.def("tensor_from_ptr", &tensor_from_ptr, py::arg("ptr"), py::arg("shape"),
-        py::arg("dtype") = std::string("float32"), py::arg("device") = 0,
-        py::arg("take_ownership") = false);
-
-  // ipc_ext API
-  m.def(
-      "_create_tensor",
-      [](const std::vector<int64_t> &shape, const py::object &dtype) {
-        return create_tensor(shape,
-                             torch::python::detail::py_object_to_dtype(dtype));
-      },
-      py::arg("shape"), py::arg("dtype"));
-
-  m.def("create_host_device_tensor", &create_host_device_tensor,
-        "Create host/device shared pinned-mapped tensor (shape + dtype)");
-
-  m.def(
-      "_create_ipc_handle",
-      [](uintptr_t ptr_value) {
-        void *ptr = reinterpret_cast<void *>(ptr_value);
-        return create_ipc_handle(ptr);
-      },
-      py::arg("ptr_value"));
-
-  m.def(
-      "_sync_ipc_handles",
-      [](int rank, const std::vector<int> &device_ids,
-         uintptr_t buffer_ptrs_gpu_addr,
-         const std::vector<std::optional<py::bytearray>> &all_gathered_handles,
-         const std::optional<py::bytearray> &root_unique_id_opt) {
-        void **buffer_ptrs_gpu =
-            reinterpret_cast<void **>(buffer_ptrs_gpu_addr);
-        sync_ipc_handles(rank, device_ids, buffer_ptrs_gpu,
-                         all_gathered_handles, root_unique_id_opt);
-      },
-      py::arg("rank"), py::arg("device_ids"), py::arg("buffer_ptrs_gpu_addr"),
-      py::arg("all_gathered_handles"), py::arg("root_unique_id_opt"));
-}
diff --git a/tilelang/utils/ts_ext/ts_ext_ops.h b/tilelang/utils/ts_ext/ts_ext_ops.h
deleted file mode 100644
index 224ace2968..0000000000
--- a/tilelang/utils/ts_ext/ts_ext_ops.h
+++ /dev/null
@@ -1,25 +0,0 @@
-#pragma once
-#include <cstdint>
-#include <optional>
-#include <pybind11/pytypes.h>
-#include <string>
-#include <torch/types.h>
-#include <vector>
-
-torch::Tensor tensor_from_ptr(uint64_t ptr_val, std::vector<int64_t> shape,
-                              const std::string &dtype = "float32",
-                              int64_t device = 0, bool take_ownership = false);
-
-torch::Tensor create_tensor(const std::vector<int64_t> &shape,
-                            c10::ScalarType dtype);
-
-std::pair<torch::Tensor, torch::Tensor>
-create_host_device_tensor(const std::vector<int64_t> &shape,
-                          c10::ScalarType dtype);
-
-pybind11::bytearray create_ipc_handle(void *ptr);
-
-void sync_ipc_handles(
-    int rank, const std::vector<int> &device_ids, void **buffer_ptrs_gpu,
-    const std::vector<std::optional<pybind11::bytearray>> &all_gathered_handles,
-    const std::optional<pybind11::bytearray> &root_unique_id_opt);
diff --git a/tilelang/utils/version.py b/tilelang/utils/version.py
new file mode 100644
index 0000000000..46d23d7531
--- /dev/null
+++ b/tilelang/utils/version.py
@@ -0,0 +1,31 @@
+"""Version utilities for tilelang."""
+
+from __future__ import annotations
+
+import re
+
+
+def build_date(version_str: str | None = None) -> int | None:
+    """Extract build date (YYYYMMDD) from version string.
+
+    Args:
+        version_str: Version string like "0.1.7.post3+cuda.d20260127.gita17230e4".
+                     If None, uses tilelang.__version__.
+
+    Returns:
+        Build date as integer (e.g., 20260127), or None if not found.
+
+    Example:
+        >>> import tilelang
+        >>> if tilelang.build_date() >= 20260127:
+        ...     print("Version meets requirement")
+    """
+    if version_str is None:
+        import tilelang
+
+        version_str = tilelang.__version__
+
+    match = re.search(r"\.d(\d{8})\.", version_str)
+    if match:
+        return int(match.group(1))
+    return None
diff --git a/tilescale_ext/__init__.py b/tilescale_ext/__init__.py
deleted file mode 100644
index c650ea949c..0000000000
--- a/tilescale_ext/__init__.py
+++ /dev/null
@@ -1,15 +0,0 @@
-from tilescale_ext._C import (
-    tensor_from_ptr,
-    _create_tensor,
-    _create_ipc_handle,
-    _sync_ipc_handles,
-    create_host_device_tensor,
-)
-
-__all__ = [
-    "tensor_from_ptr",
-    "_create_tensor",
-    "_create_ipc_handle",
-    "_sync_ipc_handles",
-    "create_host_device_tensor",
-]