tile-ai · LeiWang1999 · May 8, 2026 · May 7, 2026 · May 8, 2026 · May 8, 2026
diff --git a/benchmark/matmul/benchmark_matmul_intrinsic.py b/benchmark/matmul/benchmark_matmul_intrinsic.py
diff --git a/docs/deeplearning_operators/matmul.md b/docs/deeplearning_operators/matmul.md
@@ -62,7 +62,7 @@ Below is a simplified code snippet for a 1024 x 1024 x 1024 matrix multiplicatio
 ```python
 import tilelang
 import tilelang.language as T
-from tilelang.intrinsics import make_mma_swizzle_layout
+from tilelang.cuda.intrinsics import make_mma_swizzle_layout
 
 def matmul(M, N, K, block_M, block_N, block_K, dtype="float16", accum_dtype="float"):
     @T.prim_func

diff --git a/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py b/examples/bitnet-1.58b/kernel_benchmark/tilelang_bitnet_158_int8xint2_prefill.py
@@ -7,12 +7,12 @@
 import tilelang.language as T
 from tilelang import tvm as tvm
 from tvm import DataType
-from tilelang.intrinsics.mma_layout import (
+from tilelang.cuda.intrinsics.layout.mma_layout import (
     make_mma_swizzle_layout as make_swizzle_layout,
 )
 import numpy as np
 
-from tilelang.intrinsics.mma_macro_generator import (
+from tilelang.cuda.intrinsics.macro.mma_macro_generator import (
     INT4TensorCoreIntrinEmitter,
 )
 from tilelang.transform import simplify_prim_func

diff --git a/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py b/examples/dequantize_gemm/example_dequant_gemm_fine_grained.py
@@ -141,8 +141,8 @@ def tl_matmul_with_ladder_weight_only_transform_block_reduce_int4(
     accum_dtype,
     transform_b,
 ):
-    from tilelang.intrinsics.mma_layout import make_mma_swizzle_layout as make_swizzle_layout
-    from tilelang.intrinsics.mma_macro_generator import (
+    from tilelang.cuda.intrinsics.layout.mma_layout import make_mma_swizzle_layout as make_swizzle_layout
+    from tilelang.cuda.intrinsics.macro.mma_macro_generator import (
         TensorCoreIntrinEmitterWithLadderTransform,
     )
 

diff --git a/examples/gemm/README.md b/examples/gemm/README.md
@@ -174,7 +174,7 @@ Below is a more advanced snippet that showcases how to apply memory layouts, ena
 import tilelang.language as T
 # `make_mma_swizzle_layout` is a python-defined layout function
 # that helps align data for MMA (Matrix Multiply-Accumulate) operations.
-from tilelang.intrinsics import make_mma_swizzle_layout as make_swizzle_layout
+from tilelang.cuda.intrinsics import make_mma_swizzle_layout as make_swizzle_layout
 
 def matmul(M, N, K, block_M, block_N, block_K, dtype=T.float16, accum_dtype=T.float):
     @T.prim_func

diff --git a/examples/gemm/example_gemm_intrinsics.py b/examples/gemm/example_gemm_intrinsics.py
@@ -2,8 +2,8 @@
 from tvm import DataType
 import tilelang
 import tilelang.language as T
-from tilelang.intrinsics import get_swizzle_layout
-from tilelang.intrinsics.mma_macro_generator import (
+from tilelang.cuda.intrinsics import get_swizzle_layout
+from tilelang.cuda.intrinsics.macro.mma_macro_generator import (
     TensorCoreIntrinEmitter,
 )
 

diff --git a/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py b/examples/gemm_fp8/example_tilelang_gemm_amd_fp8_preshuffle.py
@@ -6,7 +6,7 @@
 import tilelang.language as T
 from tilelang.tileop.base import GemmWarpPolicy
 from tilelang.layout import make_swizzled_layout
-from tilelang.intrinsics.mfma_macro_generator import MatrixCorePreshuffleIntrinEmitter
+from tilelang.rocm.intrinsics.mfma_macro_generator import MatrixCorePreshuffleIntrinEmitter
 from tilelang.utils import determine_fp8_type
 
 tilelang.testing.set_random_seed(0)