Skip to content
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
Show all changes
191 commits
Select commit Hold shift + click to select a range
1ede0bd
Introduce T.deallocate_tmem and T.transpose (#1971)
LeiWang1999 Mar 30, 2026
a9b8d53
Add `annotations` parameter to `alloc_buffer` in `tilelang/language/a…
Copilot Mar 30, 2026
1c561f6
[Bugfix] Raise error on zero grid dimension instead of silent clamp (…
LeiWang1999 Mar 30, 2026
6a859f1
[BugFix] Fix missing barrier init attrs when TMA is disabled (#1995)
Rachmanino Mar 31, 2026
0f7c214
[BugFix] Add missing fences in GEMM SM100 examples and canonicalize t…
Rachmanino Mar 31, 2026
eb6f05c
[Refactor] Refactor CUDA atomic helpers (#2001)
SiriusNEO Mar 31, 2026
8c3b043
[Bugfix] Fix CuTeDSL autotune cache invalid ELF header (#1967) (#1972)
kurisu6912 Mar 31, 2026
a82fa71
fix: fix copy+cast vectorize loop to use wider vector load/store inst…
Achazwl Mar 31, 2026
e45ecf7
[Feature] Support T.annotate_compile_flags, T.annotate_pass_configs, …
kurisu6912 Apr 2, 2026
6e6295f
[BugFix] Fix CI failures: clean /tmp on self-hosted runners and skip …
kurisu6912 Apr 2, 2026
5f70374
[Test] Add 1D TMA regression test for issue #1842 (#2005)
kurisu6912 Apr 3, 2026
6fc3afa
[BugFix] Fix auto vectorization for binary operations after wider cop…
Achazwl Apr 4, 2026
01c714d
fix: add cudaGetLastError check after cuLaunchKernel in TVM FFI backe…
kurisu6912 Apr 4, 2026
bb79425
[CI] Remove legacy dequantize gemm test (#2013)
LeiWang1999 Apr 5, 2026
1ff58bc
[CI] [pre-commit.ci] autoupdate (#2014)
pre-commit-ci[bot] Apr 6, 2026
4f75940
[BugFix] Enhance CUDA vectorization for binary operations (#2015)
LeiWang1999 Apr 7, 2026
868c740
[Docs] fix arrow direction in ir_transform_diagram.png (#2016)
kermanx Apr 7, 2026
cabdb86
[codex] Fuse packed x2 mul-add into fma2 in CUDA codegen (#2017)
LeiWang1999 Apr 7, 2026
37c1c0c
[codex] Reduce slow pytest runtime in testing/python (#2018)
LeiWang1999 Apr 7, 2026
3ee0988
[Refactor][Pipeline] Run pipeline rewriting before layout inference a…
LeiWang1999 Apr 7, 2026
39adf6a
remove wg_wait in gemm_auto_tcgen5mma.py
AutumnKite Apr 8, 2026
0402120
support more than 2 warp groups
AutumnKite Apr 8, 2026
b021c2a
change `before` and `after` to map
AutumnKite Apr 8, 2026
1cb8fde
Bump transformers from 4.53.0 to 5.0.0rc3 in /examples/bitnet-1.58b (…
dependabot[bot] Apr 8, 2026
469a847
pin apache-tvm-ffi<0.1.10 (derived_object regression) (#2020)
oraluben Apr 8, 2026
86e37b7
Fix serial loop phase dtype mismatch in LowerTileOp (#2022)
LeiWang1999 Apr 8, 2026
9a15696
Fix typo
silentCoder-dev Apr 8, 2026
7668827
Merge branch 'auto-schedule' of https://github.com/silentCoder-dev/ti…
AutumnKite Apr 8, 2026
d37fd42
implement a naive ws on F3
AutumnKite Apr 9, 2026
f75ceac
Merge branch 'auto-schedule' of https://github.com/silentCoder-dev/ti…
AutumnKite Apr 9, 2026
9806233
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
AutumnKite Apr 9, 2026
5c7352e
fix typo and run format
AutumnKite Apr 9, 2026
1381a30
Merge branch 'auto-schedule' of https://github.com/silentCoder-dev/ti…
AutumnKite Apr 9, 2026
3b47b87
modify return value of NaiveBuild
AutumnKite Apr 9, 2026
313ed08
Merge branch 'auto-schedule' of https://github.com/silentCoder-dev/ti…
AutumnKite Apr 9, 2026
101a672
fix barrier logic
AutumnKite Apr 10, 2026
35f13d2
run format
AutumnKite Apr 10, 2026
a5e3f19
Re-enable deprecated `TL_DISABLE_TMA_LOWER` pass config for TMA store…
LJC00118 Apr 10, 2026
b1a88bf
[Misc] Remove mistakenly introduced temp file (#2027)
SiriusNEO Apr 10, 2026
9b4e0a2
Fix warpgroup partition
Denverjin Apr 10, 2026
4db4433
Merge pull request #1 from Denverjin/auto-schedule-myh
AutumnKite Apr 10, 2026
b31aaa9
fix barrier logic
AutumnKite Apr 10, 2026
7be6332
Merge branch 'auto-schedule' of https://github.com/silentCoder-dev/ti…
AutumnKite Apr 10, 2026
853e805
[Codegen] Add lexical_alloc_scope for scoped local variable lifetime …
LeiWang1999 Apr 11, 2026
90299d6
[Bugfix] Fix incorrect sync hoist for fragment buffer conditions in T…
LeiWang1999 Apr 11, 2026
d619164
add .agents/skills/build/SKILL.md for build conventions (#2019)
oraluben Apr 11, 2026
7a515b5
[AMD][gfx950] Add gfx950 support for DeepGeem example (#2028)
zhangnju Apr 12, 2026
11dc3e6
Fix shared memory buffer reuse
Denverjin Apr 13, 2026
5d729ee
[Refactor] Remove GEMM v1 and promote gemm_py to be the canonical gem…
LeiWang1999 Apr 13, 2026
35d8139
[CI]: Bump actions/github-script from 8 to 9 (#2036)
dependabot[bot] Apr 13, 2026
8243f7e
Nan propagation option for bf16 and half16 (#1958)
haoran35-jpg Apr 13, 2026
fc5001f
FIx naive loop var duplication bug
Denverjin Apr 13, 2026
19236b4
[Feature] Add TIR builtins for warp-level vote and block-level predic…
sepcnt Apr 13, 2026
b3d5981
[API] Default warp-lane mask to 0xFFFFFFFF for warp-sync builtins (#2…
LeiWang1999 Apr 13, 2026
a8bafa6
fix: suppress false positive conflict write warning when dst index de…
kurisu6912 Apr 14, 2026
74fc980
[Refactor] Refactor `DecoupleTypeCast` Pass (#2026)
LJC00118 Apr 14, 2026
f309d81
[Bugfix][Subtype] Fix scalar fp4 store/load codegen for non-packed bu…
kurisu6912 Apr 14, 2026
380fb5e
Support local var fragment
Denverjin Apr 15, 2026
39c3c06
Merge pull request #2 from Denverjin/auto-schedule-myh
AutumnKite Apr 15, 2026
a16ff86
[Feature] autodd: add __freeze__ annotation to protect code regions f…
kurisu6912 Apr 15, 2026
d2e02e1
[BugFix] Skip MMA shared buffer layout inference when layout already …
kurisu6912 Apr 15, 2026
c93778f
support tcgen05_gemm
AutumnKite Apr 15, 2026
7893678
[Refactor] Remove obsolete RewriteWgmmaSync pass (#2046)
LeiWang1999 Apr 15, 2026
e3d214d
Add if node
Denverjin Apr 15, 2026
011d880
Merge pull request #3 from Denverjin/auto-schedule-myh
AutumnKite Apr 15, 2026
3c7f4a0
remove debug info
AutumnKite Apr 15, 2026
db3d859
run format
AutumnKite Apr 15, 2026
f12fb47
[Refactor] Move target gating into InjectFenceProxy pass entry (#2047)
LeiWang1999 Apr 15, 2026
7ae8d98
fix loop break detection
Denverjin Apr 15, 2026
dbadd77
fix control node break & format
Denverjin Apr 15, 2026
510d773
Merge pull request #4 from Denverjin/auto-schedule-myh
AutumnKite Apr 15, 2026
7ed8266
Fix if task collection bug
Denverjin Apr 15, 2026
235ad7e
Add regression test for 1D TMA load compilation and execution (#1989)
huyhoang171106 Apr 15, 2026
9e78312
Merge pull request #5 from Denverjin/auto-schedule-myh
AutumnKite Apr 16, 2026
891109e
[Transform] Add InjectTcgen05Fence pass (#2003)
LeiWang1999 Apr 16, 2026
8f67446
fix loop break
AutumnKite Apr 16, 2026
844d04e
fix tma load detection
Denverjin Apr 16, 2026
45f5da4
fix pro/epilogue identification
Denverjin Apr 16, 2026
9970ac9
remove debug output & format
Denverjin Apr 16, 2026
1660c85
reimplement barrier logic
AutumnKite Apr 16, 2026
25b0eb8
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
Denverjin Apr 16, 2026
37f05ee
partly fix pro/epilogue logic for barrier
Denverjin Apr 16, 2026
e84cee9
Merge pull request #6 from Denverjin/auto-schedule-myh
AutumnKite Apr 16, 2026
a4aa5f3
refactor ir structure clone
Denverjin Apr 16, 2026
33e5c36
fix bug
Denverjin Apr 16, 2026
cee084c
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
Denverjin Apr 16, 2026
36f51af
Merge pull request #7 from Denverjin/auto-schedule-myh
AutumnKite Apr 16, 2026
7d1e6e3
fix bug
AutumnKite Apr 16, 2026
d098f51
[Enhancement] Use atomic directory rename for cache writes (#1982)
LeiWang1999 Apr 16, 2026
a70fa26
refactor shared memory buffer merge
Denverjin Apr 17, 2026
561adfe
Merge pull request #8 from Denverjin/auto-schedule-myh
AutumnKite Apr 17, 2026
189c99f
upload latency & ii
Denverjin Apr 17, 2026
68f4710
Merge pull request #9 from Denverjin/auto-schedule-myh
AutumnKite Apr 17, 2026
cb00a60
Replace syntactic loop-var checks with invariance checks (#2050)
LJC00118 Apr 17, 2026
27f1f81
[Feature][Example] Introduce CLC tile schedule and add example for sm…
Rachmanino Apr 17, 2026
77cbe6d
[Feature] Introduce T.CUDASourceCodeKernel (#1970)
SiriusNEO Apr 17, 2026
aa0700b
run format
AutumnKite Apr 17, 2026
72d7748
fix II of IfNode
AutumnKite Apr 17, 2026
4bf8678
fix barrier
AutumnKite Apr 17, 2026
9c5fe44
[BugFix] Keep shared-prelude local vars in producer-consumer WS (#2055)
Rachmanino Apr 17, 2026
04468a3
[Bugfix] Fix stage-expanded annotated-layout aliases in LayoutInferen…
TerminusAkivili Apr 17, 2026
6364f5d
fix pro/epilogue let stmt copy
Denverjin Apr 17, 2026
557bcc1
fix z3 small n error
Denverjin Apr 17, 2026
3bc1c01
fix double kernel issue
Denverjin Apr 17, 2026
105b73b
format
Denverjin Apr 17, 2026
b18c60a
undo failed merge
Denverjin Apr 17, 2026
6ca1415
Merge pull request #10 from Denverjin/auto-schedule-myh
AutumnKite Apr 17, 2026
aa877ab
check dependency in prologue
AutumnKite Apr 17, 2026
b2abf0a
Merge commit '891109ea879839bbfe2ad937db208f2d7c6f1ce0' into auto-sch…
Denverjin Apr 17, 2026
c96dd9e
fix header missing
Denverjin Apr 17, 2026
e1d6388
[Cache] Refactor cache namespace layout (#2057)
LeiWang1999 Apr 18, 2026
0924dab
[Bugfix] Use shared::cta instead of shared::cluster for non-cluster T…
qqq-tao Apr 18, 2026
b13cdf3
change the interface to support tasks with wg_id=-1
AutumnKite Apr 20, 2026
acadde0
remove unused declarations
AutumnKite Apr 20, 2026
f8e7059
fix: improve warning output in eager frontend (#2064)
kurisu6912 Apr 20, 2026
36e0015
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
Denverjin Apr 20, 2026
f7c6f43
fix read/write regions
AutumnKite Apr 20, 2026
b10e6fb
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
Denverjin Apr 20, 2026
99ee74b
[CUDA] Support int4 `T.gemm` (#2063)
LeiWang1999 Apr 20, 2026
c797e41
[Bugfix] Correct index calculation in Software Pipeline pass (#2070)
Rachmanino Apr 20, 2026
96c649f
Add frontmatter for the build skill (#2068)
VitalyAnkh Apr 21, 2026
b744da1
Refactor ptx_ldmatrix to use tl.access_ptr with simplified signature …
LeiWang1999 Apr 21, 2026
046b1bd
[FFI] Remove upper version bound on apache-tvm-ffi (#2071)
LeiWang1999 Apr 21, 2026
9c95a42
[Refactor] Phaseout legacy util `map_torch_type` with `T.dtype.as_tor…
LeiWang1999 Apr 21, 2026
64bd742
[Bugfix] Fix reduce layout (#2074)
bucket-xv Apr 21, 2026
948d38a
[Refactor] Disable unhelpful warning print (#2077)
LeiWang1999 Apr 21, 2026
15309f5
[CUDA] Improve int4 GEMM lowering and packed codegen support (#2073)
LeiWang1999 Apr 21, 2026
4e7d126
Bump pytest --numprocesses from 4 to 8 across all platforms (#2076)
LeiWang1999 Apr 21, 2026
38cac96
fix dead-lock bug
AutumnKite Apr 22, 2026
4b3127a
[Enhancement] Enhance alloc_var function to handle _ptr_sentinel dtyp…
LeiWang1999 Apr 22, 2026
6fee850
[Release] Bump version into 0.1.9 (#2060)
LeiWang1999 Apr 22, 2026
1f831f3
fix register fragment reuse
Denverjin Apr 22, 2026
cac434c
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
Denverjin Apr 22, 2026
3c3cf53
Merge pull request #11 from Denverjin/auto-schedule-myh
AutumnKite Apr 22, 2026
441c3b0
[Refactor] Strip build machine paths from LOG messages in wheel relea…
LeiWang1999 Apr 22, 2026
a640a89
[AMD][Radeon] Add the Support of RDNA3/RDNA3.5(gfx11) WMMA (#2044)
jiawei-real Apr 22, 2026
3aeb962
fix let stmt clone bug
Denverjin Apr 22, 2026
4eed399
add the innermost task to sync infos
AutumnKite Apr 22, 2026
55cf9c1
[codex] Remove dead transform pass leftovers (#2083)
LeiWang1999 Apr 22, 2026
9aba41f
[Bugfix] Enable `.shared::cta` in TMA copy paths only on CUDA 12.8+ (…
ColmaLiu Apr 22, 2026
b6e75b1
fix hopper neutral stage
Denverjin Apr 23, 2026
6e2fb56
change layout map & remove unused letstmt
Denverjin Apr 23, 2026
89e6812
disable auto scheduling when using thread vars
Denverjin Apr 23, 2026
be0df30
Merge pull request #12 from Denverjin/auto-schedule-myh
AutumnKite Apr 23, 2026
f3f6e74
find first/last tasks of a buffer access and reduce syncs by checking…
AutumnKite Apr 23, 2026
59c9b05
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
AutumnKite Apr 23, 2026
bdb20c2
format
AutumnKite Apr 23, 2026
ff067b0
fix attr warp partition
Denverjin Apr 23, 2026
b88bbcb
[AMD][gfx950] Add ds_read_tr16_b64 / ds_read_tr8_b64 support for gfx9…
zhangnju Apr 23, 2026
95f1d29
fix let missing bug
Denverjin Apr 23, 2026
65738cd
Merge pull request #13 from Denverjin/auto-schedule-myh
AutumnKite Apr 23, 2026
6d0bffb
[AMD][Gfx950] Add the support of 160K LDS and copy.async (#2058)
zhangnju Apr 23, 2026
10b7f1f
add double-thread constraint
AutumnKite Apr 23, 2026
4cec4ba
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
AutumnKite Apr 23, 2026
ef6a431
fix local var copy
Denverjin Apr 23, 2026
0f29f9c
[BugFix] Relax loop wait and adjust trailing drain behavior in async …
Rachmanino Apr 23, 2026
e99d35a
fix naive ir structure bug
Denverjin Apr 24, 2026
2ca5e06
Merge pull request #14 from Denverjin/auto-schedule-myh
AutumnKite Apr 24, 2026
86989d7
format
AutumnKite Apr 24, 2026
a552916
fix: remove unused let
AutumnKite Apr 24, 2026
c18c623
remove redundant letstmts
Denverjin Apr 24, 2026
55cbe0e
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
Denverjin Apr 24, 2026
0edb76c
move the rewrites forward
AutumnKite Apr 24, 2026
09d9071
add WAW dependence & avoid duplicated dependence when iter=1
AutumnKite Apr 24, 2026
9897fe5
fix barrier around let missing
Denverjin Apr 24, 2026
3c5578d
Merge branch 'auto-schedule' of https://github.com/AutumnKite/tilelan…
Denverjin Apr 24, 2026
b46d709
Merge pull request #15 from Denverjin/auto-schedule-myh
AutumnKite Apr 24, 2026
01bf798
remove cross-warpgroup dependency for register buffers
AutumnKite Apr 24, 2026
d255c0a
fix reused buffer analysis
Denverjin Apr 24, 2026
3df8b46
check kernel using barrier & format
Denverjin Apr 24, 2026
468d51b
Merge pull request #16 from Denverjin/auto-schedule-myh
AutumnKite Apr 24, 2026
e18b8e6
format
AutumnKite Apr 24, 2026
264efe2
[Feature] Block-scaled GEMM support for MXFP8 on Blackwell (#1945)
Rachmanino Apr 24, 2026
057e5ba
[Host CodeGen][Refactor] Cleanup namespace and remove useless C templ…
SiriusNEO Apr 25, 2026
3f16e50
Add opt-out for prelower semantic checks for DeepSeek V4 Flash on ARM…
foraxe Apr 25, 2026
0ee6345
[Example] Add HISA: hierarchical sparse attention indexer (#2069)
xuyufei-a Apr 25, 2026
8f4a08f
[Language] Small cleanup and notes for alloc global (#2100)
SiriusNEO Apr 25, 2026
8e12157
[Enhancement] Optimize hopper fp8 deepgemm tile size (#2103)
Rachmanino Apr 26, 2026
ffdf514
[CUDA][SM100] Include cuda_fp6.h when emitting FP6 types (#2102)
TerminusAkivili Apr 26, 2026
6a29c76
feat: support cdna4 v_mfma_i32_16x16x64_i8 & v_mfma_i32_32x32x32_i8 (…
Paran0idy Apr 26, 2026
53a4c98
[AMD] [gfx950]Fix multiple HIP codegen bugs to support TileKernel (…
zhangnju Apr 26, 2026
73a54d2
Merge remote-tracking branch 'tilelang/main' into auto-schedule-myh
Denverjin Apr 27, 2026
a2f6a45
fix T.make_tensor buffer missing
Denverjin Apr 27, 2026
b60e8ce
Merge pull request #17 from Denverjin/auto-schedule
AutumnKite May 6, 2026
3490f3a
add constraints for warpgroup partition & always analyze and insert b…
AutumnKite May 6, 2026
2903cd1
assign tma store to consumer side
AutumnKite May 6, 2026
28aa5ad
Fix let & barrier bugs
Denverjin May 7, 2026
b29d135
Merge pull request #18 from Denverjin/auto-schedule
AutumnKite May 7, 2026
10503ba
add top-level barriers
AutumnKite May 7, 2026
f62744a
format
AutumnKite May 7, 2026
4f8823b
fix empty for bug
AutumnKite May 8, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion examples/auto_schedule/flashmla_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,7 +587,7 @@ def main(batch=1, heads=64, kv_heads=1, kv_ctx=1024, dim=512, pe_dim=64):

configs = [
(flashattn_auto, "auto_schedule"),
(flashattn_manual, "manual"),
# (flashattn_manual, "manual"), # manual schedule is not needed
(flashattn_warp_specialize, "warp_specialize"),
]

Expand Down
2 changes: 1 addition & 1 deletion examples/gemm_sm100/gemm_auto_tcgen5mma.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def main(
for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
T.copy(A[by * block_M, k * block_K], A_shared) # not trans_A
T.copy(B[bx * block_N, k * block_K], B_shared) # trans_B
T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, wg_wait=-1, clear_accum=k == 0)
T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, clear_accum=k == 0)

T.copy(C_tmem, C_local)
T.copy(C_local, C_shared)
Expand Down
56 changes: 11 additions & 45 deletions src/transform/auto_schedule.cc
Original file line number Diff line number Diff line change
Expand Up @@ -39,12 +39,11 @@
#include <tvm/tir/stmt_functor.h>
#include <tvm/tir/transform.h>

#include <unordered_set>

#include <algorithm>
#include <cmath>
#include <iostream>
#include <memory>
#include <numeric>
#include <optional>
#include <queue>
#include <sstream>
Expand Down Expand Up @@ -567,8 +566,6 @@ tvm::transform::Pass AutoSchedule(const bool enable_epi) {
extractor(func->body);
Stmt body_to_schedule;
bool has_tilelang_root = false;
PrimExpr updated_thread_extent; // Will be set if warpgroup partition
// doubles thread extent
IterVar thread_var; // Thread index variable for warpgroup partition

if (extractor.body.defined()) {
Expand Down Expand Up @@ -612,24 +609,20 @@ tvm::transform::Pass AutoSchedule(const bool enable_epi) {

// Build ScheduleUnits from IRStructure
ScheduleUnitBuilder unit_builder;
thread_var = ThreadTagChecker::GetThreadVar(body_to_schedule);
if (!thread_var.defined()) {
thread_var = ThreadTagChecker::GetThreadVar(func->body);
}
if (thread_var.defined()) {
unit_builder.SetThreadVar(thread_var);
} else {
LOG(FATAL) << "Could not find thread index variable, warpgroup "
"partition will use default";
}
unit_builder.SetEnableWarpPartition(config.enable_warp_partition);
unit_builder.SetSharedMemoryLimit(config.shared_memory_limit);
unit_builder.SetWarpSpecializeConfig(config);
unit_builder.SetSharedMemoryLimit(GetSharedMemoryLimit(target));

bool double_thread;
std::vector<PrimExpr> thread_count;
if (!aggressive) {
double_thread = unit_builder.NaiveBuild(ir_structure);
thread_count = unit_builder.NaiveBuild(ir_structure);
} else {
double_thread = unit_builder.Build(ir_structure);
thread_count = unit_builder.Build(ir_structure);
}

if (!config.enable_warpgroup_partition) {
Expand All @@ -656,28 +649,13 @@ tvm::transform::Pass AutoSchedule(const bool enable_epi) {
int next_barrier_id = 1;
std::vector<Buffer> barrier_buffers;
Map<ObjectRef, ObjectRef> barrier_map;
// Determine thread count for barrier arrive_count calculations
PrimExpr thread_count[2];
if (!config.enable_thread_extend) {
ICHECK(config.enable_warp_partition);
// sm_100: use fixed warp size (32) for both partitions
thread_count[0] = IntImm(DataType::Int(32), 32);
thread_count[1] = IntImm(DataType::Int(32), 32);
} else {
// sm_90: original behavior
thread_count[0] = thread_var->dom->extent;
thread_count[1] = double_thread ? thread_var->dom->extent
: IntImm(DataType::Int(32),
config.producer_thread_count);
}
LoopNestingInfo loop_info;
std::vector<MultiVersionBufferInfo> buffer_infos;
PrimExpr barrier_count = config.enable_thread_extend
? thread_count[0] + thread_count[1]
: thread_var->dom->extent;
PrimExpr updated_thread_extent = std::accumulate(
thread_count.begin() + 1, thread_count.end(), thread_count[0]);
Buffer neutral_sync_shared_barrier =
makeBarrierBuffer(barrier_count, "neutral_sync_shared_barrier", 1,
barrier_buffers, barrier_map);
makeBarrierBuffer(updated_thread_extent, "neutral_sync_shared_barrier",
1, barrier_buffers, barrier_map);
AnalyzeAndInsertBarriers(
ir_structure.get(), next_barrier_id, barrier_buffers, barrier_map,
thread_count, loop_info, buffer_infos, neutral_sync_shared_barrier);
Expand All @@ -688,19 +666,7 @@ tvm::transform::Pass AutoSchedule(const bool enable_epi) {
// Apply warpgroup partition to entire IRStructure
Stmt new_body = ApplyWarpgroupPartitionToIRStructure(
ir_structure.get(), thread_var, barrier_buffers, barrier_map,
enable_epi, thread_count, double_thread, config,
neutral_sync_shared_barrier);

if (config.enable_thread_extend) {
// sm_90: may need to update thread extent
if (double_thread) {
updated_thread_extent = thread_var->dom->extent * 2;
} else {
updated_thread_extent =
thread_var->dom->extent +
IntImm(DataType::Int(32), config.producer_thread_count);
}
}
enable_epi, thread_count, config, neutral_sync_shared_barrier);

// If we extracted from tilelang_root block, replace the body
Stmt final_body;
Expand Down
61 changes: 14 additions & 47 deletions src/transform/auto_schedule.h
Original file line number Diff line number Diff line change
Expand Up @@ -85,58 +85,25 @@ struct ComponentInfo {
bool uses_tensor_core_{false};
};

// Warp specialization architecture enum
enum class WarpSpecializeArch : uint8_t {
kHopper = 0,
kBlackwell = 1,
kUnsupported = 2,
};

// Configuration for warp specialization
struct WarpSpecializeConfig {
WarpSpecializeArch arch = WarpSpecializeArch::kUnsupported;
int consumer_max_nreg = 0;
int producer_max_nreg = 0;
int producer_thread_count = 0;
bool enable_set_max_nreg = false;
bool enable_warpgroup_partition = false;
bool enable_thread_extend = false;
bool enable_warp_partition = false;
int shared_memory_limit = 0;
};

// Factory function to get warp specialization configuration for a target
inline WarpSpecializeConfig GetWarpSpecializeConfig(Target target) {
if (TargetIsHopper(target)) {
return {WarpSpecializeArch::kHopper,
240,
24,
128,
true,
true,
true,
false,
228 * 1024};
return {WarpSpecializeArch::kHopper, 240, 24, 128, true, true, true, false};
} else if (TargetIsSm100(target)) {
return {WarpSpecializeArch::kBlackwell, 0, 0, 32, false, true, false, true};
} else {
return {
WarpSpecializeArch::kUnsupported, 0, 0, 0, false, false, false, false};
}
}

inline int64_t GetSharedMemoryLimit(Target target) {
if (TargetIsHopper(target)) {
return 228 * 1024;
} else if (TargetIsSm100(target)) {
return {WarpSpecializeArch::kBlackwell,
0,
0,
32,
false,
true,
false,
true,
228 * 1024};
return 228 * 1024;
} else {
return {WarpSpecializeArch::kUnsupported,
0,
0,
0,
false,
false,
false,
false,
0};
return 48 * 1024;
}
}

Expand Down
Loading