silentCoder-dev · AutumnKite · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 31, 2026
diff --git a/examples/auto_schedule/flashmla_benchmark.py b/examples/auto_schedule/flashmla_benchmark.py
@@ -587,7 +587,7 @@ def main(batch=1, heads=64, kv_heads=1, kv_ctx=1024, dim=512, pe_dim=64):
 
     configs = [
         (flashattn_auto, "auto_schedule"),
-        (flashattn_manual, "manual"),
+        # (flashattn_manual, "manual"), # manual schedule is not needed
         (flashattn_warp_specialize, "warp_specialize"),
     ]
 

diff --git a/examples/gemm_sm100/gemm_auto_tcgen5mma.py b/examples/gemm_sm100/gemm_auto_tcgen5mma.py
@@ -40,7 +40,7 @@ def main(
             for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
                 T.copy(A[by * block_M, k * block_K], A_shared)  # not trans_A
                 T.copy(B[bx * block_N, k * block_K], B_shared)  # trans_B
-                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, wg_wait=-1, clear_accum=k == 0)
+                T.gemm(A_shared, B_shared, C_tmem, trans_A, trans_B, clear_accum=k == 0)
 
             T.copy(C_tmem, C_local)
             T.copy(C_local, C_shared)

diff --git a/src/transform/auto_schedule.cc b/src/transform/auto_schedule.cc
@@ -39,12 +39,11 @@
 #include <tvm/tir/stmt_functor.h>
 #include <tvm/tir/transform.h>
 
-#include <unordered_set>
-
 #include <algorithm>
 #include <cmath>
 #include <iostream>
 #include <memory>
+#include <numeric>
 #include <optional>
 #include <queue>
 #include <sstream>
@@ -567,8 +566,6 @@ tvm::transform::Pass AutoSchedule(const bool enable_epi) {
     extractor(func->body);
     Stmt body_to_schedule;
     bool has_tilelang_root = false;
-    PrimExpr updated_thread_extent; // Will be set if warpgroup partition
-                                    // doubles thread extent
     IterVar thread_var; // Thread index variable for warpgroup partition
 
     if (extractor.body.defined()) {
@@ -612,24 +609,20 @@ tvm::transform::Pass AutoSchedule(const bool enable_epi) {
 
     // Build ScheduleUnits from IRStructure
     ScheduleUnitBuilder unit_builder;
-    thread_var = ThreadTagChecker::GetThreadVar(body_to_schedule);
-    if (!thread_var.defined()) {
-      thread_var = ThreadTagChecker::GetThreadVar(func->body);
-    }
     if (thread_var.defined()) {
       unit_builder.SetThreadVar(thread_var);
     } else {
       LOG(FATAL) << "Could not find thread index variable, warpgroup "
                     "partition will use default";
     }
-    unit_builder.SetEnableWarpPartition(config.enable_warp_partition);
-    unit_builder.SetSharedMemoryLimit(config.shared_memory_limit);
+    unit_builder.SetWarpSpecializeConfig(config);
+    unit_builder.SetSharedMemoryLimit(GetSharedMemoryLimit(target));
 
-    bool double_thread;
+    std::vector<PrimExpr> thread_count;
     if (!aggressive) {
-      double_thread = unit_builder.NaiveBuild(ir_structure);
+      thread_count = unit_builder.NaiveBuild(ir_structure);
     } else {
-      double_thread = unit_builder.Build(ir_structure);
+      thread_count = unit_builder.Build(ir_structure);
     }
 
     if (!config.enable_warpgroup_partition) {
@@ -656,28 +649,13 @@ tvm::transform::Pass AutoSchedule(const bool enable_epi) {
     int next_barrier_id = 1;
     std::vector<Buffer> barrier_buffers;
     Map<ObjectRef, ObjectRef> barrier_map;
-    // Determine thread count for barrier arrive_count calculations
-    PrimExpr thread_count[2];
-    if (!config.enable_thread_extend) {
-      ICHECK(config.enable_warp_partition);
-      // sm_100: use fixed warp size (32) for both partitions
-      thread_count[0] = IntImm(DataType::Int(32), 32);
-      thread_count[1] = IntImm(DataType::Int(32), 32);
-    } else {
-      // sm_90: original behavior
-      thread_count[0] = thread_var->dom->extent;
-      thread_count[1] = double_thread ? thread_var->dom->extent
-                                      : IntImm(DataType::Int(32),
-                                               config.producer_thread_count);
-    }
     LoopNestingInfo loop_info;
     std::vector<MultiVersionBufferInfo> buffer_infos;
-    PrimExpr barrier_count = config.enable_thread_extend
-                                 ? thread_count[0] + thread_count[1]
-                                 : thread_var->dom->extent;
+    PrimExpr updated_thread_extent = std::accumulate(
+        thread_count.begin() + 1, thread_count.end(), thread_count[0]);
     Buffer neutral_sync_shared_barrier =
-        makeBarrierBuffer(barrier_count, "neutral_sync_shared_barrier", 1,
-                          barrier_buffers, barrier_map);
+        makeBarrierBuffer(updated_thread_extent, "neutral_sync_shared_barrier",
+                          1, barrier_buffers, barrier_map);
     AnalyzeAndInsertBarriers(
         ir_structure.get(), next_barrier_id, barrier_buffers, barrier_map,
         thread_count, loop_info, buffer_infos, neutral_sync_shared_barrier);
@@ -688,19 +666,7 @@ tvm::transform::Pass AutoSchedule(const bool enable_epi) {
     // Apply warpgroup partition to entire IRStructure
     Stmt new_body = ApplyWarpgroupPartitionToIRStructure(
         ir_structure.get(), thread_var, barrier_buffers, barrier_map,
-        enable_epi, thread_count, double_thread, config,
-        neutral_sync_shared_barrier);
-
-    if (config.enable_thread_extend) {
-      // sm_90: may need to update thread extent
-      if (double_thread) {
-        updated_thread_extent = thread_var->dom->extent * 2;
-      } else {
-        updated_thread_extent =
-            thread_var->dom->extent +
-            IntImm(DataType::Int(32), config.producer_thread_count);
-      }
-    }
+        enable_epi, thread_count, config, neutral_sync_shared_barrier);
 
     // If we extracted from tilelang_root block, replace the body
     Stmt final_body;

diff --git a/src/transform/auto_schedule.h b/src/transform/auto_schedule.h
@@ -85,58 +85,25 @@ struct ComponentInfo {
   bool uses_tensor_core_{false};
 };
 
-// Warp specialization architecture enum
-enum class WarpSpecializeArch : uint8_t {
-  kHopper = 0,
-  kBlackwell = 1,
-  kUnsupported = 2,
-};
-
-// Configuration for warp specialization
-struct WarpSpecializeConfig {
-  WarpSpecializeArch arch = WarpSpecializeArch::kUnsupported;
-  int consumer_max_nreg = 0;
-  int producer_max_nreg = 0;
-  int producer_thread_count = 0;
-  bool enable_set_max_nreg = false;
-  bool enable_warpgroup_partition = false;
-  bool enable_thread_extend = false;
-  bool enable_warp_partition = false;
-  int shared_memory_limit = 0;
-};
-
 // Factory function to get warp specialization configuration for a target
 inline WarpSpecializeConfig GetWarpSpecializeConfig(Target target) {
   if (TargetIsHopper(target)) {
-    return {WarpSpecializeArch::kHopper,
-            240,
-            24,
-            128,
-            true,
-            true,
-            true,
-            false,
-            228 * 1024};
+    return {WarpSpecializeArch::kHopper, 240, 24, 128, true, true, true, false};
+  } else if (TargetIsSm100(target)) {
+    return {WarpSpecializeArch::kBlackwell, 0, 0, 32, false, true, false, true};
+  } else {
+    return {
+        WarpSpecializeArch::kUnsupported, 0, 0, 0, false, false, false, false};
+  }
+}
+
+inline int64_t GetSharedMemoryLimit(Target target) {
+  if (TargetIsHopper(target)) {
+    return 228 * 1024;
   } else if (TargetIsSm100(target)) {
-    return {WarpSpecializeArch::kBlackwell,
-            0,
-            0,
-            32,
-            false,
-            true,
-            false,
-            true,
-            228 * 1024};
+    return 228 * 1024;
   } else {
-    return {WarpSpecializeArch::kUnsupported,
-            0,
-            0,
-            0,
-            false,
-            false,
-            false,
-            false,
-            0};
+    return 48 * 1024;
   }
 }