update tile scheduler & add runtime check

wuxun-zhang · wuxun-zhang · commit 5d64dfbeebb2 · 2025-11-03T22:59:33.000-08:00
diff --git a/applications/flash_attention_v2/kernel/xe_fhma_fwd_kernel.hpp b/applications/flash_attention_v2/kernel/xe_fhma_fwd_kernel.hpp
@@ -349,6 +349,14 @@ class XeFMHAFwdDynamicSplitKernel {
   }
 
   static bool can_implement(Arguments const &args) {
+    // current kernel only support decode
+    if (args.kernel.shape.seq_len_qo > 1) {
+      return false;
+    }
+    // current kernel only support num batch heads less than total XeCore count
+    if (args.kernel.shape.batch * args.kernel.shape.num_heads_q > args.hw_info.sm_count) {
+      return false;
+    }
     return CollectiveMainloop::can_implement(args.mainloop)
         && CollectiveEpilogue::can_implement(args.epilogue);
   }
@@ -436,8 +444,6 @@ class XeFMHAFwdDynamicSplitKernel {
       out1(i) = out1(i) * broadcast<0>(rescale1, out1, i) + out2(i) * broadcast<0>(rescale2, out2, i);
   }
 
-  #define DEBUG_PRINT 0
-
   CUTLASS_DEVICE
   void operator()(Params const &params, char *smem_buf)
   {
@@ -456,25 +462,19 @@ class XeFMHAFwdDynamicSplitKernel {
     int tid_in_sg = thr_id % intel::sg_size;
     int num_batch_heads = s.batch * s.num_heads_q;
 
-    TileScheduler tile_scheduler{params.scheduler};
-
     int local_k_blocks = cute::ceil_div(s.seq_len_kv, get<1>(TileShapeQK{}));
     // total number of blocks need to be processed across all wgs
     int total_k_blocks = local_k_blocks * num_batch_heads;
     // to guarantee all wg process similar number of blocks of KV
     int num_blocks_per_wg = cute::ceil_div(total_k_blocks, GridDimZ());
 
-#if DEBUG_PRINT
-    if (thr_id == 0 && wg_id == 0) {
-      cute::print("Debug>> total_k_blocks: %d, num_blocks_per_wg: %d, local_k_blocks: %d, num_batch_heads: %d\n",
-             total_k_blocks, num_blocks_per_wg, local_k_blocks, num_batch_heads);
-    }
-#endif
+    TileScheduler tile_scheduler{params.scheduler, get<1>(TileShapeQK{}), local_k_blocks, num_batch_heads};
 
     CUTLASS_PRAGMA_NO_UNROLL
     for (; tile_scheduler.is_valid(); ++tile_scheduler) {
       // head_q, idx_b from tile scheduler will not be used
-      auto [blk_q, blk_v, head_q_unused, idx_b_unused] = tile_scheduler.get_block_coord(); // (Q,V,h,b)
+      // auto [blk_q, blk_v, head_q_unused, idx_b_unused] = tile_scheduler.get_block_coord(); // (Q,V,h,b)
+      auto [blk_q, blk_v, start_batch_head_id] = tile_scheduler.get_block_coord(); // (Q,V, batch_head_idx)
       auto blk_qv = make_coord(blk_q, blk_v);
 
       auto shape_Q = make_shape(s.seq_len_qo, s.head_size_qk, s.num_heads_q,  s.batch);
@@ -495,23 +495,13 @@ class XeFMHAFwdDynamicSplitKernel {
       FragA tArA;
       FragARow tA_max, tA_sum;
 
-      // compute start/end batch head id for current wg
-      int start_batch_head_id = wg_id * num_blocks_per_wg / local_k_blocks;
-
       // compute num computed blocks for start batch head id
       int num_computed_blocks = (start_batch_head_id == 0) ? (wg_id * num_blocks_per_wg) : (wg_id * num_blocks_per_wg - start_batch_head_id * local_k_blocks);
       int start_blk, end_blk, head_q, idx_b, head_kv;
       // leader wg is also responsible for reducing partial results, while other
       // worker wg only to compute partial results
       bool is_leader_wg = wg_id < num_batch_heads;
 
-#if DEBUG_PRINT
-    if (thr_id == 0) {
-      cute::print("Debug>> wg id %d, start_batch_head_id: %d, num_computed_blocks: %d\n",
-             wg_id, start_batch_head_id, num_computed_blocks);
-    }
-#endif
-
       if (thr_id == 0 && is_leader_wg) {
         // reset atomic counter before computation
         *(params.atomic_reduce_cnt_ptr + wg_id) = 0;
@@ -558,13 +548,6 @@ class XeFMHAFwdDynamicSplitKernel {
         // partition id of start batch head id in current wg
         int partition_id = get_partition_id(wg_id, batch_head_id, num_blocks_per_wg, local_k_blocks);
 
-#if DEBUG_PRINT
-    if (thr_id == 0) {
-      cute::print("Debug>> wg id %d, batch_head_id: %d, partition_id: %d\n",
-             wg_id, batch_head_id, partition_id);
-    }
-#endif
-
         // store partial result: tArA, tA_max and tA_sum
         int offset = batch_head_id * max_num_partitions * num_elem_per_thead * SGPerWG::value * intel::sg_size 
                     + partition_id * num_elem_per_thead * SGPerWG::value * intel::sg_size
@@ -601,12 +584,6 @@ class XeFMHAFwdDynamicSplitKernel {
       if (is_leader_wg) {
         int num_partitions = get_num_partitions(wg_id, num_blocks_per_wg, local_k_blocks);
 
-#if DEBUG_PRINT
-    if (thr_id == 0) {
-      cute::print("Debug>> wg id %d, num_partitions: %d\n", wg_id, num_partitions);
-    }
-#endif
-
         // check atomic to wait for partial results ready
         while(atomicLoad(params.atomic_reduce_cnt_ptr + wg_id) != num_partitions) {}
 
diff --git a/applications/flash_attention_v2/kernel/xe_tile_scheduler.hpp b/applications/flash_attention_v2/kernel/xe_tile_scheduler.hpp
@@ -101,9 +101,15 @@ struct XeFHMAIndividualPersistentTileScheduler {
 
   bool valid_ = true;
   Params params;
+  int kv_tile_size_;
+  // num of kv blocks for each head
+  int local_num_kv_blocks_;
+  int num_batch_heads_;
 
   CUTLASS_DEVICE
-  XeFHMAIndividualPersistentTileScheduler(Params const& params) : params(params) {}
+  XeFHMAIndividualPersistentTileScheduler(Params const& params, int kv_tile_size,
+    int local_num_kv_blocks, int num_batch_heads)
+    : params(params), kv_tile_size_(kv_tile_size), local_num_kv_blocks_(local_num_kv_blocks), num_batch_heads_(num_batch_heads) {}
 
   template <class ProblemShape, class TileShape>
   static Params to_underlying_arguments(
@@ -116,31 +122,8 @@ struct XeFHMAIndividualPersistentTileScheduler {
               size(ceil_div(shape.seq_len_qo,   get<0>(tile_shape))),     // Q
               size(shape.batch * shape.num_heads_q));                     // (h,b) -- split later
     int num_heads = shape.num_heads_q;
-
-    auto total_wg = grid.x * grid.y * grid.z;
-    // FIXME: replace with runtime check
-    assert(shape.batch == 1);
-    assert((grid.z <= hw_info.sm_count / 2)  && "XeFHMAIndividualPersistentTileScheduler only enabled for decode case where num batch heads samller than SM count");
-
-    // how many partitions each KV seq is split into
-    int num_partitions = hw_info.sm_count / grid.z;
-    // this is for the case where sm_count cannot be divisible by num_batch_heads,
-    // for some head/work group, the KV seq need to split into `num_partitions+1`
-    // partitions to occupy all xecores, here we assme first `tail_wg` work groups
-    // will handle one more partition
-    // for eample, num head is 8, sm_count is 20, so first 20%8=4 work groups
-    // will handle 3 partitions, the rest 4 work groups will handle 2 partitions
-    int num_tail_wg = hw_info.sm_count % grid.z;
-
-    // assume grid shape (1, 1, hw_info.sm_count) to use all xecores
     grid.z = hw_info.sm_count;
-    // int num_partitions = 4; // for 5/1
-    // grid.z *= num_partitions;
-    // num_heads *= num_partitions;
-
-    // FIXME: add fallback mechanism if given problem size doesn't meet requirement
 
-    std::cout << "Debug>> grid shape [" << grid.x << ", " << grid.y << ", " << grid.z << "]\n";
     return Params{grid, {num_heads}};
   }
 
@@ -157,10 +140,18 @@ struct XeFHMAIndividualPersistentTileScheduler {
   CUTLASS_DEVICE
   auto get_block_coord() {
     using namespace cute;
-    int idx_b = BlockIdxZ();
+    int wg_id = BlockIdxZ();
     int head;
-    params.divmod_num_heads(idx_b, head, idx_b);
-    return make_coord(BlockIdxY(), BlockIdxX(), head, idx_b);
+
+    // total number of blocks need to be processed across all wgs
+    int total_num_kv_blocks = local_num_kv_blocks_ * num_batch_heads_;
+    // guarantee all wg process similar number of blocks of KV (load balance)
+    int num_blocks_per_wg = cute::ceil_div(total_num_kv_blocks, GridDimZ());
+
+    // compute start batch head id for current wg
+    int start_batch_head_id = wg_id * num_blocks_per_wg / local_num_kv_blocks_;
+
+    return make_coord(BlockIdxY(), BlockIdxX(), start_batch_head_id);
   }
 
   CUTLASS_DEVICE
diff --git a/examples/06_bmg_flash_attention/06_xe_fmha_fwd.cpp b/examples/06_bmg_flash_attention/06_xe_fmha_fwd.cpp
@@ -113,6 +113,7 @@ int main(int argc, const char **argv) {
 #define KV_TILE_SIZE _256
 #else
 #define NUM_SG _16
+#define KV_TILE_SIZE _512
 #endif
 
 #if HEAD_DIM == 16