Eliminate all H2D & D2H copies

sanchitintel · sanchitintel · commit dad2193efca1 · 2025-10-14T22:12:36.000-07:00
diff --git a/examples/11_bmg_moe_gemm_bf16/11_bmg_moe_gemm_bf16.cpp b/examples/11_bmg_moe_gemm_bf16/11_bmg_moe_gemm_bf16.cpp
@@ -280,8 +280,8 @@ template <class Gemm> struct ExampleRunner {
     return passed;
   }
 
-  /// Allocates device-side data
-  void allocate(const GroupGEMMOptions &options, const ElementA *block_A_ptr,
+  /// Allocates device-side data for reference GEMM
+  void allocate_for_ref_gemm(const GroupGEMMOptions &options, const ElementA *block_A_ptr,
                 const ElementA *block_B_ptr, ElementOutput*block_C_ptr,
                 int block_A_size, int block_B_size, int block_C_size) {
     int64_t total_elements_A = 0;
@@ -338,85 +338,10 @@ template <class Gemm> struct ExampleRunner {
     cumsum_device.copy_from_host(cumsum_host);
   }
 
-  /// Initialize operands to be used in the GEMM and reference GEMM
-  void initialize_for_moe_gemm(const GroupGEMMOptions &options) {
+  /// Initialize operands to be used in the reference GEMM
+  void initialize(const GroupGEMMOptions &options) {
 
-    problem_sizes.reset(options.groups);
-    problem_sizes.copy_from_host(options.problem_sizes_host.data());
-
-    //
-    // Assign pointers
-    //
-
-    std::vector<ElementA *> ptr_A_host(1);
-    std::vector<ElementB *> ptr_B_host(1);
-    std::vector<ElementC *> ptr_C_host(1);
-    std::vector<ElementOutput *> ptr_D_host(1);
-    std::vector<ElementAccumulator *> ptr_alpha_host(options.groups);
-    std::vector<ElementAccumulator *> ptr_beta_host(options.groups);
-
-    // Compute offsets, alpha & beta over group on host
-
-      ptr_A_host.at(0) = block_A.get();
-      ptr_B_host.at(0) = block_B.get();
-      ptr_C_host.at(0) = block_C.get();
-      ptr_D_host.at(0) = block_D.get();
-    for (int32_t i = 0; i < options.groups; ++i) {
-      // Fill host vector of alpha & beta with random values if using per-group
-      // values
-      alpha_host.push_back(
-          (options.alpha == FLT_MAX)
-              ? static_cast<ElementAccumulator>((rand() % 5) + 1)
-              : options.alpha);
-      beta_host.push_back((options.beta == FLT_MAX)
-                              ? static_cast<ElementAccumulator>(rand() % 5)
-                              : options.beta);
-      // Fill host ptr vectors with offset addresses into device alpha/beta
-      // blocks
-      ptr_alpha_host.at(i) = block_alpha.get() + i;
-      ptr_beta_host.at(i) = block_beta.get() + i;
-    }
-
-    // Allocate device memory & copy from host
-    ptr_A.reset(1);
-    // Per-group alpha and beta
-    ptr_A.copy_from_host(ptr_A_host.data());
-
-    ptr_B.reset(1);
-    ptr_B.copy_from_host(ptr_B_host.data());
-
-    ptr_C.reset(1);
-    ptr_C.copy_from_host(ptr_C_host.data());
-
-    ptr_D.reset(1);
-    ptr_D.copy_from_host(ptr_D_host.data());
-
-    stride_A.reset(options.groups);
-    stride_A.copy_from_host(stride_A_host.data());
-
-    stride_B.reset(options.groups);
-    stride_B.copy_from_host(stride_B_host.data());
-
-    stride_C.reset(options.groups);
-    stride_C.copy_from_host(stride_C_host.data());
-
-    stride_D.reset(options.groups);
-    stride_D.copy_from_host(stride_D_host.data());
-
-    // Per-group alpha and beta ptrs
-    alpha_device.reset(options.groups);
-    alpha_device.copy_from_host(ptr_alpha_host.data());
-    beta_device.reset(options.groups);
-    beta_device.copy_from_host(ptr_beta_host.data());
-
-    // Per-group alpha and beta values - note these are not directly passed to
-    // kernel - the pointers (alpha_device/beta_device) are passed instead
-    block_alpha.copy_from_host(alpha_host.data());
-    block_beta.copy_from_host(beta_host.data());
-  }
-
-  /// Initialize operands to be used in the GEMM and reference GEMM
-  void initialize_for_ref_gemm(const GroupGEMMOptions &options) {
+    uint64_t seed = 2020;
 
     problem_sizes.reset(options.groups);
     problem_sizes.copy_from_host(options.problem_sizes_host.data());
@@ -438,8 +363,10 @@ template <class Gemm> struct ExampleRunner {
       ptr_B_host.at(i) = block_B.get() + offset_B.at(i);
       ptr_C_host.at(i) = block_C.get() + offset_C.at(i);
       ptr_D_host.at(i) = block_D.get() + offset_D.at(i);
-      // Fill host ptr vectors with offset addresses into device alpha/beta
-      // blocks
+      // Fill host vector of alpha & beta with random values if using per-group values
+      alpha_host.push_back((options.alpha == FLT_MAX) ? static_cast<ElementAccumulator>((rand() % 5) + 1) : options.alpha);
+      beta_host.push_back((options.beta == FLT_MAX) ? static_cast<ElementAccumulator>(rand() % 5) : options.beta);
+      // Fill host ptr vectors with offset addresses into device alpha/beta blocks
       ptr_alpha_host.at(i) = block_alpha.get() + i;
       ptr_beta_host.at(i) = block_beta.get() + i;
     }
@@ -475,9 +402,8 @@ template <class Gemm> struct ExampleRunner {
     alpha_device.copy_from_host(ptr_alpha_host.data());
     beta_device.reset(options.groups);
     beta_device.copy_from_host(ptr_beta_host.data());
-
-    // Per-group alpha and beta values - note these are not directly passed to
-    // kernel - the pointers (alpha_device/beta_device) are passed instead
+    // Per-group alpha and beta values - note these are not directly passed to kernel - the pointers
+    // (alpha_device/beta_device) are passed instead
     block_alpha.copy_from_host(alpha_host.data());
     block_beta.copy_from_host(beta_host.data());
   }
@@ -486,6 +412,9 @@ template <class Gemm> struct ExampleRunner {
   typename Gemm::Arguments
   args_from_options(const GroupGEMMOptions &options,
                     const cutlass::KernelHardwareInfo &hw_info,
+                    const ElementA* A_ptr,
+                    const ElementB* B_ptr,
+                    ElementOutput* D_ptr,
                     const int gemm_N,
                     const int gemm_K) {
     typename Gemm::Arguments arguments;
@@ -494,8 +423,8 @@ template <class Gemm> struct ExampleRunner {
     if (options.alpha != FLT_MAX && options.beta != FLT_MAX) {
       // If both alpha/beta are provided (via cmd line args) and are scalar,
       // i.e., same alpha/beta applies to all batches.
-      fusion_args.alpha = options.alpha;
-      fusion_args.beta = options.beta;
+      fusion_args.alpha = 1;
+      fusion_args.beta = 0;
       fusion_args.alpha_ptr = nullptr;
       fusion_args.beta_ptr = nullptr;
       fusion_args.alpha_ptr_array = nullptr;
@@ -506,12 +435,12 @@ template <class Gemm> struct ExampleRunner {
     } else {
       // If pointers to alpha/beta are provided, i.e., alpha/beta can differ
       // between batches/groups.
-      fusion_args.alpha = 0;
+      fusion_args.alpha = 1;
       fusion_args.beta = 0;
       fusion_args.alpha_ptr = nullptr;
       fusion_args.beta_ptr = nullptr;
-      fusion_args.alpha_ptr_array = alpha_device.get();
-      fusion_args.beta_ptr_array = beta_device.get();
+      fusion_args.alpha_ptr_array = nullptr;
+      fusion_args.beta_ptr_array = nullptr;
       // One alpha and beta per each group
       fusion_args.dAlpha = {cute::_0{}, cute::_0{}, 1};
       fusion_args.dBeta = {cute::_0{}, cute::_0{}, 1};
@@ -523,11 +452,11 @@ template <class Gemm> struct ExampleRunner {
     // Per-GEMM problem shape info may only exist on the device.
     if (host_problem_shapes_available) {
       arguments = typename Gemm::Arguments{
-          cutlass::gemm::GemmUniversalMode::kGrouped,
-          ptr_A.get(),
-          ptr_B.get(),
-          nullptr,
-          ptr_D.get(),
+          cutlass::gemm::GemmUniversalMode::kGrouped, // this just means grouped GEMM
+          static_cast<const ElementA**>((void*)A_ptr),
+          static_cast<const ElementB**>((void*)B_ptr),
+          static_cast<const ElementC**>((void*)D_ptr), // we could also pass nullptr
+          static_cast<ElementOutput**>((void*)D_ptr),
           fusion_args,
           hw_info,
           {1, RasterOrderOptions::AlongN},
@@ -538,10 +467,10 @@ template <class Gemm> struct ExampleRunner {
     } else {
       arguments = typename Gemm::Arguments{
           cutlass::gemm::GemmUniversalMode::kGrouped,
-          ptr_A.get(),
-          ptr_B.get(),
-          nullptr,
-          ptr_D.get(),
+          static_cast<const ElementA**>((void*)A_ptr),
+          static_cast<const ElementB**>((void*)B_ptr),
+          static_cast<const ElementC**>((void*)D_ptr),
+          static_cast<ElementOutput**>((void*)D_ptr),
           fusion_args,
           hw_info,
           {1, RasterOrderOptions::AlongN},
@@ -557,12 +486,11 @@ template <class Gemm> struct ExampleRunner {
   cutlass::Status run(const GroupGEMMOptions &options,
                       const cutlass::KernelHardwareInfo &hw_info,
                       const ElementA *A_ptr, const ElementB *B_ptr,
-                      ElementOutput *C_ptr, int A_size, int B_size, int D_size, const int gemm_n, const int gemm_k) {
-    allocate(options, A_ptr, B_ptr, C_ptr, A_size, B_size, D_size);
-    initialize_for_moe_gemm(options);
+                      ElementOutput *D_ptr, int A_size, int B_size, int D_size, const int gemm_n, const int gemm_k) {
+    allocate_for_ref_gemm(options, A_ptr, B_ptr, D_ptr, A_size, B_size, D_size);
 
     Gemm gemm_op;
-    auto arguments = args_from_options(options, hw_info, gemm_n, gemm_k);
+    auto arguments = args_from_options(options, hw_info, A_ptr, B_ptr, D_ptr, gemm_n, gemm_k);
 
     size_t workspace_size = Gemm::get_workspace_size(arguments);
     cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
@@ -575,15 +503,14 @@ template <class Gemm> struct ExampleRunner {
     CUTLASS_CHECK(gemm_op.run());
 
     syclcompat::wait();
-    initialize_for_ref_gemm(options);
+    initialize(options);
     // Verify that the result is correct
     bool passed = verify(options);
     std::cout << "Disposition: " << (passed ? "Passed" : "Failed") << std::endl;
     if (!passed)
       return cutlass::Status::kErrorInternal;
-    initialize_for_moe_gemm(options);
     syclcompat::wait();
-    arguments = args_from_options(options, hw_info, gemm_n, gemm_k);
+    arguments = args_from_options(options, hw_info, A_ptr, B_ptr, D_ptr, gemm_n, gemm_k);
     CUTLASS_CHECK(gemm_op.can_implement(arguments));
 
     CUTLASS_CHECK(gemm_op.initialize(arguments, workspace.get()));
@@ -647,20 +574,15 @@ void MoEGEMM(const bfloat16_t *activations, const bfloat16_t *weights,
           hw_info.device_id);
 
   using LayoutA = cutlass::layout::RowMajor;
-  using LayoutB = cutlass::layout::ColumnMajor;
+  using LayoutB = cutlass::layout::RowMajor;
   using LayoutC = cutlass::layout::RowMajor;
   using LayoutD = cutlass::layout::RowMajor;
 
   using GmemTiledCopyA = XE_2D_U16x32x32_LD_N;
-  using GmemTiledCopyB = XE_2D_U16x16x16_LD_T;
+  using GmemTiledCopyB = XE_2D_U16x32x32_LD_V;
 
   // Workgroup-level tile
   using TileShape = Shape<_256, _256, _32>;
-/*
-  using TiledMma =
-      TiledMMA<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>,
-               Layout<Shape<_8, _4, _1>, Stride<_4, _1, _0>>>;
-*/
 
   using TiledMma =                    // M=8,N=16,K=16, D=f32,A=bf16,B=bf16,C=f32
       typename TiledMMAHelper<MMA_Atom<XE_8x16x16_F32BF16BF16F32_TT>, Layout<TileShape>,
@@ -710,18 +632,23 @@ void MoEGEMM(const bfloat16_t *activations, const bfloat16_t *weights,
 
 
 int main(int argc, const char **argv) {
-  const int num_experts = 32;
+  const int num_experts = 16;
 
-  int total_rows_for_each_expert[num_experts] = {
+ /* int total_rows_for_each_expert[num_experts] = {
     148, 231, 404, 180, 127, 244, 224, 244, 110, 617, 289, 845, 191, 424, 30, 97, 57, 324,
-  62, 77, 75, 144, 250, 287, 629, 370, 161, 101, 215, 113, 224, 35};
+  62, 77, 75, 144, 250, 287, 629, 370, 161, 101, 215, 113, 224, 35}; */
+
+ int total_rows_for_each_expert[num_experts];
+  for (int i = 0; i < num_experts; i++) {
+    total_rows_for_each_expert[i] = 512;
+  }
 
   int num_tokens_incl_duplicated = 0;
   for (int i = 0; i < num_experts; i++) {
     num_tokens_incl_duplicated += total_rows_for_each_expert[i];
   }
-  int n_moe = 3072;
-  int k_moe = 4096;
+  int n_moe = 16384;
+  int k_moe = 5120;
 
   cutlass::DeviceAllocation<int32_t> num_rows_per_expert_device;
   cutlass::DeviceAllocation<bfloat16_t> activations_data;
diff --git a/include/cutlass/epilogue/collective/xe_array_epilogue.hpp b/include/cutlass/epilogue/collective/xe_array_epilogue.hpp
@@ -494,7 +494,7 @@ template <typename ProblemShape_MNKL>
     TensorD mD_mnl;
     if constexpr (is_source_supported) {
       ElementC const *ptr_C_curr_batch =
-          reinterpret_cast<ElementC const *>(params.ptr_C[0]) +
+          reinterpret_cast<ElementC const *>((void*)(params.ptr_C)) +
           cumulative_M * N;
       mC_mnl = make_tensor(
           make_gmem_ptr(ptr_C_curr_batch),
@@ -504,7 +504,7 @@ template <typename ProblemShape_MNKL>
 
     if constexpr (is_destination_supported) {
       ElementD *ptr_D_curr_batch =
-          reinterpret_cast<ElementD *>(params.ptr_D[0]) +
+          reinterpret_cast<ElementD *>((void*)(params.ptr_D)) +
           cumulative_M * N;
       mD_mnl = make_tensor(
           make_gmem_ptr(ptr_D_curr_batch),
diff --git a/include/cutlass/gemm/collective/xe_array_mma.hpp b/include/cutlass/gemm/collective/xe_array_mma.hpp
@@ -311,10 +311,10 @@ template <typename ProblemShape_MNKL>
     const int32_t K = get<2>(problem_shape_mnkl);
 
     ElementA const *ptr_A_curr_batch =
-        reinterpret_cast<ElementA const *>(mainloop_params.ptr_A[0]) +
+        reinterpret_cast<ElementA const *>((void*)(mainloop_params.ptr_A)) +
         cumulative_M * K;
     ElementB const *ptr_B_curr_batch =
-        reinterpret_cast<ElementB const *>(mainloop_params.ptr_B[0]) +
+        reinterpret_cast<ElementB const *>((void*)(mainloop_params.ptr_B)) +
         next_group * K * N;
 
     Tensor mA = make_tensor(