Bug fix

sanchitintel · sanchitintel · commit 5b6f880076cc · 2025-10-14T22:12:36.000-07:00
diff --git a/include/cutlass/epilogue/collective/xe_array_epilogue.hpp b/include/cutlass/epilogue/collective/xe_array_epilogue.hpp
@@ -43,6 +43,7 @@
 #include "cutlass/epilogue/fusion/sm90_visitor_tma_warpspecialized.hpp"
 #include "cutlass/epilogue/fusion/xe_visitor_softmax.hpp"
 #include "cutlass/detail/layout.hpp"
+#include "../tools/util/include/cutlass/util/packed_stride.hpp"
 
 #include "cute/tensor.hpp"
 
@@ -114,6 +115,7 @@ class CollectiveEpilogue<
   using ElementScalar = typename FusionCallbacks::ElementScalar;
   static constexpr FloatRoundStyle RoundStyle = FloatRoundStyle::round_to_nearest;
 
+
   static_assert(cute::is_any_of_v<typename FusionCallbacks::Operation, 
                                 fusion::LinearCombination<ElementAccumulator, ElementCompute, ElementSource, ElementScalar, RoundStyle, false>,
                                 fusion::LinearCombination<ElementAccumulator, ElementCompute, ElementSource, ElementScalar, RoundStyle, true>>,
@@ -244,6 +246,7 @@ class CollectiveEpilogue<
       Arguments const& args) {
     constexpr int copy_alignment_bits = 128;
     constexpr int batch_alignment_bits = 512;
+
     bool implementable = true;
     bool fusion_implementable = true;
 
@@ -493,22 +496,20 @@ template <typename ProblemShape_MNKL>
       ElementC const *ptr_C_curr_batch =
           reinterpret_cast<ElementC const *>(params.ptr_C[0]) +
           cumulative_M * N;
-      auto c_stride = InternalStrideC{};
-      cute::get<0>(c_stride) = N;
       mC_mnl = make_tensor(
           make_gmem_ptr(ptr_C_curr_batch),
-          make_layout(make_shape(M, N, L), c_stride));
+          make_layout(make_shape(M, N, L), cutlass::make_cute_packed_stride(
+                                               InternalStrideC{}, {M, N, 1})));
     }
 
     if constexpr (is_destination_supported) {
       ElementD *ptr_D_curr_batch =
           reinterpret_cast<ElementD *>(params.ptr_D[0]) +
           cumulative_M * N;
-      auto d_stride = InternalStrideD{};
-      cute::get<0>(d_stride) = N;
       mD_mnl = make_tensor(
           make_gmem_ptr(ptr_D_curr_batch),
-          make_layout(make_shape(M, N, L), d_stride));
+          make_layout(make_shape(M, N, L), cutlass::make_cute_packed_stride(
+                                               InternalStrideD{}, {M, N, 1})));
     }
     return cute::make_tuple(mC_mnl, mD_mnl);
   }
diff --git a/include/cutlass/gemm/collective/xe_array_mma.hpp b/include/cutlass/gemm/collective/xe_array_mma.hpp
@@ -37,6 +37,7 @@
 #include "cute/algorithm/functional.hpp"
 #include "cute/atom/mma_atom.hpp"
 #include "cute/algorithm/gemm.hpp"
+#include "../tools/util/include/cutlass/util/packed_stride.hpp"
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -315,16 +316,13 @@ template <typename ProblemShape_MNKL>
     ElementB const *ptr_B_curr_batch =
         reinterpret_cast<ElementB const *>(mainloop_params.ptr_B[0]) +
         next_group * K * N;
-    auto a_stride = InternalStrideA{};
-    cute::get<0>(a_stride) = K;
+
     Tensor mA = make_tensor(
         make_gmem_ptr(ptr_A_curr_batch), make_shape(M, K, (int32_t)1),
-        a_stride);
-    auto b_stride = InternalStrideB{};
-    cute::get<0>(b_stride) = K;
+        cutlass::make_cute_packed_stride(InternalStrideA{}, {M, K, 1}));
     Tensor mB = make_tensor(
         make_gmem_ptr(ptr_B_curr_batch), make_shape(N, K, (int32_t)1),
-        b_stride);
+        cutlass::make_cute_packed_stride(InternalStrideB{}, {N, K, 1}));
 
     return cute::make_tuple(mA, mB);
   }