ROCm · shbae · Apr 28, 2026 · May 12, 2026 · May 13, 2026 · May 19, 2026
@@ -112,6 +112,12 @@ struct Arguments
     int32_t batch_count;
     int32_t batch_mode;
 
+    // Batch offset support for general batched GEMM
+    int64_t batch_offset_a;
+    int64_t batch_offset_b;
+    int64_t batch_offset_c;
+    int64_t batch_offset_d;
+
     int32_t iters;
     int32_t cold_iters;
 
@@ -251,6 +257,10 @@ struct Arguments
     OPER(lde) SEP                    \
     OPER(batch_count) SEP            \
     OPER(batch_mode) SEP             \
+    OPER(batch_offset_a) SEP         \
+    OPER(batch_offset_b) SEP         \
+    OPER(batch_offset_c) SEP         \
+    OPER(batch_offset_d) SEP         \
     OPER(iters) SEP                  \
     OPER(cold_iters) SEP             \
     OPER(warmup_time) SEP            \

@@ -597,6 +597,10 @@ Arguments:
   - lde: c_int64*32
   - batch_count: c_int32
   - batch_mode: c_int32
+  - batch_offset_a: c_int64
+  - batch_offset_b: c_int64
+  - batch_offset_c: c_int64
+  - batch_offset_d: c_int64
   - iters: c_int32
   - cold_iters: c_int32
   - warmup_time: c_float
@@ -723,6 +727,10 @@ Defaults:
   transB: '*'
   batch_count: 1
   batch_mode: 0
+  batch_offset_a: 0
+  batch_offset_b: 0
+  batch_offset_c: 0
+  batch_offset_d: 0
   HMM: false
   pad: 4096
   threads: 0

@@ -3211,4 +3211,204 @@ Tests:
   requested_solution_num: 10
   gpu_arch: '950'
 
+# ==============================================================================
+# Batch Offset Tests - 64-bit offset support for general batched GEMM
+# ==============================================================================
+
+# Quick validation test - verifies basic offset functionality
+- name: matmul_batch_offset_quick
+  category: quick
+  function: matmul_batch_offset
+  precision: *real_precisions
+  transA: N
+  transB: N
+  M: 256
+  N: 128
+  K: 64
+  lda: 256
+  ldb: 64
+  ldc: 256
+  ldd: 256
+  batch_mode: 1  # Pointer array mode
+  batch_count: 2
+  batch_offset_a: 0
+  batch_offset_b: 64
+  batch_offset_c: 128
+  batch_offset_d: 256
+  alpha: 1.0
+  beta: 0.0
+  unit_check: 1
+  norm_check: 1
+
+# Offset variation test - various offset values
+# Note: Uses M=256 to avoid known General Batched GEMM issue with larger sizes
+- name: matmul_batch_offset_values
+  category: pre_checkin
+  function: matmul_batch_offset
+  precision: *real_precisions
+  transA: N
+  transB: N
+  M: 256
+  N: 128
+  K: 128
+  lda: 256
+  ldb: 128
+  ldc: 256
+  ldd: 256
+  batch_mode: 1
+  batch_count: 3
+  batch_offset_a: [0, 64, 256, 512]
+  batch_offset_b: [0, 64, 256, 512]
+  batch_offset_c: [0, 64, 256, 512]
+  batch_offset_d: [0, 64, 256, 512]
+  alpha: 1.0
+  beta: [0.0, 1.0]
+  unit_check: 1
+  norm_check: 1
+
+# Transpose with offset
+# Note: lda/ldb must be valid for all transpose combinations
+# For transA=N: lda >= M, for transA=T: lda >= K
+# For transB=N: ldb >= K, for transB=T: ldb >= N
+# Using max(M,K)=256 for lda and max(K,N)=256 for ldb to cover all cases
+- name: matmul_batch_offset_transpose
+  category: pre_checkin
+  function: matmul_batch_offset
+  precision: *real_precisions
+  transA: [N, T]
+  transB: [N, T]
+  M: 256
+  N: 256
+  K: 128
+  lda: 256
+  ldb: 256
+  ldc: 256
+  ldd: 256
+  batch_mode: 1
+  batch_count: 3
+  batch_offset_a: 128
+  batch_offset_b: 128
+  batch_offset_c: 128
+  batch_offset_d: 128
+  alpha: 1.0
+  beta: 0.5
+  unit_check: 1
+  norm_check: 1
+
+# Alpha/Beta edge cases with offset
+- name: matmul_batch_offset_alpha_beta
+  category: pre_checkin
+  function: matmul_batch_offset
+  precision: *real_precisions
+  transA: N
+  transB: N
+  M: 256
+  N: 128
+  K: 64
+  batch_mode: 1
+  batch_count: 4
+  batch_offset_a: 64
+  batch_offset_b: 64
+  batch_offset_c: 64
+  batch_offset_d: 64
+  alpha_beta: *alpha_beta_range
+  unit_check: 1
+  norm_check: 1
+
+# Large offset test
+- name: matmul_batch_offset_large
+  category: nightly
+  function: matmul_batch_offset
+  precision: [*hpa_half_precision, *hpa_bf16_precision]
+  transA: N
+  transB: N
+  M: 1024
+  N: 1024
+  K: 256
+  lda: 1024
+  ldb: 256
+  ldc: 1024
+  ldd: 1024
+  batch_mode: 1
+  batch_count: 2
+  batch_offset_a: 4294967296
+  batch_offset_b: 4294967297
+  batch_offset_c: 4294967298
+  batch_offset_d: 4097
+  alpha: 1.0
+  beta: 1.0
+  unit_check: 1
+  norm_check: 1
+  gpu_arch: '9(42|50)'
+
+# Test all solutions with batch offsets and all transpose combinations
+- name: matmul_batch_offset_all_solutions
+  category: nightly
+  function: matmul_batch_offset
+  precision: *real_precisions
+  transA_transB: *transA_transB_range
+  M: 256
+  N: 256
+  K: 128
+  lda: 256
+  ldb: 256
+  ldc: 256
+  ldd: 256
+  batch_mode: 1
+  batch_count: 2
+  batch_offset_a: 256
+  batch_offset_b: 128
+  batch_offset_c: 512
+  batch_offset_d: 256
+  alpha: 1.0
+  beta: 1.0
+  requested_solution_num: -1
+  unit_check: 1
+
+# Test with negative batch offsets to verify proper memory layout handling
+- name: matmul_batch_offset_negative
+  category: nightly
+  function: matmul_batch_offset
+  precision: *real_precisions
+  transA_transB: *transA_transB_range
+  M: 256
+  N: 256
+  K: 128
+  lda: 256
+  ldb: 256
+  ldc: 256
+  ldd: 256
+  batch_mode: 1
+  batch_count: 2
+  batch_offset_a: -128
+  batch_offset_b: -64
+  batch_offset_c: -256
+  batch_offset_d: -128
+  alpha: 1.0
+  beta: 1.0
+  unit_check: 1
+
+# Test with mixed positive and negative offsets
+- name: matmul_batch_offset_mixed
+  category: nightly
+  function: matmul_batch_offset
+  precision: *real_precisions
+  transA_transB: *transA_transB_range
+  M: 256
+  N: 256
+  K: 128
+  lda: 256
+  ldb: 256
+  ldc: 256
+  ldd: 256
+  batch_mode: 1
+  batch_count: 2
+  batch_offset_a: -64
+  batch_offset_b: 128
+  batch_offset_c: -128
+  batch_offset_d: 256
+  alpha: 1.0
+  beta: 1.0
+  unit_check: 1
+
 ...
@@ -27,6 +27,7 @@
 #include "hipblaslt_datatype2string.hpp"
 #include "hipblaslt_test.hpp"
 #include "testing_matmul.hpp"
+#include "testing_matmul_batch_offset.hpp"
 #include <cctype>
 #include <cstring>
 #include <type_traits>
@@ -48,6 +49,8 @@ namespace
                 testing_matmul(arg);
             else if(!strcmp(arg.function, "matmul_bad_arg"))
                 testing_matmul_bad_arg(arg);
+            else if(!strcmp(arg.function, "matmul_batch_offset"))
+                testing_matmul_batch_offset(arg);
             else
                 FAIL() << "Internal error: Test called with unknown function: " << arg.function;
         }
@@ -64,7 +67,8 @@ namespace
         // Filter for which functions apply to this suite
         static bool function_filter(const Arguments& arg)
         {
-            return !strcmp(arg.function, "matmul") || !strcmp(arg.function, "matmul_bad_arg");
+            return !strcmp(arg.function, "matmul") || !strcmp(arg.function, "matmul_bad_arg")
+                   || !strcmp(arg.function, "matmul_batch_offset");
         }
 
         // Google Test name suffix based on parameters

@@ -163,15 +163,25 @@ typedef enum {
    * ``int64_t``
    */
   HIPBLASLT_MATRIX_LAYOUT_LD = 6,
+
   /** Matrix Batch Mode.
    * Batched GEMM can be either:
    * 1. Strided Batch: Single contiguous memory allocation and stride between matrices in
    * the batch is specified in terms of number of elements.
-   * 2. General Batched: This uses pointer array with each pointer storing the base address 
+   * 2. General Batched: This uses pointer array with each pointer storing the base address
    * of the matrices in the batch.
    * See hipblasLtBatchMode_t
    */
-  HIPBLASLT_MATRIX_LAYOUT_BATCH_MODE = 7,   
+  HIPBLASLT_MATRIX_LAYOUT_BATCH_MODE = 7,
+
+  /** Matrix Offset.
+   *
+   * For ``General Batched GEMM``, we can support for users to access a sub-matrix of
+   * the original matrix by adding an ``offset`` value (in elements) from the base address.
+   * Note that for non-batched or Strided Batch GEMM case, we can directly apply
+   * the offset value by using the strided-offset value.
+   */
+  HIPBLASLT_MATRIX_LAYOUT_OFFSET = 8
 } hipblasLtMatrixLayoutAttribute_t;
 
 /*! \ingroup types_module

@@ -262,7 +262,7 @@ try
 {
     rocblaslt::Debug::Instance().markerStart("hipblasLtMatrixLayoutDestroy");
     auto status = RocBlasLtStatusToHIPStatus(
-        rocblaslt_matrix_layout_destory((const rocblaslt_matrix_layout)descr));
+        rocblaslt_matrix_layout_destroy((const rocblaslt_matrix_layout)descr));
     rocblaslt::Debug::Instance().markerStop();
     return status;
 }

@@ -158,6 +158,22 @@ constexpr const char* hip_datatype_to_string(hipDataType type)
     return "invalid";
 }
 
+// Returns true for sub-byte MX-style data types (fp6/fp4).
+// Used to reject features that require byte-addressable elements.
+HIPBLASLT_EXPORT
+constexpr bool hip_datatype_is_mxtype(hipDataType type)
+{
+    switch(type)
+    {
+    case HIP_R_6F_E2M3:
+    case HIP_R_6F_E3M2:
+    case HIP_R_4F_E2M1:
+        return true;
+    default:
+        return false;
+    }
+}
+
 // return precision string for hipDataType
 HIPBLASLT_EXPORT
 constexpr const char* hipblas_computetype_to_string(hipblasComputeType_t type)

@@ -117,7 +117,7 @@ rocblaslt_status rocblaslt_get_sm_count_target(rocblaslt_handle handle,
  *  \brief Create a descriptor for matrix
  *  \details
  *  \p rocblaslt_matrix_layout_create creates a matrix descriptor It initializes
- *  It should be destroyed at the end using rocblaslt_matrix_layout_destory().
+ *  It should be destroyed at the end using rocblaslt_matrix_layout_destroy().
  *
  *  @param[out]
  *  matDescr   the pointer to the matrix descriptor
@@ -136,7 +136,7 @@ rocblaslt_status rocblaslt_matrix_layout_create(rocblaslt_matrix_layout* matDesc
  *  \brief Destroy a matrix descriptor
  *
  *  \details
- *  \p rocblaslt_matrix_layout_destory destroys a matrix descriptor and releases
+ *  \p rocblaslt_matrix_layout_destroy destroys a matrix descriptor and releases
  * all resources used by the descriptor
  *
  *  @param[in]
@@ -145,7 +145,7 @@ rocblaslt_status rocblaslt_matrix_layout_create(rocblaslt_matrix_layout* matDesc
  *  \retval rocblaslt_status_success the operation completed successfully.
  *  \retval rocblaslt_status_invalid_pointer \p descr is invalid.
  */
-rocblaslt_status rocblaslt_matrix_layout_destory(const rocblaslt_matrix_layout descr);
+rocblaslt_status rocblaslt_matrix_layout_destroy(const rocblaslt_matrix_layout descr);
 
 rocblaslt_status rocblaslt_matrix_layout_set_attribute(rocblaslt_matrix_layout           matLayout,
                                                        rocblaslt_matrix_layout_attribute attr,
@@ -187,7 +187,7 @@ rocblaslt_status rocblaslt_matmul_desc_create(rocblaslt_matmul_desc* matmulDesc,
  *  \brief Destroy a matrix multiplication descriptor
  *
  *  \details
- *  \p rocblaslt_matrix_layout_destory destroys a multiplication matrix descr.
+ *  \p rocblaslt_matrix_layout_destroy destroys a multiplication matrix descr.
  *
  *  @param[in]
  *  descr   the matrix multiplication descriptor