Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
68bb8e4
minor fix of the type of an internal function name.
shbae Apr 28, 2026
5ae84f2
[hipblaslt] implement host-side code for the 64bit offset support in …
shbae May 12, 2026
299f898
[hipblaslt] pass batch offset values as kernel arguments.
shbae May 13, 2026
122c1cc
[hipblaslt] add offset parameters into the kernel signature.
shbae May 19, 2026
5a8e6ae
[hipblaslt] update kernel generation to use offsets in general batch …
shbae May 20, 2026
ef58b1d
[hipblaslt] add waitcnt instruction after SLoadB64 before updating th…
shbae May 20, 2026
f86ea62
[hipblaslt] implement tests for 64-bit offset support
shbae May 28, 2026
c4767cc
[hipblaslt] update tensilelite code-gen part for 64bit offset support…
shbae May 28, 2026
95f65e7
[hipblaslt] fix bugs in computeStoreSrd() and remove unnecessary change.
shbae May 29, 2026
bc1e76a
[hipblaslt] update 64-bit offset support for BetaOnly and Conversion.
shbae May 30, 2026
75c41a7
[hipblaslt] fix bugs in the changes of KernelWriterConversion.py
shbae Jun 1, 2026
9687d09
[hipblaslt] remove unnecessary if-condition and minor update.
shbae Jun 2, 2026
a1e6023
[hipblaslt] fix CI test failures.
shbae Jun 3, 2026
a5c0c37
remove temporary debug implementation.
shbae Jun 3, 2026
6a04c1c
[hipblaslt] fix the bug related to CI failures.
shbae Jun 10, 2026
dfcb804
[hipblaslt] place the batch offset argument at the tail of the kernar…
shbae Jun 11, 2026
f506496
[hipblaslt] modify the batch offset value as in elements.
shbae Jun 12, 2026
314abe7
remove temporary debug test and update matmul_batch_offset_large test.
shbae Jun 12, 2026
a1aa531
[hipblaslt] modify matmul_batch_offset_large test with actual 64-bit …
shbae Jun 15, 2026
19ad018
limit the tested gpu_arch for matmul_batch_offset_large tests.
shbae Jun 15, 2026
536d137
[hipblaslt] remove checking the positive offset value
shbae Jun 17, 2026
74324ee
[hipblaslt] add matmul_batch_offset_all_solutions test.
shbae Jun 24, 2026
905a59e
[hipblaslt] don't add batchOffset arguments to the Custom Kernels, an…
shbae Jun 24, 2026
d36670a
[hipblaslt] add tests for negative batch offset values.
shbae Jun 24, 2026
24de4c7
[hipblaslt] check validity with non-zero offsets with POINTER_ARRAY m…
shbae Jun 25, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -112,6 +112,12 @@ struct Arguments
int32_t batch_count;
int32_t batch_mode;

// Batch offset support for general batched GEMM
int64_t batch_offset_a;
int64_t batch_offset_b;
int64_t batch_offset_c;
int64_t batch_offset_d;

int32_t iters;
int32_t cold_iters;

Expand Down Expand Up @@ -251,6 +257,10 @@ struct Arguments
OPER(lde) SEP \
OPER(batch_count) SEP \
OPER(batch_mode) SEP \
OPER(batch_offset_a) SEP \
OPER(batch_offset_b) SEP \
OPER(batch_offset_c) SEP \
OPER(batch_offset_d) SEP \
OPER(iters) SEP \
OPER(cold_iters) SEP \
OPER(warmup_time) SEP \
Expand Down

Large diffs are not rendered by default.

8 changes: 8 additions & 0 deletions projects/hipblaslt/clients/tests/data/hipblaslt_common.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -597,6 +597,10 @@ Arguments:
- lde: c_int64*32
- batch_count: c_int32
- batch_mode: c_int32
- batch_offset_a: c_int64
- batch_offset_b: c_int64
- batch_offset_c: c_int64
- batch_offset_d: c_int64
- iters: c_int32
- cold_iters: c_int32
- warmup_time: c_float
Expand Down Expand Up @@ -723,6 +727,10 @@ Defaults:
transB: '*'
batch_count: 1
batch_mode: 0
batch_offset_a: 0
batch_offset_b: 0
batch_offset_c: 0
batch_offset_d: 0
HMM: false
pad: 4096
threads: 0
Expand Down
200 changes: 200 additions & 0 deletions projects/hipblaslt/clients/tests/data/matmul_gtest.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -3211,4 +3211,204 @@ Tests:
requested_solution_num: 10
gpu_arch: '950'

# ==============================================================================
# Batch Offset Tests - 64-bit offset support for general batched GEMM
# ==============================================================================

# Quick validation test - verifies basic offset functionality
- name: matmul_batch_offset_quick
category: quick
function: matmul_batch_offset
precision: *real_precisions
transA: N
transB: N
M: 256
N: 128
K: 64
lda: 256
ldb: 64
ldc: 256
ldd: 256
batch_mode: 1 # Pointer array mode
batch_count: 2
batch_offset_a: 0
batch_offset_b: 64
batch_offset_c: 128
batch_offset_d: 256
alpha: 1.0
beta: 0.0
unit_check: 1
norm_check: 1

# Offset variation test - various offset values
# Note: Uses M=256 to avoid known General Batched GEMM issue with larger sizes
- name: matmul_batch_offset_values
category: pre_checkin
function: matmul_batch_offset
precision: *real_precisions
transA: N
transB: N
M: 256
N: 128
K: 128
lda: 256
ldb: 128
ldc: 256
ldd: 256
batch_mode: 1
batch_count: 3
batch_offset_a: [0, 64, 256, 512]
batch_offset_b: [0, 64, 256, 512]
batch_offset_c: [0, 64, 256, 512]
batch_offset_d: [0, 64, 256, 512]
alpha: 1.0
beta: [0.0, 1.0]
unit_check: 1
norm_check: 1

# Transpose with offset
# Note: lda/ldb must be valid for all transpose combinations
# For transA=N: lda >= M, for transA=T: lda >= K
# For transB=N: ldb >= K, for transB=T: ldb >= N
# Using max(M,K)=256 for lda and max(K,N)=256 for ldb to cover all cases
- name: matmul_batch_offset_transpose
category: pre_checkin
function: matmul_batch_offset
precision: *real_precisions
transA: [N, T]
transB: [N, T]
M: 256
N: 256
K: 128
lda: 256
ldb: 256
ldc: 256
ldd: 256
batch_mode: 1
batch_count: 3
batch_offset_a: 128
batch_offset_b: 128
batch_offset_c: 128
batch_offset_d: 128
alpha: 1.0
beta: 0.5
unit_check: 1
norm_check: 1

# Alpha/Beta edge cases with offset
- name: matmul_batch_offset_alpha_beta
category: pre_checkin
function: matmul_batch_offset
precision: *real_precisions
transA: N
transB: N
M: 256
N: 128
K: 64
batch_mode: 1
batch_count: 4
batch_offset_a: 64
batch_offset_b: 64
batch_offset_c: 64
batch_offset_d: 64
alpha_beta: *alpha_beta_range
unit_check: 1
norm_check: 1

# Large offset test
- name: matmul_batch_offset_large
category: nightly
function: matmul_batch_offset
precision: [*hpa_half_precision, *hpa_bf16_precision]
transA: N
transB: N
M: 1024
N: 1024
K: 256
lda: 1024
ldb: 256
ldc: 1024
ldd: 1024
batch_mode: 1
batch_count: 2
batch_offset_a: 4294967296
batch_offset_b: 4294967297
batch_offset_c: 4294967298
batch_offset_d: 4097
alpha: 1.0
beta: 1.0
unit_check: 1
norm_check: 1
gpu_arch: '9(42|50)'

# Test all solutions with batch offsets and all transpose combinations
- name: matmul_batch_offset_all_solutions
category: nightly
function: matmul_batch_offset
precision: *real_precisions
transA_transB: *transA_transB_range
M: 256
N: 256
K: 128
lda: 256
ldb: 256
ldc: 256
ldd: 256
batch_mode: 1
batch_count: 2
batch_offset_a: 256
batch_offset_b: 128
batch_offset_c: 512
batch_offset_d: 256
alpha: 1.0
beta: 1.0
requested_solution_num: -1
unit_check: 1

# Test with negative batch offsets to verify proper memory layout handling
- name: matmul_batch_offset_negative

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@TorreZuk I've added matmul_batch_offset_negative and matmul_batch_offset_mixed tests here and they are all PASSED locally.

category: nightly
function: matmul_batch_offset
precision: *real_precisions
transA_transB: *transA_transB_range
M: 256
N: 256
K: 128
lda: 256
ldb: 256
ldc: 256
ldd: 256
batch_mode: 1
batch_count: 2
batch_offset_a: -128
batch_offset_b: -64
batch_offset_c: -256
batch_offset_d: -128
alpha: 1.0
beta: 1.0
unit_check: 1

# Test with mixed positive and negative offsets
- name: matmul_batch_offset_mixed
category: nightly
function: matmul_batch_offset
precision: *real_precisions
transA_transB: *transA_transB_range
M: 256
N: 256
K: 128
lda: 256
ldb: 256
ldc: 256
ldd: 256
batch_mode: 1
batch_count: 2
batch_offset_a: -64
batch_offset_b: 128
batch_offset_c: -128
batch_offset_d: 256
alpha: 1.0
beta: 1.0
unit_check: 1

...
6 changes: 5 additions & 1 deletion projects/hipblaslt/clients/tests/src/matmul_gtest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@
#include "hipblaslt_datatype2string.hpp"
#include "hipblaslt_test.hpp"
#include "testing_matmul.hpp"
#include "testing_matmul_batch_offset.hpp"
#include <cctype>
#include <cstring>
#include <type_traits>
Expand All @@ -48,6 +49,8 @@ namespace
testing_matmul(arg);
else if(!strcmp(arg.function, "matmul_bad_arg"))
testing_matmul_bad_arg(arg);
else if(!strcmp(arg.function, "matmul_batch_offset"))
testing_matmul_batch_offset(arg);
else
FAIL() << "Internal error: Test called with unknown function: " << arg.function;
}
Expand All @@ -64,7 +67,8 @@ namespace
// Filter for which functions apply to this suite
static bool function_filter(const Arguments& arg)
{
return !strcmp(arg.function, "matmul") || !strcmp(arg.function, "matmul_bad_arg");
return !strcmp(arg.function, "matmul") || !strcmp(arg.function, "matmul_bad_arg")
|| !strcmp(arg.function, "matmul_batch_offset");
}

// Google Test name suffix based on parameters
Expand Down
14 changes: 12 additions & 2 deletions projects/hipblaslt/library/include/hipblaslt/hipblaslt.h
Original file line number Diff line number Diff line change
Expand Up @@ -163,15 +163,25 @@ typedef enum {
* ``int64_t``
*/
HIPBLASLT_MATRIX_LAYOUT_LD = 6,

/** Matrix Batch Mode.
* Batched GEMM can be either:
* 1. Strided Batch: Single contiguous memory allocation and stride between matrices in
* the batch is specified in terms of number of elements.
* 2. General Batched: This uses pointer array with each pointer storing the base address
* 2. General Batched: This uses pointer array with each pointer storing the base address
* of the matrices in the batch.
* See hipblasLtBatchMode_t
*/
HIPBLASLT_MATRIX_LAYOUT_BATCH_MODE = 7,
HIPBLASLT_MATRIX_LAYOUT_BATCH_MODE = 7,

/** Matrix Offset.
*
* For ``General Batched GEMM``, we can support for users to access a sub-matrix of
* the original matrix by adding an ``offset`` value (in elements) from the base address.
* Note that for non-batched or Strided Batch GEMM case, we can directly apply
* the offset value by using the strided-offset value.
*/
HIPBLASLT_MATRIX_LAYOUT_OFFSET = 8
} hipblasLtMatrixLayoutAttribute_t;

/*! \ingroup types_module
Expand Down
2 changes: 1 addition & 1 deletion projects/hipblaslt/library/src/amd_detail/hipblaslt.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -262,7 +262,7 @@ try
{
rocblaslt::Debug::Instance().markerStart("hipblasLtMatrixLayoutDestroy");
auto status = RocBlasLtStatusToHIPStatus(
rocblaslt_matrix_layout_destory((const rocblaslt_matrix_layout)descr));
rocblaslt_matrix_layout_destroy((const rocblaslt_matrix_layout)descr));
rocblaslt::Debug::Instance().markerStop();
return status;
}
Expand Down
16 changes: 16 additions & 0 deletions projects/hipblaslt/library/src/amd_detail/include/auxiliary.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,22 @@ constexpr const char* hip_datatype_to_string(hipDataType type)
return "invalid";
}

// Returns true for sub-byte MX-style data types (fp6/fp4).
// Used to reject features that require byte-addressable elements.
HIPBLASLT_EXPORT
constexpr bool hip_datatype_is_mxtype(hipDataType type)
{
switch(type)
{
case HIP_R_6F_E2M3:
case HIP_R_6F_E3M2:
case HIP_R_4F_E2M1:
return true;
default:
return false;
}
}

// return precision string for hipDataType
HIPBLASLT_EXPORT
constexpr const char* hipblas_computetype_to_string(hipblasComputeType_t type)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -117,7 +117,7 @@ rocblaslt_status rocblaslt_get_sm_count_target(rocblaslt_handle handle,
* \brief Create a descriptor for matrix
* \details
* \p rocblaslt_matrix_layout_create creates a matrix descriptor It initializes
* It should be destroyed at the end using rocblaslt_matrix_layout_destory().
* It should be destroyed at the end using rocblaslt_matrix_layout_destroy().
*
* @param[out]
* matDescr the pointer to the matrix descriptor
Expand All @@ -136,7 +136,7 @@ rocblaslt_status rocblaslt_matrix_layout_create(rocblaslt_matrix_layout* matDesc
* \brief Destroy a matrix descriptor
*
* \details
* \p rocblaslt_matrix_layout_destory destroys a matrix descriptor and releases
* \p rocblaslt_matrix_layout_destroy destroys a matrix descriptor and releases
* all resources used by the descriptor
*
* @param[in]
Expand All @@ -145,7 +145,7 @@ rocblaslt_status rocblaslt_matrix_layout_create(rocblaslt_matrix_layout* matDesc
* \retval rocblaslt_status_success the operation completed successfully.
* \retval rocblaslt_status_invalid_pointer \p descr is invalid.
*/
rocblaslt_status rocblaslt_matrix_layout_destory(const rocblaslt_matrix_layout descr);
rocblaslt_status rocblaslt_matrix_layout_destroy(const rocblaslt_matrix_layout descr);
Comment thread
shbae marked this conversation as resolved.

rocblaslt_status rocblaslt_matrix_layout_set_attribute(rocblaslt_matrix_layout matLayout,
rocblaslt_matrix_layout_attribute attr,
Expand Down Expand Up @@ -187,7 +187,7 @@ rocblaslt_status rocblaslt_matmul_desc_create(rocblaslt_matmul_desc* matmulDesc,
* \brief Destroy a matrix multiplication descriptor
*
* \details
* \p rocblaslt_matrix_layout_destory destroys a multiplication matrix descr.
* \p rocblaslt_matrix_layout_destroy destroys a multiplication matrix descr.
*
* @param[in]
* descr the matrix multiplication descriptor
Expand Down
Loading
Loading