Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Benchmarks: Revise Code - Add hipblasLt tuning to dist-inference cpp implementation #616

Merged
merged 5 commits into from
Apr 2, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions superbench/benchmarks/micro_benchmarks/dist_inference.py
Original file line number Diff line number Diff line change
@@ -312,6 +312,12 @@ def add_parser_arguments(self):
required=False,
help='Whether to launch kernels in CUDA graph mode.',
)
self._parser.add_argument(
'--tune_gemm',
action='store_true',
required=False,
help='Whether to tune GEMM performance before testing.',
)

def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
@@ -356,6 +362,8 @@ def _preprocess(self):
(self._args.num_layers, self._args.num_warmup, self._args.num_steps)
if self._args.use_cuda_graph:
args += ' --use_cuda_graph'
if self._args.tune_gemm:
args += ' --tune_gemm'
self._commands = ['%s %s' % (self.__bin_path, args)]

return True
Original file line number Diff line number Diff line change
@@ -27,6 +27,7 @@
*
*******************************************************************************/

#include <algorithm>
#include <chrono>
#include <cstdio>
#include <cstdlib>
@@ -60,6 +61,21 @@ using cublasLtHalf = hipblasLtHalf;
#else
#define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32
#endif
#if HIP_VERSION >= 50700000
#include <hipblaslt/hipblaslt-ext.hpp>
#if HIP_VERSION >= 60000000
#define HIPBLASLT_GETINDEXFROMALGO(algo) hipblaslt_ext::getIndexFromAlgo(algo)
#else
static int getIndexFromAlgo(hipblasLtMatmulAlgo_t &algo) {
int *algo_ptr = (int *)algo.data;
if (*algo_ptr < 0) {
return -1;
}
return *algo_ptr;
}
#define HIPBLASLT_GETINDEXFROMALGO(algo) getIndexFromAlgo(algo)
#endif
#endif
#else
#include <cublasLt.h>
#include <cuda_fp16.h>
@@ -94,23 +110,26 @@ using cublasLtHalf = half;
#endif

static void ShowUsage(char *argv[]) {
std::cerr << "Usage: " << argv[0] << " <options>\n"
<< "options:\n"
<< "\t-h, --help\t\t\t\tShow this help message\n"
<< "\t-m \t\t\tm\t\tGEMM_STRIDED argument m\n"
<< "\t-n \t\t\tn\t\tGEMM_STRIDED argument n\n"
<< "\t-k \t\t\tk \t\tGEMM_STRIDED argument k\n"
<< "\t--alpha \t\talpha \t\tGEMM_STRIDED argument alpha\n"
<< "\t--beta \t\t\tbeta \t\tGEMM_STRIDED argument beta\n"
<< "\t--num_layers \t\t\tnum_layers \t\tNumber of layers in the model\n"
<< "\t--num_warmups \t\t\tnum_warmups \t\tNumber of warmup runs\n"
<< "\t--num_iters \t\t\tnum_iters \t\tNumber of test runs\n"
<< "\t--use_cuda_graph \t\t\tuse_cuda_graph \t\tWhether to launch kernels in CUDA graph mode\n"
<< std::endl;
std::cerr
<< "Usage: " << argv[0] << " <options>\n"
<< "options:\n"
<< "\t-h, --help\t\t\t\tShow this help message\n"
<< "\t-m \t\t\tm\t\tGEMM_STRIDED argument m\n"
<< "\t-n \t\t\tn\t\tGEMM_STRIDED argument n\n"
<< "\t-k \t\t\tk \t\tGEMM_STRIDED argument k\n"
<< "\t--alpha \t\talpha \t\tGEMM_STRIDED argument alpha\n"
<< "\t--beta \t\t\tbeta \t\tGEMM_STRIDED argument beta\n"
<< "\t--num_layers \t\t\tnum_layers \t\tNumber of layers in the model\n"
<< "\t--num_warmups \t\t\tnum_warmups \t\tNumber of warmup runs\n"
<< "\t--num_iters \t\t\tnum_iters \t\tNumber of test runs\n"
<< "\t--use_cuda_graph \t\t\tuse_cuda_graph \t\tWhether to launch kernels in CUDA graph mode\n"
<< "\t--tune_gemm \t\t\ttune_gemm \t\tWhether to tune GEMM before testing. Currently only work for hipblasLt.\n"
<< std::endl;
}

static int ParseArguments(int argc, char *argv[], int64_t *m, int64_t *n, int64_t *k, float *alpha, float *beta,
int32_t *num_layers, int32_t *num_warmups, int32_t *num_iters, bool *use_cuda_graph) {
int32_t *num_layers, int32_t *num_warmups, int32_t *num_iters, bool *use_cuda_graph,
bool *tune_gemm) {
if (argc >= 2) {
for (int i = 1; i < argc; ++i) {
std::string arg = argv[i];
@@ -143,6 +162,8 @@ static int ParseArguments(int argc, char *argv[], int64_t *m, int64_t *n, int64_
std::cerr << "not supported by current environment" << std::endl << std::endl;
return -1;
#endif
} else if (arg == "--tune_gemm") {
*tune_gemm = true;
} else {
std::cerr << "error with " << arg << std::endl;
std::cerr << "do not recognize option" << std::endl << std::endl;
@@ -182,10 +203,91 @@ void InitializeABCDEF(std::vector<cublasLtHalf> &ha, int64_t size_a, std::vector
}
}

#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 50700000
// Tune GEMM algorithm in local rank.
// Write <0 to ret_algo_time_in_ms if nothing found.
// Write >=0 to ret_algo_time_in_ms and write ret_algo if something is found.
void TuneHipblasLtGemmLocal(const hipblasLtHandle_t &handle, const hipblasLtMatmulDesc_t &matmul, float alpha, void *da,
const hipblasLtMatrixLayout_t &matA, void *db, const hipblasLtMatrixLayout_t &matB,
float beta, void *dc, const hipblasLtMatrixLayout_t &matC, void *dd,
const hipblasLtMatrixLayout_t &matD, void *d_workspace, uint64_t workspace_size,
const cudaStream_t &stream, int rank, int num_ranks, hipblasLtMatmulAlgo_t *ret_algo,
float *ret_algo_time_in_ms) {
std::vector<hipblasLtMatmulHeuristicResult_t> gemm_heuristics;
// Get all possible algorithms
CHECK_CUBLASLT_ERROR(hipblaslt_ext::getAllAlgos(
handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM, HIPBLAS_OP_N, HIPBLAS_OP_N, DIST_INF_HIP_DATATYPE_R_16F,
DIST_INF_HIP_DATATYPE_R_16F, DIST_INF_HIP_DATATYPE_R_16F, DIST_INF_HIP_DATATYPE_R_16F,
DIST_INF_HIP_COMPUTETYPE_F32, gemm_heuristics));
// Make sure the algorithm order is deterministic
std::sort(gemm_heuristics.begin(), gemm_heuristics.end(),
[](hipblasLtMatmulHeuristicResult_t &a, hipblasLtMatmulHeuristicResult_t &b) {
return HIPBLASLT_GETINDEXFROMALGO(a.algo) < HIPBLASLT_GETINDEXFROMALGO(b.algo);
});
// Timing utilities
cudaEvent_t start_event;
cudaEvent_t end_event;
const int kNumWarmups = 10;
const int kNumTestRuns = 100;
*ret_algo_time_in_ms = -1;
// Benchmark all algorithms in given shape
CHECK_CUDA_ERROR(cudaEventCreate(&start_event));
CHECK_CUDA_ERROR(cudaEventCreate(&end_event));
// Partition work evenly into different ranks
for (size_t algo_idx = rank; algo_idx < gemm_heuristics.size(); algo_idx += num_ranks) {
auto &algo = gemm_heuristics[algo_idx].algo;
size_t ret_workspace_size = 0;
auto status = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul, &alpha, matA, matB, &beta, matC, matD, algo,
ret_workspace_size);
if (status != HIPBLAS_STATUS_SUCCESS || ret_workspace_size >= workspace_size) {
continue;
}
for (int i = 0; i < kNumWarmups; i++) {
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul, &alpha, da, matA, db, matB, &beta, dc, matC, dd, matD,
&algo, d_workspace, workspace_size, stream));
}
CHECK_CUDA_ERROR(cudaEventRecord(start_event, stream));
for (int i = 0; i < kNumTestRuns; i++) {
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul, &alpha, da, matA, db, matB, &beta, dc, matC, dd, matD,
&algo, d_workspace, workspace_size, stream));
}
CHECK_CUDA_ERROR(cudaEventRecord(end_event, stream));
CHECK_CUDA_ERROR(cudaStreamSynchronize(stream));
float time_in_ms = 0;
CHECK_CUDA_ERROR(cudaEventElapsedTime(&time_in_ms, start_event, end_event));
time_in_ms /= kNumTestRuns;
if (*ret_algo_time_in_ms < 0 || time_in_ms < *ret_algo_time_in_ms) {
*ret_algo = algo;
*ret_algo_time_in_ms = time_in_ms;
}
}
CHECK_CUDA_ERROR(cudaEventDestroy(start_event));
CHECK_CUDA_ERROR(cudaEventDestroy(end_event));
}

// Select global best GEMM algorithms across ranks. Write global_algo if something is found.
void TuneHipblasLtGemmGlobal(int num_ranks, const hipblasLtMatmulAlgo_t &local_algo, float local_time_in_ms,
hipblasLtMatmulAlgo_t *global_algo) {
std::vector<hipblasLtMatmulAlgo_t> coll_algo(num_ranks);
std::vector<float> coll_time_in_ms(num_ranks);
MPI_Allgather(&local_algo, sizeof(local_algo), MPI_BYTE, coll_algo.data(), sizeof(local_algo), MPI_BYTE,
MPI_COMM_WORLD);
MPI_Allgather(&local_time_in_ms, sizeof(local_time_in_ms), MPI_BYTE, coll_time_in_ms.data(),
sizeof(local_time_in_ms), MPI_BYTE, MPI_COMM_WORLD);
float min_time_in_ms = -1;
for (int i = 0; i < num_ranks; i++) {
if (coll_time_in_ms[i] >= 0 && (min_time_in_ms < 0 || coll_time_in_ms[i] < min_time_in_ms)) {
min_time_in_ms = coll_time_in_ms[i];
*global_algo = coll_algo[i];
}
}
}
#endif

// B[m, k] * A[k, n] + C[m, n] = D[m, n]
// E[k, m] * D[m, n] + F[k, n] = G[k, n]
void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t num_layers, int32_t num_warmups,
int32_t num_iters, bool use_cuda_graph, ncclComm_t nccl_comm) {
int32_t num_iters, bool use_cuda_graph, bool tune_gemm, ncclComm_t nccl_comm, int rank, int num_ranks) {
const int kNcclBufAlignment = 512;

int size_a = k * n;
@@ -230,7 +332,11 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
CHECK_CUDA_ERROR(cudaMemcpy(de, he.data(), sizeof(cublasLtHalf) * size_e, cudaMemcpyHostToDevice));
CHECK_CUDA_ERROR(cudaMemcpy(df, hf.data(), sizeof(cublasLtHalf) * size_f, cudaMemcpyHostToDevice));

#if defined(__HIP_PLATFORM_AMD__)
uint64_t workspace_size = 256 * 1024 * 1024; // max workspace size allowed for hipblaslt
#else
uint64_t workspace_size = 1024 * 1024;
#endif
void *d_workspace;
CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size));
int returnedAlgoCount = 0;
@@ -279,8 +385,22 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
// E[k, m] * D[m, n] + F[k, n] = G[k, n]
CHECK_CUBLASLT_ERROR(hipblasLtMatmulAlgoGetHeuristic(handle, matmul1, matB, matA, matC, matD, pref, 1,
heuristicResult1, &returnedAlgoCount));
hipblasLtMatmulAlgo_t algo1 = heuristicResult1[0].algo;
CHECK_CUBLASLT_ERROR(hipblasLtMatmulAlgoGetHeuristic(handle, matmul2, matE, matD, matF, matG, pref, 1,
heuristicResult2, &returnedAlgoCount));
hipblasLtMatmulAlgo_t algo2 = heuristicResult2[0].algo;
#if HIP_VERSION >= 50700000
if (tune_gemm) {
hipblasLtMatmulAlgo_t ret_algo;
float ret_algo_time_in_ms;
TuneHipblasLtGemmLocal(handle, matmul1, alpha, db, matB, da, matA, beta, dc, matC, dd, matD, d_workspace,
workspace_size, stream, rank, num_ranks, &ret_algo, &ret_algo_time_in_ms);
TuneHipblasLtGemmGlobal(num_ranks, ret_algo, ret_algo_time_in_ms, &algo1);
TuneHipblasLtGemmLocal(handle, matmul2, alpha, de, matE, dd, matD, beta, df, matF, dg, matG, d_workspace,
workspace_size, stream, rank, num_ranks, &ret_algo, &ret_algo_time_in_ms);
TuneHipblasLtGemmGlobal(num_ranks, ret_algo, ret_algo_time_in_ms, &algo2);
}
#endif
#else
cublasLtHandle_t handle;
cublasLtMatrixLayout_t matA, matB, matC, matD, matE, matF, matG;
@@ -328,13 +448,13 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t
// cublasLt is not well supported by ROCm hipify tools, explicitly define ROCm logic instead.
#if defined(__HIP_PLATFORM_AMD__)
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul1, &alpha, db, matB, da, matA, &beta, dc, matC, dd, matD,
&heuristicResult1[0].algo, d_workspace, workspace_size, stream));
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul1, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG,
&heuristicResult2[0].algo, d_workspace, workspace_size, stream));
&algo1, d_workspace, workspace_size, stream));
CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul2, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG,
&algo2, d_workspace, workspace_size, stream));
#else
CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul1, &alpha, db, matB, da, matA, &beta, dc, matC, dd, matD,
&heuristicResult1[0].algo, d_workspace, workspace_size, stream));
CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul1, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG,
CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul2, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG,
&heuristicResult2[0].algo, d_workspace, workspace_size, stream));
#endif
CHECK_NCCL_ERROR(ncclAllReduce(dg, dg, size_g, ncclFloat16, ncclSum, nccl_comm, stream));
@@ -456,18 +576,21 @@ int main(int argc, char *argv[]) {
int32_t num_warmups = 20;
int32_t num_iters = 100;
bool use_cuda_graph = false;
bool tune_gemm = false;

if (ParseArguments(argc, argv, &m, &n, &k, &alpha, &beta, &num_layers, &num_warmups, &num_iters, &use_cuda_graph)) {
if (ParseArguments(argc, argv, &m, &n, &k, &alpha, &beta, &num_layers, &num_warmups, &num_iters, &use_cuda_graph,
&tune_gemm)) {
ShowUsage(argv);
return -1;
}

fprintf(stdout,
"Parameters: m=%ld, n=%ld, k=%ld, alpha=%f, beta=%f, num_layers=%d, num_warmups=%d, num_iters=%d, "
"use_cuda_graph=%d\n",
m, n, k, alpha, beta, num_layers, num_warmups, num_iters, (int)use_cuda_graph);
"use_cuda_graph=%d, tune_gemm=%d\n",
m, n, k, alpha, beta, num_layers, num_warmups, num_iters, (int)use_cuda_graph, (int)tune_gemm);

TestModel(m, n, k, alpha, beta, num_layers, num_warmups, num_iters, use_cuda_graph, nccl_comm);
TestModel(m, n, k, alpha, beta, num_layers, num_warmups, num_iters, use_cuda_graph, tune_gemm, nccl_comm, comm_rank,
comm_size);

CHECK_NCCL_ERROR(ncclCommDestroy(nccl_comm));

7 changes: 5 additions & 2 deletions tests/benchmarks/micro_benchmarks/test_dist_inference.py
Original file line number Diff line number Diff line change
@@ -53,6 +53,7 @@ def test_pytorch_dist_inference_normal():
assert (benchmark._args.distributed_impl == DistributedImpl.DDP)
assert (benchmark._args.distributed_backend == DistributedBackend.NCCL)
assert (benchmark._args.use_cuda_graph is False)
assert (benchmark._args.tune_gemm is False)

# Check results and metrics.
assert (benchmark.run_count == 1)
@@ -98,6 +99,7 @@ def test_pytorch_dist_inference_fake_distributed():
assert (benchmark._args.distributed_impl == DistributedImpl.DDP)
assert (benchmark._args.distributed_backend == DistributedBackend.NCCL)
assert (benchmark._args.use_cuda_graph is False)
assert (benchmark._args.tune_gemm is False)

# Check results and metrics.
assert (benchmark.run_count == 1)
@@ -136,7 +138,7 @@ def _test_dist_inference_command_generation(self, platform):
num_steps = 8
wrapper_params_format_str = \
'--batch_size %d --input_size %d --hidden_size %d ' \
'--alpha %g --beta %g --num_layers %d --num_warmup %d --num_steps %d --use_cuda_graph'
'--alpha %g --beta %g --num_layers %d --num_warmup %d --num_steps %d --use_cuda_graph --tune_gemm'
parameters = wrapper_params_format_str % (
batch_size, input_size, hidden_size, alpha, beta, num_layers, num_warmup, num_steps
)
@@ -161,14 +163,15 @@ def _test_dist_inference_command_generation(self, platform):
assert (benchmark._args.num_warmup == num_warmup)
assert (benchmark._args.num_steps == num_steps)
assert (benchmark._args.use_cuda_graph is True)
assert (benchmark._args.tune_gemm is True)

# Check command
assert (1 == len(benchmark._commands))
for cmd in benchmark._commands:
m, n, k = hidden_size, batch_size, input_size
bench_params_format_str = \
'%s -m %d -n %d -k %d --alpha %g --beta %g ' + \
'--num_layers %d --num_warmups %d --num_iters %d --use_cuda_graph'
'--num_layers %d --num_warmups %d --num_iters %d --use_cuda_graph --tune_gemm'
assert (
cmd == (
bench_params_format_str %