From cc89ee591c7fd4f2b8624ee2f0b1330b61fa6508 Mon Sep 17 00:00:00 2001 From: Ziyue Yang Date: Tue, 2 Apr 2024 09:56:33 +0800 Subject: [PATCH] Benchmarks: Revise Code - Add hipblasLt tuning to dist-inference cpp implementation (#616) **Description** Adds hipblasLt tuning to dist-inference cpp implementation. --- .../micro_benchmarks/dist_inference.py | 8 + .../dist_inference_cpp/dist_inference.cu | 169 +++++++++++++++--- .../micro_benchmarks/test_dist_inference.py | 7 +- 3 files changed, 159 insertions(+), 25 deletions(-) diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference.py b/superbench/benchmarks/micro_benchmarks/dist_inference.py index 7232a1409..27914f453 100644 --- a/superbench/benchmarks/micro_benchmarks/dist_inference.py +++ b/superbench/benchmarks/micro_benchmarks/dist_inference.py @@ -312,6 +312,12 @@ def add_parser_arguments(self): required=False, help='Whether to launch kernels in CUDA graph mode.', ) + self._parser.add_argument( + '--tune_gemm', + action='store_true', + required=False, + help='Whether to tune GEMM performance before testing.', + ) def _preprocess(self): """Preprocess/preparation operations before the benchmarking. @@ -356,6 +362,8 @@ def _preprocess(self): (self._args.num_layers, self._args.num_warmup, self._args.num_steps) if self._args.use_cuda_graph: args += ' --use_cuda_graph' + if self._args.tune_gemm: + args += ' --tune_gemm' self._commands = ['%s %s' % (self.__bin_path, args)] return True diff --git a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu index c6d7cd033..f33fae492 100644 --- a/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu +++ b/superbench/benchmarks/micro_benchmarks/dist_inference_cpp/dist_inference.cu @@ -27,6 +27,7 @@ * *******************************************************************************/ +#include #include #include #include @@ -60,6 +61,21 @@ using cublasLtHalf = hipblasLtHalf; #else #define DIST_INF_HIP_COMPUTETYPE_F32 HIPBLASLT_COMPUTE_F32 #endif +#if HIP_VERSION >= 50700000 +#include +#if HIP_VERSION >= 60000000 +#define HIPBLASLT_GETINDEXFROMALGO(algo) hipblaslt_ext::getIndexFromAlgo(algo) +#else +static int getIndexFromAlgo(hipblasLtMatmulAlgo_t &algo) { + int *algo_ptr = (int *)algo.data; + if (*algo_ptr < 0) { + return -1; + } + return *algo_ptr; +} +#define HIPBLASLT_GETINDEXFROMALGO(algo) getIndexFromAlgo(algo) +#endif +#endif #else #include #include @@ -94,23 +110,26 @@ using cublasLtHalf = half; #endif static void ShowUsage(char *argv[]) { - std::cerr << "Usage: " << argv[0] << " \n" - << "options:\n" - << "\t-h, --help\t\t\t\tShow this help message\n" - << "\t-m \t\t\tm\t\tGEMM_STRIDED argument m\n" - << "\t-n \t\t\tn\t\tGEMM_STRIDED argument n\n" - << "\t-k \t\t\tk \t\tGEMM_STRIDED argument k\n" - << "\t--alpha \t\talpha \t\tGEMM_STRIDED argument alpha\n" - << "\t--beta \t\t\tbeta \t\tGEMM_STRIDED argument beta\n" - << "\t--num_layers \t\t\tnum_layers \t\tNumber of layers in the model\n" - << "\t--num_warmups \t\t\tnum_warmups \t\tNumber of warmup runs\n" - << "\t--num_iters \t\t\tnum_iters \t\tNumber of test runs\n" - << "\t--use_cuda_graph \t\t\tuse_cuda_graph \t\tWhether to launch kernels in CUDA graph mode\n" - << std::endl; + std::cerr + << "Usage: " << argv[0] << " \n" + << "options:\n" + << "\t-h, --help\t\t\t\tShow this help message\n" + << "\t-m \t\t\tm\t\tGEMM_STRIDED argument m\n" + << "\t-n \t\t\tn\t\tGEMM_STRIDED argument n\n" + << "\t-k \t\t\tk \t\tGEMM_STRIDED argument k\n" + << "\t--alpha \t\talpha \t\tGEMM_STRIDED argument alpha\n" + << "\t--beta \t\t\tbeta \t\tGEMM_STRIDED argument beta\n" + << "\t--num_layers \t\t\tnum_layers \t\tNumber of layers in the model\n" + << "\t--num_warmups \t\t\tnum_warmups \t\tNumber of warmup runs\n" + << "\t--num_iters \t\t\tnum_iters \t\tNumber of test runs\n" + << "\t--use_cuda_graph \t\t\tuse_cuda_graph \t\tWhether to launch kernels in CUDA graph mode\n" + << "\t--tune_gemm \t\t\ttune_gemm \t\tWhether to tune GEMM before testing. Currently only work for hipblasLt.\n" + << std::endl; } static int ParseArguments(int argc, char *argv[], int64_t *m, int64_t *n, int64_t *k, float *alpha, float *beta, - int32_t *num_layers, int32_t *num_warmups, int32_t *num_iters, bool *use_cuda_graph) { + int32_t *num_layers, int32_t *num_warmups, int32_t *num_iters, bool *use_cuda_graph, + bool *tune_gemm) { if (argc >= 2) { for (int i = 1; i < argc; ++i) { std::string arg = argv[i]; @@ -143,6 +162,8 @@ static int ParseArguments(int argc, char *argv[], int64_t *m, int64_t *n, int64_ std::cerr << "not supported by current environment" << std::endl << std::endl; return -1; #endif + } else if (arg == "--tune_gemm") { + *tune_gemm = true; } else { std::cerr << "error with " << arg << std::endl; std::cerr << "do not recognize option" << std::endl << std::endl; @@ -182,10 +203,91 @@ void InitializeABCDEF(std::vector &ha, int64_t size_a, std::vector } } +#if defined(__HIP_PLATFORM_AMD__) && HIP_VERSION >= 50700000 +// Tune GEMM algorithm in local rank. +// Write <0 to ret_algo_time_in_ms if nothing found. +// Write >=0 to ret_algo_time_in_ms and write ret_algo if something is found. +void TuneHipblasLtGemmLocal(const hipblasLtHandle_t &handle, const hipblasLtMatmulDesc_t &matmul, float alpha, void *da, + const hipblasLtMatrixLayout_t &matA, void *db, const hipblasLtMatrixLayout_t &matB, + float beta, void *dc, const hipblasLtMatrixLayout_t &matC, void *dd, + const hipblasLtMatrixLayout_t &matD, void *d_workspace, uint64_t workspace_size, + const cudaStream_t &stream, int rank, int num_ranks, hipblasLtMatmulAlgo_t *ret_algo, + float *ret_algo_time_in_ms) { + std::vector gemm_heuristics; + // Get all possible algorithms + CHECK_CUBLASLT_ERROR(hipblaslt_ext::getAllAlgos( + handle, hipblaslt_ext::GemmType::HIPBLASLT_GEMM, HIPBLAS_OP_N, HIPBLAS_OP_N, DIST_INF_HIP_DATATYPE_R_16F, + DIST_INF_HIP_DATATYPE_R_16F, DIST_INF_HIP_DATATYPE_R_16F, DIST_INF_HIP_DATATYPE_R_16F, + DIST_INF_HIP_COMPUTETYPE_F32, gemm_heuristics)); + // Make sure the algorithm order is deterministic + std::sort(gemm_heuristics.begin(), gemm_heuristics.end(), + [](hipblasLtMatmulHeuristicResult_t &a, hipblasLtMatmulHeuristicResult_t &b) { + return HIPBLASLT_GETINDEXFROMALGO(a.algo) < HIPBLASLT_GETINDEXFROMALGO(b.algo); + }); + // Timing utilities + cudaEvent_t start_event; + cudaEvent_t end_event; + const int kNumWarmups = 10; + const int kNumTestRuns = 100; + *ret_algo_time_in_ms = -1; + // Benchmark all algorithms in given shape + CHECK_CUDA_ERROR(cudaEventCreate(&start_event)); + CHECK_CUDA_ERROR(cudaEventCreate(&end_event)); + // Partition work evenly into different ranks + for (size_t algo_idx = rank; algo_idx < gemm_heuristics.size(); algo_idx += num_ranks) { + auto &algo = gemm_heuristics[algo_idx].algo; + size_t ret_workspace_size = 0; + auto status = hipblaslt_ext::matmulIsAlgoSupported(handle, matmul, &alpha, matA, matB, &beta, matC, matD, algo, + ret_workspace_size); + if (status != HIPBLAS_STATUS_SUCCESS || ret_workspace_size >= workspace_size) { + continue; + } + for (int i = 0; i < kNumWarmups; i++) { + CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul, &alpha, da, matA, db, matB, &beta, dc, matC, dd, matD, + &algo, d_workspace, workspace_size, stream)); + } + CHECK_CUDA_ERROR(cudaEventRecord(start_event, stream)); + for (int i = 0; i < kNumTestRuns; i++) { + CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul, &alpha, da, matA, db, matB, &beta, dc, matC, dd, matD, + &algo, d_workspace, workspace_size, stream)); + } + CHECK_CUDA_ERROR(cudaEventRecord(end_event, stream)); + CHECK_CUDA_ERROR(cudaStreamSynchronize(stream)); + float time_in_ms = 0; + CHECK_CUDA_ERROR(cudaEventElapsedTime(&time_in_ms, start_event, end_event)); + time_in_ms /= kNumTestRuns; + if (*ret_algo_time_in_ms < 0 || time_in_ms < *ret_algo_time_in_ms) { + *ret_algo = algo; + *ret_algo_time_in_ms = time_in_ms; + } + } + CHECK_CUDA_ERROR(cudaEventDestroy(start_event)); + CHECK_CUDA_ERROR(cudaEventDestroy(end_event)); +} + +// Select global best GEMM algorithms across ranks. Write global_algo if something is found. +void TuneHipblasLtGemmGlobal(int num_ranks, const hipblasLtMatmulAlgo_t &local_algo, float local_time_in_ms, + hipblasLtMatmulAlgo_t *global_algo) { + std::vector coll_algo(num_ranks); + std::vector coll_time_in_ms(num_ranks); + MPI_Allgather(&local_algo, sizeof(local_algo), MPI_BYTE, coll_algo.data(), sizeof(local_algo), MPI_BYTE, + MPI_COMM_WORLD); + MPI_Allgather(&local_time_in_ms, sizeof(local_time_in_ms), MPI_BYTE, coll_time_in_ms.data(), + sizeof(local_time_in_ms), MPI_BYTE, MPI_COMM_WORLD); + float min_time_in_ms = -1; + for (int i = 0; i < num_ranks; i++) { + if (coll_time_in_ms[i] >= 0 && (min_time_in_ms < 0 || coll_time_in_ms[i] < min_time_in_ms)) { + min_time_in_ms = coll_time_in_ms[i]; + *global_algo = coll_algo[i]; + } + } +} +#endif + // B[m, k] * A[k, n] + C[m, n] = D[m, n] // E[k, m] * D[m, n] + F[k, n] = G[k, n] void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t num_layers, int32_t num_warmups, - int32_t num_iters, bool use_cuda_graph, ncclComm_t nccl_comm) { + int32_t num_iters, bool use_cuda_graph, bool tune_gemm, ncclComm_t nccl_comm, int rank, int num_ranks) { const int kNcclBufAlignment = 512; int size_a = k * n; @@ -230,7 +332,11 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t CHECK_CUDA_ERROR(cudaMemcpy(de, he.data(), sizeof(cublasLtHalf) * size_e, cudaMemcpyHostToDevice)); CHECK_CUDA_ERROR(cudaMemcpy(df, hf.data(), sizeof(cublasLtHalf) * size_f, cudaMemcpyHostToDevice)); +#if defined(__HIP_PLATFORM_AMD__) + uint64_t workspace_size = 256 * 1024 * 1024; // max workspace size allowed for hipblaslt +#else uint64_t workspace_size = 1024 * 1024; +#endif void *d_workspace; CHECK_CUDA_ERROR(cudaMalloc(&d_workspace, workspace_size)); int returnedAlgoCount = 0; @@ -279,8 +385,22 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t // E[k, m] * D[m, n] + F[k, n] = G[k, n] CHECK_CUBLASLT_ERROR(hipblasLtMatmulAlgoGetHeuristic(handle, matmul1, matB, matA, matC, matD, pref, 1, heuristicResult1, &returnedAlgoCount)); + hipblasLtMatmulAlgo_t algo1 = heuristicResult1[0].algo; CHECK_CUBLASLT_ERROR(hipblasLtMatmulAlgoGetHeuristic(handle, matmul2, matE, matD, matF, matG, pref, 1, heuristicResult2, &returnedAlgoCount)); + hipblasLtMatmulAlgo_t algo2 = heuristicResult2[0].algo; +#if HIP_VERSION >= 50700000 + if (tune_gemm) { + hipblasLtMatmulAlgo_t ret_algo; + float ret_algo_time_in_ms; + TuneHipblasLtGemmLocal(handle, matmul1, alpha, db, matB, da, matA, beta, dc, matC, dd, matD, d_workspace, + workspace_size, stream, rank, num_ranks, &ret_algo, &ret_algo_time_in_ms); + TuneHipblasLtGemmGlobal(num_ranks, ret_algo, ret_algo_time_in_ms, &algo1); + TuneHipblasLtGemmLocal(handle, matmul2, alpha, de, matE, dd, matD, beta, df, matF, dg, matG, d_workspace, + workspace_size, stream, rank, num_ranks, &ret_algo, &ret_algo_time_in_ms); + TuneHipblasLtGemmGlobal(num_ranks, ret_algo, ret_algo_time_in_ms, &algo2); + } +#endif #else cublasLtHandle_t handle; cublasLtMatrixLayout_t matA, matB, matC, matD, matE, matF, matG; @@ -328,13 +448,13 @@ void TestModel(int64_t m, int64_t n, int64_t k, float alpha, float beta, int32_t // cublasLt is not well supported by ROCm hipify tools, explicitly define ROCm logic instead. #if defined(__HIP_PLATFORM_AMD__) CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul1, &alpha, db, matB, da, matA, &beta, dc, matC, dd, matD, - &heuristicResult1[0].algo, d_workspace, workspace_size, stream)); - CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul1, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG, - &heuristicResult2[0].algo, d_workspace, workspace_size, stream)); + &algo1, d_workspace, workspace_size, stream)); + CHECK_CUBLASLT_ERROR(hipblasLtMatmul(handle, matmul2, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG, + &algo2, d_workspace, workspace_size, stream)); #else CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul1, &alpha, db, matB, da, matA, &beta, dc, matC, dd, matD, &heuristicResult1[0].algo, d_workspace, workspace_size, stream)); - CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul1, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG, + CHECK_CUBLASLT_ERROR(cublasLtMatmul(handle, matmul2, &alpha, de, matE, dd, matD, &beta, df, matF, dg, matG, &heuristicResult2[0].algo, d_workspace, workspace_size, stream)); #endif CHECK_NCCL_ERROR(ncclAllReduce(dg, dg, size_g, ncclFloat16, ncclSum, nccl_comm, stream)); @@ -456,18 +576,21 @@ int main(int argc, char *argv[]) { int32_t num_warmups = 20; int32_t num_iters = 100; bool use_cuda_graph = false; + bool tune_gemm = false; - if (ParseArguments(argc, argv, &m, &n, &k, &alpha, &beta, &num_layers, &num_warmups, &num_iters, &use_cuda_graph)) { + if (ParseArguments(argc, argv, &m, &n, &k, &alpha, &beta, &num_layers, &num_warmups, &num_iters, &use_cuda_graph, + &tune_gemm)) { ShowUsage(argv); return -1; } fprintf(stdout, "Parameters: m=%ld, n=%ld, k=%ld, alpha=%f, beta=%f, num_layers=%d, num_warmups=%d, num_iters=%d, " - "use_cuda_graph=%d\n", - m, n, k, alpha, beta, num_layers, num_warmups, num_iters, (int)use_cuda_graph); + "use_cuda_graph=%d, tune_gemm=%d\n", + m, n, k, alpha, beta, num_layers, num_warmups, num_iters, (int)use_cuda_graph, (int)tune_gemm); - TestModel(m, n, k, alpha, beta, num_layers, num_warmups, num_iters, use_cuda_graph, nccl_comm); + TestModel(m, n, k, alpha, beta, num_layers, num_warmups, num_iters, use_cuda_graph, tune_gemm, nccl_comm, comm_rank, + comm_size); CHECK_NCCL_ERROR(ncclCommDestroy(nccl_comm)); diff --git a/tests/benchmarks/micro_benchmarks/test_dist_inference.py b/tests/benchmarks/micro_benchmarks/test_dist_inference.py index 2fb6a8a4c..43d469b67 100644 --- a/tests/benchmarks/micro_benchmarks/test_dist_inference.py +++ b/tests/benchmarks/micro_benchmarks/test_dist_inference.py @@ -53,6 +53,7 @@ def test_pytorch_dist_inference_normal(): assert (benchmark._args.distributed_impl == DistributedImpl.DDP) assert (benchmark._args.distributed_backend == DistributedBackend.NCCL) assert (benchmark._args.use_cuda_graph is False) + assert (benchmark._args.tune_gemm is False) # Check results and metrics. assert (benchmark.run_count == 1) @@ -98,6 +99,7 @@ def test_pytorch_dist_inference_fake_distributed(): assert (benchmark._args.distributed_impl == DistributedImpl.DDP) assert (benchmark._args.distributed_backend == DistributedBackend.NCCL) assert (benchmark._args.use_cuda_graph is False) + assert (benchmark._args.tune_gemm is False) # Check results and metrics. assert (benchmark.run_count == 1) @@ -136,7 +138,7 @@ def _test_dist_inference_command_generation(self, platform): num_steps = 8 wrapper_params_format_str = \ '--batch_size %d --input_size %d --hidden_size %d ' \ - '--alpha %g --beta %g --num_layers %d --num_warmup %d --num_steps %d --use_cuda_graph' + '--alpha %g --beta %g --num_layers %d --num_warmup %d --num_steps %d --use_cuda_graph --tune_gemm' parameters = wrapper_params_format_str % ( batch_size, input_size, hidden_size, alpha, beta, num_layers, num_warmup, num_steps ) @@ -161,6 +163,7 @@ def _test_dist_inference_command_generation(self, platform): assert (benchmark._args.num_warmup == num_warmup) assert (benchmark._args.num_steps == num_steps) assert (benchmark._args.use_cuda_graph is True) + assert (benchmark._args.tune_gemm is True) # Check command assert (1 == len(benchmark._commands)) @@ -168,7 +171,7 @@ def _test_dist_inference_command_generation(self, platform): m, n, k = hidden_size, batch_size, input_size bench_params_format_str = \ '%s -m %d -n %d -k %d --alpha %g --beta %g ' + \ - '--num_layers %d --num_warmups %d --num_iters %d --use_cuda_graph' + '--num_layers %d --num_warmups %d --num_iters %d --use_cuda_graph --tune_gemm' assert ( cmd == ( bench_params_format_str %