diff --git a/CMakeLists.txt b/CMakeLists.txt index dc8f105d6..0ac7ee2d7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,10 @@ ecbuild_add_option( FEATURE ACC DESCRIPTION "Support for using GPUs with OpenACC" REQUIRED_PACKAGES "OpenACC COMPONENTS Fortran" ) +ecbuild_add_option( FEATURE GPU_HOST + DEFAULT OFF + DESCRIPTION "Use host backend for GPU version" ) + ecbuild_add_option( FEATURE DOUBLE_PRECISION DEFAULT ON DESCRIPTION "Support for Double Precision" ) @@ -99,19 +103,21 @@ ecbuild_add_option( FEATURE TRANSI # Search for available GPU runtimes, searching for CUDA first and, if not found, # attempt to find HIP -if( ECTRANS_ENABLE_GPU OR (NOT DEFINED ECTRANS_ENABLE_GPU AND ENABLE_GPU)) - set(HAVE_CUDA 0) - set(HAVE_HIP 0) - ectrans_find_cuda() # sets "HAVE_CUDA" - if( NOT HAVE_CUDA ) - ectrans_find_hip() # sets "HAVE_HIP" +set(HAVE_CUDA 0) +set(HAVE_HIP 0) +if( NOT HAVE_GPU_HOST ) + if( ECTRANS_ENABLE_GPU OR (NOT DEFINED ECTRANS_ENABLE_GPU AND ENABLE_GPU)) + ectrans_find_cuda() # sets "HAVE_CUDA" + if( NOT HAVE_CUDA ) + ectrans_find_hip() # sets "HAVE_HIP" + endif() endif() endif() ecbuild_add_option( FEATURE GPU DEFAULT OFF DESCRIPTION "Compile GPU version of ectrans (Requires OpenACC or sufficient OpenMP offloading support)" - CONDITION (HAVE_HIP OR HAVE_CUDA) AND (HAVE_ACC OR HAVE_OMP) ) + CONDITION ((HAVE_HIP OR HAVE_CUDA) AND (HAVE_ACC OR HAVE_OMP)) OR HAVE_GPU_HOST ) # Check CPU or GPU is enabled, and if not, abort if( (NOT HAVE_CPU) AND (NOT HAVE_GPU) ) @@ -123,14 +129,13 @@ if( HAVE_GPU ) set( GPU_OFFLOAD "ACC" ) elseif( HAVE_OMP ) set( GPU_OFFLOAD "OMP" ) - else() - ecbuild_error("Could not enable GPU as OMP or ACC were not enabled") endif() endif() ecbuild_add_option( FEATURE CUTLASS DEFAULT OFF CONDITION HAVE_GPU AND HAVE_CUDA AND CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC" + AND NOT HAVE_GPU_HOST DESCRIPTION "Support for Cutlass BLAS operations" REQUIRED_PACKAGES "NvidiaCutlass VERSION 2.11" ) @@ -142,18 +147,18 @@ ecbuild_add_option( FEATURE CUTLASS_3XTF32 ecbuild_add_option( FEATURE GPU_AWARE_MPI DEFAULT ON - CONDITION HAVE_GPU AND HAVE_MPI + CONDITION HAVE_GPU AND HAVE_MPI AND NOT HAVE_GPU_HOST REQUIRED_PACKAGES "MPI COMPONENTS Fortran" DESCRIPTION "Enable GPU-aware MPI" ) ecbuild_add_option( FEATURE GPU_GRAPHS_GEMM DEFAULT ON - CONDITION HAVE_GPU + CONDITION HAVE_GPU AND NOT HAVE_GPU_HOST DESCRIPTION "Enable graph-based optimisation of Legendre transform GEMM kernel" ) ecbuild_add_option( FEATURE GPU_GRAPHS_FFT DEFAULT ON - CONDITION HAVE_GPU + CONDITION HAVE_GPU AND NOT HAVE_GPU_HOST DESCRIPTION "Enable graph-based optimisation of FFT kernels" ) if( BUILD_SHARED_LIBS ) diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 0a19b4b3b..4c67d4aaa 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -10,7 +10,6 @@ list( APPEND trans_gpu_common_src algor/ext_acc.F90 - algor/c_hipmemgetinfo.cpp algor/buffered_allocator_mod.F90 algor/device_mod.F90 algor/growing_allocator_mod.F90 @@ -24,6 +23,7 @@ if( HAVE_HIP ) algor/*.hip.cpp ) list( APPEND trans_gpu_common_src + algor/c_hipmemgetinfo.cpp algor/hicblas_gemm.hip.cpp algor/hicfft.hip.cpp ) @@ -32,12 +32,17 @@ elseif( HAVE_CUDA ) set( GPU_RUNTIME "CUDA" ) set( ECTRANS_GPU_HIP_LIBRARIES CUDA::cufft CUDA::cublas nvhpcwrapnvtx CUDA::cudart ) list( APPEND trans_gpu_common_src + algor/c_hipmemgetinfo.cpp algor/hicblas_gemm.cuda.cu algor/hicfft.cuda.cu ) ecbuild_info("warn: IN_PLACE_FFT defined for cuFFT") else() - ecbuild_info("warn: HIP and CUDA not found") + set( GPU_RUNTIME "HOST" ) + list( APPEND trans_gpu_common_src + algor/fft.host.cpp + algor/gemm.host.cpp + ) endif() @@ -67,6 +72,12 @@ ecbuild_add_library( $<${HAVE_GPU_GRAPHS_FFT}:USE_GRAPHS_FFT> ) +if( HAVE_GPU_HOST ) + target_link_libraries( ectrans_gpu_common PRIVATE ${LAPACK_LIBRARIES} ) + target_link_libraries( ectrans_gpu_common PRIVATE ${FFTW_LIBRARIES} ) + target_include_directories( ectrans_gpu_common PRIVATE ${FFTW_INCLUDE_DIRS} ) +endif() + ectrans_target_fortran_module_directory( TARGET ectrans_gpu_common MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/module/ectrans @@ -169,7 +180,7 @@ foreach( prec dp sp ) endif() # cuFFT can do in-place FFT, hipFFT cannot - if( HAVE_CUDA ) + if( HAVE_CUDA OR (GPU_RUNTIME STREQUAL "HOST")) target_compile_definitions( ectrans_gpu_${prec} PRIVATE IN_PLACE_FFT ) endif() diff --git a/src/trans/gpu/algor/ext_acc.F90 b/src/trans/gpu/algor/ext_acc.F90 index 2cf79c98c..7216e3c44 100644 --- a/src/trans/gpu/algor/ext_acc.F90 +++ b/src/trans/gpu/algor/ext_acc.F90 @@ -27,12 +27,10 @@ module openacc_ext implicit none private - public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, & + public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, ext_acc_delete + public :: ext_acc_arr_desc #ifdef ACCGPU - & ext_acc_delete, ext_acc_arr_desc, acc_handle_kind -#endif -#ifdef OMPGPU - & ext_acc_delete, ext_acc_arr_desc + public :: acc_handle_kind #endif type common_pointer_descr @@ -271,9 +269,8 @@ subroutine ext_acc_create(ptrs, stream) type(ext_acc_arr_desc), intent(in) :: ptrs(:) #ifdef ACCGPU integer(acc_handle_kind), optional :: stream -#endif -#ifdef OMPGPU - integer(kind=int32), optional :: stream +#else + integer(kind=int32), optional :: stream ! not used by other runtimes than OpenACC #endif type(common_pointer_descr), allocatable :: common_ptrs(:) @@ -310,9 +307,8 @@ subroutine ext_acc_copyin(ptrs, stream) type(ext_acc_arr_desc), intent(in) :: ptrs(:) #ifdef ACCGPU integer(acc_handle_kind), optional :: stream -#endif -#ifdef OMPGPU - integer(kind=int32), optional :: stream +#else + integer(kind=int32), optional :: stream ! not used by other runtimes than OpenACC #endif type(common_pointer_descr), allocatable :: common_ptrs(:) @@ -349,9 +345,8 @@ subroutine ext_acc_copyout(ptrs, stream) type(ext_acc_arr_desc), intent(in) :: ptrs(:) #ifdef ACCGPU integer(acc_handle_kind), optional :: stream -#endif -#ifdef OMPGPU - integer(kind=int32), optional :: stream +#else + integer(kind=int32), optional :: stream ! not used by other runtimes than OpenACC #endif type(common_pointer_descr), allocatable :: common_ptrs(:) @@ -388,7 +383,7 @@ subroutine ext_acc_delete(ptrs, stream) #ifdef ACCGPU integer(acc_handle_kind), optional :: stream #else - integer(kind=int32), optional :: stream + integer(kind=int32), optional :: stream ! not used by other runtimes than OpenACC #endif type(common_pointer_descr), allocatable :: common_ptrs(:) diff --git a/src/trans/gpu/algor/fft.host.cpp b/src/trans/gpu/algor/fft.host.cpp new file mode 100644 index 000000000..6059ab539 --- /dev/null +++ b/src/trans/gpu/algor/fft.host.cpp @@ -0,0 +1,201 @@ +#include +#include +#include +#include +#include + +typedef enum fftType_t { + FFT_R2C = 0x2a, // Real to complex (interleaved) + FFT_C2R = 0x2c, // Complex (interleaved) to real + FFT_D2Z = 0x6a, // Double to double-complex (interleaved) + FFT_Z2D = 0x6c, // Double-complex (interleaved) to double +} fftType; + +namespace { +struct Double { + using real = double; + using cmplx = fftw_complex; + using plan_type = fftw_plan; +}; +struct Float { + using real = float; + using cmplx = fftwf_complex; + using plan_type = fftwf_plan; +}; + +template class fft_plan { + using real = typename Type::real; + using cmplx = typename Type::cmplx; + using plan_type = typename Type::plan_type; + +public: + void exec(real *data_real, cmplx *data_complex) const { + real *data_real_l = &data_real[offset]; + cmplx *data_complex_l = &data_complex[offset / 2]; + if constexpr (Direction == FFT_R2C) + fftwf_execute_dft_r2c(*plan_ptr, data_real_l, data_complex_l); + else if constexpr (Direction == FFT_C2R) + fftwf_execute_dft_c2r(*plan_ptr, data_complex_l, data_real_l); + else if constexpr (Direction == FFT_D2Z) + fftw_execute_dft_r2c(*plan_ptr, data_real_l, data_complex_l); + else if constexpr (Direction == FFT_Z2D) + fftw_execute_dft_c2r(*plan_ptr, data_complex_l, data_real_l); + } + fft_plan(plan_type plan_, int64_t offset_) + : plan_ptr(new plan_type{plan_}, + [](auto ptr) { + if constexpr (Direction == FFT_R2C || Direction == FFT_C2R) + fftwf_destroy_plan(*ptr); + else if constexpr (Direction == FFT_D2Z || Direction == FFT_Z2D) + fftw_destroy_plan(*ptr); + delete ptr; + }), + offset(offset_) {} + +private: + std::shared_ptr plan_ptr; + int64_t offset; +}; + +struct cache_key { + int resol_id; + int kfield; + bool operator==(const cache_key &other) const { + return resol_id == other.resol_id && kfield == other.kfield; + } + cache_key(int resol_id_, int kfield_) + : resol_id(resol_id_), kfield(kfield_) {} +}; +} // namespace + +template <> struct std::hash { + std::size_t operator()(const cache_key &k) const { + return k.resol_id * 10000 + k.kfield; + } +}; + +namespace { + +// kfield -> handles +template auto &get_fft_plan_cache() { + static std::unordered_map>> + fftPlansCache; + return fftPlansCache; +} + +// // kfield -> ptrs +// template auto &get_ptr_cache() { +// using real = typename Type::real; +// using cmplx = typename Type::cmplx; +// static std::unordered_map> ptrCache; +// return ptrCache; +// } + +template +void erase_resol_from_cache(Cache &cache, int resol_id) { + // Note that in C++20 this could also be std::erase_if + int erased = 0; + for (auto it = cache.begin(); it != cache.end();) { + if (it->first.resol_id == resol_id) { + it = cache.erase(it); + ++erased; + } else + ++it; + } +} +template +void erase_from_caches(int resol_id) { + erase_resol_from_cache(get_fft_plan_cache(), resol_id); + // erase_resol_from_cache(get_ptr_cache(), resol_id); +} + +template +std::vector> plan_all(int resol_id, int kfield, int *loens, + int nfft, int64_t *offsets) { + using cmplx = typename Type::cmplx; + using plan_type = typename Type::plan_type; + int flags = FFTW_ESTIMATE | FFTW_NO_SIMD; + + auto key = cache_key{resol_id, kfield}; + auto &fftPlansCache = get_fft_plan_cache(); + auto fftPlans = fftPlansCache.find(key); + if (fftPlans == fftPlansCache.end()) { + // the fft plans do not exist yet + cmplx *dummy; + if constexpr (Direction == FFT_R2C || Direction == FFT_C2R) + dummy = fftwf_alloc_complex((size_t)1); + else + dummy = fftw_alloc_complex((size_t)1); + + std::vector> newPlans; + newPlans.reserve(nfft); + for (int i = 0; i < nfft; ++i) { + int nloen = loens[i]; + + plan_type plan; + int dist = offsets[i + 1] - offsets[i]; + int embed[] = {1}; + + if constexpr (Direction == FFT_R2C) + plan = fftwf_plan_many_dft_r2c( + 1, &nloen, kfield, (float*)dummy, embed, 1, dist, dummy, embed, 1, dist / 2, flags + ); + else if constexpr (Direction == FFT_C2R) + plan = fftwf_plan_many_dft_c2r( + 1, &nloen, kfield, dummy, embed, 1, dist / 2, (float*)dummy, embed, 1, dist, flags + ); + else if constexpr (Direction == FFT_D2Z) + plan = fftw_plan_many_dft_r2c( + 1, &nloen, kfield, (double*)dummy, embed, 1, dist, dummy, embed, 1, dist / 2, flags + ); + else if constexpr (Direction == FFT_Z2D) + plan = fftw_plan_many_dft_c2r( + 1, &nloen, kfield, dummy, embed, 1, dist / 2, (double*)dummy, embed, 1, dist, flags + ); + newPlans.emplace_back(plan, kfield * offsets[i]); + } + fftPlansCache.insert({key, newPlans}); + } + return fftPlansCache.find(key)->second; +} +} // namespace + +extern "C" { +void execute_dir_fft_float(float *data_real, fftwf_complex *data_complex, + int resol_id, int kfield, int *loens, int64_t *offsets, int nfft, + void *growing_allocator) { + auto plans = plan_all(resol_id, kfield, loens, nfft, offsets); + for (auto &plan : plans) + plan.exec(data_real, data_complex); +} +void execute_inv_fft_float(fftwf_complex *data_complex, float *data_real, + int resol_id, int kfield, int *loens, int64_t *offsets, int nfft, + void *growing_allocator) { + auto plans = plan_all(resol_id, kfield, loens, nfft, offsets); + for (auto &plan : plans) + plan.exec(data_real, data_complex); +} +void execute_dir_fft_double(double *data_real, + fftw_complex *data_complex, int resol_id, int kfield, + int *loens, int64_t *offsets, int nfft, + void *growing_allocator) { + auto plans = plan_all(resol_id, kfield, loens, nfft, offsets); + for (auto &plan : plans) + plan.exec(data_real, data_complex); +} +void execute_inv_fft_double(fftw_complex *data_complex, + double *data_real, int resol_id, int kfield, int *loens, + int64_t *offsets, int nfft, void *growing_allocator) { + auto plans = plan_all(resol_id, kfield, loens, nfft, offsets); + for (auto &plan : plans) + plan.exec(data_real, data_complex); +} + +void clean_fft(int resol_id) { + erase_from_caches(resol_id); + erase_from_caches(resol_id); + erase_from_caches(resol_id); + erase_from_caches(resol_id); +} +} diff --git a/src/trans/gpu/algor/gemm.host.cpp b/src/trans/gpu/algor/gemm.host.cpp new file mode 100644 index 000000000..b343d2b36 --- /dev/null +++ b/src/trans/gpu/algor/gemm.host.cpp @@ -0,0 +1,85 @@ +// (C) Copyright 2000- ECMWF. +// (C) Copyright 2024- NVIDIA. +// +// This software is licensed under the terms of the Apache Licence Version 2.0 +// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +// In applying this licence, ECMWF does not waive the privileges and immunities +// granted to it by virtue of its status as an intergovernmental organisation +// nor does it submit to any jurisdiction. + + +#include + +extern "C" void dgemm_( + const char *transa, const char *transb, + const int *m, const int *n, const int *k, + double *alpha, + const double *A, const int *lda, + const double *B, const int *ldb, + double *beta, + double *C, const int *ldc +); + +extern "C" void sgemm_( + const char *transa, const char *transb, + const int *m, const int *n, const int *k, + float *alpha, + const float *A, const int *lda, + const float *B, const int *ldb, + float *beta, + float *C, const int *ldc +); + +extern "C" { +void hipblas_dgemm_wrapper(char transa, char transb, int m, int n, int k, + double alpha, const double *A, int lda, int tda, + const double *B, int ldb, int tdb, double beta, + double *C, int ldc, int tdc, int batchCount, + size_t stream, void *growing_allocator) { + dgemm_(&transa, &transb, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); +} + +void hipblas_sgemm_wrapper(char transa, char transb, int m, int n, int k, + float alpha, const float *A, int lda, int tda, + const float *B, int ldb, int tdb, float beta, + float *C, int ldc, int tdc, int batchCount, + void *growing_allocator) { + sgemm_(&transa, &transb, &m, &n, &k, &alpha, A, &lda, B, &ldb, &beta, C, &ldc); +} + +void hipblas_sgemm_wrapper_grouped( + int resol_id, int blas_id, char transa, char transb, int m, const int *n, + const int *k, float alpha, const float *A, int lda, const int64_t *offsetsA, + const float *B, const int *ldb, const int64_t *offsetsB, float beta, + float *C, int ldc, const int64_t *offsetsC, int batchCount, size_t stream, + void *growing_allocator) { + + for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) + continue; + sgemm_(&transa, &transb, &m, &n[i], &k[i], &alpha, A + offsetsA[i], &lda, B + offsetsB[i], + &ldb[i], &beta, C + offsetsC[i], &ldc); + } +} + +void hipblas_dgemm_wrapper_grouped(int resol_id, int blas_id, char transa, + char transb, int m, const int *n, + const int *k, double alpha, const double *A, + int lda, const int64_t *offsetsA, + const double *B, const int *ldb, + const int64_t *offsetsB, double beta, + double *C, int ldc, const int64_t *offsetsC, + int batchCount, size_t stream, + void *growing_allocator) { + for (int i = 0; i < batchCount; ++i) { + if (m == 0 || n[i] == 0 || k[i] == 0) + continue; + dgemm_(&transa, &transb, &m, &n[i], &k[i], &alpha, A + offsetsA[i], &lda, B + offsetsB[i], + &ldb[i], &beta, C + offsetsC[i], &ldc); + } +} + +void clean_gemm(int resol_id) { + // Just here to satisfy compilation -> there's nothing to clean on the host version +} +} diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 88a16b56f..526136931 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -382,12 +382,11 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), & #ifdef ACCGPU + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), & & STREAM=1_ACC_HANDLE_KIND) -#endif -#ifdef OMPGPU - & STREAM=1) +#else + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT)) #endif #ifdef ACCGPU @@ -845,12 +844,11 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END DATA !PGP #endif - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), & #ifdef ACCGPU + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), & & STREAM=1_ACC_HANDLE_KIND) -#endif -#ifdef OMPGPU - & STREAM=1) +#else + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT)) #endif ! Free this now diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 4fcc11496..6e80aa73b 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -519,12 +519,11 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), & #ifdef ACCGPU + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT), & & STREAM=1_ACC_HANDLE_KIND) -#endif -#ifdef OMPGPU - & STREAM=1) +#else + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT)) #endif #ifdef OMPGPU @@ -993,12 +992,11 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC UPDATE HOST(PGP3B) ASYNC(1) #endif ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), & #ifdef ACCGPU + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT), & & STREAM=1_ACC_HANDLE_KIND) -#endif -#ifdef OMPGPU - & STREAM=1) +#else + IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT)) #endif IF (LSYNC_TRANS) THEN