Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 17 additions & 12 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,10 @@ ecbuild_add_option( FEATURE ACC
DESCRIPTION "Support for using GPUs with OpenACC"
REQUIRED_PACKAGES "OpenACC COMPONENTS Fortran" )

ecbuild_add_option( FEATURE GPU_HOST
DEFAULT OFF
DESCRIPTION "Use host backend for GPU version" )

ecbuild_add_option( FEATURE DOUBLE_PRECISION
DEFAULT ON
DESCRIPTION "Support for Double Precision" )
Expand Down Expand Up @@ -99,19 +103,21 @@ ecbuild_add_option( FEATURE TRANSI

# Search for available GPU runtimes, searching for CUDA first and, if not found,
# attempt to find HIP
if( ECTRANS_ENABLE_GPU OR (NOT DEFINED ECTRANS_ENABLE_GPU AND ENABLE_GPU))
set(HAVE_CUDA 0)
set(HAVE_HIP 0)
ectrans_find_cuda() # sets "HAVE_CUDA"
if( NOT HAVE_CUDA )
ectrans_find_hip() # sets "HAVE_HIP"
set(HAVE_CUDA 0)
set(HAVE_HIP 0)
if( NOT HAVE_GPU_HOST )
if( ECTRANS_ENABLE_GPU OR (NOT DEFINED ECTRANS_ENABLE_GPU AND ENABLE_GPU))
ectrans_find_cuda() # sets "HAVE_CUDA"
if( NOT HAVE_CUDA )
ectrans_find_hip() # sets "HAVE_HIP"
endif()
endif()
endif()

ecbuild_add_option( FEATURE GPU
DEFAULT OFF
DESCRIPTION "Compile GPU version of ectrans (Requires OpenACC or sufficient OpenMP offloading support)"
CONDITION (HAVE_HIP OR HAVE_CUDA) AND (HAVE_ACC OR HAVE_OMP) )
CONDITION ((HAVE_HIP OR HAVE_CUDA) AND (HAVE_ACC OR HAVE_OMP)) OR HAVE_GPU_HOST )

# Check CPU or GPU is enabled, and if not, abort
if( (NOT HAVE_CPU) AND (NOT HAVE_GPU) )
Expand All @@ -123,14 +129,13 @@ if( HAVE_GPU )
set( GPU_OFFLOAD "ACC" )
elseif( HAVE_OMP )
set( GPU_OFFLOAD "OMP" )
else()
ecbuild_error("Could not enable GPU as OMP or ACC were not enabled")
endif()
endif()

ecbuild_add_option( FEATURE CUTLASS
DEFAULT OFF
CONDITION HAVE_GPU AND HAVE_CUDA AND CMAKE_Fortran_COMPILER_ID MATCHES "NVHPC"
AND NOT HAVE_GPU_HOST
DESCRIPTION "Support for Cutlass BLAS operations"
REQUIRED_PACKAGES "NvidiaCutlass VERSION 2.11" )

Expand All @@ -142,18 +147,18 @@ ecbuild_add_option( FEATURE CUTLASS_3XTF32

ecbuild_add_option( FEATURE GPU_AWARE_MPI
DEFAULT ON
CONDITION HAVE_GPU AND HAVE_MPI
CONDITION HAVE_GPU AND HAVE_MPI AND NOT HAVE_GPU_HOST
REQUIRED_PACKAGES "MPI COMPONENTS Fortran"
DESCRIPTION "Enable GPU-aware MPI" )

ecbuild_add_option( FEATURE GPU_GRAPHS_GEMM
DEFAULT ON
CONDITION HAVE_GPU
CONDITION HAVE_GPU AND NOT HAVE_GPU_HOST
DESCRIPTION "Enable graph-based optimisation of Legendre transform GEMM kernel" )

ecbuild_add_option( FEATURE GPU_GRAPHS_FFT
DEFAULT ON
CONDITION HAVE_GPU
CONDITION HAVE_GPU AND NOT HAVE_GPU_HOST
DESCRIPTION "Enable graph-based optimisation of FFT kernels" )

if( BUILD_SHARED_LIBS )
Expand Down
17 changes: 14 additions & 3 deletions src/trans/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@

list( APPEND trans_gpu_common_src
algor/ext_acc.F90
algor/c_hipmemgetinfo.cpp
algor/buffered_allocator_mod.F90
algor/device_mod.F90
algor/growing_allocator_mod.F90
Expand All @@ -24,6 +23,7 @@ if( HAVE_HIP )
algor/*.hip.cpp
)
list( APPEND trans_gpu_common_src
algor/c_hipmemgetinfo.cpp
algor/hicblas_gemm.hip.cpp
algor/hicfft.hip.cpp
)
Expand All @@ -32,12 +32,17 @@ elseif( HAVE_CUDA )
set( GPU_RUNTIME "CUDA" )
set( ECTRANS_GPU_HIP_LIBRARIES CUDA::cufft CUDA::cublas nvhpcwrapnvtx CUDA::cudart )
list( APPEND trans_gpu_common_src
algor/c_hipmemgetinfo.cpp
algor/hicblas_gemm.cuda.cu
algor/hicfft.cuda.cu
)
ecbuild_info("warn: IN_PLACE_FFT defined for cuFFT")
else()
ecbuild_info("warn: HIP and CUDA not found")
set( GPU_RUNTIME "HOST" )
list( APPEND trans_gpu_common_src
algor/fft.host.cpp
algor/gemm.host.cpp
)
endif()


Expand Down Expand Up @@ -67,6 +72,12 @@ ecbuild_add_library(
$<${HAVE_GPU_GRAPHS_FFT}:USE_GRAPHS_FFT>
)

if( HAVE_GPU_HOST )
target_link_libraries( ectrans_gpu_common PRIVATE ${LAPACK_LIBRARIES} )
target_link_libraries( ectrans_gpu_common PRIVATE ${FFTW_LIBRARIES} )
target_include_directories( ectrans_gpu_common PRIVATE ${FFTW_INCLUDE_DIRS} )
endif()

ectrans_target_fortran_module_directory(
TARGET ectrans_gpu_common
MODULE_DIRECTORY ${PROJECT_BINARY_DIR}/module/ectrans
Expand Down Expand Up @@ -169,7 +180,7 @@ foreach( prec dp sp )
endif()

# cuFFT can do in-place FFT, hipFFT cannot
if( HAVE_CUDA )
if( HAVE_CUDA OR (GPU_RUNTIME STREQUAL "HOST"))
target_compile_definitions( ectrans_gpu_${prec} PRIVATE IN_PLACE_FFT )
endif()

Expand Down
25 changes: 10 additions & 15 deletions src/trans/gpu/algor/ext_acc.F90
Original file line number Diff line number Diff line change
Expand Up @@ -27,12 +27,10 @@ module openacc_ext
implicit none

private
public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, &
public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, ext_acc_delete
public :: ext_acc_arr_desc
#ifdef ACCGPU
& ext_acc_delete, ext_acc_arr_desc, acc_handle_kind
#endif
#ifdef OMPGPU
& ext_acc_delete, ext_acc_arr_desc
public :: acc_handle_kind
#endif

type common_pointer_descr
Expand Down Expand Up @@ -271,9 +269,8 @@ subroutine ext_acc_create(ptrs, stream)
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream
#endif
#ifdef OMPGPU
integer(kind=int32), optional :: stream
#else
integer(kind=int32), optional :: stream ! not used by other runtimes than OpenACC
#endif

type(common_pointer_descr), allocatable :: common_ptrs(:)
Expand Down Expand Up @@ -310,9 +307,8 @@ subroutine ext_acc_copyin(ptrs, stream)
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream
#endif
#ifdef OMPGPU
integer(kind=int32), optional :: stream
#else
integer(kind=int32), optional :: stream ! not used by other runtimes than OpenACC
#endif

type(common_pointer_descr), allocatable :: common_ptrs(:)
Expand Down Expand Up @@ -349,9 +345,8 @@ subroutine ext_acc_copyout(ptrs, stream)
type(ext_acc_arr_desc), intent(in) :: ptrs(:)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream
#endif
#ifdef OMPGPU
integer(kind=int32), optional :: stream
#else
integer(kind=int32), optional :: stream ! not used by other runtimes than OpenACC
#endif
type(common_pointer_descr), allocatable :: common_ptrs(:)

Expand Down Expand Up @@ -388,7 +383,7 @@ subroutine ext_acc_delete(ptrs, stream)
#ifdef ACCGPU
integer(acc_handle_kind), optional :: stream
#else
integer(kind=int32), optional :: stream
integer(kind=int32), optional :: stream ! not used by other runtimes than OpenACC
#endif
type(common_pointer_descr), allocatable :: common_ptrs(:)

Expand Down
201 changes: 201 additions & 0 deletions src/trans/gpu/algor/fft.host.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,201 @@
#include <unordered_map>
#include <vector>
#include <algorithm>
#include <memory>
#include <fftw3.h>

typedef enum fftType_t {
FFT_R2C = 0x2a, // Real to complex (interleaved)
FFT_C2R = 0x2c, // Complex (interleaved) to real
FFT_D2Z = 0x6a, // Double to double-complex (interleaved)
FFT_Z2D = 0x6c, // Double-complex (interleaved) to double
} fftType;

namespace {
struct Double {
using real = double;
using cmplx = fftw_complex;
using plan_type = fftw_plan;
};
struct Float {
using real = float;
using cmplx = fftwf_complex;
using plan_type = fftwf_plan;
};

template <class Type, fftType Direction> class fft_plan {
using real = typename Type::real;
using cmplx = typename Type::cmplx;
using plan_type = typename Type::plan_type;

public:
void exec(real *data_real, cmplx *data_complex) const {
real *data_real_l = &data_real[offset];
cmplx *data_complex_l = &data_complex[offset / 2];
if constexpr (Direction == FFT_R2C)
fftwf_execute_dft_r2c(*plan_ptr, data_real_l, data_complex_l);
else if constexpr (Direction == FFT_C2R)
fftwf_execute_dft_c2r(*plan_ptr, data_complex_l, data_real_l);
else if constexpr (Direction == FFT_D2Z)
fftw_execute_dft_r2c(*plan_ptr, data_real_l, data_complex_l);
else if constexpr (Direction == FFT_Z2D)
fftw_execute_dft_c2r(*plan_ptr, data_complex_l, data_real_l);
}
fft_plan(plan_type plan_, int64_t offset_)
: plan_ptr(new plan_type{plan_},
[](auto ptr) {
if constexpr (Direction == FFT_R2C || Direction == FFT_C2R)
fftwf_destroy_plan(*ptr);
else if constexpr (Direction == FFT_D2Z || Direction == FFT_Z2D)
fftw_destroy_plan(*ptr);
delete ptr;
}),
offset(offset_) {}

private:
std::shared_ptr<plan_type> plan_ptr;
int64_t offset;
};

struct cache_key {
int resol_id;
int kfield;
bool operator==(const cache_key &other) const {
return resol_id == other.resol_id && kfield == other.kfield;
}
cache_key(int resol_id_, int kfield_)
: resol_id(resol_id_), kfield(kfield_) {}
};
} // namespace

template <> struct std::hash<cache_key> {
std::size_t operator()(const cache_key &k) const {
return k.resol_id * 10000 + k.kfield;
}
};

namespace {

// kfield -> handles
template <class Type, fftType Direction> auto &get_fft_plan_cache() {
static std::unordered_map<cache_key,
std::vector<fft_plan<Type, Direction>>>
fftPlansCache;
return fftPlansCache;
}

// // kfield -> ptrs
// template <class Type, hipfftType Direction> auto &get_ptr_cache() {
// using real = typename Type::real;
// using cmplx = typename Type::cmplx;
// static std::unordered_map<cache_key, std::pair<real *, cmplx *>> ptrCache;
// return ptrCache;
// }

template <typename Cache>
void erase_resol_from_cache(Cache &cache, int resol_id) {
// Note that in C++20 this could also be std::erase_if
int erased = 0;
for (auto it = cache.begin(); it != cache.end();) {
if (it->first.resol_id == resol_id) {
it = cache.erase(it);
++erased;
} else
++it;
}
}
template <class Type, fftType Direction>
void erase_from_caches(int resol_id) {
erase_resol_from_cache(get_fft_plan_cache<Type, Direction>(), resol_id);
// erase_resol_from_cache(get_ptr_cache<Type, Direction>(), resol_id);
}

template <class Type, fftType Direction>
std::vector<fft_plan<Type, Direction>> plan_all(int resol_id, int kfield, int *loens,
int nfft, int64_t *offsets) {
using cmplx = typename Type::cmplx;
using plan_type = typename Type::plan_type;
int flags = FFTW_ESTIMATE | FFTW_NO_SIMD;

auto key = cache_key{resol_id, kfield};
auto &fftPlansCache = get_fft_plan_cache<Type, Direction>();
auto fftPlans = fftPlansCache.find(key);
if (fftPlans == fftPlansCache.end()) {
// the fft plans do not exist yet
cmplx *dummy;
if constexpr (Direction == FFT_R2C || Direction == FFT_C2R)
dummy = fftwf_alloc_complex((size_t)1);
else
dummy = fftw_alloc_complex((size_t)1);

std::vector<fft_plan<Type, Direction>> newPlans;
newPlans.reserve(nfft);
for (int i = 0; i < nfft; ++i) {
int nloen = loens[i];

plan_type plan;
int dist = offsets[i + 1] - offsets[i];
int embed[] = {1};

if constexpr (Direction == FFT_R2C)
plan = fftwf_plan_many_dft_r2c(
1, &nloen, kfield, (float*)dummy, embed, 1, dist, dummy, embed, 1, dist / 2, flags
);
else if constexpr (Direction == FFT_C2R)
plan = fftwf_plan_many_dft_c2r(
1, &nloen, kfield, dummy, embed, 1, dist / 2, (float*)dummy, embed, 1, dist, flags
);
else if constexpr (Direction == FFT_D2Z)
plan = fftw_plan_many_dft_r2c(
1, &nloen, kfield, (double*)dummy, embed, 1, dist, dummy, embed, 1, dist / 2, flags
);
else if constexpr (Direction == FFT_Z2D)
plan = fftw_plan_many_dft_c2r(
1, &nloen, kfield, dummy, embed, 1, dist / 2, (double*)dummy, embed, 1, dist, flags
);
newPlans.emplace_back(plan, kfield * offsets[i]);
}
fftPlansCache.insert({key, newPlans});
}
return fftPlansCache.find(key)->second;
}
} // namespace

extern "C" {
void execute_dir_fft_float(float *data_real, fftwf_complex *data_complex,
int resol_id, int kfield, int *loens, int64_t *offsets, int nfft,
void *growing_allocator) {
auto plans = plan_all<Float, FFT_R2C>(resol_id, kfield, loens, nfft, offsets);
for (auto &plan : plans)
plan.exec(data_real, data_complex);
}
void execute_inv_fft_float(fftwf_complex *data_complex, float *data_real,
int resol_id, int kfield, int *loens, int64_t *offsets, int nfft,
void *growing_allocator) {
auto plans = plan_all<Float, FFT_C2R>(resol_id, kfield, loens, nfft, offsets);
for (auto &plan : plans)
plan.exec(data_real, data_complex);
}
void execute_dir_fft_double(double *data_real,
fftw_complex *data_complex, int resol_id, int kfield,
int *loens, int64_t *offsets, int nfft,
void *growing_allocator) {
auto plans = plan_all<Double, FFT_D2Z>(resol_id, kfield, loens, nfft, offsets);
for (auto &plan : plans)
plan.exec(data_real, data_complex);
}
void execute_inv_fft_double(fftw_complex *data_complex,
double *data_real, int resol_id, int kfield, int *loens,
int64_t *offsets, int nfft, void *growing_allocator) {
auto plans = plan_all<Double, FFT_Z2D>(resol_id, kfield, loens, nfft, offsets);
for (auto &plan : plans)
plan.exec(data_real, data_complex);
}

void clean_fft(int resol_id) {
erase_from_caches<Float, FFT_R2C>(resol_id);
erase_from_caches<Float, FFT_C2R>(resol_id);
erase_from_caches<Double, FFT_D2Z>(resol_id);
erase_from_caches<Double, FFT_Z2D>(resol_id);
}
}
Loading
Loading