Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ cmake_minimum_required( VERSION 3.18 FATAL_ERROR )

find_package( ecbuild 3.4 REQUIRED HINTS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../ecbuild )

set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)

project( ectrans LANGUAGES C CXX Fortran )
include( ectrans_macros )

Expand Down
1 change: 1 addition & 0 deletions src/programs/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ foreach( program ectrans-benchmark ectrans-benchmark-ifs )
trans_gpu_${prec}
OpenACC::OpenACC_Fortran
)
target_compile_definitions(${program}-gpu-${prec} PUBLIC USE_GPU)
endif()
endforeach( prec )
endif( HAVE_GPU )
Expand Down
14 changes: 10 additions & 4 deletions src/programs/ectrans-benchmark.F90
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,12 @@

program transform_test

#ifdef USE_GPU
#define PINNED_TAG , pinned
#else
#define PINNED_TAG
#endif

!
! Spectral transform test
!
Expand Down Expand Up @@ -94,18 +100,18 @@ program transform_test
real(kind=jprd) :: zaveave(0:jpmaxstat)

! Grid-point space data structures
real(kind=jprb), allocatable, target :: zgmv (:,:,:,:) ! Multilevel fields at t and t-dt
real(kind=jprb), allocatable, target :: zgmvs (:,:,:) ! Single level fields at t and t-dt
real(kind=jprb), allocatable, target PINNED_TAG :: zgmv (:,:,:,:) ! Multilevel fields at t and t-dt
real(kind=jprb), allocatable, target PINNED_TAG :: zgmvs (:,:,:) ! Single level fields at t and t-dt
real(kind=jprb), pointer :: zgp3a (:,:,:,:) ! Multilevel fields at t and t-dt
real(kind=jprb), pointer :: zgpuv (:,:,:,:) ! Multilevel fields at t and t-dt
real(kind=jprb), pointer :: zgp2 (:,:,:) ! Single level fields at t and t-dt

! Spectral space data structures
real(kind=jprb), allocatable, target :: sp3d(:,:,:)
real(kind=jprb), allocatable, target PINNED_TAG :: sp3d(:,:,:)
real(kind=jprb), pointer :: zspvor(:,:) => null()
real(kind=jprb), pointer :: zspdiv(:,:) => null()
real(kind=jprb), pointer :: zspsc3a(:,:,:) => null()
real(kind=jprb), allocatable :: zspsc2(:,:)
real(kind=jprb), allocatable PINNED_TAG :: zspsc2(:,:)

logical :: lstack = .false. ! Output stack info
logical :: luserpnm = .false.
Expand Down
2 changes: 2 additions & 0 deletions src/trans/gpu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -119,6 +119,8 @@ foreach( prec dp sp )
$<$<LINK_LANGUAGE:Fortran>:SHELL:${OpenACC_Fortran_FLAGS}>
$<$<LINK_LANG_AND_ID:C,NVHPC>:SHELL:${OpenACC_Fortran_FLAGS}>
$<$<LINK_LANG_AND_ID:CXX,NVHPC>:SHELL:${OpenACC_Fortran_FLAGS}> )
target_compile_options(trans_gpu_${prec} PUBLIC $<$<COMPILE_LANG_AND_ID:Fortran,NVHPC>:-cuda -gpu=nordc>)
target_link_options(trans_gpu_${prec} PUBLIC -cuda)
endif()

endif()
Expand Down
13 changes: 5 additions & 8 deletions src/trans/gpu/algor/external/fourier/hicfft_create_plan.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,24 +43,21 @@ hicfft_create_plan_(hipfftHandle * *plan, int *ISIGNp, int *Np, int *LOTp, int *
// Disable auto allocation
fftSafeCall(hipfftSetAutoAllocation(**plan, false));

size_t thisWorkplanSize;
if( ISIGN== -1 ){
fftSafeCall(hipfftPlanMany(*plan, 1, &N,
fftSafeCall(hipfftMakePlanMany(**plan, 1, &N,
embed, stride, dist,
embed, stride, dist,
fft_dir, LOT));
fft_dir, LOT, &thisWorkplanSize));
} else if( ISIGN== 1){
fftSafeCall(hipfftPlanMany(*plan, 1, &N,
fftSafeCall(hipfftMakePlanMany(**plan, 1, &N,
embed, stride, dist,
embed, stride, dist,
fft_inv, LOT));
fft_inv, LOT, &thisWorkplanSize));
} else {
abort();
}

// get size used by this plan
size_t thisWorkplanSize;
hipfftGetSize(**plan, &thisWorkplanSize);

// check if this the work space is sufficiently large
if (thisWorkplanSize > currentWorkspaceSize) {
hipDeviceSynchronize();
Expand Down
1 change: 1 addition & 0 deletions src/trans/gpu/algor/external/fourier/hicfft_cuda.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@
#define hipfftCreate cufftCreate
#define hipfftDestroy cufftDestroy
#define hipfftPlanMany cufftPlanMany
#define hipfftMakePlanMany cufftMakePlanMany
#define hipfftGetSize cufftGetSize
#define hipfftSetAutoAllocation cufftSetAutoAllocation
#define hipfftSetStream cufftSetStream
Expand Down
15 changes: 9 additions & 6 deletions src/trans/gpu/algor/module/hicblas_cutlass.cuda.h
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
// (C) Copyright 2000- ECMWF.
// (C) Copyright 2024- NVIDIA.

#ifdef USE_CUTLASS
//#include "hicblas.h"
#include "cutlass/gemm/device/gemm.h"
Expand Down Expand Up @@ -147,9 +150,9 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {
template <cublasOperation_t TransA, cublasOperation_t TransB>
void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
float alpha, const float *A, int lda,
int *offsetsA, const float *B, int ldb,
int *offsetsB, float beta, float *C,
int ldc, int *offsetsC, int batchCount,
int64_t *offsetsA, const float *B, int *ldb,
int64_t *offsetsB, float beta, float *C,
int ldc, int64_t *offsetsC, int batchCount,
cudaStream_t stream,
void *growing_allocator) {
using namespace detail;
Expand All @@ -174,9 +177,9 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,

void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int *offsetsA,
const float *B, int ldb, int *offsetsB, float beta,
float *C, int ldc, int *offsetsC,
const float *A, int lda, int64_t *offsetsA,
const float *B, int *ldb, int64_t *offsetsB, float beta,
float *C, int ldc, int64_t *offsetsC,
int batchCount, cudaStream_t stream,
void *growing_allocator) {

Expand Down
48 changes: 25 additions & 23 deletions src/trans/gpu/algor/module/hicblas_gemm.hip.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
// (C) Copyright 2000- ECMWF.
// (C) Copyright 2024- NVIDIA.
//
// This software is licensed under the terms of the Apache Licence Version 2.0
// which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
Expand Down Expand Up @@ -63,11 +64,12 @@ template <typename Gemm, typename Real> void free_gemm_cache(float *, size_t) {

// this version is using graphs and caches the graphs
template <typename Gemm, typename Real>
void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
const Real *A, int lda, int *offsetsA, const Real *B,
int ldb, int *offsetsB, Real beta, Real *C, int ldc,
int *offsetsC, int batchCount, hipStream_t stream,
int blas_id, void *growing_allocator) {
void run_group_graph(Gemm &&gemm, int const m, int const *n, int const *k,
Real alpha, const Real *A, int lda, int64_t const *offsetsA,
const Real *B, int const *ldb, int64_t const *offsetsB, Real beta,
Real *C, int ldc, int64_t const *offsetsC, int batchCount,
hipStream_t stream, int blas_id,
void *growing_allocator) {
growing_allocator_register_free_c(growing_allocator,
free_gemm_cache<Gemm, Real>);

Expand All @@ -86,7 +88,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
// the plan is cached, but the pointers are not correct. we remove and
// delete the graph, but we keep the hipblas handles, if this happens more
// often, we should cache this...
std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." << std::endl;
std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow.\n";
std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id
<< "}\n";
std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", "
Expand All @@ -111,7 +113,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,

HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal));
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
ldb, beta, C + offsetsC[i], ldc);
ldb[i], beta, C + offsetsC[i], ldc);
hipGraph_t my_graph;
HIC_CHECK(hipStreamEndCapture(stream, &my_graph));
hipGraphNode_t my_node;
Expand All @@ -133,14 +135,14 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
// stupid simple gemm calls
template <typename Gemm, typename Real>
void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A,
int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB,
Real beta, Real *C, int ldc, int *offsetsC, int batchCount,
int lda, int64_t *offsetsA, const Real *B, int *ldb, int64_t *offsetsB,
Real beta, Real *C, int ldc, int64_t *offsetsC, int batchCount,
hipStream_t stream, int = -1) {
for (int i = 0; i < batchCount; ++i) {
if (m == 0 || n[i] == 0 || k[i] == 0)
continue;
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb,
beta, C + offsetsC[i], ldc);
gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
ldb[i], beta, C + offsetsC[i], ldc);
}
}

Expand Down Expand Up @@ -184,9 +186,9 @@ template <typename Real> struct hipblas_gemm_grouped {

void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int *offsetsA,
const float *B, int ldb, int *offsetsB, float beta,
float *C, int ldc, int *offsetsC,
const float *A, int lda, int64_t *offsetsA,
const float *B, int *ldb, int64_t *offsetsB, float beta,
float *C, int ldc, int64_t *offsetsC,
int batchCount, hipStream_t stream,
void *growing_allocator) {

Expand All @@ -211,10 +213,10 @@ void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k,
double alpha,
const double *A, int lda, int *offsetsA,
const double *B, int ldb, int *offsetsB,
const double *A, int lda, int64_t *offsetsA,
const double *B, int *ldb, int64_t *offsetsB,
double beta,
double *C, int ldc, int *offsetsC,
double *C, int ldc, int64_t *offsetsC,
int batchCount, hipStream_t stream, void *) {

hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N;
Expand Down Expand Up @@ -287,9 +289,9 @@ void hipblas_sgemm_wrapper (char transa, char transb,

void blas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k, float alpha,
const float *A, int lda, int *offsetsA,
const float *B, int ldb, int *offsetsB, float beta,
float *C, int ldc, int *offsetsC,
const float *A, int lda, int64_t *offsetsA,
const float *B, int *ldb, int64_t *offsetsB, float beta,
float *C, int ldc, int64_t *offsetsC,
int batchCount, size_t stream,
void *growing_allocator) {
#ifdef USE_CUTLASS
Expand All @@ -308,9 +310,9 @@ void blas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,

void blas_dgemm_wrapper_grouped(int blas_id, char transa, char transb,
int m, int *n, int *k, double alpha,
const double *A, int lda, int *offsetsA,
const double *B, int ldb, int *offsetsB, double beta,
double *C, int ldc, int *offsetsC,
const double *A, int lda, int64_t *offsetsA,
const double *B, int *ldb, int64_t *offsetsB, double beta,
double *C, int ldc, int64_t *offsetsC,
int batchCount, size_t stream,
void *growing_allocator) {
hipblas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B,
Expand Down
Loading