marsdeno · lukasm91 · Jun 5, 2024 · Jun 10, 2024 · Jun 10, 2024 · Jun 10, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -13,6 +13,8 @@ cmake_minimum_required( VERSION 3.18 FATAL_ERROR )
 
 find_package( ecbuild 3.4 REQUIRED HINTS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../ecbuild )
 
+set(CMAKE_CUDA_RUNTIME_LIBRARY Shared)
+
 project( ectrans LANGUAGES C CXX Fortran )
 include( ectrans_macros )
 

diff --git a/src/programs/CMakeLists.txt b/src/programs/CMakeLists.txt
@@ -59,6 +59,7 @@ foreach( program ectrans-benchmark ectrans-benchmark-ifs )
                                       trans_gpu_${prec}
                                       OpenACC::OpenACC_Fortran
             )
+            target_compile_definitions(${program}-gpu-${prec} PUBLIC USE_GPU)
        endif()
     endforeach( prec )
   endif( HAVE_GPU )

diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90
@@ -9,6 +9,12 @@
 
 program transform_test
 
+#ifdef USE_GPU
+#define PINNED_TAG , pinned
+#else
+#define PINNED_TAG
+#endif
+
 !
 ! Spectral transform test
 !
@@ -94,18 +100,18 @@ program transform_test
 real(kind=jprd) :: zaveave(0:jpmaxstat)
 
 ! Grid-point space data structures
-real(kind=jprb), allocatable, target :: zgmv   (:,:,:,:) ! Multilevel fields at t and t-dt
-real(kind=jprb), allocatable, target :: zgmvs  (:,:,:)   ! Single level fields at t and t-dt
+real(kind=jprb), allocatable, target PINNED_TAG :: zgmv   (:,:,:,:) ! Multilevel fields at t and t-dt
+real(kind=jprb), allocatable, target PINNED_TAG :: zgmvs  (:,:,:)   ! Single level fields at t and t-dt
 real(kind=jprb), pointer :: zgp3a (:,:,:,:) ! Multilevel fields at t and t-dt
 real(kind=jprb), pointer :: zgpuv   (:,:,:,:) ! Multilevel fields at t and t-dt
 real(kind=jprb), pointer :: zgp2 (:,:,:) ! Single level fields at t and t-dt
 
 ! Spectral space data structures
-real(kind=jprb), allocatable, target :: sp3d(:,:,:)
+real(kind=jprb), allocatable, target PINNED_TAG :: sp3d(:,:,:)
 real(kind=jprb), pointer :: zspvor(:,:) => null()
 real(kind=jprb), pointer :: zspdiv(:,:) => null()
 real(kind=jprb), pointer :: zspsc3a(:,:,:) => null()
-real(kind=jprb), allocatable :: zspsc2(:,:)
+real(kind=jprb), allocatable PINNED_TAG :: zspsc2(:,:)
 
 logical :: lstack = .false. ! Output stack info
 logical :: luserpnm = .false.

diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt
@@ -119,6 +119,8 @@ foreach( prec dp sp )
                 $<$<LINK_LANGUAGE:Fortran>:SHELL:${OpenACC_Fortran_FLAGS}>
                 $<$<LINK_LANG_AND_ID:C,NVHPC>:SHELL:${OpenACC_Fortran_FLAGS}>
                 $<$<LINK_LANG_AND_ID:CXX,NVHPC>:SHELL:${OpenACC_Fortran_FLAGS}> )
+           target_compile_options(trans_gpu_${prec} PUBLIC $<$<COMPILE_LANG_AND_ID:Fortran,NVHPC>:-cuda -gpu=nordc>)
+           target_link_options(trans_gpu_${prec} PUBLIC -cuda)
         endif()
 
   endif()

diff --git a/src/trans/gpu/algor/external/fourier/hicfft_create_plan.hip.cpp b/src/trans/gpu/algor/external/fourier/hicfft_create_plan.hip.cpp
@@ -43,24 +43,21 @@ hicfft_create_plan_(hipfftHandle * *plan, int *ISIGNp, int *Np, int *LOTp, int *
     // Disable auto allocation
     fftSafeCall(hipfftSetAutoAllocation(**plan, false));
 
+    size_t thisWorkplanSize;
     if( ISIGN== -1 ){
-      fftSafeCall(hipfftPlanMany(*plan, 1, &N,
+      fftSafeCall(hipfftMakePlanMany(**plan, 1, &N,
                     embed, stride, dist,
                     embed, stride, dist,
-                    fft_dir, LOT));
+                    fft_dir, LOT, &thisWorkplanSize));
     }  else if( ISIGN== 1){
-      fftSafeCall(hipfftPlanMany(*plan, 1, &N,
+      fftSafeCall(hipfftMakePlanMany(**plan, 1, &N,
                     embed, stride, dist,
                     embed, stride, dist,
-                    fft_inv, LOT));
+                    fft_inv, LOT, &thisWorkplanSize));
     } else {
       abort();
     }
 
-    // get size used by this plan
-    size_t thisWorkplanSize;
-    hipfftGetSize(**plan, &thisWorkplanSize);
-
     // check if this the work space is sufficiently large
     if (thisWorkplanSize > currentWorkspaceSize) {
       hipDeviceSynchronize();

diff --git a/src/trans/gpu/algor/external/fourier/hicfft_cuda.h b/src/trans/gpu/algor/external/fourier/hicfft_cuda.h
@@ -43,6 +43,7 @@
 #define hipfftCreate cufftCreate
 #define hipfftDestroy cufftDestroy
 #define hipfftPlanMany cufftPlanMany
+#define hipfftMakePlanMany cufftMakePlanMany
 #define hipfftGetSize cufftGetSize
 #define hipfftSetAutoAllocation cufftSetAutoAllocation 
 #define hipfftSetStream cufftSetStream

diff --git a/src/trans/gpu/algor/module/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/module/hicblas_cutlass.cuda.h
@@ -1,3 +1,6 @@
+// (C) Copyright 2000- ECMWF.
+// (C) Copyright 2024- NVIDIA.
+
 #ifdef USE_CUTLASS
 //#include "hicblas.h"
 #include "cutlass/gemm/device/gemm.h"
@@ -147,9 +150,9 @@ class cutlass_sgemm_grouped<CutlassType::cutlass_fp32, TransA, TransB> {
 template <cublasOperation_t TransA, cublasOperation_t TransB>
 void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
                                       float alpha, const float *A, int lda,
-                                      int *offsetsA, const float *B, int ldb,
-                                      int *offsetsB, float beta, float *C,
-                                      int ldc, int *offsetsC, int batchCount,
+                                      int64_t *offsetsA, const float *B, int *ldb,
+                                      int64_t *offsetsB, float beta, float *C,
+                                      int ldc, int64_t *offsetsC, int batchCount,
                                       cudaStream_t stream,
                                       void *growing_allocator) {
   using namespace detail;
@@ -174,9 +177,9 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k,
 
 void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
                                    int m, int *n, int *k, float alpha,
-                                   const float *A, int lda, int *offsetsA,
-                                   const float *B, int ldb, int *offsetsB, float beta,
-                                   float *C, int ldc, int *offsetsC,
+                                   const float *A, int lda, int64_t *offsetsA,
+                                   const float *B, int *ldb, int64_t *offsetsB, float beta,
+                                   float *C, int ldc, int64_t *offsetsC,
                                    int batchCount, cudaStream_t stream,
                                    void *growing_allocator) {
 

diff --git a/src/trans/gpu/algor/module/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/module/hicblas_gemm.hip.cpp
@@ -1,4 +1,5 @@
 // (C) Copyright 2000- ECMWF.
+// (C) Copyright 2024- NVIDIA.
 //
 // This software is licensed under the terms of the Apache Licence Version 2.0
 // which can be obtained at http://www.apache.org/licenses/LICENSE-2.0.
@@ -63,11 +64,12 @@ template <typename Gemm, typename Real> void free_gemm_cache(float *, size_t) {
 
 // this version is using graphs and caches the graphs
 template <typename Gemm, typename Real>
-void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
-                     const Real *A, int lda, int *offsetsA, const Real *B,
-                     int ldb, int *offsetsB, Real beta, Real *C, int ldc,
-                     int *offsetsC, int batchCount, hipStream_t stream,
-                     int blas_id, void *growing_allocator) {
+void run_group_graph(Gemm &&gemm, int const m, int const *n, int const *k,
+                     Real alpha, const Real *A, int lda, int64_t const *offsetsA,
+                     const Real *B, int const *ldb, int64_t const *offsetsB, Real beta,
+                     Real *C, int ldc, int64_t const *offsetsC, int batchCount,
+                     hipStream_t stream, int blas_id,
+                     void *growing_allocator) {
   growing_allocator_register_free_c(growing_allocator,
                                     free_gemm_cache<Gemm, Real>);
 
@@ -86,7 +88,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
     // the plan is cached, but the pointers are not correct. we remove and
     // delete the graph, but we keep the hipblas handles, if this happens more
     // often, we should cache this...
-    std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." << std::endl;
+    std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow.\n";
     std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id
               << "}\n";
     std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", "
@@ -111,7 +113,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
 
       HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal));
       gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
-           ldb, beta, C + offsetsC[i], ldc);
+           ldb[i], beta, C + offsetsC[i], ldc);
       hipGraph_t my_graph;
       HIC_CHECK(hipStreamEndCapture(stream, &my_graph));
       hipGraphNode_t my_node;
@@ -133,14 +135,14 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha,
 // stupid simple gemm calls
 template <typename Gemm, typename Real>
 void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A,
-               int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB,
-               Real beta, Real *C, int ldc, int *offsetsC, int batchCount,
+               int lda, int64_t *offsetsA, const Real *B, int *ldb, int64_t *offsetsB,
+               Real beta, Real *C, int ldc, int64_t *offsetsC, int batchCount,
                hipStream_t stream, int = -1) {
   for (int i = 0; i < batchCount; ++i) {
     if (m == 0 || n[i] == 0 || k[i] == 0)
       continue;
-    gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb,
-         beta, C + offsetsC[i], ldc);
+    gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i],
+         ldb[i], beta, C + offsetsC[i], ldc);
   }
 }
 
@@ -184,9 +186,9 @@ template <typename Real> struct hipblas_gemm_grouped {
 
 void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
                                    int m, int *n, int *k, float alpha,
-                                   const float *A, int lda, int *offsetsA,
-                                   const float *B, int ldb, int *offsetsB, float beta,
-                                   float *C, int ldc, int *offsetsC,
+                                   const float *A, int lda, int64_t *offsetsA,
+                                   const float *B, int *ldb, int64_t *offsetsB, float beta,
+                                   float *C, int ldc, int64_t *offsetsC,
                                    int batchCount, hipStream_t stream,
                                   void *growing_allocator) {
 
@@ -211,10 +213,10 @@ void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
 void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb,
                                    int m, int *n, int *k,
                                    double alpha,
-                                   const double *A, int lda, int *offsetsA,
-                                   const double *B, int ldb, int *offsetsB,
+                                   const double *A, int lda, int64_t *offsetsA,
+                                   const double *B, int *ldb, int64_t *offsetsB,
                                    double beta,
-                                   double *C, int ldc, int *offsetsC,
+                                   double *C, int ldc, int64_t *offsetsC,
                                    int batchCount, hipStream_t stream, void *) {
 
   hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N;
@@ -287,9 +289,9 @@ void hipblas_sgemm_wrapper (char transa, char transb,
 
 void blas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
                                 int m, int *n, int *k, float alpha,
-                                const float *A, int lda, int *offsetsA,
-                                const float *B, int ldb, int *offsetsB, float beta,
-                                float *C, int ldc, int *offsetsC,
+                                const float *A, int lda, int64_t *offsetsA,
+                                const float *B, int *ldb, int64_t *offsetsB, float beta,
+                                float *C, int ldc, int64_t *offsetsC,
                                 int batchCount, size_t stream,
                                 void *growing_allocator) {
 #ifdef USE_CUTLASS
@@ -308,9 +310,9 @@ void blas_sgemm_wrapper_grouped(int blas_id, char transa, char transb,
 
 void blas_dgemm_wrapper_grouped(int blas_id, char transa, char transb,
                                 int m, int *n, int *k, double alpha,
-                                const double *A, int lda, int *offsetsA,
-                                const double *B, int ldb, int *offsetsB, double beta,
-                                double *C, int ldc, int *offsetsC,
+                                const double *A, int lda, int64_t *offsetsA,
+                                const double *B, int *ldb, int64_t *offsetsB, double beta,
+                                double *C, int ldc, int64_t *offsetsC,
                                 int batchCount, size_t stream,
                                 void *growing_allocator) {
     hipblas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B,