diff --git a/CMakeLists.txt b/CMakeLists.txt index 48154ac33..759db7db8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,6 +13,8 @@ cmake_minimum_required( VERSION 3.18 FATAL_ERROR ) find_package( ecbuild 3.4 REQUIRED HINTS ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../ecbuild ) +set(CMAKE_CUDA_RUNTIME_LIBRARY Shared) + project( ectrans LANGUAGES C CXX Fortran ) include( ectrans_macros ) diff --git a/src/programs/CMakeLists.txt b/src/programs/CMakeLists.txt index ec566cafd..dc040a86e 100644 --- a/src/programs/CMakeLists.txt +++ b/src/programs/CMakeLists.txt @@ -59,6 +59,7 @@ foreach( program ectrans-benchmark ectrans-benchmark-ifs ) trans_gpu_${prec} OpenACC::OpenACC_Fortran ) + target_compile_definitions(${program}-gpu-${prec} PUBLIC USE_GPU) endif() endforeach( prec ) endif( HAVE_GPU ) diff --git a/src/programs/ectrans-benchmark.F90 b/src/programs/ectrans-benchmark.F90 index 3ba41d174..e9028507e 100644 --- a/src/programs/ectrans-benchmark.F90 +++ b/src/programs/ectrans-benchmark.F90 @@ -9,6 +9,12 @@ program transform_test +#ifdef USE_GPU +#define PINNED_TAG , pinned +#else +#define PINNED_TAG +#endif + ! ! Spectral transform test ! @@ -94,18 +100,18 @@ program transform_test real(kind=jprd) :: zaveave(0:jpmaxstat) ! Grid-point space data structures -real(kind=jprb), allocatable, target :: zgmv (:,:,:,:) ! Multilevel fields at t and t-dt -real(kind=jprb), allocatable, target :: zgmvs (:,:,:) ! Single level fields at t and t-dt +real(kind=jprb), allocatable, target PINNED_TAG :: zgmv (:,:,:,:) ! Multilevel fields at t and t-dt +real(kind=jprb), allocatable, target PINNED_TAG :: zgmvs (:,:,:) ! Single level fields at t and t-dt real(kind=jprb), pointer :: zgp3a (:,:,:,:) ! Multilevel fields at t and t-dt real(kind=jprb), pointer :: zgpuv (:,:,:,:) ! Multilevel fields at t and t-dt real(kind=jprb), pointer :: zgp2 (:,:,:) ! Single level fields at t and t-dt ! Spectral space data structures -real(kind=jprb), allocatable, target :: sp3d(:,:,:) +real(kind=jprb), allocatable, target PINNED_TAG :: sp3d(:,:,:) real(kind=jprb), pointer :: zspvor(:,:) => null() real(kind=jprb), pointer :: zspdiv(:,:) => null() real(kind=jprb), pointer :: zspsc3a(:,:,:) => null() -real(kind=jprb), allocatable :: zspsc2(:,:) +real(kind=jprb), allocatable PINNED_TAG :: zspsc2(:,:) logical :: lstack = .false. ! Output stack info logical :: luserpnm = .false. diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index b0886d265..517b79ac5 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -119,6 +119,8 @@ foreach( prec dp sp ) $<$:SHELL:${OpenACC_Fortran_FLAGS}> $<$:SHELL:${OpenACC_Fortran_FLAGS}> $<$:SHELL:${OpenACC_Fortran_FLAGS}> ) + target_compile_options(trans_gpu_${prec} PUBLIC $<$:-cuda -gpu=nordc>) + target_link_options(trans_gpu_${prec} PUBLIC -cuda) endif() endif() diff --git a/src/trans/gpu/algor/external/fourier/hicfft_create_plan.hip.cpp b/src/trans/gpu/algor/external/fourier/hicfft_create_plan.hip.cpp index a1aa074ce..20877f235 100644 --- a/src/trans/gpu/algor/external/fourier/hicfft_create_plan.hip.cpp +++ b/src/trans/gpu/algor/external/fourier/hicfft_create_plan.hip.cpp @@ -43,24 +43,21 @@ hicfft_create_plan_(hipfftHandle * *plan, int *ISIGNp, int *Np, int *LOTp, int * // Disable auto allocation fftSafeCall(hipfftSetAutoAllocation(**plan, false)); + size_t thisWorkplanSize; if( ISIGN== -1 ){ - fftSafeCall(hipfftPlanMany(*plan, 1, &N, + fftSafeCall(hipfftMakePlanMany(**plan, 1, &N, embed, stride, dist, embed, stride, dist, - fft_dir, LOT)); + fft_dir, LOT, &thisWorkplanSize)); } else if( ISIGN== 1){ - fftSafeCall(hipfftPlanMany(*plan, 1, &N, + fftSafeCall(hipfftMakePlanMany(**plan, 1, &N, embed, stride, dist, embed, stride, dist, - fft_inv, LOT)); + fft_inv, LOT, &thisWorkplanSize)); } else { abort(); } - // get size used by this plan - size_t thisWorkplanSize; - hipfftGetSize(**plan, &thisWorkplanSize); - // check if this the work space is sufficiently large if (thisWorkplanSize > currentWorkspaceSize) { hipDeviceSynchronize(); diff --git a/src/trans/gpu/algor/external/fourier/hicfft_cuda.h b/src/trans/gpu/algor/external/fourier/hicfft_cuda.h index 89745331d..a23895092 100644 --- a/src/trans/gpu/algor/external/fourier/hicfft_cuda.h +++ b/src/trans/gpu/algor/external/fourier/hicfft_cuda.h @@ -43,6 +43,7 @@ #define hipfftCreate cufftCreate #define hipfftDestroy cufftDestroy #define hipfftPlanMany cufftPlanMany +#define hipfftMakePlanMany cufftMakePlanMany #define hipfftGetSize cufftGetSize #define hipfftSetAutoAllocation cufftSetAutoAllocation #define hipfftSetStream cufftSetStream diff --git a/src/trans/gpu/algor/module/hicblas_cutlass.cuda.h b/src/trans/gpu/algor/module/hicblas_cutlass.cuda.h index a05ff5754..f660bef99 100644 --- a/src/trans/gpu/algor/module/hicblas_cutlass.cuda.h +++ b/src/trans/gpu/algor/module/hicblas_cutlass.cuda.h @@ -1,3 +1,6 @@ +// (C) Copyright 2000- ECMWF. +// (C) Copyright 2024- NVIDIA. + #ifdef USE_CUTLASS //#include "hicblas.h" #include "cutlass/gemm/device/gemm.h" @@ -147,9 +150,9 @@ class cutlass_sgemm_grouped { template void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, float alpha, const float *A, int lda, - int *offsetsA, const float *B, int ldb, - int *offsetsB, float beta, float *C, - int ldc, int *offsetsC, int batchCount, + int64_t *offsetsA, const float *B, int *ldb, + int64_t *offsetsB, float beta, float *C, + int ldc, int64_t *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { using namespace detail; @@ -174,9 +177,9 @@ void cutlass_sgemm_wrapper_grouped_op(int blas_id, int m, int *n, int *k, void cutlass_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, + const float *A, int lda, int64_t *offsetsA, + const float *B, int *ldb, int64_t *offsetsB, float beta, + float *C, int ldc, int64_t *offsetsC, int batchCount, cudaStream_t stream, void *growing_allocator) { diff --git a/src/trans/gpu/algor/module/hicblas_gemm.hip.cpp b/src/trans/gpu/algor/module/hicblas_gemm.hip.cpp index 705230fcd..27ea2a12f 100644 --- a/src/trans/gpu/algor/module/hicblas_gemm.hip.cpp +++ b/src/trans/gpu/algor/module/hicblas_gemm.hip.cpp @@ -1,4 +1,5 @@ // (C) Copyright 2000- ECMWF. +// (C) Copyright 2024- NVIDIA. // // This software is licensed under the terms of the Apache Licence Version 2.0 // which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. @@ -63,11 +64,12 @@ template void free_gemm_cache(float *, size_t) { // this version is using graphs and caches the graphs template -void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, - const Real *A, int lda, int *offsetsA, const Real *B, - int ldb, int *offsetsB, Real beta, Real *C, int ldc, - int *offsetsC, int batchCount, hipStream_t stream, - int blas_id, void *growing_allocator) { +void run_group_graph(Gemm &&gemm, int const m, int const *n, int const *k, + Real alpha, const Real *A, int lda, int64_t const *offsetsA, + const Real *B, int const *ldb, int64_t const *offsetsB, Real beta, + Real *C, int ldc, int64_t const *offsetsC, int batchCount, + hipStream_t stream, int blas_id, + void *growing_allocator) { growing_allocator_register_free_c(growing_allocator, free_gemm_cache); @@ -86,7 +88,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, // the plan is cached, but the pointers are not correct. we remove and // delete the graph, but we keep the hipblas handles, if this happens more // often, we should cache this... - std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow." << std::endl; + std::cout << "WARNING GEMM: POINTER CHANGE - Graph recreation might be slow.\n"; std::cout << "We have an entry with key {m=" << m << ", blas_id=" << blas_id << "}\n"; std::cout << "Pointers: " << std::get<0>(ptrs->second) << ", " @@ -111,7 +113,7 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, HIC_CHECK(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal)); gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], - ldb, beta, C + offsetsC[i], ldc); + ldb[i], beta, C + offsetsC[i], ldc); hipGraph_t my_graph; HIC_CHECK(hipStreamEndCapture(stream, &my_graph)); hipGraphNode_t my_node; @@ -133,14 +135,14 @@ void run_group_graph(Gemm &&gemm, int m, int *n, int *k, Real alpha, // stupid simple gemm calls template void run_group(Gemm &&gemm, int m, int *n, int *k, Real alpha, const Real *A, - int lda, int *offsetsA, const Real *B, int ldb, int *offsetsB, - Real beta, Real *C, int ldc, int *offsetsC, int batchCount, + int lda, int64_t *offsetsA, const Real *B, int *ldb, int64_t *offsetsB, + Real beta, Real *C, int ldc, int64_t *offsetsC, int batchCount, hipStream_t stream, int = -1) { for (int i = 0; i < batchCount; ++i) { if (m == 0 || n[i] == 0 || k[i] == 0) continue; - gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], ldb, - beta, C + offsetsC[i], ldc); + gemm(stream, m, n[i], k[i], alpha, A + offsetsA[i], lda, B + offsetsB[i], + ldb[i], beta, C + offsetsC[i], ldc); } } @@ -184,9 +186,9 @@ template struct hipblas_gemm_grouped { void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, + const float *A, int lda, int64_t *offsetsA, + const float *B, int *ldb, int64_t *offsetsB, float beta, + float *C, int ldc, int64_t *offsetsC, int batchCount, hipStream_t stream, void *growing_allocator) { @@ -211,10 +213,10 @@ void hipblas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, void hipblas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, double alpha, - const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, + const double *A, int lda, int64_t *offsetsA, + const double *B, int *ldb, int64_t *offsetsB, double beta, - double *C, int ldc, int *offsetsC, + double *C, int ldc, int64_t *offsetsC, int batchCount, hipStream_t stream, void *) { hipblasOperation_t op_t1=HIPBLAS_OP_N, op_t2=HIPBLAS_OP_N; @@ -287,9 +289,9 @@ void hipblas_sgemm_wrapper (char transa, char transb, void blas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, float alpha, - const float *A, int lda, int *offsetsA, - const float *B, int ldb, int *offsetsB, float beta, - float *C, int ldc, int *offsetsC, + const float *A, int lda, int64_t *offsetsA, + const float *B, int *ldb, int64_t *offsetsB, float beta, + float *C, int ldc, int64_t *offsetsC, int batchCount, size_t stream, void *growing_allocator) { #ifdef USE_CUTLASS @@ -308,9 +310,9 @@ void blas_sgemm_wrapper_grouped(int blas_id, char transa, char transb, void blas_dgemm_wrapper_grouped(int blas_id, char transa, char transb, int m, int *n, int *k, double alpha, - const double *A, int lda, int *offsetsA, - const double *B, int ldb, int *offsetsB, double beta, - double *C, int ldc, int *offsetsC, + const double *A, int lda, int64_t *offsetsA, + const double *B, int *ldb, int64_t *offsetsB, double beta, + double *C, int ldc, int64_t *offsetsC, int batchCount, size_t stream, void *growing_allocator) { hipblas_dgemm_wrapper_grouped(blas_id, transa, transb, m, n, k, alpha, A, lda, offsetsA, B, diff --git a/src/trans/gpu/algor/module/hicblas_mod.F90 b/src/trans/gpu/algor/module/hicblas_mod.F90 index d6e2a5a35..778b0a728 100644 --- a/src/trans/gpu/algor/module/hicblas_mod.F90 +++ b/src/trans/gpu/algor/module/hicblas_mod.F90 @@ -16,7 +16,7 @@ MODULE HICBLAS_MOD -USE PARKIND1, ONLY : JPIM, JPRM, JPRD +USE PARKIND1, ONLY : JPIM, JPRM, JPRD, JPIB USE GROWING_ALLOCATOR_MOD, ONLY: GROWING_ALLOCATION_TYPE USE ISO_C_BINDING USE OPENACC_LIB, ONLY: ACC_GET_HIP_STREAM @@ -82,9 +82,9 @@ SUBROUTINE HIP_DGEMM_BATCHED( & CHARACTER(1,C_CHAR), VALUE :: CTA, CTB INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT REAL(C_DOUBLE), VALUE :: ALPHA,BETA - REAL(C_DOUBLE), DIMENSION(LDA,*) :: A - REAL(C_DOUBLE), DIMENSION(LDB,*) :: B - REAL(C_DOUBLE), DIMENSION(LDC,*) :: C + REAL(C_DOUBLE), DIMENSION(LDA,*), DEVICE :: A + REAL(C_DOUBLE), DIMENSION(LDB,*), DEVICE :: B + REAL(C_DOUBLE), DIMENSION(LDC,*), DEVICE :: C INTEGER(KIND=C_SIZE_T) :: STREAM TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE HIP_DGEMM_BATCHED @@ -106,9 +106,9 @@ SUBROUTINE HIP_DGEMM_STRIDED_BATCHED(& INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, BATCHCOUNT INTEGER(C_INT), VALUE :: TDA,TDB,TDC REAL(C_DOUBLE), VALUE :: ALPHA, BETA - REAL(C_DOUBLE), DIMENSION(LDA,*) :: A - REAL(C_DOUBLE), DIMENSION(LDB,*) :: B - REAL(C_DOUBLE), DIMENSION(LDC,*) :: C + REAL(C_DOUBLE), DIMENSION(LDA,*), DEVICE :: A + REAL(C_DOUBLE), DIMENSION(LDB,*), DEVICE :: B + REAL(C_DOUBLE), DIMENSION(LDC,*), DEVICE :: C INTEGER(KIND=C_SIZE_T) :: STREAM END SUBROUTINE HIP_DGEMM_STRIDED_BATCHED END INTERFACE @@ -133,9 +133,9 @@ SUBROUTINE HIP_SGEMM_BATCHED( & CHARACTER(1,C_CHAR), VALUE :: CTA, CTB INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, TDA, TDB, TDC, BATCHCOUNT REAL(C_FLOAT), VALUE :: ALPHA, BETA - REAL(C_FLOAT), DIMENSION(LDA,*) :: A - REAL(C_FLOAT), DIMENSION(LDB,*) :: B - REAL(C_FLOAT), DIMENSION(LDC,*) :: C + REAL(C_FLOAT), DIMENSION(LDA,*), DEVICE :: A + REAL(C_FLOAT), DIMENSION(LDB,*), DEVICE :: B + REAL(C_FLOAT), DIMENSION(LDC,*), DEVICE :: C INTEGER(KIND=C_SIZE_T) :: STREAM TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE HIP_SGEMM_BATCHED @@ -157,9 +157,9 @@ SUBROUTINE HIP_SGEMM_STRIDED_BATCHED(& INTEGER(C_INT), VALUE :: M, N, K, LDA, LDB, LDC, BATCHCOUNT INTEGER(C_INT), VALUE :: TDA,TDB,TDC REAL(C_FLOAT), VALUE :: ALPHA, BETA - REAL(C_FLOAT), DIMENSION(LDA,*) :: A - REAL(C_FLOAT), DIMENSION(LDB,*) :: B - REAL(C_FLOAT), DIMENSION(LDC,*) :: C + REAL(C_FLOAT), DIMENSION(LDA,*), DEVICE :: A + REAL(C_FLOAT), DIMENSION(LDB,*), DEVICE :: B + REAL(C_FLOAT), DIMENSION(LDC,*), DEVICE :: C INTEGER(KIND=C_SIZE_T) :: STREAM END SUBROUTINE HIP_SGEMM_STRIDED_BATCHED END INTERFACE @@ -183,9 +183,10 @@ SUBROUTINE HIP_DGEMM_GROUPED( & USE ISO_C_BINDING CHARACTER(1,C_CHAR), VALUE :: CTA, CTB INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) + INTEGER(C_INT) :: N(*), K(*) + INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_DOUBLE), VALUE :: ALPHA,BETA - REAL(C_DOUBLE) :: A(*), B(*), C(*) + REAL(C_DOUBLE), DEVICE :: A(*), B(*), C(*) INTEGER(KIND=C_SIZE_T) :: STREAM TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE HIP_DGEMM_GROUPED @@ -201,10 +202,11 @@ SUBROUTINE HIP_SGEMM_GROUPED( & &) BIND(C, NAME='blas_sgemm_wrapper_grouped') USE ISO_C_BINDING CHARACTER(1,C_CHAR), VALUE :: CTA, CTB - INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDB, LDC, BATCHCOUNT - INTEGER(C_INT) :: N(*), K(*), OFFSETA(*), OFFSETB(*), OFFSETC(*) + INTEGER(C_INT), VALUE :: BLAS_ID, M, LDA, LDC, BATCHCOUNT + INTEGER(C_INT) :: N(*), K(*), LDB(*) + INTEGER(C_INT64_T) :: OFFSETA(*), OFFSETB(*), OFFSETC(*) REAL(C_FLOAT), VALUE :: ALPHA,BETA - REAL(C_FLOAT) :: A(*), B(*), C(*) + REAL(C_FLOAT), DEVICE :: A(*), B(*), C(*) INTEGER(KIND=C_SIZE_T) :: STREAM TYPE(C_PTR), INTENT(IN), VALUE :: ALLOC END SUBROUTINE HIP_SGEMM_GROUPED @@ -226,14 +228,14 @@ SUBROUTINE HIP_DGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: N INTEGER(KIND=JPIM) :: K REAL(KIND=JPRD) :: ALPHA - REAL(KIND=JPRD), DIMENSION(:) :: AARRAY + REAL(KIND=JPRD), DIMENSION(:), DEVICE :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: STRIDEA - REAL(KIND=JPRD), DIMENSION(:,:) :: BARRAY + REAL(KIND=JPRD), DIMENSION(:,:), DEVICE :: BARRAY INTEGER(KIND=JPIM) :: LDB INTEGER(KIND=JPIM) :: STRIDEB REAL(KIND=JPRD) :: BETA - REAL(KIND=JPRD), DIMENSION(:) :: CARRAY + REAL(KIND=JPRD), DIMENSION(:), DEVICE :: CARRAY INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT @@ -275,14 +277,14 @@ SUBROUTINE HIP_SGEMM_BATCHED_OVERLOAD( & INTEGER(KIND=JPIM) :: N INTEGER(KIND=JPIM) :: K REAL(KIND=JPRM) :: ALPHA - REAL(KIND=JPRM), DIMENSION(:) :: AARRAY + REAL(KIND=JPRM), DIMENSION(:), DEVICE :: AARRAY INTEGER(KIND=JPIM) :: LDA INTEGER(KIND=JPIM) :: STRIDEA - REAL(KIND=JPRM), DIMENSION(*) :: BARRAY + REAL(KIND=JPRM), DIMENSION(*), DEVICE :: BARRAY INTEGER(KIND=JPIM) :: LDB INTEGER(KIND=JPIM) :: STRIDEB REAL(KIND=JPRM) :: BETA - REAL(KIND=JPRM), DIMENSION(:) :: CARRAY + REAL(KIND=JPRM), DIMENSION(:), DEVICE :: CARRAY INTEGER(KIND=JPIM) :: LDC INTEGER(KIND=JPIM) :: STRIDEC INTEGER(KIND=JPIM) :: BATCHCOUNT @@ -320,16 +322,16 @@ SUBROUTINE HIP_DGEMM_GROUPED_OVERLOAD( & INTEGER(KIND=JPIM) :: N(:) INTEGER(KIND=JPIM) :: K(:) REAL(KIND=JPRD) :: ALPHA - REAL(KIND=JPRD), DIMENSION(:) :: AARRAY + REAL(KIND=JPRD), DIMENSION(:), DEVICE :: AARRAY INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIM) :: OFFSETA(:) - REAL(KIND=JPRD), DIMENSION(*) :: BARRAY + INTEGER(KIND=JPIB) :: OFFSETA(:) + REAL(KIND=JPRD), DIMENSION(*), DEVICE :: BARRAY INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIM) :: OFFSETB(:) + INTEGER(KIND=JPIB) :: OFFSETB(:) REAL(KIND=JPRD) :: BETA - REAL(KIND=JPRD), DIMENSION(:) :: CARRAY + REAL(KIND=JPRD), DIMENSION(:), DEVICE :: CARRAY INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIM) :: OFFSETC(:) + INTEGER(KIND=JPIB) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT INTEGER(KIND=C_INT) :: STREAM TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC @@ -365,16 +367,16 @@ SUBROUTINE HIP_SGEMM_GROUPED_OVERLOAD(& INTEGER(KIND=JPIM) :: N(:) INTEGER(KIND=JPIM) :: K(:) REAL(KIND=JPRM) :: ALPHA - REAL(KIND=JPRM), DIMENSION(:) :: AARRAY + REAL(KIND=JPRM), DIMENSION(:), DEVICE :: AARRAY INTEGER(KIND=JPIM) :: LDA - INTEGER(KIND=JPIM) :: OFFSETA(:) - REAL(KIND=JPRM), DIMENSION(:,:,:) :: BARRAY - INTEGER(KIND=JPIM) :: LDB - INTEGER(KIND=JPIM) :: OFFSETB(:) + INTEGER(KIND=JPIB) :: OFFSETA(:) + REAL(KIND=JPRM), DIMENSION(:), DEVICE :: BARRAY + INTEGER(KIND=JPIM), DIMENSION(:) :: LDB + INTEGER(KIND=JPIB) :: OFFSETB(:) REAL(KIND=JPRM) :: BETA - REAL(KIND=JPRM), DIMENSION(:) :: CARRAY + REAL(KIND=JPRM), DIMENSION(:), DEVICE :: CARRAY INTEGER(KIND=JPIM) :: LDC - INTEGER(KIND=JPIM) :: OFFSETC(:) + INTEGER(KIND=JPIB) :: OFFSETC(:) INTEGER(KIND=JPIM) :: BATCHCOUNT INTEGER(KIND=C_INT) :: STREAM TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN) :: ALLOC diff --git a/src/trans/gpu/external/gpnorm_trans_gpu.F90 b/src/trans/gpu/external/gpnorm_trans_gpu.F90 index 8d941f78d..55770298e 100755 --- a/src/trans/gpu/external/gpnorm_trans_gpu.F90 +++ b/src/trans/gpu/external/gpnorm_trans_gpu.F90 @@ -189,7 +189,7 @@ SUBROUTINE GPNORM_TRANS_GPU(PGP,KFIELDS,KPROMA,PAVE,PMIN,PMAX,LDAVE_ONLY,KRESOL) ! done in setup_trans LGPNORM=.TRUE. !!FIXME -!!CALL TRGTOL_CUDAAWARE(ZGTF,IF_FS,IF_GP,IVSET,PGP=PGP) +!!CALL TRGTOL(ZGTF,IF_FS,IF_GP,IVSET,PGP=PGP) LGPNORM=.FALSE. ! ZGTF is now on GPU diff --git a/src/trans/gpu/external/setup_trans.F90 b/src/trans/gpu/external/setup_trans.F90 index 2045020ce..2b5017ebe 100755 --- a/src/trans/gpu/external/setup_trans.F90 +++ b/src/trans/gpu/external/setup_trans.F90 @@ -138,6 +138,7 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& USE PREPSNM_MOD ,ONLY : PREPSNM #ifdef ACCGPU USE OPENACC +USE CUDAFOR #endif #ifdef OMPGPU USE OMP_LIB @@ -190,6 +191,15 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& #endif INTEGER :: INUMDEVS, IUNIT, ISTAT, IDEV, MYGPU +REAL(KIND=JPRBT), POINTER :: LOCAL_ARR(:,:) +INTEGER(KIND=4) :: IRET +INTEGER(KIND=8) :: IWIDTH, IHEIGHT +INTEGER :: IPITCH_DST, IPITCH_SRC +INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM +TYPE(C_PTR) :: HSTPTR +REAL(KIND=JPRB), POINTER :: RPNMA0(:,:), RPNMS0(:,:) +TYPE(C_DEVPTR) :: DEVPTR + #include "user_clock.intfb.h" ! ------------------------------------------------------------------ @@ -481,33 +491,91 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& ! Initialize A arrays -ALLOCATE(ZAA(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+2)/2,8),D%NUMP)) -ALLOCATE(ZAS(ALIGN(R%NDGNH,8),ALIGN((R%NTMAX+3)/2,8),D%NUMP)) +ALLOCATE(ZAA(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))) +ALLOCATE(ZAS(D%OFFSETS_GEMM_MATRIX(D%NUMP+1))) + +!$ACC ENTER DATA CREATE(ZAA,ZAS) WRITE(NOUT,*)'setup_trans: sizes1 NUMP=',D%NUMP -WRITE(NOUT,*)'ZAS:',size(ZAS) -WRITE(NOUT,*)'ZAA:',size(ZAA) +WRITE(NOUT,'("ZAA: ", I, " B")') SIZE(ZAA,KIND=8)*SIZEOF(ZAA(1)) +WRITE(NOUT,'("ZAS: ", I, " B")') SIZE(ZAS,KIND=8)*SIZEOF(ZAS(1)) + +STREAM = ACC_GET_CUDA_STREAM(1) + +! Be very careful here not to touch ZAA, ZAS, ZAA0 and ZAS0 on the host! -ZAA(:,:,:) = 0._JPRBT -ZAS(:,:,:) = 0._JPRBT +#ifdef OMPGPU +#endif +#ifdef ACCGPU +!$ACC KERNELS DEFAULT(PRESENT) ASYNC(1) +#endif +ZAA(:) = 0._JPRBT +ZAS(:) = 0._JPRBT +#ifdef OMPGPU +#endif +#ifdef ACCGPU +!$ACC END KERNELS +#endif +KMLOC0 = 0 DO JMLOC=1,D%NUMP KM = D%MYMS(JMLOC) KDGLU = G%NDGLU(KM) ILA = (R%NSMAX-KM+2)/2 ILS = (R%NSMAX-KM+3)/2 - ZAA(1:KDGLU,1:ILA,JMLOC)=S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) - ZAS(1:KDGLU,1:ILS,JMLOC)=S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + IF (KM /= 0) THEN + CALL C_F_POINTER(C_LOC(ZAA(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, & + & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILA/)) + !$ACC HOST_DATA USE_DEVICE(LOCAL_ARR) + DEVPTR = C_DEVLOC(LOCAL_ARR) + !$ACC END HOST_DATA + IPITCH_DST = D%LEGENDRE_MATRIX_STRIDES(JMLOC)*SIZEOF(ZAA(1)) + HSTPTR = C_LOC(S%FA(JMLOC)%RPNMA) + IPITCH_SRC = SIZE(S%FA(JMLOC)%RPNMA,1)*SIZEOF(ZAA(1)) + IWIDTH = KDGLU*SIZEOF(ZAA(1)) + IHEIGHT = ILA + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + + CALL C_F_POINTER(C_LOC(ZAS(1+D%OFFSETS_GEMM_MATRIX(JMLOC))), LOCAL_ARR, & + & (/D%LEGENDRE_MATRIX_STRIDES(JMLOC),ILS/)) + !$ACC HOST_DATA USE_DEVICE(LOCAL_ARR) + DEVPTR = C_DEVLOC(LOCAL_ARR) + !$ACC END HOST_DATA + IPITCH_DST = D%LEGENDRE_MATRIX_STRIDES(JMLOC)*SIZEOF(ZAS(1)) + HSTPTR = C_LOC(S%FA(JMLOC)%RPNMS) + IPITCH_SRC = SIZE(S%FA(JMLOC)%RPNMS,1)*SIZEOF(ZAS(1)) + IWIDTH = KDGLU*SIZEOF(ZAS(1)) + IHEIGHT = ILS + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + ELSE + KMLOC0 = JMLOC + ALLOCATE(ZAA0(ALIGN(KDGLU,8),ILA)) + ALLOCATE(ZAS0(ALIGN(KDGLU,8),ILS)) + RPNMA0 => S%FA(JMLOC)%RPNMA(1:KDGLU,1:ILA) + RPNMS0 => S%FA(JMLOC)%RPNMS(1:KDGLU,1:ILS) + !$ACC ENTER DATA CREATE(ZAA0,ZAS0) + !$ACC KERNELS DEFAULT(PRESENT) ASYNC(1) + ZAA0(:,:) = 0 + ZAS0(:,:) = 0 + !$ACC END KERNELS + !$ACC KERNELS DEFAULT(PRESENT) COPYIN(RPNMA0,RPNMS0) ASYNC(1) + ZAA0(1:KDGLU,1:ILA)=RPNMA0(1:KDGLU,1:ILA) + ZAS0(1:KDGLU,1:ILS)=RPNMS0(1:KDGLU,1:ILS) + !$ACC END KERNELS + write(nout,'("ZAA0: ", I, " B")') size(ZAA0)*sizeof(ZAA0(1,1)) + write(nout,'("ZAS0: ", I, " B")') size(ZAS0)*sizeof(ZAS0(1,1)) + ENDIF ENDDO - -! permanent copy of Legendre polynomials into device - -#ifdef ACCGPU -!$ACC ENTER DATA COPYIN(ZAA,ZAS) -#endif -#ifdef OMPGPU -#endif +!$ACC WAIT(1) ALLOCATE(ZEPSNM(D%NUMP,0:R%NTMAX+2)) WRITE(NOUT,*)'ZEPSNM :',SIZE(ZEPSNM) @@ -636,33 +704,9 @@ SUBROUTINE SETUP_TRANS(KSMAX,KDGL,KDLON,KLOEN,LDSPLIT,PSTRET,& D_NUMP=D%NUMP D_NDGL_FS=D%NDGL_FS -KMLOC0 = -1 DO I=1,SIZE(D%MYMS) D_MYMS(I)=D%MYMS(I) - IF(D_MYMS(I) == 0) KMLOC0 = I end DO - -! arrays for m=0 in ledir_mod: -IF(KMLOC0 >= 0) THEN - ALLOCATE(ZAA0(SIZE(ZAA,1),SIZE(ZAA,2))) - ALLOCATE(ZAS0(SIZE(ZAS,1),SIZE(ZAS,2))) - ZAA0 = ZAA(:,:,KMLOC0) - ZAS0 = ZAS(:,:,KMLOC0) -#ifdef ACCGPU - !$ACC ENTER DATA COPYIN(ZAA0,ZAS0) -#endif -#ifdef OMPGPU - !$OMP TARGET ENTER DATA MAP(TO:ZAA0,ZAS0) -#endif - WRITE(NOUT,*) 'GPU arrays for m=0 successfully allocated' -#ifdef ACCGPU - WRITE(NOUT,*) 'Using OpenACC' -#endif -#ifdef OMPGPU - WRITE(NOUT,*) 'Using OpenMP offloading' -#endif -ENDIF - DO I=1,SIZE(F%RW) F_RW(I)=F%RW(I) END DO diff --git a/src/trans/gpu/internal/buffered_allocator_mod.F90 b/src/trans/gpu/internal/buffered_allocator_mod.F90 index 86149ea5f..d883cee8e 100644 --- a/src/trans/gpu/internal/buffered_allocator_mod.F90 +++ b/src/trans/gpu/internal/buffered_allocator_mod.F90 @@ -68,10 +68,11 @@ FUNCTION MAKE_BUFFERED_ALLOCATOR() MAKE_BUFFERED_ALLOCATOR%NEXT_BUF = 0 END FUNCTION MAKE_BUFFERED_ALLOCATOR - FUNCTION RESERVE(ALLOCATOR, SZ) + FUNCTION RESERVE(ALLOCATOR, SZ, WHO) IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR INTEGER(KIND=C_SIZE_T), INTENT(IN) :: SZ + CHARACTER(*), INTENT(IN), OPTIONAL :: WHO TYPE(ALLOCATION_RESERVATION_HANDLE) :: RESERVE @@ -86,7 +87,7 @@ SUBROUTINE INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) IMPLICIT NONE TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR !!TYPE(GROWING_ALLOCATION_TYPE), INTENT(IN), POINTER :: GROWING_ALLOCATION - TYPE(GROWING_ALLOCATION_TYPE), target, INTENT(INout) :: GROWING_ALLOCATION + TYPE(GROWING_ALLOCATION_TYPE), TARGET, INTENT(INOUT) :: GROWING_ALLOCATION INTEGER :: I DO I = 0, NBUF-1 @@ -178,7 +179,7 @@ SUBROUTINE ASSIGN_PTR_DOUBLE(DST, SRC, START_IN_BYTES, LENGTH_IN_BYTES, SET_VALU ! setting all bits to 1 (meaning NaN in floating point) !$ACC KERNELS PRESENT(SRC) ASYNC(SET_STREAM_EFF) SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1) = -1 - !$ACC END KERNELS!! LOOP + !$ACC END KERNELS ENDIF CALL C_F_POINTER(C_LOC(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1)), DST, & & [SIZEOF(SRC(START_IN_BYTES:START_IN_BYTES+LENGTH_IN_BYTES-1))/SIZEOF(DST(0))]) diff --git a/src/trans/gpu/internal/cuda_copy_module.F90 b/src/trans/gpu/internal/cuda_copy_module.F90 new file mode 100644 index 000000000..c563bcdca --- /dev/null +++ b/src/trans/gpu/internal/cuda_copy_module.F90 @@ -0,0 +1,1226 @@ + + + +MODULE CUDA_COPY_MODULE + +USE OPENACC +USE CUDAFOR +USE ISO_C_BINDING + + + +IMPLICIT NONE + +PRIVATE + +PUBLIC :: COPY, NH2D, ND2H, ACC_HANDLE_KIND + +INTEGER(4), PARAMETER :: NH2D = CUDAMEMCPYHOSTTODEVICE , ND2H = CUDAMEMCPYDEVICETOHOST + +INTERFACE COPY +MODULE PROCEDURE COPY_2_REAL_4 +MODULE PROCEDURE COPY_2_REAL_8 +MODULE PROCEDURE COPY_3_REAL_4 +MODULE PROCEDURE COPY_3_REAL_8 +MODULE PROCEDURE COPY_4_REAL_4 +MODULE PROCEDURE COPY_4_REAL_8 +END INTERFACE + +CONTAINS + + + SUBROUTINE COPY_2_REAL_4(DST, SRC, KDIR, QUEUE) + REAL(4), TARGET :: DST(:,:), SRC(:,:) + REAL(4), POINTER :: DST_PTR(:,:), SRC_PTR(:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + + SRC_PTR => SRC + DST_PTR => DST + + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_4(SRC_PTR, 0) + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_4(DST_PTR, 0) + + IF (LAST_CONTIG_DIM_SRC == 2 .AND. LAST_CONTIG_DIM_DST == 2) THEN + CALL COPY_2_REAL_4_1D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ELSE + CALL COPY_2_REAL_4_2D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_2_REAL_4_1D(DST, SRC, KDIR, QUEUE) + REAL(4), CONTIGUOUS, POINTER :: DST (:,:), SRC (:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + + INTEGER (KIND=C_SIZE_T) :: ISIZE + + ISIZE = KIND (DST) * SIZE(DST(:,:), KIND=C_SIZE_T) + + IF (KDIR == NH2D) THEN + !$ACC HOST_DATA USE_DEVICE(DST) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_TO_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_TO_DEVICE(DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_FROM_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_FROM_DEVICE (DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ENDIF + + END SUBROUTINE + SUBROUTINE COPY_2_REAL_4_2D(DST, SRC, KDIR, QUEUE) + REAL(4), POINTER :: DST (:,:), SRC (:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + INTEGER :: NEXT_CONTIG_DIM_DST, NEXT_CONTIG_DIM_SRC + + INTEGER :: LAST_CONTIG_DIM + + INTEGER :: IRET + INTEGER :: IPITCH_DST, IPITCH_SRC + INTEGER(KIND=C_SIZE_T) :: IWIDTH, IHEIGHT, ISHP_SRC(3), ISHP_DST(3) + INTEGER(KIND=C_SIZE_T) :: ISTRIDES_SRC(3), ISTRIDES_DST(3) + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + TYPE(C_PTR) :: HSTPTR + TYPE(C_DEVPTR) :: DEVPTR + + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_4(DST, 0) + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_4(SRC, 0) + NEXT_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_4(DST, LAST_CONTIG_DIM_DST+1) + NEXT_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_4(SRC, LAST_CONTIG_DIM_SRC+1) + + ! We expect that device is always contiguous, and that host has only one non-contiguous dimension + IF (KDIR == NH2D) THEN + IF (LAST_CONTIG_DIM_DST /= 2) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_SRC /= 2) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_SRC + ELSE IF (KDIR == ND2H) THEN + IF (LAST_CONTIG_DIM_SRC /= 2) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_DST /= 2) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_DST + ENDIF + + + ISTRIDES_SRC(1) = LOC (SRC(LBOUND(SRC, 1)+1, LBOUND(SRC, 2))) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2))) + ISTRIDES_DST(1) = LOC (DST(LBOUND(DST, 1)+1, LBOUND(DST, 2))) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2))) + + ISTRIDES_SRC(2) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2)+1)) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2))) + ISTRIDES_DST(2) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2)+1)) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2))) + + ISTRIDES_SRC(3) = ISTRIDES_SRC(2) * SIZE(SRC, 2) + ISTRIDES_DST(3) = ISTRIDES_DST(2) * SIZE(DST, 2) + + ISHP_SRC(1) = 1 + ISHP_SRC(2:) = SHAPE(SRC) + ISHP_DST(1) = 1 + ISHP_DST(2:) = SHAPE(DST) + IWIDTH = PRODUCT(ISHP_DST(1:LAST_CONTIG_DIM+1)) * KIND(DST) + IHEIGHT = PRODUCT(ISHP_DST(LAST_CONTIG_DIM+2:NEXT_CONTIG_DIM_SRC+1)) + + IPITCH_SRC = ISTRIDES_SRC(LAST_CONTIG_DIM+1) + IPITCH_DST = ISTRIDES_DST(LAST_CONTIG_DIM+1) + + IF (KDIR == NH2D) THEN + HSTPTR = C_LOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2))) + !$ACC HOST_DATA USE_DEVICE(DST) + DEVPTR = C_DEVLOC(DST(LBOUND(DST, 1),LBOUND(DST, 2))) + !$ACC END HOST_DATA + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + DEVPTR = C_DEVLOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2))) + !$ACC END HOST_DATA + HSTPTR = C_LOC(DST(LBOUND(DST, 1),LBOUND(DST, 2))) + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in D2H") + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_2_REAL_8(DST, SRC, KDIR, QUEUE) + REAL(8), TARGET :: DST(:,:), SRC(:,:) + REAL(8), POINTER :: DST_PTR(:,:), SRC_PTR(:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + + SRC_PTR => SRC + DST_PTR => DST + + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_8(SRC_PTR, 0) + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_8(DST_PTR, 0) + + IF (LAST_CONTIG_DIM_SRC == 2 .AND. LAST_CONTIG_DIM_DST == 2) THEN + CALL COPY_2_REAL_8_1D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ELSE + CALL COPY_2_REAL_8_2D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_2_REAL_8_1D(DST, SRC, KDIR, QUEUE) + REAL(8), CONTIGUOUS, POINTER :: DST (:,:), SRC (:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + + INTEGER (KIND=C_SIZE_T) :: ISIZE + + ISIZE = KIND (DST) * SIZE(DST(:,:), KIND=C_SIZE_T) + + IF (KDIR == NH2D) THEN + !$ACC HOST_DATA USE_DEVICE(DST) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_TO_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_TO_DEVICE(DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_FROM_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_FROM_DEVICE (DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ENDIF + + END SUBROUTINE + SUBROUTINE COPY_2_REAL_8_2D(DST, SRC, KDIR, QUEUE) + REAL(8), POINTER :: DST (:,:), SRC (:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + INTEGER :: NEXT_CONTIG_DIM_DST, NEXT_CONTIG_DIM_SRC + + INTEGER :: LAST_CONTIG_DIM + + INTEGER :: IRET + INTEGER :: IPITCH_DST, IPITCH_SRC + INTEGER(KIND=C_SIZE_T) :: IWIDTH, IHEIGHT, ISHP_SRC(3), ISHP_DST(3) + INTEGER(KIND=C_SIZE_T) :: ISTRIDES_SRC(3), ISTRIDES_DST(3) + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + TYPE(C_PTR) :: HSTPTR + TYPE(C_DEVPTR) :: DEVPTR + + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_8(DST, 0) + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_8(SRC, 0) + NEXT_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_8(DST, LAST_CONTIG_DIM_DST+1) + NEXT_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_8(SRC, LAST_CONTIG_DIM_SRC+1) + + ! We expect that device is always contiguous, and that host has only one non-contiguous dimension + IF (KDIR == NH2D) THEN + IF (LAST_CONTIG_DIM_DST /= 2) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_SRC /= 2) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_SRC + ELSE IF (KDIR == ND2H) THEN + IF (LAST_CONTIG_DIM_SRC /= 2) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_DST /= 2) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_DST + ENDIF + + + ISTRIDES_SRC(1) = LOC (SRC(LBOUND(SRC, 1)+1, LBOUND(SRC, 2))) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2))) + ISTRIDES_DST(1) = LOC (DST(LBOUND(DST, 1)+1, LBOUND(DST, 2))) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2))) + + ISTRIDES_SRC(2) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2)+1)) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2))) + ISTRIDES_DST(2) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2)+1)) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2))) + + ISTRIDES_SRC(3) = ISTRIDES_SRC(2) * SIZE(SRC, 2) + ISTRIDES_DST(3) = ISTRIDES_DST(2) * SIZE(DST, 2) + + ISHP_SRC(1) = 1 + ISHP_SRC(2:) = SHAPE(SRC) + ISHP_DST(1) = 1 + ISHP_DST(2:) = SHAPE(DST) + IWIDTH = PRODUCT(ISHP_DST(1:LAST_CONTIG_DIM+1)) * KIND(DST) + IHEIGHT = PRODUCT(ISHP_DST(LAST_CONTIG_DIM+2:NEXT_CONTIG_DIM_SRC+1)) + + IPITCH_SRC = ISTRIDES_SRC(LAST_CONTIG_DIM+1) + IPITCH_DST = ISTRIDES_DST(LAST_CONTIG_DIM+1) + + IF (KDIR == NH2D) THEN + HSTPTR = C_LOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2))) + !$ACC HOST_DATA USE_DEVICE(DST) + DEVPTR = C_DEVLOC(DST(LBOUND(DST, 1),LBOUND(DST, 2))) + !$ACC END HOST_DATA + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + DEVPTR = C_DEVLOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2))) + !$ACC END HOST_DATA + HSTPTR = C_LOC(DST(LBOUND(DST, 1),LBOUND(DST, 2))) + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in D2H") + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_3_REAL_4(DST, SRC, KDIR, QUEUE) + REAL(4), TARGET :: DST(:,:,:), SRC(:,:,:) + REAL(4), POINTER :: DST_PTR(:,:,:), SRC_PTR(:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + + SRC_PTR => SRC + DST_PTR => DST + + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_4(SRC_PTR, 0) + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_4(DST_PTR, 0) + + IF (LAST_CONTIG_DIM_SRC == 3 .AND. LAST_CONTIG_DIM_DST == 3) THEN + CALL COPY_3_REAL_4_1D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ELSE + CALL COPY_3_REAL_4_2D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_3_REAL_4_1D(DST, SRC, KDIR, QUEUE) + REAL(4), CONTIGUOUS, POINTER :: DST (:,:,:), SRC (:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + + INTEGER (KIND=C_SIZE_T) :: ISIZE + + ISIZE = KIND (DST) * SIZE(DST(:,:,:), KIND=C_SIZE_T) + + IF (KDIR == NH2D) THEN + !$ACC HOST_DATA USE_DEVICE(DST) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_TO_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_TO_DEVICE(DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_FROM_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_FROM_DEVICE (DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ENDIF + + END SUBROUTINE + SUBROUTINE COPY_3_REAL_4_2D(DST, SRC, KDIR, QUEUE) + REAL(4), POINTER :: DST (:,:,:), SRC (:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + INTEGER :: NEXT_CONTIG_DIM_DST, NEXT_CONTIG_DIM_SRC + + INTEGER :: LAST_CONTIG_DIM + + INTEGER :: IRET + INTEGER :: IPITCH_DST, IPITCH_SRC + INTEGER(KIND=C_SIZE_T) :: IWIDTH, IHEIGHT, ISHP_SRC(4), ISHP_DST(4) + INTEGER(KIND=C_SIZE_T) :: ISTRIDES_SRC(4), ISTRIDES_DST(4) + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + TYPE(C_PTR) :: HSTPTR + TYPE(C_DEVPTR) :: DEVPTR + + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_4(DST, 0) + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_4(SRC, 0) + NEXT_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_4(DST, LAST_CONTIG_DIM_DST+1) + NEXT_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_4(SRC, LAST_CONTIG_DIM_SRC+1) + + ! We expect that device is always contiguous, and that host has only one non-contiguous dimension + IF (KDIR == NH2D) THEN + IF (LAST_CONTIG_DIM_DST /= 3) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_SRC /= 3) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_SRC + ELSE IF (KDIR == ND2H) THEN + IF (LAST_CONTIG_DIM_SRC /= 3) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_DST /= 3) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_DST + ENDIF + + + ISTRIDES_SRC(1) = LOC (SRC(LBOUND(SRC, 1)+1, LBOUND(SRC, 2), LBOUND(SRC, 3))) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2),& + & LBOUND(SRC, 3))) + ISTRIDES_DST(1) = LOC (DST(LBOUND(DST, 1)+1, LBOUND(DST, 2), LBOUND(DST, 3))) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2),& + & LBOUND(DST, 3))) + + ISTRIDES_SRC(2) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2)+1, LBOUND(SRC, 3))) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2),& + & LBOUND(SRC, 3))) + ISTRIDES_DST(2) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2)+1, LBOUND(DST, 3))) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2),& + & LBOUND(DST, 3))) + + ISTRIDES_SRC(3) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2), LBOUND(SRC, 3)+1)) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2),& + & LBOUND(SRC, 3))) + ISTRIDES_DST(3) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2), LBOUND(DST, 3)+1)) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2),& + & LBOUND(DST, 3))) + + ISTRIDES_SRC(4) = ISTRIDES_SRC(3) * SIZE(SRC, 3) + ISTRIDES_DST(4) = ISTRIDES_DST(3) * SIZE(DST, 3) + + ISHP_SRC(1) = 1 + ISHP_SRC(2:) = SHAPE(SRC) + ISHP_DST(1) = 1 + ISHP_DST(2:) = SHAPE(DST) + IWIDTH = PRODUCT(ISHP_DST(1:LAST_CONTIG_DIM+1)) * KIND(DST) + IHEIGHT = PRODUCT(ISHP_DST(LAST_CONTIG_DIM+2:NEXT_CONTIG_DIM_SRC+1)) + + IPITCH_SRC = ISTRIDES_SRC(LAST_CONTIG_DIM+1) + IPITCH_DST = ISTRIDES_DST(LAST_CONTIG_DIM+1) + + IF (KDIR == NH2D) THEN + HSTPTR = C_LOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2),LBOUND(SRC, 3))) + !$ACC HOST_DATA USE_DEVICE(DST) + DEVPTR = C_DEVLOC(DST(LBOUND(DST, 1),LBOUND(DST, 2),LBOUND(DST, 3))) + !$ACC END HOST_DATA + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + DEVPTR = C_DEVLOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2),LBOUND(SRC, 3))) + !$ACC END HOST_DATA + HSTPTR = C_LOC(DST(LBOUND(DST, 1),LBOUND(DST, 2),LBOUND(DST, 3))) + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in D2H") + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_3_REAL_8(DST, SRC, KDIR, QUEUE) + REAL(8), TARGET :: DST(:,:,:), SRC(:,:,:) + REAL(8), POINTER :: DST_PTR(:,:,:), SRC_PTR(:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + + SRC_PTR => SRC + DST_PTR => DST + + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_8(SRC_PTR, 0) + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_8(DST_PTR, 0) + + IF (LAST_CONTIG_DIM_SRC == 3 .AND. LAST_CONTIG_DIM_DST == 3) THEN + CALL COPY_3_REAL_8_1D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ELSE + CALL COPY_3_REAL_8_2D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_3_REAL_8_1D(DST, SRC, KDIR, QUEUE) + REAL(8), CONTIGUOUS, POINTER :: DST (:,:,:), SRC (:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + + INTEGER (KIND=C_SIZE_T) :: ISIZE + + ISIZE = KIND (DST) * SIZE(DST(:,:,:), KIND=C_SIZE_T) + + IF (KDIR == NH2D) THEN + !$ACC HOST_DATA USE_DEVICE(DST) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_TO_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_TO_DEVICE(DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_FROM_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_FROM_DEVICE (DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ENDIF + + END SUBROUTINE + SUBROUTINE COPY_3_REAL_8_2D(DST, SRC, KDIR, QUEUE) + REAL(8), POINTER :: DST (:,:,:), SRC (:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + INTEGER :: NEXT_CONTIG_DIM_DST, NEXT_CONTIG_DIM_SRC + + INTEGER :: LAST_CONTIG_DIM + + INTEGER :: IRET + INTEGER :: IPITCH_DST, IPITCH_SRC + INTEGER(KIND=C_SIZE_T) :: IWIDTH, IHEIGHT, ISHP_SRC(4), ISHP_DST(4) + INTEGER(KIND=C_SIZE_T) :: ISTRIDES_SRC(4), ISTRIDES_DST(4) + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + TYPE(C_PTR) :: HSTPTR + TYPE(C_DEVPTR) :: DEVPTR + + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_8(DST, 0) + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_8(SRC, 0) + NEXT_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_8(DST, LAST_CONTIG_DIM_DST+1) + NEXT_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_8(SRC, LAST_CONTIG_DIM_SRC+1) + + ! We expect that device is always contiguous, and that host has only one non-contiguous dimension + IF (KDIR == NH2D) THEN + IF (LAST_CONTIG_DIM_DST /= 3) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_SRC /= 3) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_SRC + ELSE IF (KDIR == ND2H) THEN + IF (LAST_CONTIG_DIM_SRC /= 3) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_DST /= 3) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_DST + ENDIF + + + ISTRIDES_SRC(1) = LOC (SRC(LBOUND(SRC, 1)+1, LBOUND(SRC, 2), LBOUND(SRC, 3))) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2),& + & LBOUND(SRC, 3))) + ISTRIDES_DST(1) = LOC (DST(LBOUND(DST, 1)+1, LBOUND(DST, 2), LBOUND(DST, 3))) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2),& + & LBOUND(DST, 3))) + + ISTRIDES_SRC(2) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2)+1, LBOUND(SRC, 3))) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2),& + & LBOUND(SRC, 3))) + ISTRIDES_DST(2) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2)+1, LBOUND(DST, 3))) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2),& + & LBOUND(DST, 3))) + + ISTRIDES_SRC(3) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2), LBOUND(SRC, 3)+1)) - LOC (SRC (LBOUND(SRC, 1), LBOUND(SRC, 2),& + & LBOUND(SRC, 3))) + ISTRIDES_DST(3) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2), LBOUND(DST, 3)+1)) - LOC (DST (LBOUND(DST, 1), LBOUND(DST, 2),& + & LBOUND(DST, 3))) + + ISTRIDES_SRC(4) = ISTRIDES_SRC(3) * SIZE(SRC, 3) + ISTRIDES_DST(4) = ISTRIDES_DST(3) * SIZE(DST, 3) + + ISHP_SRC(1) = 1 + ISHP_SRC(2:) = SHAPE(SRC) + ISHP_DST(1) = 1 + ISHP_DST(2:) = SHAPE(DST) + IWIDTH = PRODUCT(ISHP_DST(1:LAST_CONTIG_DIM+1)) * KIND(DST) + IHEIGHT = PRODUCT(ISHP_DST(LAST_CONTIG_DIM+2:NEXT_CONTIG_DIM_SRC+1)) + + IPITCH_SRC = ISTRIDES_SRC(LAST_CONTIG_DIM+1) + IPITCH_DST = ISTRIDES_DST(LAST_CONTIG_DIM+1) + + IF (KDIR == NH2D) THEN + HSTPTR = C_LOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2),LBOUND(SRC, 3))) + !$ACC HOST_DATA USE_DEVICE(DST) + DEVPTR = C_DEVLOC(DST(LBOUND(DST, 1),LBOUND(DST, 2),LBOUND(DST, 3))) + !$ACC END HOST_DATA + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + DEVPTR = C_DEVLOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2),LBOUND(SRC, 3))) + !$ACC END HOST_DATA + HSTPTR = C_LOC(DST(LBOUND(DST, 1),LBOUND(DST, 2),LBOUND(DST, 3))) + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in D2H") + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_4_REAL_4(DST, SRC, KDIR, QUEUE) + REAL(4), TARGET :: DST(:,:,:,:), SRC(:,:,:,:) + REAL(4), POINTER :: DST_PTR(:,:,:,:), SRC_PTR(:,:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + + SRC_PTR => SRC + DST_PTR => DST + + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_4(SRC_PTR, 0) + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_4(DST_PTR, 0) + + IF (LAST_CONTIG_DIM_SRC == 4 .AND. LAST_CONTIG_DIM_DST == 4) THEN + CALL COPY_4_REAL_4_1D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ELSE + CALL COPY_4_REAL_4_2D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_4_REAL_4_1D(DST, SRC, KDIR, QUEUE) + REAL(4), CONTIGUOUS, POINTER :: DST (:,:,:,:), SRC (:,:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + + INTEGER (KIND=C_SIZE_T) :: ISIZE + + ISIZE = KIND (DST) * SIZE(DST(:,:,:,:), KIND=C_SIZE_T) + + IF (KDIR == NH2D) THEN + !$ACC HOST_DATA USE_DEVICE(DST) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_TO_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_TO_DEVICE(DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_FROM_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_FROM_DEVICE (DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ENDIF + + END SUBROUTINE + SUBROUTINE COPY_4_REAL_4_2D(DST, SRC, KDIR, QUEUE) + REAL(4), POINTER :: DST (:,:,:,:), SRC (:,:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + INTEGER :: NEXT_CONTIG_DIM_DST, NEXT_CONTIG_DIM_SRC + + INTEGER :: LAST_CONTIG_DIM + + INTEGER :: IRET + INTEGER :: IPITCH_DST, IPITCH_SRC + INTEGER(KIND=C_SIZE_T) :: IWIDTH, IHEIGHT, ISHP_SRC(5), ISHP_DST(5) + INTEGER(KIND=C_SIZE_T) :: ISTRIDES_SRC(5), ISTRIDES_DST(5) + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + TYPE(C_PTR) :: HSTPTR + TYPE(C_DEVPTR) :: DEVPTR + + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_4(DST, 0) + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_4(SRC, 0) + NEXT_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_4(DST, LAST_CONTIG_DIM_DST+1) + NEXT_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_4(SRC, LAST_CONTIG_DIM_SRC+1) + + ! We expect that device is always contiguous, and that host has only one non-contiguous dimension + IF (KDIR == NH2D) THEN + IF (LAST_CONTIG_DIM_DST /= 4) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_SRC /= 4) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_SRC + ELSE IF (KDIR == ND2H) THEN + IF (LAST_CONTIG_DIM_SRC /= 4) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_DST /= 4) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_DST + ENDIF + + + ISTRIDES_SRC(1) = LOC (SRC(LBOUND(SRC, 1)+1, LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) - LOC (SRC (LBOUND(SRC, 1),& + & LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) + ISTRIDES_DST(1) = LOC (DST(LBOUND(DST, 1)+1, LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) - LOC (DST (LBOUND(DST, 1),& + & LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) + + ISTRIDES_SRC(2) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2)+1, LBOUND(SRC, 3), LBOUND(SRC, 4))) - LOC (SRC (LBOUND(SRC, 1),& + & LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) + ISTRIDES_DST(2) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2)+1, LBOUND(DST, 3), LBOUND(DST, 4))) - LOC (DST (LBOUND(DST, 1),& + & LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) + + ISTRIDES_SRC(3) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2), LBOUND(SRC, 3)+1, LBOUND(SRC, 4))) - LOC (SRC (LBOUND(SRC, 1),& + & LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) + ISTRIDES_DST(3) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2), LBOUND(DST, 3)+1, LBOUND(DST, 4))) - LOC (DST (LBOUND(DST, 1),& + & LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) + + ISTRIDES_SRC(4) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4)+1)) - LOC (SRC (LBOUND(SRC, 1),& + & LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) + ISTRIDES_DST(4) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4)+1)) - LOC (DST (LBOUND(DST, 1),& + & LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) + + ISTRIDES_SRC(5) = ISTRIDES_SRC(4) * SIZE(SRC, 4) + ISTRIDES_DST(5) = ISTRIDES_DST(4) * SIZE(DST, 4) + + ISHP_SRC(1) = 1 + ISHP_SRC(2:) = SHAPE(SRC) + ISHP_DST(1) = 1 + ISHP_DST(2:) = SHAPE(DST) + IWIDTH = PRODUCT(ISHP_DST(1:LAST_CONTIG_DIM+1)) * KIND(DST) + IHEIGHT = PRODUCT(ISHP_DST(LAST_CONTIG_DIM+2:NEXT_CONTIG_DIM_SRC+1)) + + IPITCH_SRC = ISTRIDES_SRC(LAST_CONTIG_DIM+1) + IPITCH_DST = ISTRIDES_DST(LAST_CONTIG_DIM+1) + + IF (KDIR == NH2D) THEN + HSTPTR = C_LOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2),LBOUND(SRC, 3),LBOUND(SRC, 4))) + !$ACC HOST_DATA USE_DEVICE(DST) + DEVPTR = C_DEVLOC(DST(LBOUND(DST, 1),LBOUND(DST, 2),LBOUND(DST, 3),LBOUND(DST, 4))) + !$ACC END HOST_DATA + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + DEVPTR = C_DEVLOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2),LBOUND(SRC, 3),LBOUND(SRC, 4))) + !$ACC END HOST_DATA + HSTPTR = C_LOC(DST(LBOUND(DST, 1),LBOUND(DST, 2),LBOUND(DST, 3),LBOUND(DST, 4))) + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in D2H") + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_4_REAL_8(DST, SRC, KDIR, QUEUE) + REAL(8), TARGET :: DST(:,:,:,:), SRC(:,:,:,:) + REAL(8), POINTER :: DST_PTR(:,:,:,:), SRC_PTR(:,:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + + SRC_PTR => SRC + DST_PTR => DST + + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_8(SRC_PTR, 0) + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_8(DST_PTR, 0) + + IF (LAST_CONTIG_DIM_SRC == 4 .AND. LAST_CONTIG_DIM_DST == 4) THEN + CALL COPY_4_REAL_8_1D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ELSE + CALL COPY_4_REAL_8_2D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_4_REAL_8_1D(DST, SRC, KDIR, QUEUE) + REAL(8), CONTIGUOUS, POINTER :: DST (:,:,:,:), SRC (:,:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + + INTEGER (KIND=C_SIZE_T) :: ISIZE + + ISIZE = KIND (DST) * SIZE(DST(:,:,:,:), KIND=C_SIZE_T) + + IF (KDIR == NH2D) THEN + !$ACC HOST_DATA USE_DEVICE(DST) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_TO_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_TO_DEVICE(DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_FROM_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_FROM_DEVICE (DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ENDIF + + END SUBROUTINE + SUBROUTINE COPY_4_REAL_8_2D(DST, SRC, KDIR, QUEUE) + REAL(8), POINTER :: DST (:,:,:,:), SRC (:,:,:,:) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + INTEGER :: NEXT_CONTIG_DIM_DST, NEXT_CONTIG_DIM_SRC + + INTEGER :: LAST_CONTIG_DIM + + INTEGER :: IRET + INTEGER :: IPITCH_DST, IPITCH_SRC + INTEGER(KIND=C_SIZE_T) :: IWIDTH, IHEIGHT, ISHP_SRC(5), ISHP_DST(5) + INTEGER(KIND=C_SIZE_T) :: ISTRIDES_SRC(5), ISTRIDES_DST(5) + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + TYPE(C_PTR) :: HSTPTR + TYPE(C_DEVPTR) :: DEVPTR + + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_8(DST, 0) + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_8(SRC, 0) + NEXT_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_8(DST, LAST_CONTIG_DIM_DST+1) + NEXT_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_8(SRC, LAST_CONTIG_DIM_SRC+1) + + ! We expect that device is always contiguous, and that host has only one non-contiguous dimension + IF (KDIR == NH2D) THEN + IF (LAST_CONTIG_DIM_DST /= 4) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_SRC /= 4) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_SRC + ELSE IF (KDIR == ND2H) THEN + IF (LAST_CONTIG_DIM_SRC /= 4) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_DST /= 4) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_DST + ENDIF + + + ISTRIDES_SRC(1) = LOC (SRC(LBOUND(SRC, 1)+1, LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) - LOC (SRC (LBOUND(SRC, 1),& + & LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) + ISTRIDES_DST(1) = LOC (DST(LBOUND(DST, 1)+1, LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) - LOC (DST (LBOUND(DST, 1),& + & LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) + + ISTRIDES_SRC(2) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2)+1, LBOUND(SRC, 3), LBOUND(SRC, 4))) - LOC (SRC (LBOUND(SRC, 1),& + & LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) + ISTRIDES_DST(2) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2)+1, LBOUND(DST, 3), LBOUND(DST, 4))) - LOC (DST (LBOUND(DST, 1),& + & LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) + + ISTRIDES_SRC(3) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2), LBOUND(SRC, 3)+1, LBOUND(SRC, 4))) - LOC (SRC (LBOUND(SRC, 1),& + & LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) + ISTRIDES_DST(3) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2), LBOUND(DST, 3)+1, LBOUND(DST, 4))) - LOC (DST (LBOUND(DST, 1),& + & LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) + + ISTRIDES_SRC(4) = LOC (SRC(LBOUND(SRC, 1), LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4)+1)) - LOC (SRC (LBOUND(SRC, 1),& + & LBOUND(SRC, 2), LBOUND(SRC, 3), LBOUND(SRC, 4))) + ISTRIDES_DST(4) = LOC (DST(LBOUND(DST, 1), LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4)+1)) - LOC (DST (LBOUND(DST, 1),& + & LBOUND(DST, 2), LBOUND(DST, 3), LBOUND(DST, 4))) + + ISTRIDES_SRC(5) = ISTRIDES_SRC(4) * SIZE(SRC, 4) + ISTRIDES_DST(5) = ISTRIDES_DST(4) * SIZE(DST, 4) + + ISHP_SRC(1) = 1 + ISHP_SRC(2:) = SHAPE(SRC) + ISHP_DST(1) = 1 + ISHP_DST(2:) = SHAPE(DST) + IWIDTH = PRODUCT(ISHP_DST(1:LAST_CONTIG_DIM+1)) * KIND(DST) + IHEIGHT = PRODUCT(ISHP_DST(LAST_CONTIG_DIM+2:NEXT_CONTIG_DIM_SRC+1)) + + IPITCH_SRC = ISTRIDES_SRC(LAST_CONTIG_DIM+1) + IPITCH_DST = ISTRIDES_DST(LAST_CONTIG_DIM+1) + + IF (KDIR == NH2D) THEN + HSTPTR = C_LOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2),LBOUND(SRC, 3),LBOUND(SRC, 4))) + !$ACC HOST_DATA USE_DEVICE(DST) + DEVPTR = C_DEVLOC(DST(LBOUND(DST, 1),LBOUND(DST, 2),LBOUND(DST, 3),LBOUND(DST, 4))) + !$ACC END HOST_DATA + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + DEVPTR = C_DEVLOC(SRC(LBOUND(SRC, 1),LBOUND(SRC, 2),LBOUND(SRC, 3),LBOUND(SRC, 4))) + !$ACC END HOST_DATA + HSTPTR = C_LOC(DST(LBOUND(DST, 1),LBOUND(DST, 2),LBOUND(DST, 3),LBOUND(DST, 4))) + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in D2H") + ENDIF + END SUBROUTINE + + + + INTEGER FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_4(PTR, AFTER) RESULT (JDIM) + REAL(4), POINTER :: PTR (:,:) + INTEGER :: AFTER + INTEGER(KIND=C_SIZE_T) :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE + INTEGER :: J, LB(2) + + ! assume that dimension all dimensions before AFTER are contiguous... + + LB = LBOUND(PTR) + IF (AFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1)+1, LB(2))) - LOC (PTR (LB(1), LB(2))) + IF (AFTER < 1) THEN + ISIZE = 1 + IF (SIZE(PTR, 1) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 0 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 1) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2)+1)) - LOC (PTR (LB(1), LB(2))) + IF (AFTER < 2) THEN + ISIZE = SIZE(PTR, 1) + IF (SIZE(PTR, 2) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 1 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 2) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + JDIM = 2 + END FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_4 + + + INTEGER FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_8(PTR, AFTER) RESULT (JDIM) + REAL(8), POINTER :: PTR (:,:) + INTEGER :: AFTER + INTEGER(KIND=C_SIZE_T) :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE + INTEGER :: J, LB(2) + + ! assume that dimension all dimensions before AFTER are contiguous... + + LB = LBOUND(PTR) + IF (AFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1)+1, LB(2))) - LOC (PTR (LB(1), LB(2))) + IF (AFTER < 1) THEN + ISIZE = 1 + IF (SIZE(PTR, 1) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 0 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 1) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2)+1)) - LOC (PTR (LB(1), LB(2))) + IF (AFTER < 2) THEN + ISIZE = SIZE(PTR, 1) + IF (SIZE(PTR, 2) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 1 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 2) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + JDIM = 2 + END FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_2_REAL_8 + + + INTEGER FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_4(PTR, AFTER) RESULT (JDIM) + REAL(4), POINTER :: PTR (:,:,:) + INTEGER :: AFTER + INTEGER(KIND=C_SIZE_T) :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE + INTEGER :: J, LB(3) + + ! assume that dimension all dimensions before AFTER are contiguous... + + LB = LBOUND(PTR) + IF (AFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1)+1, LB(2), LB(3))) - LOC (PTR (LB(1), LB(2), LB(3))) + IF (AFTER < 1) THEN + ISIZE = 1 + IF (SIZE(PTR, 1) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 0 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 1) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2)+1, LB(3))) - LOC (PTR (LB(1), LB(2), LB(3))) + IF (AFTER < 2) THEN + ISIZE = SIZE(PTR, 1) + IF (SIZE(PTR, 2) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 1 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 2) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2), LB(3)+1)) - LOC (PTR (LB(1), LB(2), LB(3))) + IF (AFTER < 3) THEN + ISIZE = SIZE(PTR, 2) + IF (SIZE(PTR, 3) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 2 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 3) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + JDIM = 3 + END FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_4 + + + INTEGER FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_8(PTR, AFTER) RESULT (JDIM) + REAL(8), POINTER :: PTR (:,:,:) + INTEGER :: AFTER + INTEGER(KIND=C_SIZE_T) :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE + INTEGER :: J, LB(3) + + ! assume that dimension all dimensions before AFTER are contiguous... + + LB = LBOUND(PTR) + IF (AFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1)+1, LB(2), LB(3))) - LOC (PTR (LB(1), LB(2), LB(3))) + IF (AFTER < 1) THEN + ISIZE = 1 + IF (SIZE(PTR, 1) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 0 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 1) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2)+1, LB(3))) - LOC (PTR (LB(1), LB(2), LB(3))) + IF (AFTER < 2) THEN + ISIZE = SIZE(PTR, 1) + IF (SIZE(PTR, 2) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 1 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 2) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2), LB(3)+1)) - LOC (PTR (LB(1), LB(2), LB(3))) + IF (AFTER < 3) THEN + ISIZE = SIZE(PTR, 2) + IF (SIZE(PTR, 3) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 2 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 3) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + JDIM = 3 + END FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_3_REAL_8 + + + INTEGER FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_4(PTR, AFTER) RESULT (JDIM) + REAL(4), POINTER :: PTR (:,:,:,:) + INTEGER :: AFTER + INTEGER(KIND=C_SIZE_T) :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE + INTEGER :: J, LB(4) + + ! assume that dimension all dimensions before AFTER are contiguous... + + LB = LBOUND(PTR) + IF (AFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1)+1, LB(2), LB(3), LB(4))) - LOC (PTR (LB(1), LB(2), LB(3), LB(4))) + IF (AFTER < 1) THEN + ISIZE = 1 + IF (SIZE(PTR, 1) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 0 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 1) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2)+1, LB(3), LB(4))) - LOC (PTR (LB(1), LB(2), LB(3), LB(4))) + IF (AFTER < 2) THEN + ISIZE = SIZE(PTR, 1) + IF (SIZE(PTR, 2) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 1 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 2) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2), LB(3)+1, LB(4))) - LOC (PTR (LB(1), LB(2), LB(3), LB(4))) + IF (AFTER < 3) THEN + ISIZE = SIZE(PTR, 2) + IF (SIZE(PTR, 3) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 2 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 3) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2), LB(3), LB(4)+1)) - LOC (PTR (LB(1), LB(2), LB(3), LB(4))) + IF (AFTER < 4) THEN + ISIZE = SIZE(PTR, 3) + IF (SIZE(PTR, 4) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 3 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 4) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + JDIM = 4 + END FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_4 + + + INTEGER FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_8(PTR, AFTER) RESULT (JDIM) + REAL(8), POINTER :: PTR (:,:,:,:) + INTEGER :: AFTER + INTEGER(KIND=C_SIZE_T) :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE + INTEGER :: J, LB(4) + + ! assume that dimension all dimensions before AFTER are contiguous... + + LB = LBOUND(PTR) + IF (AFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1)+1, LB(2), LB(3), LB(4))) - LOC (PTR (LB(1), LB(2), LB(3), LB(4))) + IF (AFTER < 1) THEN + ISIZE = 1 + IF (SIZE(PTR, 1) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 0 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 1) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2)+1, LB(3), LB(4))) - LOC (PTR (LB(1), LB(2), LB(3), LB(4))) + IF (AFTER < 2) THEN + ISIZE = SIZE(PTR, 1) + IF (SIZE(PTR, 2) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 1 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 2) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2), LB(3)+1, LB(4))) - LOC (PTR (LB(1), LB(2), LB(3), LB(4))) + IF (AFTER < 3) THEN + ISIZE = SIZE(PTR, 2) + IF (SIZE(PTR, 3) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 2 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 3) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + ITHIS_STRIDE = LOC (PTR (LB(1), LB(2), LB(3), LB(4)+1)) - LOC (PTR (LB(1), LB(2), LB(3), LB(4))) + IF (AFTER < 4) THEN + ISIZE = SIZE(PTR, 3) + IF (SIZE(PTR, 4) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = 3 + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == 4) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + + JDIM = 4 + END FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_4_REAL_8 + + +END MODULE CUDA_COPY_MODULE diff --git a/src/trans/gpu/internal/cuda_copy_module.fypp b/src/trans/gpu/internal/cuda_copy_module.fypp new file mode 100644 index 000000000..c1b4d3078 --- /dev/null +++ b/src/trans/gpu/internal/cuda_copy_module.fypp @@ -0,0 +1,249 @@ +#! (C) Copyright 2022- ECMWF. +#! (C) Copyright 2022- Meteo-France. +#! (C) Copyright 2023- NVIDIA +#! +#! This software is licensed under the terms of the Apache Licence Version 2.0 +#! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. +#! In applying this licence, ECMWF does not waive the privileges and immunities +#! granted to it by virtue of its status as an intergovernmental organisation +#! nor does it submit to any jurisdiction. + +#! fypp cuda_copy_module.fypp -m collections -m itertools > cuda_copy_module.F90 + +#:set shape = lambda rank : ','.join([':'] * rank) +#:set function_name = lambda dt: str(dt.type).upper() + '_' + str(dt.kind).upper() + +MODULE CUDA_COPY_MODULE + +USE OPENACC +USE CUDAFOR +USE ISO_C_BINDING + +#:set DataType = collections.namedtuple('DataType', ['type', 'kind']) + +#:set fields = list(itertools.product(list(range(2, 5)), [DataType("real", "4"), DataType("real", "8")])) + +IMPLICIT NONE + +PRIVATE + +PUBLIC :: COPY, NH2D, ND2H, ACC_HANDLE_KIND + +INTEGER(4), PARAMETER :: NH2D = CUDAMEMCPYHOSTTODEVICE , ND2H = CUDAMEMCPYDEVICETOHOST + +INTERFACE COPY +#:for rank, datatype in fields +MODULE PROCEDURE COPY_${rank}$_${function_name(datatype)}$ +#:endfor +END INTERFACE + +CONTAINS + +#:for rank, datatype in fields +#:set ftn = f"{rank}_{function_name(datatype)}".upper() +#:set dt = f"{datatype.type}({datatype.kind})".upper() + + SUBROUTINE COPY_${ftn}$(DST, SRC, KDIR, QUEUE) + ${dt}$, TARGET :: DST(${shape(rank)}$), SRC(${shape(rank)}$) + ${dt}$, POINTER :: DST_PTR(${shape(rank)}$), SRC_PTR(${shape(rank)}$) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + + SRC_PTR => SRC + DST_PTR => DST + + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_${ftn}$(SRC_PTR, 0) + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_${ftn}$(DST_PTR, 0) + + IF (LAST_CONTIG_DIM_SRC == ${rank}$ .AND. LAST_CONTIG_DIM_DST == ${rank}$) THEN + CALL COPY_${ftn}$_1D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ELSE + CALL COPY_${ftn}$_2D(DST_PTR, SRC_PTR, KDIR, QUEUE) + ENDIF + END SUBROUTINE + + SUBROUTINE COPY_${ftn}$_1D(DST, SRC, KDIR, QUEUE) + ${dt}$, CONTIGUOUS, POINTER :: DST (${shape(rank)}$), SRC (${shape(rank)}$) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + + INTEGER (KIND=C_SIZE_T) :: ISIZE + + ISIZE = KIND (DST) * SIZE(DST(${','.join([':'] * rank)}$), KIND=C_SIZE_T) + +#:if defined('DEBUG') + PRINT *, "COPY_${ftn}$_1D" + PRINT *, "SIZE", ISIZE + +#:endif + IF (KDIR == NH2D) THEN + !$ACC HOST_DATA USE_DEVICE(DST) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_TO_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_TO_DEVICE(DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + IF(PRESENT(QUEUE))THEN + CALL ACC_MEMCPY_FROM_DEVICE_ASYNC(DST, SRC, ISIZE, QUEUE) + ELSE + CALL ACC_MEMCPY_FROM_DEVICE (DST, SRC, ISIZE) + ENDIF + !$ACC END HOST_DATA + ENDIF + + END SUBROUTINE + SUBROUTINE COPY_${ftn}$_2D(DST, SRC, KDIR, QUEUE) + ${dt}$, POINTER :: DST (${shape(rank)}$), SRC (${shape(rank)}$) + INTEGER (KIND=4), INTENT (IN) :: KDIR + INTEGER (KIND=ACC_HANDLE_KIND), OPTIONAL, INTENT (IN) :: QUEUE + INTEGER :: LAST_CONTIG_DIM_DST, LAST_CONTIG_DIM_SRC + INTEGER :: NEXT_CONTIG_DIM_DST, NEXT_CONTIG_DIM_SRC + + INTEGER :: LAST_CONTIG_DIM + + INTEGER :: IRET + INTEGER :: IPITCH_DST, IPITCH_SRC + INTEGER(KIND=C_SIZE_T) :: IWIDTH, IHEIGHT, ISHP_SRC(${rank+1}$), ISHP_DST(${rank+1}$) + INTEGER(KIND=C_SIZE_T) :: ISTRIDES_SRC(${rank+1}$), ISTRIDES_DST(${rank+1}$) + INTEGER(KIND=CUDA_STREAM_KIND) :: STREAM + TYPE(C_PTR) :: HSTPTR + TYPE(C_DEVPTR) :: DEVPTR + + LAST_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_${ftn}$(DST, 0) + LAST_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_${ftn}$(SRC, 0) + NEXT_CONTIG_DIM_DST = GET_LAST_CONTIGUOUS_DIMENSION_${ftn}$(DST, LAST_CONTIG_DIM_DST+1) + NEXT_CONTIG_DIM_SRC = GET_LAST_CONTIGUOUS_DIMENSION_${ftn}$(SRC, LAST_CONTIG_DIM_SRC+1) + + ! We expect that device is always contiguous, and that host has only one non-contiguous dimension + IF (KDIR == NH2D) THEN + IF (LAST_CONTIG_DIM_DST /= ${rank}$) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_SRC /= ${rank}$) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_SRC + ELSE IF (KDIR == ND2H) THEN + IF (LAST_CONTIG_DIM_SRC /= ${rank}$) CALL ABOR1("device must be contiguous") + IF (NEXT_CONTIG_DIM_DST /= ${rank}$) CALL ABOR1("host must have at most one non-cont dim") + LAST_CONTIG_DIM = LAST_CONTIG_DIM_DST + ENDIF + +#:for d in range(rank) +#:set lb = lambda arr, i: f'LBOUND({arr}, {i+1})' +#:set lbnds = lambda arr, start, end: [lb(arr, i) for i in range(start, end)] +#:set this_slice = lambda arr: ', '.join(lbnds(arr, 0, rank)) +#:set next_slice = lambda arr: ', '.join(lbnds(arr, 0, d) + [lb(arr, d)+'+1'] + lbnds(arr, d+1, rank)) + + ISTRIDES_SRC(${d+1}$) = LOC (SRC(${next_slice('SRC')}$)) - LOC (SRC (${this_slice('SRC')}$)) + ISTRIDES_DST(${d+1}$) = LOC (DST(${next_slice('DST')}$)) - LOC (DST (${this_slice('DST')}$)) +#:endfor + + ISTRIDES_SRC(${rank+1}$) = ISTRIDES_SRC(${rank}$) * SIZE(SRC, ${rank}$) + ISTRIDES_DST(${rank+1}$) = ISTRIDES_DST(${rank}$) * SIZE(DST, ${rank}$) + + ISHP_SRC(1) = 1 + ISHP_SRC(2:) = SHAPE(SRC) + ISHP_DST(1) = 1 + ISHP_DST(2:) = SHAPE(DST) + IWIDTH = PRODUCT(ISHP_DST(1:LAST_CONTIG_DIM+1)) * KIND(DST) + IHEIGHT = PRODUCT(ISHP_DST(LAST_CONTIG_DIM+2:NEXT_CONTIG_DIM_SRC+1)) + + IPITCH_SRC = ISTRIDES_SRC(LAST_CONTIG_DIM+1) + IPITCH_DST = ISTRIDES_DST(LAST_CONTIG_DIM+1) + +#:if defined('DEBUG') + PRINT *, "COPY_${ftn}$_2D" + PRINT *, "IPITCH_SRC/DST", IPITCH_SRC, IPITCH_DST + PRINT *, "IWIDTH/IHEIGHT", IWIDTH, IHEIGHT + +#:endif + #:set ar = lambda arr: ','.join(lbnds(arr, 0, rank)) + IF (KDIR == NH2D) THEN + HSTPTR = C_LOC(SRC(${ar('SRC')}$)) + !$ACC HOST_DATA USE_DEVICE(DST) + DEVPTR = C_DEVLOC(DST(${ar('DST')}$)) + !$ACC END HOST_DATA + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(DEVPTR, IPITCH_DST, & + & HSTPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in H2D") + ELSEIF (KDIR == ND2H) THEN + !$ACC HOST_DATA USE_DEVICE(SRC) + DEVPTR = C_DEVLOC(SRC(${ar('SRC')}$)) + !$ACC END HOST_DATA + HSTPTR = C_LOC(DST(${ar('DST')}$)) + IF(PRESENT(QUEUE)) THEN + STREAM = ACC_GET_CUDA_STREAM(QUEUE) + IRET = CUDAMEMCPY2DASYNC(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT, & + & STREAM=STREAM) + ELSE + IRET = CUDAMEMCPY2D(HSTPTR, IPITCH_DST, & + & DEVPTR, IPITCH_SRC, & + & IWIDTH, IHEIGHT) + ENDIF + IF (IRET /= CUDASUCCESS) CALL ABOR1("error in D2H") + ENDIF + END SUBROUTINE +#:endfor + + +#:for rank, datatype in fields +#:set ftn = f"{rank}_{function_name(datatype)}".upper() +#:set dt = f"{datatype.type}({datatype.kind})".upper() + + INTEGER FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_${ftn}$(PTR, AFTER) RESULT (JDIM) + ${dt}$, POINTER :: PTR (${shape(rank)}$) + INTEGER :: AFTER + INTEGER(KIND=C_SIZE_T) :: IPREVIOUS_STRIDE, ITHIS_STRIDE, ISIZE + INTEGER :: J, LB(${rank}$) + + ! assume that dimension all dimensions before AFTER are contiguous... + + LB = LBOUND(PTR) + IF (AFTER == 0) THEN + IPREVIOUS_STRIDE = KIND (PTR) + ENDIF + +#:for d in range (1, rank+1) + #:set ind0 = "" + #:set ind1 = "" + #:for i in range(1,rank+1) + #:set ind0 = ind0 + "LB({}), ".format(i) + #:set ind1 = ind1 + "LB({}){}, ".format(i,"+1"*(i==d)) + #:endfor + #:set ind0 = ind0[:-2] + #:set ind1 = ind1[:-2] + ITHIS_STRIDE = LOC (PTR (${ind1}$)) - LOC (PTR (${ind0}$)) + IF (AFTER < ${d}$) THEN + #:if d == 1 + ISIZE = 1 + #:else + ISIZE = SIZE(PTR, ${d-1}$) + #:endif + IF (SIZE(PTR, ${d}$) /= 1 .AND. IPREVIOUS_STRIDE * ISIZE /= ITHIS_STRIDE) THEN + JDIM = ${d-1}$ + RETURN + ENDIF + IPREVIOUS_STRIDE = IPREVIOUS_STRIDE * ISIZE + ELSE IF (AFTER == ${d}$) THEN + IPREVIOUS_STRIDE = ITHIS_STRIDE + ENDIF + +#:endfor + JDIM = ${rank}$ + END FUNCTION GET_LAST_CONTIGUOUS_DIMENSION_${ftn}$ + +#:endfor + +END MODULE CUDA_COPY_MODULE diff --git a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 index 776968471..556507238 100755 --- a/src/trans/gpu/internal/dir_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctl_mod.F90 @@ -155,7 +155,7 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& CALL INSTANTIATE_ALLOCATOR(ALLOCATOR, GROWING_ALLOCATION) ! from the PGP arrays to PREEL_REAL - WRITE(NOUT,*) 'dir_trans_ctl_mod:TRGTOL_CUDAAWARE' + WRITE(NOUT,*) 'dir_trans_ctl_mod:TRGTOL' CALL TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& @@ -170,9 +170,9 @@ SUBROUTINE DIR_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& CALL GSTATS(153,0) - WRITE(NOUT,*) 'dir_trans_ctl_mod:TRLTOM_CUDAAWARE' + WRITE(NOUT,*) 'dir_trans_ctl_mod:TRLTOM' CALL TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) - CALL TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) + CALL TRLTOM(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) CALL TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) CALL GSTATS(153,1) diff --git a/src/trans/gpu/internal/ext_acc.F90 b/src/trans/gpu/internal/ext_acc.F90 deleted file mode 100644 index bf42d9a5f..000000000 --- a/src/trans/gpu/internal/ext_acc.F90 +++ /dev/null @@ -1,357 +0,0 @@ -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -module openacc_ext_type - use iso_c_binding - implicit none - private - public :: ext_acc_arr_desc - - ! to my knowledge, this cannot be part of openacc_ext - type ext_acc_arr_desc - integer(c_size_t) :: ptr, sz - end type -end module -module openacc_ext - use iso_c_binding - use iso_fortran_env - use openacc, only : acc_create, acc_copyin, acc_handle_kind - use openacc_ext_type - implicit none - - private - public :: ext_acc_pass, ext_acc_create, ext_acc_copyin, ext_acc_copyout, & - & ext_acc_delete, ext_acc_arr_desc, acc_handle_kind - - type common_pointer_descr - type(c_ptr) :: ptr - integer(c_size_t) :: sz - end type - - interface ext_acc_pass - module procedure ext_acc_pass_2d_r4, ext_acc_pass_3d_r4, ext_acc_pass_4d_r4, ext_acc_pass_2d_r8, ext_acc_pass_3d_r8, ext_acc_pass_4d_r8 - end interface -contains - - function ext_acc_pass_2d_r4(arr) result(ret) - implicit none - type(ext_acc_arr_desc) :: ret - real(4), intent(in) :: arr(:,:) - - type(c_ptr) :: ptr1, ptr2 - integer(c_size_t) :: ptr1_v, ptr2_v - - ! get full slices for all but the last slice - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1)) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - - ret%ptr = ptr1_v - ret%sz = (ptr2_v - ptr1_v) * (size(arr, 2) - 1) - - ! for the last slice, take the actual offset, otherwise we imght go OOB - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) - ptr2 = c_loc(arr(lbound(arr,1)+1, lbound(arr,2))) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 1) - end function - function ext_acc_pass_3d_r4(arr) result(ret) - implicit none - type(ext_acc_arr_desc) :: ret - real(4), intent(in) :: arr(:,:,:) - - type(c_ptr) :: ptr1, ptr2 - integer(c_size_t) :: ptr1_v, ptr2_v - - ! get full slices for all but the last slice - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3)+1)) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - - ret%ptr = ptr1_v - ret%sz = (ptr2_v - ptr1_v) * (size(arr, 3) - 1) - - ! for the last slice, take the actual offset, otherwise we imght go OOB - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1, lbound(arr,3))) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 2) - end function - function ext_acc_pass_4d_r4(arr) result(ret) - implicit none - type(ext_acc_arr_desc) :: ret - real(4), intent(in) :: arr(:,:,:,:) - - type(c_ptr) :: ptr1, ptr2 - integer(c_size_t) :: ptr1_v, ptr2_v - - ! get full slices for all but the last slice - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4)+1)) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - - ret%ptr = ptr1_v - ret%sz = (ptr2_v - ptr1_v) * (size(arr, 4) - 1) - - ! for the last slice, take the actual offset, otherwise we imght go OOB - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3)+1, lbound(arr,4))) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 3) - end function - function ext_acc_pass_2d_r8(arr) result(ret) - implicit none - type(ext_acc_arr_desc) :: ret - real(8), intent(in) :: arr(:,:) - - type(c_ptr) :: ptr1, ptr2 - integer(c_size_t) :: ptr1_v, ptr2_v - - ! get full slices for all but the last slice - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1)) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - - ret%ptr = ptr1_v - ret%sz = (ptr2_v - ptr1_v) * (size(arr, 2) - 1) - - ! for the last slice, take the actual offset, otherwise we imght go OOB - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2))) - ptr2 = c_loc(arr(lbound(arr,1)+1, lbound(arr,2))) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 1) - end function - function ext_acc_pass_3d_r8(arr) result(ret) - implicit none - type(ext_acc_arr_desc) :: ret - real(8), intent(in) :: arr(:,:,:) - - type(c_ptr) :: ptr1, ptr2 - integer(c_size_t) :: ptr1_v, ptr2_v - - ! get full slices for all but the last slice - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3)+1)) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - - ret%ptr = ptr1_v - ret%sz = (ptr2_v - ptr1_v) * (size(arr, 3) - 1) - - ! for the last slice, take the actual offset, otherwise we imght go OOB - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr,3))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2)+1, lbound(arr,3))) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 2) - end function - function ext_acc_pass_4d_r8(arr) result(ret) - implicit none - type(ext_acc_arr_desc) :: ret - real(8), intent(in) :: arr(:,:,:,:) - - type(c_ptr) :: ptr1, ptr2 - integer(c_size_t) :: ptr1_v, ptr2_v - - ! get full slices for all but the last slice - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4)+1)) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - - ret%ptr = ptr1_v - ret%sz = (ptr2_v - ptr1_v) * (size(arr, 4) - 1) - - ! for the last slice, take the actual offset, otherwise we imght go OOB - ptr1 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3), lbound(arr,4))) - ptr2 = c_loc(arr(lbound(arr,1), lbound(arr,2), lbound(arr, 3)+1, lbound(arr,4))) - ptr1_v= transfer(ptr1, ptr1_v) - ptr2_v= transfer(ptr2, ptr2_v) - ret%sz = ret%sz + (ptr2_v - ptr1_v) * size(arr, 3) - end function - function get_common_pointers(in_ptrs, out_ptrs) result(num_ranges) - implicit none - type(ext_acc_arr_desc), intent(in) :: in_ptrs(:) - type(common_pointer_descr), intent(out) :: out_ptrs(:) - - integer(c_size_t), allocatable :: ptrs_only(:) - logical, allocatable :: mask(:) - integer, allocatable :: sort_index(:) - - type(ext_acc_arr_desc), allocatable :: common_ptrs(:) - integer :: i, j, num_ranges - integer(c_size_t) :: start1, start2, end1, end2 - logical :: found - - ! first sort the pointers increasingly such that no gaps are possible - allocate(ptrs_only(size(in_ptrs))) - do i = 1, size(in_ptrs) - ptrs_only(i) = in_ptrs(i)%ptr - enddo - allocate(mask(size(in_ptrs))) - do i = 1, size(in_ptrs) - mask(i) = .true. - enddo - allocate(sort_index(size(in_ptrs))) - do i = 1, size(in_ptrs) - j = minloc(ptrs_only, 1, mask=mask) - mask(j) = .false. - sort_index(i) = j - enddo - - ! initialize - allocate(common_ptrs(size(in_ptrs))) - do i = 1, size(in_ptrs) - common_ptrs(1)%ptr = 0 - common_ptrs(1)%sz = 0 - enddo - - num_ranges = 1 - common_ptrs(1) = in_ptrs(sort_index(1)) - do i = 2, size(in_ptrs) - found = .false. - start1 = in_ptrs(sort_index(i))%ptr - end1 = in_ptrs(sort_index(i))%ptr + in_ptrs(sort_index(i))%sz - do j = 1, num_ranges - start2 = common_ptrs(j)%ptr - end2 = common_ptrs(j)%ptr + common_ptrs(j)%sz - if (max(start1, start2) <= min(end1, end2)) then - ! if we intersect with this range, extend the range - common_ptrs(j)%ptr = min(start1, start2) - common_ptrs(j)%sz = max(end1, end2) - common_ptrs(j)%ptr - found = .true. - exit - endif - enddo - if (.not. found) then - ! if we did not find anything: add a new one - num_ranges = num_ranges + 1 - common_ptrs(num_ranges)%ptr = start1 - common_ptrs(num_ranges)%sz = end1 - start1 - endif - enddo - do i = 1, num_ranges - out_ptrs(i)%ptr = transfer(common_ptrs(i)%ptr, out_ptrs(i)%ptr) - out_ptrs(i)%sz = common_ptrs(i)%sz - enddo - end function - subroutine ext_acc_create(ptrs, stream) - use openacc, only : acc_create, acc_async_sync - implicit none - type(ext_acc_arr_desc), intent(in) :: ptrs(:) - integer(acc_handle_kind), optional :: stream - - type(common_pointer_descr), allocatable :: common_ptrs(:) - - integer :: i, num_ranges - integer(kind=int32), pointer :: pp(:) - integer(acc_handle_kind) :: stream_act - - if (present(stream)) then - stream_act = stream - else - stream_act = acc_async_sync - endif - allocate(common_ptrs(size(ptrs))) - num_ranges = get_common_pointers(ptrs, common_ptrs) - - do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) - !!call acc_create_async(pp, common_ptrs(i)%sz, async=stream_act) - call acc_create(pp, int(common_ptrs(i)%sz)) - enddo - end subroutine - subroutine ext_acc_copyin(ptrs, stream) - use openacc - implicit none - type(ext_acc_arr_desc), intent(in) :: ptrs(:) - integer(acc_handle_kind), optional :: stream - - type(common_pointer_descr), allocatable :: common_ptrs(:) - - integer :: i, num_ranges - integer(4), pointer :: pp(:) - - integer(acc_handle_kind) :: stream_act - - if (present(stream)) then - stream_act = stream - else - stream_act = acc_async_sync - endif - allocate(common_ptrs(size(ptrs))) - num_ranges = get_common_pointers(ptrs, common_ptrs) - - do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) - !!call acc_copyin_async(pp, common_ptrs(i)%sz, async=stream_act) - call acc_copyin(pp, int(common_ptrs(i)%sz)) - enddo - end subroutine - subroutine ext_acc_copyout(ptrs, stream) - use openacc - implicit none - type(ext_acc_arr_desc), intent(in) :: ptrs(:) - integer(acc_handle_kind), optional :: stream - - type(common_pointer_descr), allocatable :: common_ptrs(:) - - integer :: i, num_ranges - integer(4), pointer :: pp(:) - - integer(acc_handle_kind) :: stream_act - - if (present(stream)) then - stream_act = stream - else - stream_act = acc_async_sync - endif - allocate(common_ptrs(size(ptrs))) - num_ranges = get_common_pointers(ptrs, common_ptrs) - - do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) - !!call acc_copyout_async(pp, common_ptrs(i)%sz, async=stream_act) - call acc_copyout(pp, int(common_ptrs(i)%sz)) - enddo - end subroutine - subroutine ext_acc_delete(ptrs, stream) - use openacc - implicit none - type(ext_acc_arr_desc), intent(in) :: ptrs(:) - integer(acc_handle_kind), optional :: stream - - type(common_pointer_descr), allocatable :: common_ptrs(:) - - integer :: i, num_ranges - integer(4), pointer :: pp(:) - - integer(acc_handle_kind) :: stream_act - - if (present(stream)) then - stream_act = stream - else - stream_act = acc_async_sync - endif - allocate(common_ptrs(size(ptrs))) - num_ranges = get_common_pointers(ptrs, common_ptrs) - - do i = 1, num_ranges - call c_f_pointer(common_ptrs(i)%ptr, pp, shape=[common_ptrs(i)%sz/sizeof(pp(1))]) - !!call acc_delete_async(pp, common_ptrs(i)%sz, async=stream_act) - call acc_delete(pp, int(common_ptrs(i)%sz)) - enddo - end subroutine -end module diff --git a/src/trans/gpu/internal/fsc_mod.F90 b/src/trans/gpu/internal/fsc_mod.F90 index 6c0a2d162..85c153498 100755 --- a/src/trans/gpu/internal/fsc_mod.F90 +++ b/src/trans/gpu/internal/fsc_mod.F90 @@ -65,7 +65,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ! ------------------------------------------------------------------ -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT, JPIB USE TPM_TRANS ,ONLY : LATLON USE TPM_DISTR ,ONLY : D, MYSETW, MYPROC, NPROC, D_NUMP, D_NPTRLS, D_NSTAGTF @@ -85,10 +85,10 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE INTEGER(KIND=JPIM) :: KGL REAL(KIND=JPRBT) :: ZACHTE2 -REAL(KIND=JPRBT) :: ZAMP, ZPHASE -INTEGER(KIND=JPIM) :: IOFF_LAT,OFFSET_VAR -INTEGER(KIND=JPIM) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER -INTEGER(KIND=JPIM) :: JF,IGLG,II,JM +INTEGER(KIND=JPIM) :: OFFSET_VAR +INTEGER(KIND=JPIB) :: IOFF_LAT +INTEGER(KIND=JPIB) :: IOFF_SCALARS,IOFF_SCALARS_EWDER,IOFF_UV,IOFF_UV_EWDER,IOFF_KSCALARS_NSDER +INTEGER(KIND=JPIM) :: JF,IGLG,JM INTEGER(KIND=JPIM) :: IBEG,IEND,IINC REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX @@ -134,7 +134,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) IGLG = OFFSET_VAR+KGL-1 IF (JM <= G_NMEN(IGLG)) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_LAT = 1_JPIB*KF_FS*D_NSTAGTF(KGL) IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) ZACHTE2 = F_RACTHE(IGLG) @@ -163,7 +163,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE DO JM=0,R_NSMAX !(note that R_NSMAX <= G_NMEN(IGLG) for all IGLG) IGLG = OFFSET_VAR+KGL-1 IF (JM <= G_NMEN(IGLG)) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_LAT = 1_JPIB*KF_FS*D_NSTAGTF(KGL) IOFF_KSCALARS_NSDER = IOFF_LAT+(KSCALARS_NSDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) ZACHTE2 = F_RACTHE(IGLG) @@ -201,7 +201,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ! to fill those floor(NLON/2)+1 values. ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. IF (JM <= G_NLOEN(IGLG)/2) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_LAT = 1_JPIB*KF_FS*D_NSTAGTF(KGL) IOFF_UV = IOFF_LAT+(KUV_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) IOFF_UV_EWDER = IOFF_LAT+(KUV_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) @@ -242,7 +242,7 @@ SUBROUTINE FSC(ALLOCATOR,HFSC,PREEL_COMPLEX, KF_FS, KF_UV, KF_SCALARS, KUV_OFFSE ! to fill those floor(NLON/2)+1 values. ! Truncation happens starting at G_NMEN+1. Hence, we zero-fill those values. IF (JM <= G_NLOEN(IGLG)/2) THEN - IOFF_LAT = KF_FS*D_NSTAGTF(KGL) + IOFF_LAT = 1_JPIB*KF_FS*D_NSTAGTF(KGL) IOFF_SCALARS_EWDER = IOFF_LAT+(KSCALARS_EWDER_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) IOFF_SCALARS = IOFF_LAT+(KSCALARS_OFFSET+JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) diff --git a/src/trans/gpu/internal/ftdir_mod.F90 b/src/trans/gpu/internal/ftdir_mod.F90 index b131947c7..a3fe9878c 100755 --- a/src/trans/gpu/internal/ftdir_mod.F90 +++ b/src/trans/gpu/internal/ftdir_mod.F90 @@ -64,7 +64,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) USE TPM_DISTR ,ONLY : MYSETW, MYPROC, NPROC, D_NSTAGT0B, D_NSTAGTF,D_NPTRLS, D_NPNTGTB0, D_NPROCM, D_NDGL_FS USE TPM_GEOMETRY ,ONLY : G_NMEN, G_NLOEN USE TPM_HICFFT ,ONLY : EXECUTE_DIR_FFT - USE MPL_MODULE ,ONLY : MPL_BARRIER + USE MPL_MODULE ,ONLY : MPL_BARRIER, MPL_ALL_MS_COMM USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE @@ -89,7 +89,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(430,1) ENDIF CALL GSTATS(413,0) @@ -99,7 +99,7 @@ SUBROUTINE FTDIR(ALLOCATOR,HFTDIR,PREEL_REAL,PREEL_COMPLEX,KFIELD) IF (LSYNC_TRANS) THEN CALL GSTATS(433,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(433,1) ENDIF CALL GSTATS(413,1) diff --git a/src/trans/gpu/internal/ftinv_mod.F90 b/src/trans/gpu/internal/ftinv_mod.F90 index 77a8cf223..f0759f168 100755 --- a/src/trans/gpu/internal/ftinv_mod.F90 +++ b/src/trans/gpu/internal/ftinv_mod.F90 @@ -67,7 +67,7 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) USE TPM_DISTR ,ONLY : MYSETW, MYPROC, NPROC, D_NPTRLS, D_NDGL_FS, D_NSTAGTF USE TPM_GEOMETRY ,ONLY : G_NLOEN USE TPM_HICFFT ,ONLY : EXECUTE_INV_FFT - USE MPL_MODULE ,ONLY : MPL_BARRIER + USE MPL_MODULE ,ONLY : MPL_BARRIER, MPL_ALL_MS_COMM USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE @@ -90,7 +90,7 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(440,1) ENDIF CALL GSTATS(423,0) @@ -100,7 +100,7 @@ SUBROUTINE FTINV(ALLOCATOR,HFTINV,PREEL_COMPLEX,PREEL_REAL,KFIELD) IF (LSYNC_TRANS) THEN CALL GSTATS(443,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(443,1) ENDIF CALL GSTATS(423,1) diff --git a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 index 50e40cd28..0171b4ff5 100644 --- a/src/trans/gpu/internal/inv_trans_ctl_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctl_mod.F90 @@ -215,10 +215,10 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& CALL GSTATS(102,1) ! Packing into send buffer, to fourier space and unpack - WRITE(NOUT,*) 'inv_trans_ctl_mod:TRMTOL_CUDAAWARE' + WRITE(NOUT,*) 'inv_trans_ctl_mod:TRMTOL' CALL GSTATS(152,0) CALL TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) - CALL TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) + CALL TRMTOL(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) CALL TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) CALL GSTATS(152,1) @@ -233,8 +233,8 @@ SUBROUTINE INV_TRANS_CTL(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! Transposition into grid-point space CALL GSTATS(157,0) - WRITE(NOUT,*) 'inv_trans_ctl_mod:TRLTOG_CUDAAWARE' - CALL TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,IF_FOURIER,KF_GP,KF_UV_G,KF_SCALARS_G,& + WRITE(NOUT,*) 'inv_trans_ctl_mod:TRLTOG' + CALL TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,IF_FOURIER,KF_GP,KF_UV_G,KF_SCALARS_G,& & KVSETUV=KVSETUV,KVSETSC=KVSETSC,& & KVSETSC3A=KVSETSC3A,KVSETSC3B=KVSETSC3B,KVSETSC2=KVSETSC2,& & PGP=PGP,PGPUV=PGPUV,PGP3A=PGP3A,PGP3B=PGP3B,PGP2=PGP2) diff --git a/src/trans/gpu/internal/ledir_mod.F90 b/src/trans/gpu/internal/ledir_mod.F90 index 1effbae63..b7bde52ca 100755 --- a/src/trans/gpu/internal/ledir_mod.F90 +++ b/src/trans/gpu/internal/ledir_mod.F90 @@ -23,7 +23,7 @@ MODULE LEDIR_MOD CONTAINS SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS ,ONLY : JPIM, JPIB USE TPM_DIM ,ONLY : R USE TPM_DISTR, ONLY: D,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 @@ -31,8 +31,10 @@ SUBROUTINE LEDIR_STRIDES(KF_FS,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_SIZE - INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0 + INTEGER(KIND=JPIB), OPTIONAL :: IOUT_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0 + INTEGER(KIND=JPIB), OPTIONAL :: IIN_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE @@ -97,17 +99,16 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) ! ------------------------------------------------------------------ USE TPM_GEN ,ONLY : LSYNC_TRANS - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD, JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R_NDGNH,R_NSMAX,R_NTMAX,R_NDGL USE TPM_GEOMETRY ,ONLY : G_NDGLU USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 - USE TPM_DISTR ,ONLY : D_NUMP,D_MYMS,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 USE HICBLAS_MOD ,ONLY : HIP_GEMM_BATCHED, HIP_DGEMM_BATCHED_OVERLOAD, & & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD - USE MPL_MODULE ,ONLY : MPL_BARRIER + USE MPL_MODULE ,ONLY : MPL_BARRIER, MPL_ALL_MS_COMM USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - USE, INTRINSIC :: ISO_C_BINDING USE IEEE_ARITHMETIC USE OPENACC @@ -132,15 +133,18 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) INTEGER(KIND=JPIM) :: KM INTEGER(KIND=JPIM) :: KMLOC INTEGER(KIND=JPIM) :: IA, IS, ISL, J - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP) + INTEGER(KIND=JPIB) :: AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPRBT) :: PAIA, PAIS, V1, V2 INTEGER(KIND=JPIM) :: IGLS, JF, JGL INTEGER(KIND=JPIM) :: OFFSET1, OFFSET2 - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_STRIDES1 - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_STRIDES1 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_STRIDES1 + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_STRIDES1 INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_STRIDES1 INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_STRIDES1 INTEGER(KIND=8) :: ALLOC_SZ, ALLOC_POS @@ -169,7 +173,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC WAIT(1) #endif CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(430,1) ENDIF CALL GSTATS(414,0) @@ -206,7 +210,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) NS(KMLOC) = (R_NSMAX-KM+2)/2 KS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(KMLOC0 > 0) THEN @@ -225,7 +229,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & & ZINPA, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & + & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -240,7 +244,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC WAIT(1) #endif CALL GSTATS(434,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(434,1) ENDIF CALL GSTATS(414,1) @@ -283,7 +287,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC WAIT(1) #endif CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(430,1) ENDIF CALL GSTATS(414,0) @@ -321,7 +325,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) NS(KMLOC) = (R_NSMAX-KM+3)/2 KS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM1(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM2(KMLOC) ENDDO IF(KMLOC0 > 0) THEN @@ -340,7 +344,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) & 2*KF_FS, NS(:), KS(:), & & 1.0_JPRBT, & & ZINPS, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & + & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUT, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -355,7 +359,7 @@ SUBROUTINE LEDIR(ALLOCATOR,ZINPS,ZINPA,ZINPS0,ZINPA0,ZOUT,ZOUT0,POA1,KF_FS) !$ACC WAIT(1) #endif CALL GSTATS(434,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(434,1) ENDIF CALL GSTATS(414,1) diff --git a/src/trans/gpu/internal/leinv_mod.F90 b/src/trans/gpu/internal/leinv_mod.F90 index 3d9c56425..0fb990fe7 100755 --- a/src/trans/gpu/internal/leinv_mod.F90 +++ b/src/trans/gpu/internal/leinv_mod.F90 @@ -23,7 +23,7 @@ MODULE LEINV_MOD CONTAINS SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& IOUT0_STRIDES0,IOUT0_SIZE,IIN0_STRIDES0,IIN0_SIZE) - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS ,ONLY : JPIM, JPIB USE TPM_DIM ,ONLY : R USE TPM_DISTR, ONLY: D,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 @@ -31,8 +31,10 @@ SUBROUTINE LEINV_STRIDES(KF_LEG,IOUT_STRIDES0,IOUT_SIZE,IIN_STRIDES0,IIN_SIZE,& INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG - INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0, IOUT_SIZE - INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IOUT_STRIDES0 + INTEGER(KIND=JPIB), OPTIONAL :: IOUT_SIZE + INTEGER(KIND=JPIM), OPTIONAL :: IIN_STRIDES0 + INTEGER(KIND=JPIB), OPTIONAL :: IIN_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM), OPTIONAL :: IIN0_STRIDES0, IIN0_SIZE @@ -95,12 +97,12 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) ! ------------------------------------------------------------------ USE TPM_GEN ,ONLY : LSYNC_TRANS - USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB, JPRBT, JPRD + USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB, JPRBT, JPRD, JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R_NDGNH,R_NSMAX, R_NDGL USE TPM_GEOMETRY ,ONLY : G_NDGLU USE TPM_FIELDS ,ONLY : ZAA,ZAS,ZAA0,ZAS0,KMLOC0 - USE TPM_DISTR ,ONLY : D_NUMP,D_MYMS,MYPROC,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 + USE TPM_DISTR ,ONLY : D,D_NUMP,D_MYMS,MYPROC,D_OFFSETS_GEMM1,D_OFFSETS_GEMM2 USE HICBLAS_MOD ,ONLY : HIP_GEMM_BATCHED, HIP_DGEMM_BATCHED_OVERLOAD, & & HIP_DGEMM_GROUPED_OVERLOAD, HIP_SGEMM_GROUPED_OVERLOAD #ifdef TRANS_SINGLE @@ -110,7 +112,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) #endif USE, INTRINSIC :: ISO_C_BINDING - USE MPL_MODULE ,ONLY : MPL_BARRIER + USE MPL_MODULE ,ONLY : MPL_BARRIER, MPL_ALL_MS_COMM USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX IMPLICIT NONE @@ -122,10 +124,13 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR ! LOCAL - INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP), AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) + INTEGER(KIND=JPIM) :: KS(D_NUMP), NS(D_NUMP) + INTEGER(KIND=JPIB) :: AOFFSETS(D_NUMP), BOFFSETS(D_NUMP), COFFSETS(D_NUMP) INTEGER(KIND=JPIM) :: KM, KMLOC, IA, IS, ISL, J1, JGL, JK, J - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE @@ -188,6 +193,11 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) DO J=1,(R_NSMAX-KM+2)/2 ZINP(JK+(J-1)*IIN_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IIN_STRIDES0)=PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO + ! those are - in principle - only needed with tensor cores (zinp might contain NaNs!) + !$ACC LOOP SEQ + DO J=(R_NSMAX-KM+2)/2+1,ALIGN((R_NSMAX-KM+2)/2,A) + ZINP(JK+(J-1)*IIN_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IIN_STRIDES0)=0 + ENDDO ELSEIF (MOD((JK-1),2) .EQ. 0) THEN ! every other field is sufficient because Im(KM=0) == 0 #ifdef OMPGPU @@ -198,6 +208,11 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) DO J=1,(R_NSMAX+2)/2 ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IA+1+(J-1)*2,KMLOC) ENDDO + ! those are - in principle - only needed with tensor cores (zinp might contain NaNs!) + !$ACC LOOP SEQ + DO J=(R_NSMAX+2)/2+1,ALIGN((R_NSMAX+2)/2,A) + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = 0 + ENDDO ENDIF ENDDO ENDDO @@ -208,7 +223,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC WAIT(1) #endif CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(440,1) ENDIF CALL GSTATS(424,0) @@ -245,7 +260,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KS(KMLOC) = (R_NSMAX-KM+2)/2 NS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAA,1)*SIZE(ZAA,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(KMLOC0 > 0) THEN @@ -264,7 +279,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAA, SIZE(ZAA,1), BOFFSETS, & + & ZAA, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTA, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -280,7 +295,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC WAIT(1) #endif CALL GSTATS(444,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(444,1) ENDIF CALL GSTATS(424,1) @@ -313,6 +328,11 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) DO J=1,(R_NSMAX-KM+3)/2 ZINP(JK+(J-1)*IIN_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IIN_STRIDES0)=PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO + ! those are - in principle - only needed with tensor cores (zinp might contain NaNs!) + !$ACC LOOP SEQ + DO J=(R_NSMAX-KM+3)/2+1,ALIGN((R_NSMAX-KM+3)/2,A) + ZINP(JK+(J-1)*IIN_STRIDES0+D_OFFSETS_GEMM2(KMLOC)*IIN_STRIDES0)=0 + ENDDO ELSEIF (MOD((JK-1),2) == 0) THEN #ifdef OMPGPU #endif @@ -322,6 +342,11 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) DO J=1,(R_NSMAX+3)/2 ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = PIA(JK,IS+1+(J-1)*2,KMLOC) ENDDO + ! those are - in principle - only needed with tensor cores (zinp might contain NaNs!) + !$ACC LOOP SEQ + DO J=(R_NSMAX+3)/2+1,ALIGN((R_NSMAX+3)/2,A) + ZINP0((JK-1)/2+1+(J-1)*IIN0_STRIDES0) = 0 + ENDDO ENDIF ENDDO ENDDO @@ -331,7 +356,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC WAIT(1) #endif CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(440,1) ENDIF CALL GSTATS(424,0) @@ -365,7 +390,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) KS(KMLOC) = (R_NSMAX-KM+3)/2 NS(KMLOC) = G_NDGLU(KM) AOFFSETS(KMLOC) = IIN_STRIDES0*D_OFFSETS_GEMM2(KMLOC) - BOFFSETS(KMLOC) = SIZE(ZAS,1)*SIZE(ZAS,2)*(KMLOC-1) + BOFFSETS(KMLOC) = D%OFFSETS_GEMM_MATRIX(KMLOC) COFFSETS(KMLOC) = IOUT_STRIDES0*D_OFFSETS_GEMM1(KMLOC) ENDDO IF(KMLOC0 > 0) THEN @@ -384,7 +409,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) & 2*KF_LEG, NS(:), KS(:), & & 1.0_JPRBT, & & ZINP, IIN_STRIDES0, AOFFSETS, & - & ZAS, SIZE(ZAS,1), BOFFSETS, & + & ZAS, D%LEGENDRE_MATRIX_STRIDES, BOFFSETS, & & 0.0_JPRBT, & & ZOUTS, IOUT_STRIDES0, COFFSETS, & & D_NUMP, STREAM=1_C_INT, ALLOC=ALLOCATOR%PTR) @@ -399,7 +424,7 @@ SUBROUTINE LEINV(ALLOCATOR,PIA,ZINP,ZINP0,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,KF_LEG) !$ACC WAIT(1) #endif CALL GSTATS(444,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(444,1) ENDIF CALL GSTATS(424,1) diff --git a/src/trans/gpu/internal/ltdir_mod.F90 b/src/trans/gpu/internal/ltdir_mod.F90 index 3f3f2eb07..012df030e 100755 --- a/src/trans/gpu/internal/ltdir_mod.F90 +++ b/src/trans/gpu/internal/ltdir_mod.F90 @@ -23,10 +23,9 @@ MODULE LTDIR_MOD CONTAINS FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE TPM_DISTR, ONLY: D USE TPM_DIM, ONLY: R - USE ISO_C_BINDING USE LEDIR_MOD USE BUFFERED_ALLOCATOR_MOD @@ -36,8 +35,9 @@ FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV TYPE(LTDIR_HANDLE) :: HLTDIR - INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIB) :: IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY @@ -47,15 +47,15 @@ FUNCTION PREPARE_LTDIR(ALLOCATOR, KF_FS, KF_UV) RESULT(HLTDIR) IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) ! POA1 - IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = ALIGN(2_JPIB*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) ! POA2 - IALLOC_SZ = IALLOC_SZ + ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + IALLOC_SZ = IALLOC_SZ + ALIGN(4_JPIB*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) ! ZOUT IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT_SIZE*SIZEOF(ZPRBT_DUMMY),128) ! ZOUT0 IALLOC_SZ = IALLOC_SZ+ ALIGN(IOUT0_SIZE*SIZEOF(ZPRD_DUMMY),128) - HLTDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ) + HLTDIR%HOUT_AND_POA = RESERVE(ALLOCATOR, IALLOC_SZ, "HLTDIR%HOUT_AND_POA") END FUNCTION PREPARE_LTDIR SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALARS,& @@ -63,7 +63,7 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA & PSPSC3A,PSPSC3B,PSPSC2, & & KFLDPTRUV,KFLDPTRSC) - USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD, JPRB + USE PARKIND_ECTRANS ,ONLY : JPIM, JPRBT, JPRD, JPRB, JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R @@ -75,12 +75,12 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA USE UVTVD_MOD USE UPDSP_MOD ,ONLY : UPDSP USE UPDSPB_MOD ,ONLY : UPDSPB - USE MPL_MODULE ,ONLY : MPL_BARRIER + USE MPL_MODULE ,ONLY : MPL_BARRIER, MPL_ALL_MS_COMM USE TPM_GEN ,ONLY : LSYNC_TRANS USE TPM_TRANS ,ONLY : NF_SC2, NF_SC3A, NF_SC3B USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX USE BUFFERED_ALLOCATOR_MOD - USE ISO_C_BINDING, ONLY: C_SIZE_T, C_F_POINTER, C_LOC + USE ISO_C_BINDING, ONLY: C_F_POINTER, C_LOC !**** *LTDIR* - Control of Direct Legendre transform step @@ -166,8 +166,9 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA REAL(KIND=JPRD), POINTER :: ZOUT0(:) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LTDIR_HANDLE), INTENT(IN) :: HLTDIR - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE @@ -190,13 +191,13 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA IALLOC_POS = 1 - IALLOC_SZ = ALIGN(2*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(POA1_L(1)),128) + IALLOC_SZ = ALIGN(2_JPIB*KF_FS*(R%NTMAX+3)*D%NUMP*SIZEOF(POA1_L(1)),128) CALL ASSIGN_PTR(POA1_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) CALL C_F_POINTER(C_LOC(POA1_L), POA1, (/ 2*KF_FS, R%NTMAX+3, D%NUMP /)) IALLOC_POS = IALLOC_POS + IALLOC_SZ - IALLOC_SZ = ALIGN(4*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(POA2_L(1)),128) + IALLOC_SZ = ALIGN(4_JPIB*KF_UV*(R%NTMAX+3)*D%NUMP*SIZEOF(POA2_L(1)),128) CALL ASSIGN_PTR(POA2_L, GET_ALLOCATION(ALLOCATOR, HLTDIR%HOUT_AND_POA),& & IALLOC_POS, IALLOC_SZ, SET_STREAM=1) CALL C_F_POINTER(C_LOC(POA2_L), POA2, (/ 4*KF_UV, R%NTMAX+3, D%NUMP /)) @@ -267,7 +268,7 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(430,1) ENDIF CALL GSTATS(412,0) @@ -282,7 +283,7 @@ SUBROUTINE LTDIR(ALLOCATOR,HLTDIR,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV,KF_SCALA #endif IF (LSYNC_TRANS) THEN CALL GSTATS(432,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(432,1) ENDIF CALL GSTATS(412,1) diff --git a/src/trans/gpu/internal/ltinv_mod.F90 b/src/trans/gpu/internal/ltinv_mod.F90 index 861aea97b..e438d505a 100755 --- a/src/trans/gpu/internal/ltinv_mod.F90 +++ b/src/trans/gpu/internal/ltinv_mod.F90 @@ -25,10 +25,9 @@ MODULE LTINV_MOD CONTAINS FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT(HLTINV) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE TPM_DISTR, ONLY: D USE TPM_DIM, ONLY: R - USE ISO_C_BINDING USE LEINV_MOD USE BUFFERED_ALLOCATOR_MOD @@ -40,10 +39,12 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( TYPE(LTINV_HANDLE) :: HLTINV - INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ, IPIA_SZ - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIB) :: IALLOC_SZ, IPIA_SZ + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY @@ -63,7 +64,7 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( IF (LSCDERS) & IF_READIN = IF_READIN + KF_SCALARS ! Scalars NS Derivatives - IPIA_SZ = ALIGN(2*IF_READIN*(R%NSMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) + IPIA_SZ = ALIGN(2_JPIB*IF_READIN*(R%NSMAX+3)*D%NUMP*SIZEOF(ZPRBT_DUMMY),128) ! In Legendre space, we then ignore vorticity/divergence, if ! they don't need to be transformed. @@ -81,7 +82,7 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( ! ZINP0 IALLOC_SZ = IALLOC_SZ + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) - HLTINV%HPIA_AND_IN = RESERVE(ALLOCATOR, IALLOC_SZ) + HLTINV%HPIA_AND_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HLTINV_HPIA_AND_IN") IALLOC_SZ = 0 ! ZOUTA @@ -93,7 +94,7 @@ FUNCTION PREPARE_LTINV(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) RESULT( ! ZOUTS0 IALLOC_SZ = IALLOC_SZ + ALIGN(IOUT0_SIZE*SIZEOF(ZPRD_DUMMY),128) - HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ) + HLTINV%HOUTS_AND_OUTA = RESERVE(ALLOCATOR, IALLOC_SZ, "HLTINV_HOUTS_AND_OUTA") END FUNCTION PREPARE_LTINV @@ -101,7 +102,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& & PSPVOR,PSPDIV,PSPSCALAR,PSPSC3A,PSPSC3B,PSPSC2, & & ZOUTS,ZOUTA,ZOUTS0,ZOUTA0) - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB, JPRBT, JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK USE TPM_DIM ,ONLY : R @@ -116,7 +117,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& USE ABORT_TRANS_MOD ,ONLY : ABORT_TRANS use ieee_arithmetic USE TPM_FIELDS ,ONLY : F,ZEPSNM - USE MPL_MODULE ,ONLY : MPL_BARRIER + USE MPL_MODULE ,ONLY : MPL_BARRIER, MPL_ALL_MS_COMM USE TPM_GEN ,ONLY : LSYNC_TRANS USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX @@ -194,13 +195,15 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(LTINV_HANDLE), INTENT(IN) :: HLTINV - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE INTEGER(KIND=JPIM) :: IF_READIN, IF_LEG - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ REAL(KIND=JPRBT), POINTER :: ZINP(:) REAL(KIND=JPRD), POINTER :: ZINP0(:) @@ -315,7 +318,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(440,1) ENDIF CALL GSTATS(422,0) @@ -330,7 +333,7 @@ SUBROUTINE LTINV(ALLOCATOR,HLTINV,KF_UV,KF_SCALARS,& #endif IF (LSYNC_TRANS) THEN CALL GSTATS(442,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(442,1) ENDIF CALL GSTATS(422,1) diff --git a/src/trans/gpu/internal/sump_trans_mod.F90 b/src/trans/gpu/internal/sump_trans_mod.F90 index 64c907f4f..d20d47700 100755 --- a/src/trans/gpu/internal/sump_trans_mod.F90 +++ b/src/trans/gpu/internal/sump_trans_mod.F90 @@ -19,7 +19,7 @@ SUBROUTINE SUMP_TRANS ! Modifications : ! P.Marguinaud : 11-Sep-2012 : Fix twice allocated pointer -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT, JPRD +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT, JPRD, JPIB USE TPM_GEN ,ONLY : NOUT, NPRINTLEV USE TPM_DIM ,ONLY : R @@ -41,7 +41,8 @@ SUBROUTINE SUMP_TRANS INTEGER(KIND=JPIM) :: JM INTEGER(KIND=JPIM) :: JGL,IGL,IPLAT,ISENDSET,IRECVSET,JML,IPOS,IM -INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,OFFSET1,OFFSET2,KMLOC,KM +INTEGER(KIND=JPIM) :: IGPTOT,IMEDIAP,IRESTM,JA,JB,IOFF,KMLOC,KM +INTEGER(KIND=JPIB) :: OFFSET1,OFFSET2,OFFSET3 INTEGER(KIND=JPIM),ALLOCATABLE :: IGPTOTL(:,:) REAL(KIND=JPRBT),ALLOCATABLE :: ZDUM(:) @@ -273,23 +274,33 @@ SUBROUTINE SUMP_TRANS ALLOCATE(D%OFFSETS_GEMM1(D%NUMP+1)) ALLOCATE(D%OFFSETS_GEMM2(D%NUMP+1)) +ALLOCATE(D%OFFSETS_GEMM_MATRIX(D%NUMP+1)) +ALLOCATE(D%LEGENDRE_MATRIX_STRIDES(D%NUMP)) OFFSET1 = 0 OFFSET2 = 0 +OFFSET3 = 0 DO KMLOC=1,D%NUMP KM = D%MYMS(KMLOC) D%OFFSETS_GEMM1(KMLOC) = OFFSET1 D%OFFSETS_GEMM2(KMLOC) = OFFSET2 + D%OFFSETS_GEMM_MATRIX(KMLOC) = OFFSET3 !KM=0 is transformed in double precision, no need to store here IF (KM /= 0) THEN OFFSET1 = OFFSET1 + ALIGN(G%NDGLU(KM),8) ! N_OFFSET takes the max of the two GEMMs OFFSET2 = OFFSET2 + ALIGN((R%NSMAX-KM+3)/2,8) + + D%LEGENDRE_MATRIX_STRIDES(KMLOC) = ALIGN(G%NDGLU(KM),8) + ! Note that both sizes have to be aligned because we make the GEMMs + ! multiples of 8 + OFFSET3 = OFFSET3 + ALIGN((R%NSMAX-KM+3)/2,8) * D%LEGENDRE_MATRIX_STRIDES(KMLOC) ENDIF ENDDO D%OFFSETS_GEMM1(D%NUMP+1) = OFFSET1 D%OFFSETS_GEMM2(D%NUMP+1) = OFFSET2 +D%OFFSETS_GEMM_MATRIX(D%NUMP+1) = OFFSET3 ! ------------------------------------------------------------------ 9 FORMAT(1X,'ARRAY ',A10,' ALLOCATED ',8I8) diff --git a/src/trans/gpu/internal/tpm_distr.F90 b/src/trans/gpu/internal/tpm_distr.F90 index 088122929..86d7ebf47 100755 --- a/src/trans/gpu/internal/tpm_distr.F90 +++ b/src/trans/gpu/internal/tpm_distr.F90 @@ -13,7 +13,7 @@ MODULE TPM_DISTR ! Module for distributed memory environment. -USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT +USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT, JPIB IMPLICIT NONE @@ -161,7 +161,8 @@ MODULE TPM_DISTR REAL(KIND=JPRBT) ,ALLOCATABLE :: RWEIGHT(:) ! Weight per grid-point (if weighted distribution) INTEGER(KIND=JPIM) ,ALLOCATABLE :: NPROCA_GP(:) ! Number of grid-points per a-set -INTEGER(KIND=JPIM), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:) +INTEGER(KIND=JPIB), ALLOCATABLE :: OFFSETS_GEMM1(:), OFFSETS_GEMM2(:), OFFSETS_GEMM_MATRIX(:) +INTEGER(KIND=JPIM), ALLOCATABLE :: LEGENDRE_MATRIX_STRIDES(:) END TYPE DISTR_TYPE @@ -187,7 +188,7 @@ MODULE TPM_DISTR ! The offsets in the input and output arrays to the gemms. ! (1) are the offsets in the "inputs" of dirtrans ("outputs" invtrans) ! (2) are the offsets in the "outputs" of invtrans ("inputs" dirtrans) -INTEGER(KIND=JPIM), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) +INTEGER(KIND=JPIB), POINTER :: D_OFFSETS_GEMM1(:), D_OFFSETS_GEMM2(:) TYPE(DISTR_TYPE),ALLOCATABLE,TARGET :: DISTR_RESOL(:) TYPE(DISTR_TYPE),POINTER :: D diff --git a/src/trans/gpu/internal/tpm_fields.F90 b/src/trans/gpu/internal/tpm_fields.F90 index 1f917730d..32a1c5a2c 100755 --- a/src/trans/gpu/internal/tpm_fields.F90 +++ b/src/trans/gpu/internal/tpm_fields.F90 @@ -45,8 +45,8 @@ MODULE TPM_FIELDS ! scratch arrays for ltinv and ltdir and associated dimension variables -REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:,:,:) !! JPRL for 1/2 -REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:,:,:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAA(:) !! JPRL for 1/2 +REAL(KIND=JPRBT),ALLOCATABLE :: ZAS(:) !! JPRL for 1/2 ! for m=0 in ledir_mod: REAL(KIND=JPRD),ALLOCATABLE :: ZAA0(:,:) diff --git a/src/trans/gpu/internal/tpm_hicfft.F90 b/src/trans/gpu/internal/tpm_hicfft.F90 index c59df0be1..30f2cd65a 100755 --- a/src/trans/gpu/internal/tpm_hicfft.F90 +++ b/src/trans/gpu/internal/tpm_hicfft.F90 @@ -227,8 +227,8 @@ SUBROUTINE EXECUTE_DIR_FFT_FLOAT(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,A SUBROUTINE EXECUTE_DIR_FFT_FLOAT_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_float") USE ISO_C_BINDING - REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_REAL(*) - REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_COMPLEX(*) + REAL(KIND=C_FLOAT), INTENT(IN), DEVICE :: PREEL_REAL(*) + REAL(KIND=C_FLOAT), INTENT(OUT), DEVICE :: PREEL_COMPLEX(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -259,8 +259,8 @@ SUBROUTINE EXECUTE_DIR_FFT_DOUBLE(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS, SUBROUTINE EXECUTE_DIR_FFT_DOUBLE_C(PREEL_REAL,PREEL_COMPLEX,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_dir_fft_double") USE ISO_C_BINDING - REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_REAL(*) - REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_COMPLEX(*) + REAL(KIND=C_DOUBLE), INTENT(IN), DEVICE :: PREEL_REAL(*) + REAL(KIND=C_DOUBLE), INTENT(OUT), DEVICE :: PREEL_COMPLEX(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -292,8 +292,8 @@ SUBROUTINE EXECUTE_INV_FFT_FLOAT(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,A SUBROUTINE EXECUTE_INV_FFT_FLOAT_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_float") USE ISO_C_BINDING - REAL(KIND=C_FLOAT), INTENT(IN) :: PREEL_COMPLEX(*) - REAL(KIND=C_FLOAT), INTENT(OUT) :: PREEL_REAL(*) + REAL(KIND=C_FLOAT), INTENT(IN), DEVICE :: PREEL_COMPLEX(*) + REAL(KIND=C_FLOAT), INTENT(OUT), DEVICE :: PREEL_REAL(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT @@ -324,8 +324,8 @@ SUBROUTINE EXECUTE_INV_FFT_DOUBLE(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS, SUBROUTINE EXECUTE_INV_FFT_DOUBLE_C(PREEL_COMPLEX,PREEL_REAL,KFIELD,LOENS,OFFSETS,NFFT,ALLOC) & & BIND(C, NAME="execute_inv_fft_double") USE ISO_C_BINDING - REAL(KIND=C_DOUBLE), INTENT(IN) :: PREEL_COMPLEX(*) - REAL(KIND=C_DOUBLE), INTENT(OUT) :: PREEL_REAL(*) + REAL(KIND=C_DOUBLE), INTENT(IN), DEVICE :: PREEL_COMPLEX(*) + REAL(KIND=C_DOUBLE), INTENT(OUT), DEVICE :: PREEL_REAL(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: KFIELD INTEGER(KIND=C_INT),INTENT(IN) :: LOENS(*), OFFSETS(*) INTEGER(KIND=C_INT),INTENT(IN),VALUE :: NFFT diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 1460239b9..335368d61 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -1,6 +1,5 @@ #define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) -! (C) Copyright 1995- ECMWF. -! (C) Copyright 1995- Meteo-France. +! (C) Copyright 1995- ECMWF.,KMLOC,KM ! (C) Copyright 1995- Meteo-France. ! (C) Copyright 2022- NVIDIA. ! ! This software is licensed under the terms of the Apache Licence Version 2.0 @@ -25,7 +24,7 @@ FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) USE PARKIND_ECTRANS, ONLY : JPIM, JPRB, JPRBT USE TPM_DISTR, ONLY : D USE BUFFERED_ALLOCATOR_MOD - USE ISO_C_BINDING, ONLY: C_SIZE_T + USE PARKIND_ECTRANS, ONLY : JPIB IMPLICIT NONE @@ -35,13 +34,14 @@ FUNCTION PREPARE_TRGTOL(ALLOCATOR,KF_GP,KF_FS) RESULT(HTRGTOL) REAL(KIND=JPRBT) :: DUMMY - INTEGER(KIND=C_SIZE_T) :: NELEM + INTEGER(KIND=JPIB) :: NELEM - HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, int(KF_GP*D%NGPTOT*SIZEOF(DUMMY),kind=c_size_t)) + HTRGTOL%HCOMBUFS = RESERVE(ALLOCATOR, 1_JPIB*KF_GP*D%NGPTOT*SIZEOF(DUMMY), "HTRGTOL%HCOMBUFS") - NELEM = KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! ZCOMBUFR - NELEM = NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! PREEL_REAL - HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM) + NELEM = 0 + NELEM = NELEM + 1_JPIB*KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! ZCOMBUFR + NELEM = NELEM + 1_JPIB*KF_FS*D%NLENGTF*SIZEOF(DUMMY) ! PREEL_REAL + HTRGTOL%HCOMBUFR_AND_REEL = RESERVE(ALLOCATOR, NELEM, "HTRGTOL%HCOMBUFR_AND_REEL") END FUNCTION PREPARE_TRGTOL SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,& @@ -105,9 +105,9 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT, jprd + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT, JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER + USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER, MPL_ABORT USE TPM_GEN ,ONLY : LSYNC_TRANS USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV @@ -117,9 +117,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE MPI_F08 USE TPM_STATS ,ONLY : GSTATS => GSTATS_NVTX USE TPM_TRANS ,ONLY : NPROMA - USE ISO_C_BINDING ,ONLY : C_SIZE_T, c_float, c_double, c_int8_t USE BUFFERED_ALLOCATOR_MOD - USE OPENACC_EXT + USE CUDA_COPY_MODULE IMPLICIT NONE @@ -134,31 +133,36 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ! LOCAL VARIABLES + REAL(KIND=JPRB),ALLOCATABLE :: PGP_DEV(:,:,:), PGPUV_DEV(:,:,:,:), PGP3A_DEV(:,:,:,:), PGP3B_DEV(:,:,:,:), PGP2_DEV(:,:,:) ! LOCAL INTEGER SCALARS REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) - INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) - INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIB) :: ISENDTOT (NPROC) + INTEGER(KIND=JPIB) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIM) :: ISENDTOT_MPI(NPROC) + INTEGER(KIND=JPIM) :: IRECVTOT_MPI(NPROC) INTEGER(KIND=JPIM) :: IREQ (NPROC*2) INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: IFIRSTLAT, IGL, IGLL, ILAST,& - &ILASTLAT, ILEN, JROC, IPOS, ISETA, & + &ILASTLAT, ILEN, JROC, ISETA, & &ISETB, IRECV, & &ISETV, ISEND, JBLK, JFLD, & &JGL, JI, JK, JL, ISETW, IFLD, & &II,IBUFLENR,IRECV_COUNTS, IPROC,IFLDS, & &ISEND_COUNTS,INS,INR,IR, JKL, PBOUND, IERROR, ILOCAL_LAT INTEGER(KIND=JPIM) :: KF, KGL, KI, J3 + INTEGER(KIND=JPIB) :: IPOS INTEGER(KIND=JPIM) :: IOFF, ILAT_STRIP - INTEGER(KIND=JPIM) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2),IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V + INTEGER(KIND=JPIB) :: IRECV_BUFR_TO_OUT(D%NLENGTF,2) + INTEGER(KIND=JPIB) :: IRECV_BUFR_TO_OUT_OFFSET(NPROC), IRECV_BUFR_TO_OUT_V INTEGER(KIND=JPIM) :: ISEND_FIELD_COUNT(NPRTRV),ISEND_FIELD_COUNT_V INTEGER(KIND=JPIM) :: ISEND_WSET_SIZE(NPRTRW),ISEND_WSET_SIZE_V INTEGER(KIND=JPIM) :: ISEND_WSET_OFFSET(NPRTRW+1), ISEND_WSET_OFFSET_V - INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) - INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + INTEGER(KIND=JPIB), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIB) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V INTEGER(KIND=JPIM) :: IFLDA(KF_GP) INTEGER(KIND=JPIM) :: IVSET(KF_GP) @@ -171,9 +175,6 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, INTEGER(JPIM), PARAMETER :: PGP_INDICES_END = 5 INTEGER(JPIM) :: PGP_INDICES(PGP_INDICES_END) - TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins... - INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT = 0 - TYPE(MPI_COMM) :: LOCAL_COMM TYPE(MPI_REQUEST) :: IREQUEST(2*NPROC) @@ -258,6 +259,34 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, ISEND_FIELD_COUNT(IVSET(JFLD)) = ISEND_FIELD_COUNT(IVSET(JFLD)) + 1 ENDDO ENDIF + CALL GSTATS(1805,1) + + ! Put data on device for copyin + IF (LSYNC_TRANS) THEN +#ifdef ACCGPU + !$ACC WAIT(1) +#endif + CALL GSTATS(430,0) + CALL MPL_BARRIER(CDSTRING='') + CALL GSTATS(430,1) + ENDIF + IF (PRESENT(PGP)) ALLOCATE(PGP_DEV, MOLD=PGP) + IF (PRESENT(PGPUV)) ALLOCATE(PGPUV_DEV, MOLD=PGPUV) + IF (PRESENT(PGP2)) ALLOCATE(PGP2_DEV, MOLD=PGP2) + IF (PRESENT(PGP3A)) ALLOCATE(PGP3A_DEV, MOLD=PGP3A) + IF (PRESENT(PGP3B)) ALLOCATE(PGP3B_DEV, MOLD=PGP3B) + + !$ACC DATA IF(ALLOCATED(PGP_DEV)) CREATE(PGP_DEV) ASYNC(1) + !$ACC DATA IF(ALLOCATED(PGPUV_DEV)) CREATE(PGPUV_DEV) ASYNC(1) + !$ACC DATA IF(ALLOCATED(PGP2_DEV)) CREATE(PGP2_DEV) ASYNC(1) + !$ACC DATA IF(ALLOCATED(PGP3A_DEV)) CREATE(PGP3A_DEV) ASYNC(1) + !$ACC DATA IF(ALLOCATED(PGP3B_DEV)) CREATE(PGP3B_DEV) ASYNC(1) + IF (PRESENT(PGP)) CALL COPY(PGP_DEV, PGP, NH2D, QUEUE=1_ACC_HANDLE_KIND) + IF (PRESENT(PGPUV)) CALL COPY(PGPUV_DEV, PGPUV, NH2D, QUEUE=1_ACC_HANDLE_KIND) + IF (PRESENT(PGP2)) CALL COPY(PGP2_DEV, PGP2, NH2D, QUEUE=1_ACC_HANDLE_KIND) + IF (PRESENT(PGP3A)) CALL COPY(PGP3A_DEV, PGP3A, NH2D, QUEUE=1_ACC_HANDLE_KIND) + IF (PRESENT(PGP3B)) CALL COPY(PGP3B_DEV, PGP3B, NH2D, QUEUE=1_ACC_HANDLE_KIND) + !$ACC WAIT(1) ! find number of grid-points on a certain W-set that overlap with myself ISEND_WSET_SIZE(:) = 0 DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) @@ -315,11 +344,8 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IRECVTOT(JROC) = IPOS*KF_FS ENDDO - block CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & int(KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))+1,kind=c_size_t), int(KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1)),kind=c_size_t)) - !!CALL ASSIGN_PTR(PREEL_REAL, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL), size1, size2) - end block + & 1_JPIB*KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))+1, 1_JPIB*KF_FS*D%NLENGTF*SIZEOF(PREEL_REAL(1))) #ifdef OMPGPU #endif @@ -327,85 +353,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC DATA COPYIN(IRECV_BUFR_TO_OUT,PGP_INDICES) PRESENT(PREEL_REAL) ASYNC(1) #endif - CALL GSTATS(1805,1) - - ! Put data on device for copyin - IF (LSYNC_TRANS) THEN -#ifdef ACCGPU - !$ACC WAIT(1) -#endif - CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') - CALL GSTATS(430,1) - ENDIF CALL GSTATS(412,0) - ACC_POINTERS_CNT = 0 - IF (PRESENT(PGP)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP) - ENDIF - IF (PRESENT(PGPUV)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGPUV) - ENDIF - IF (PRESENT(PGP2)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP2) - ENDIF - IF (PRESENT(PGP3A)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3A) - ENDIF - IF (PRESENT(PGP3B)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) - ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) - !$ACC WAIT(1) - IF (PRESENT(PGP)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE DEVICE(PGP) -#endif - ENDIF - IF (PRESENT(PGPUV)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE DEVICE(PGPUV) -#endif - ENDIF - IF (PRESENT(PGP2)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE DEVICE(PGP2) -#endif - ENDIF - IF (PRESENT(PGP3A)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE DEVICE(PGP3A) -#endif - ENDIF - IF (PRESENT(PGP3B)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE DEVICE(PGP3B) -#endif - ENDIF -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC DATA IF(PRESENT(PGP) .AND. KF_GP > 0) PRESENT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) -#endif IF (LSYNC_TRANS) THEN #ifdef ACCGPU !$ACC WAIT(1) @@ -447,7 +395,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IF (ISEND_COUNTS > 0) THEN CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFS),& - & 1_C_SIZE_T, int(ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1)),kind=c_size_t)) + & 1_JPIB, ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) ENDIF !....Pack loop......................................................... @@ -499,7 +447,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, JBLK = (ISEND_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD = IFLDA(JFLD) JI = (JFLD-1)*ISEND_WSET_SIZE_V+JL - ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP(JK,IFLD,JBLK) + ZCOMBUFS(ICOMBUFS_OFFSET_V+JI) = PGP_DEV(JK,IFLD,JBLK) ENDDO ENDDO ELSE @@ -518,20 +466,20 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, JI = ICOMBUFS_OFFSET_V+(JFLD-1)*ISEND_WSET_SIZE_V+JL IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) - PBOUND=UBOUND(PGPUV,2) + PBOUND=UBOUND(PGPUV_DEV,2) ! TODO we could certainly reshape PGPXX arrays and we would simplify this - ZCOMBUFS(JI) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + ZCOMBUFS(JI) = PGPUV_DEV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - ZCOMBUFS(JI) = PGP2(JK,IOFF+1,JBLK) + ZCOMBUFS(JI) = PGP2_DEV(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) - PBOUND=UBOUND(PGP3A,2) - ZCOMBUFS(JI) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PBOUND=UBOUND(PGP3A_DEV,2) + ZCOMBUFS(JI) = PGP3A_DEV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) - PBOUND=UBOUND(PGP3B,2) - ZCOMBUFS(JI)= PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PBOUND=UBOUND(PGP3B_DEV,2) + ZCOMBUFS(JI)= PGP3B_DEV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -556,7 +504,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, CALL GSTATS(411,0) IF (IRECV_COUNTS > 0) THEN CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRGTOL%HCOMBUFR_AND_REEL),& - & 1_C_SIZE_T, int(ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1)),kind=c_size_t)) + & 1_JPIB, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) ENDIF #ifdef OMPGPU #endif @@ -576,11 +524,19 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(ZCOMBUFS) #endif + + ISENDTOT_MPI = ISENDTOT + IRECVTOT_MPI = IRECVTOT + IF (ANY(ISENDTOT_MPI /= ISENDTOT)) & + & CALL MPL_ABORT("Overflow in trgtol") + IF (ANY(IRECVTOT_MPI /= IRECVTOT)) & + & CALL MPL_ABORT("Overflow in trgtol") + ! Receive loop......................................................... DO INR=1,IRECV_COUNTS IR=IR+1 IPROC=IRECV_TO_PROC(INR) - CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT(IPROC), & + CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT_MPI(IPROC), & & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL ENDDO @@ -589,7 +545,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, DO INS=1,ISEND_COUNTS IR=IR+1 ISEND=ISEND_TO_PROC(INS) - CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT_MPI(ISEND), & & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL ENDDO @@ -636,7 +592,7 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, IFLD = IFLDA(JFLD) IPOS = IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,1)+ & & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 - PREEL_REAL(IPOS) = PGP(JK,IFLD,JBLK) + PREEL_REAL(IPOS) = PGP_DEV(JK,IFLD,JBLK) ENDDO ENDDO ELSE @@ -656,19 +612,19 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, & (JFLD-1)*IRECV_BUFR_TO_OUT(IRECV_BUFR_TO_OUT_V+JL,2)+1 IF(IFLD < PGP_INDICES(PGP_INDICES_UV+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_UV) - PBOUND=UBOUND(PGPUV,2) - PREEL_REAL(IPOS) = PGPUV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PBOUND=UBOUND(PGPUV_DEV,2) + PREEL_REAL(IPOS) = PGPUV_DEV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP2+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP2) - PREEL_REAL(IPOS) = PGP2(JK,IOFF+1,JBLK) + PREEL_REAL(IPOS) = PGP2_DEV(JK,IOFF+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3A+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3A) - PBOUND=UBOUND(PGP3A,2) - PREEL_REAL(IPOS) = PGP3A(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PBOUND=UBOUND(PGP3A_DEV,2) + PREEL_REAL(IPOS) = PGP3A_DEV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ELSEIF(IFLD < PGP_INDICES(PGP_INDICES_GP3B+1)) THEN IOFF=IFLD-PGP_INDICES(PGP_INDICES_GP3B) - PBOUND=UBOUND(PGP3B,2) - PREEL_REAL(IPOS) = PGP3B(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) + PBOUND=UBOUND(PGP3B_DEV,2) + PREEL_REAL(IPOS) = PGP3B_DEV(JK,MOD(IOFF,PBOUND)+1,IOFF/PBOUND+1,JBLK) ENDIF ENDDO ENDDO @@ -738,13 +694,12 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, !$ACC END DATA ! ZCOMBUFR !$ACC END DATA ! IRECV_BUFR_TO_OUT,PGPINDICES !$ACC END DATA !ZCOMBUFS (present) - !$ACC END DATA !PGP3B - !$ACC END DATA !PGP3A - !$ACC END DATA !PGP2 - !$ACC END DATA !PGPUV - !$ACC END DATA !PGP + !$ACC END DATA !PGP3B_DEV + !$ACC END DATA !PGP3A_DEV + !$ACC END DATA !PGP2_DEV + !$ACC END DATA !PGPUV_DEV + !$ACC END DATA !PGP_DEV #endif - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) IF (LHOOK) CALL DR_HOOK('TRGTOL',1,ZHOOK_HANDLE) END SUBROUTINE TRGTOL diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 6143efe3b..6d23af351 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -15,16 +15,15 @@ MODULE TRLTOG_MOD IMPLICIT NONE PRIVATE - PUBLIC :: TRLTOG_CUDAAWARE, TRLTOG_HANDLE, PREPARE_TRLTOG + PUBLIC :: TRLTOG, TRLTOG_HANDLE, PREPARE_TRLTOG TYPE TRLTOG_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HCOMBUFR_AND_COMBUFS END TYPE CONTAINS FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D - USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE @@ -34,15 +33,16 @@ FUNCTION PREPARE_TRLTOG(ALLOCATOR,KF_FS,KF_GP) RESULT(HTRLTOG) REAL(KIND=JPRBT) :: DUMMY - INTEGER(KIND=C_SIZE_T) :: NELEM + INTEGER(KIND=JPIB) :: NELEM - NELEM = ALIGN(KF_GP*D%NGPTOT*SIZEOF(DUMMY),128) ! ZCOMBUFR - NELEM = ALIGN(NELEM + KF_FS*D%NLENGTF*SIZEOF(DUMMY),128) !ZCOMBUFS upper obund + NELEM = 0 + NELEM = NELEM + ALIGN(1_JPIB*KF_GP*D%NGPTOT*SIZEOF(DUMMY),128) ! ZCOMBUFR + NELEM = NELEM + ALIGN(1_JPIB*KF_FS*D%NLENGTF*SIZEOF(DUMMY),128) !ZCOMBUFS upper obund - HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM) + HTRLTOG%HCOMBUFR_AND_COMBUFS = RESERVE(ALLOCATOR, NELEM, "HTRLTOG%HCOMBUFR_AND_COMBUFS") END FUNCTION PREPARE_TRLTOG - SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,& + SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G,KPTRGP,& & KVSETUV,KVSETSC,KVSETSC3A,KVSETSC3B,KVSETSC2,& & PGP,PGPUV,PGP3A,PGP3B,PGP2) @@ -103,9 +103,9 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ! 09-01-02 G.Mozdzynski: use non-blocking recv and send ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRB , JPRBT, JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER + USE MPL_MODULE ,ONLY : MPL_WAIT, MPL_BARRIER, MPL_ABORT USE TPM_GEN ,ONLY : LSYNC_TRANS USE EQ_REGIONS_MOD ,ONLY : MY_REGION_EW, MY_REGION_NS USE TPM_DISTR ,ONLY : D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV @@ -117,8 +117,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ USE TPM_STATS ,ONLY : GSTATS => GSTATS_NVTX USE TPM_TRANS ,ONLY : LDIVGP, LSCDERS, LUVDER, LVORGP, NPROMA - USE ISO_C_BINDING ,ONLY : C_SIZE_T - USE OPENACC_EXT + USE CUDA_COPY_MODULE IMPLICIT NONE @@ -147,19 +146,23 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ ! LOCAL VARIABLES + REAL(KIND=JPRB),ALLOCATABLE :: PGP_DEV(:,:,:), PGPUV_DEV(:,:,:,:), PGP3A_DEV(:,:,:,:), PGP3B_DEV(:,:,:,:), PGP2_DEV(:,:,:) REAL(KIND=JPRBT), POINTER :: ZCOMBUFS(:),ZCOMBUFR(:) - INTEGER(KIND=JPIM) :: ISENDTOT (NPROC) - INTEGER(KIND=JPIM) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIB) :: ISENDTOT (NPROC) + INTEGER(KIND=JPIB) :: IRECVTOT (NPROC) + INTEGER(KIND=JPIM) :: ISENDTOT_MPI(NPROC) + INTEGER(KIND=JPIM) :: IRECVTOT_MPI(NPROC) INTEGER(KIND=JPIM) :: IREQ (NPROC*2) INTEGER(KIND=JPIM) :: IRECV_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: ISEND_TO_PROC(NPROC) INTEGER(KIND=JPIM) :: JFLD, J, JI, J1, J2, JGL, JK, JL, IFLDS, JROC, INR, INS INTEGER(KIND=JPIM) :: IFIRSTLAT, ILASTLAT, IFLD, IGL, IGLL,& - &IPOS, ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, & + &ISETA, ISETB, ISETV, ISEND, IRECV, ISETW, IPROC, & &IR, ILOCAL_LAT, ISEND_COUNTS, IRECV_COUNTS, IERROR, II, ILEN, IBUFLENS, IBUFLENR, & &JBLK, ILAT_STRIP + INTEGER(KIND=JPIB) :: IPOS ! Contains FIELD, PARS, LEVS INTEGER(KIND=JPIM) :: IGP_OFFSETS(KF_GP,3) @@ -167,12 +170,13 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ INTEGER(KIND=JPIM) :: IUVPAR,IGP2PAR,IGP3ALEV,IGP3APAR,IGP3BLEV,IGP3BPAR,IPAROFF,IOFF INTEGER(KIND=JPIM) :: IFLDA(KF_GP) - INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR(D%NLENGTF,2),IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V + INTEGER(KIND=JPIB) :: IIN_TO_SEND_BUFR(D%NLENGTF,2) + INTEGER(KIND=JPIM) :: IIN_TO_SEND_BUFR_OFFSET(NPROC), IIN_TO_SEND_BUFR_V INTEGER(KIND=JPIM) :: IRECV_FIELD_COUNT(NPRTRV),IRECV_FIELD_COUNT_V INTEGER(KIND=JPIM) :: IRECV_WSET_SIZE(NPRTRW),IRECV_WSET_SIZE_V INTEGER(KIND=JPIM) :: IRECV_WSET_OFFSET(NPRTRW+1), IRECV_WSET_OFFSET_V - INTEGER(KIND=JPIM), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) - INTEGER(KIND=JPIM) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V + INTEGER(KIND=JPIB), ALLOCATABLE :: ICOMBUFS_OFFSET(:),ICOMBUFR_OFFSET(:) + INTEGER(KIND=JPIB) :: ICOMBUFS_OFFSET_V, ICOMBUFR_OFFSET_V INTEGER(KIND=JPIM) :: IVSETUV(KF_UV_G) INTEGER(KIND=JPIM) :: IVSETSC(KF_SCALARS_G) @@ -182,9 +186,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ REAL(KIND=JPHOOK) :: ZHOOK_HANDLE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE_BAR - TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins... - INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT = 0 - TYPE(MPI_COMM) :: LOCAL_COMM TYPE(MPI_REQUEST) :: IREQUEST(NPROC*2) @@ -421,6 +422,18 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IRECV_FIELD_COUNT(IVSET(JFLD)) = IRECV_FIELD_COUNT(IVSET(JFLD)) + 1 ENDDO ENDIF + + IF (PRESENT(PGP)) ALLOCATE(PGP_DEV, MOLD=PGP) + IF (PRESENT(PGPUV)) ALLOCATE(PGPUV_DEV, MOLD=PGPUV) + IF (PRESENT(PGP2)) ALLOCATE(PGP2_DEV, MOLD=PGP2) + IF (PRESENT(PGP3A)) ALLOCATE(PGP3A_DEV, MOLD=PGP3A) + IF (PRESENT(PGP3B)) ALLOCATE(PGP3B_DEV, MOLD=PGP3B) + + !$ACC DATA IF(ALLOCATED(PGP_DEV)) CREATE(PGP_DEV) ASYNC(1) + !$ACC DATA IF(ALLOCATED(PGPUV_DEV)) CREATE(PGPUV_DEV) ASYNC(1) + !$ACC DATA IF(ALLOCATED(PGP2_DEV)) CREATE(PGP2_DEV) ASYNC(1) + !$ACC DATA IF(ALLOCATED(PGP3A_DEV)) CREATE(PGP3A_DEV) ASYNC(1) + !$ACC DATA IF(ALLOCATED(PGP3B_DEV)) CREATE(PGP3B_DEV) ASYNC(1) ! find number of grid-points on a certain W-set that overlap with myself IRECV_WSET_SIZE(:) = 0 DO ILOCAL_LAT=D%NFRSTLAT(MY_REGION_NS),D%NLSTLAT(MY_REGION_NS) @@ -436,7 +449,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ DO JROC=1,NPROC CALL PE2SET(JROC,ISETA,ISETB,ISETW,ISETV) ! total recv size is # points per field * # fields - IRECVTOT(JROC) = IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV) + IRECVTOT(JROC) = 1_JPIB*IRECV_WSET_SIZE(ISETW)*IRECV_FIELD_COUNT(ISETV) ENDDO ! Prepare sender arrays @@ -468,7 +481,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IPOS = IPOS+1 ! offset to first layer of this gridpoint IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,1) = & - & KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) + & 1_JPIB*KF_FS*D%NSTAGTF(IGLL)+(D%NSTA(IGL,ISETB)-1)+(JL-1) ! distance between two layers of this gridpoint IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_OFFSET(JROC)+IPOS,2) = & & D%NSTAGTF(IGLL+1)-D%NSTAGTF(IGLL) @@ -484,36 +497,9 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ !$ACC DATA COPYIN(IIN_TO_SEND_BUFR,IGP_OFFSETS) ASYNC(1) #endif - ACC_POINTERS_CNT = 0 - IF (PRESENT(PGP)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP) - ENDIF - IF (PRESENT(PGPUV)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGPUV) - ENDIF - IF (PRESENT(PGP2)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP2) - ENDIF - IF (PRESENT(PGP3A)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3A) - ENDIF - IF (PRESENT(PGP3B)) THEN - ACC_POINTERS_CNT = ACC_POINTERS_CNT + 1 - ACC_POINTERS(ACC_POINTERS_CNT) = EXT_ACC_PASS(PGP3B) - ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_CREATE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) #ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC DATA IF(PRESENT(PGP)) PRESENT(PGP) ASYNC(1) - !$ACC DATA IF(PRESENT(PGPUV)) PRESENT(PGPUV) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP2)) PRESENT(PGP2) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3A)) PRESENT(PGP3A) ASYNC(1) - !$ACC DATA IF(PRESENT(PGP3B)) PRESENT(PGP3B) ASYNC(1) ! Present until self contribution and packing are done !$ACC DATA PRESENT(PREEL_REAL) @@ -551,7 +537,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(MYSETW) IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(MYSETW) IIN_TO_SEND_BUFR_V = IIN_TO_SEND_BUFR_OFFSET(MYPROC) - IF (PRESENT(PGP)) THEN + IF (ALLOCATED(PGP_DEV)) THEN #ifdef OMPGPU #endif #ifdef ACCGPU @@ -566,7 +552,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IFLD = IFLDA(JFLD) IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 - PGP(JK,IFLD,JBLK) = PREEL_REAL(IPOS) + PGP_DEV(JK,IFLD,JBLK) = PREEL_REAL(IPOS) ENDDO ENDDO ELSE @@ -585,13 +571,13 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IPOS = IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,1)+ & & (JFLD-1)*IIN_TO_SEND_BUFR(IIN_TO_SEND_BUFR_V+JL,2)+1 IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN - PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + PGPUV_DEV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN - PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK)=PREEL_REAL(IPOS) + PGP2_DEV(JK,IGP_OFFSETS(IFLD,2),JBLK)=PREEL_REAL(IPOS) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN - PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + PGP3A_DEV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN - PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) + PGP3B_DEV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = PREEL_REAL(IPOS) ENDIF ENDDO ENDDO @@ -637,12 +623,12 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IF (IRECV_COUNTS > 0) THEN CALL ASSIGN_PTR(ZCOMBUFR, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & 1_C_SIZE_T, int(ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1)),kind=c_size_t)) + & 1_JPIB, ICOMBUFR_OFFSET(IRECV_COUNTS+1)*SIZEOF(ZCOMBUFR(1))) ENDIF IF (ISEND_COUNTS > 0) THEN CALL ASSIGN_PTR(ZCOMBUFS, GET_ALLOCATION(ALLOCATOR, HTRLTOG%HCOMBUFR_AND_COMBUFS),& - & int(ALIGN(KF_GP*D%NGPTOT*SIZEOF(ZCOMBUFR(1)),128)+1,kind=c_size_t), & - & int(ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1)),kind=c_size_t)) + & ALIGN(1_JPIB*KF_GP*D%NGPTOT*SIZEOF(ZCOMBUFR(1)),128)+1, & + & ICOMBUFS_OFFSET(ISEND_COUNTS+1)*SIZEOF(ZCOMBUFS(1))) ENDIF #ifdef OMPGPU @@ -702,11 +688,19 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(ZCOMBUFS) #endif + + ISENDTOT_MPI = ISENDTOT + IRECVTOT_MPI = IRECVTOT + IF (ANY(ISENDTOT_MPI /= ISENDTOT)) & + & CALL MPL_ABORT("Overflow in trltog") + IF (ANY(IRECVTOT_MPI /= IRECVTOT)) & + & CALL MPL_ABORT("Overflow in trltog") + DO INR=1,IRECV_COUNTS IR=IR+1 IRECV=IRECV_TO_PROC(INR) CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & - & IRECVTOT(IRECV), & + & IRECVTOT_MPI(IRECV), & & TRLTOG_DTYPE,NPRCIDS(IRECV)-1, & & MTAGLG, LOCAL_COMM, IREQUEST(IR), & & IERROR ) @@ -717,14 +711,14 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ DO INS=1,ISEND_COUNTS IR=IR+1 ISEND=ISEND_TO_PROC(INS) - CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT(ISEND), & + CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT_MPI(ISEND), & & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL ENDDO IF(IR > 0) THEN CALL MPL_WAIT(KREQUEST=IREQ(1:IR), & - & CDSTRING='TRLTOG_CUDAAWARE: WAIT FOR SENDS AND RECEIVES') + & CDSTRING='TRLTOG: WAIT FOR SENDS AND RECEIVES') ENDIF #ifdef USE_CUDA_AWARE_MPI_FT @@ -782,7 +776,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IRECV_WSET_OFFSET_V = IRECV_WSET_OFFSET(ISETW) IRECV_WSET_SIZE_V = IRECV_WSET_SIZE(ISETW) - IF (PRESENT(PGP)) THEN + IF (ALLOCATED(PGP_DEV)) THEN #ifdef OMPGPU #endif #ifdef ACCGPU @@ -796,7 +790,7 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ JBLK = (IRECV_WSET_OFFSET_V+JL-1)/NPROMA+1 IFLD=IFLDA(JFLD) JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL - PGP(JK,IFLD,JBLK) = ZCOMBUFR(JI) + PGP_DEV(JK,IFLD,JBLK) = ZCOMBUFR(JI) ENDDO ENDDO ELSE @@ -814,13 +808,13 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ IFLD=IFLDA(JFLD) JI = ICOMBUFR_OFFSET_V+(JFLD-1)*IRECV_WSET_SIZE_V+JL IF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_UV) THEN - PGPUV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + PGPUV_DEV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP2) THEN - PGP2(JK,IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + PGP2_DEV(JK,IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3A) THEN - PGP3A(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + PGP3A_DEV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) ELSEIF(IGP_OFFSETS(IFLD,1) == IGP_OFFSETS_GP3B) THEN - PGP3B(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) + PGP3B_DEV(JK,IGP_OFFSETS(IFLD,3),IGP_OFFSETS(IFLD,2),JBLK) = ZCOMBUFR(JI) ENDIF ENDDO ENDDO @@ -849,48 +843,17 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ #ifdef OMPGPU #endif #ifdef ACCGPU + IF (PRESENT(PGP)) CALL COPY(PGP, PGP_DEV, ND2H, QUEUE=1_ACC_HANDLE_KIND) + IF (PRESENT(PGPUV)) CALL COPY(PGPUV, PGPUV_DEV, ND2H, QUEUE=1_ACC_HANDLE_KIND) + IF (PRESENT(PGP2)) CALL COPY(PGP2, PGP2_DEV, ND2H, QUEUE=1_ACC_HANDLE_KIND) + IF (PRESENT(PGP3A)) CALL COPY(PGP3A, PGP3A_DEV, ND2H, QUEUE=1_ACC_HANDLE_KIND) + IF (PRESENT(PGP3B)) CALL COPY(PGP3B, PGP3B_DEV, ND2H, QUEUE=1_ACC_HANDLE_KIND) !$ACC END DATA ! PGP3B !$ACC END DATA ! PGP3A !$ACC END DATA ! PGP2 !$ACC END DATA ! PGPUV !$ACC END DATA ! PGP #endif - IF (PRESENT(PGP)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE HOST(PGP) -#endif - ENDIF - IF (PRESENT(PGPUV)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE HOST(PGPUV) -#endif - ENDIF - IF (PRESENT(PGP2)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE HOST(PGP2) -#endif - ENDIF - IF (PRESENT(PGP3A)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE HOST(PGP3A) -#endif - ENDIF - IF (PRESENT(PGP3B)) THEN -#ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC UPDATE HOST(PGP3B) -#endif - ENDIF - IF (ACC_POINTERS_CNT > 0) CALL EXT_ACC_DELETE(ACC_POINTERS(1:ACC_POINTERS_CNT),STREAM=1_ACC_HANDLE_KIND) IF (LSYNC_TRANS) THEN #ifdef ACCGPU !$ACC WAIT(1) @@ -911,6 +874,6 @@ SUBROUTINE TRLTOG_CUDAAWARE(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_ CALL GSTATS(1606,1) IF (LHOOK) CALL DR_HOOK('TRLTOG',1,ZHOOK_HANDLE) - END SUBROUTINE TRLTOG_CUDAAWARE + END SUBROUTINE TRLTOG END MODULE TRLTOG_MOD diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index f13e3353b..205cafd89 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -14,7 +14,7 @@ MODULE TRLTOM_MOD IMPLICIT NONE PRIVATE - PUBLIC :: TRLTOM_CUDAAWARE, PREPARE_TRLTOM, TRLTOM_HANDLE + PUBLIC :: TRLTOM, PREPARE_TRLTOM, TRLTOM_HANDLE TYPE TRLTOM_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF @@ -23,7 +23,6 @@ MODULE TRLTOM_MOD FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT USE TPM_DISTR, ONLY: D - USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE @@ -33,10 +32,10 @@ FUNCTION PREPARE_TRLTOM(ALLOCATOR, KF_FS) RESULT(HTRLTOM) REAL(KIND=JPRBT) :: DUMMY - HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, int(D%NLENGT1B*2*KF_FS*SIZEOF(DUMMY),kind=c_size_t)) + HTRLTOM%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT1B*KF_FS*SIZEOF(DUMMY), "HTRLTOM%HPFBUF") END FUNCTION - SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) + SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) !**** *TRLTOM * - transposition in Fourierspace ! Purpose. @@ -87,14 +86,13 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) ! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT, JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK + USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK, MPL_ABORT USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW USE TPM_GEN ,ONLY : LSYNC_TRANS USE MPI_F08 USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - USE ISO_C_BINDING, ONLY : C_SIZE_T IMPLICIT NONE @@ -102,8 +100,10 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) REAL(KIND=JPRBT) ,INTENT(OUT), POINTER :: PFBUF(:) REAL(KIND=JPRBT) ,INTENT(INOUT), POINTER :: PFBUF_IN(:) - INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + INTEGER(KIND=JPIM) :: ILENS(NPRTRW),ILENR(NPRTRW) + INTEGER(KIND=JPIB) :: IOFFS(NPRTRW),IOFFR(NPRTRW) + INTEGER(KIND=JPIM) :: J, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + INTEGER(KIND=JPIB) :: ILEN, ISTA REAL(KIND=JPHOOK) :: ZHOOK_HANDLE INTEGER(KIND=JPIM) :: IERROR @@ -118,10 +118,10 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) #endif LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM - IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',0,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HPFBUF),& - & 1_C_SIZE_T, int(D%NLENGT1B*2*KF_FS*SIZEOF(PFBUF(1)),kind=c_size_t)) + & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_FS*SIZEOF(PFBUF(1))) #ifdef OMPGPU #endif @@ -131,10 +131,10 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) IF(NPROC > 1) THEN DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*2*KF_FS - IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS - ILENR(J) = D%NLTSFTB(J)*2*KF_FS - IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS + ILENS(J) = 2_JPIB*D%NLTSGTB(J)*KF_FS + IOFFS(J) = 2_JPIB*D%NSTAGT0B(J)*KF_FS + ILENR(J) = 2_JPIB*D%NLTSFTB(J)*KF_FS + IOFFR(J) = 2_JPIB*D%NSTAGT1B(J)*KF_FS ENDDO CALL GSTATS(806,0) @@ -151,15 +151,13 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) FROM_RECV = IOFFR(IRANK) + 1 TO_RECV = FROM_RECV + ILENR(IRANK) - 1 #ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC KERNELS ASYNC(1) -#endif PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) -#ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC END KERNELS + !$ACC PARALLEL LOOP GANG VECTOR ASYNC(1) + DO J = 1, TO_SEND - FROM_SEND + 1 + PFBUF(FROM_RECV+J-1) = PFBUF_IN(FROM_SEND+J-1) + ENDDO #endif ILENS(IRANK) = 0 ILENR(IRANK) = 0 @@ -167,7 +165,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) IF (LSYNC_TRANS) THEN CALL GSTATS(430,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(430,1) ENDIF CALL GSTATS(411,0) @@ -181,8 +179,10 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(PFBUF_IN,PFBUF) #endif - CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& - & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & + IF (ANY(INT(IOFFR,KIND=JPIM) /= INT(IOFFR,KIND=JPIB))) CALL MPL_ABORT("Overflow in trltom") + IF (ANY(INT(IOFFS,KIND=JPIM) /= INT(IOFFS,KIND=JPIB))) CALL MPL_ABORT("Overflow in trltom") + CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,INT(IOFFS,KIND=JPIM),TRLTOM_DTYPE,& + & PFBUF,ILENR,INT(IOFFR,KIND=JPIM), TRLTOM_DTYPE, & & LOCAL_COMM,IERROR) #ifdef USE_CUDA_AWARE_MPI_FT #ifdef OMPGPU @@ -196,7 +196,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) #endif IF (LSYNC_TRANS) THEN CALL GSTATS(431,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(431,1) ENDIF CALL GSTATS(411,1) @@ -206,8 +206,8 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) #endif CALL GSTATS(806,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_FS - ISTA = D%NSTAGT1B(MYSETW)*2*KF_FS+1 + ILEN = 2_JPIB*D%NLTSGTB(MYSETW)*KF_FS + ISTA = 2_JPIB*D%NSTAGT1B(MYSETW)*KF_FS+1 CALL GSTATS(1607,0) #ifdef OMPGPU #endif @@ -226,7 +226,7 @@ SUBROUTINE TRLTOM_CUDAAWARE(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) !$ACC END DATA #endif - IF (LHOOK) CALL DR_HOOK('TRLTOM_CUDAAWARE',1,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('TRLTOM',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ - END SUBROUTINE TRLTOM_CUDAAWARE + END SUBROUTINE TRLTOM END MODULE TRLTOM_MOD diff --git a/src/trans/gpu/internal/trltom_pack_unpack.F90 b/src/trans/gpu/internal/trltom_pack_unpack.F90 index b14a0188b..a171df9fc 100755 --- a/src/trans/gpu/internal/trltom_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltom_pack_unpack.F90 @@ -26,9 +26,8 @@ MODULE TRLTOM_PACK_UNPACK END TYPE CONTAINS FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D - USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE @@ -38,7 +37,7 @@ FUNCTION PREPARE_TRLTOM_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) REAL(KIND=JPRBT) :: DUMMY - HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, int(D%NLENGT0B*KF_FS*2*SIZEOF(DUMMY),kind=c_size_t)) + HTRLTOM_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_FS*SIZEOF(DUMMY), "HTRLTOM_PACK%HFOUBUF_IN") END FUNCTION PREPARE_TRLTOM_PACK SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) @@ -65,11 +64,10 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) ! ------------------------------------------------------------------ USE BUFFERED_ALLOCATOR_MOD - USE PARKIND_ECTRANS, ONLY : JPIM,JPRBT + USE PARKIND_ECTRANS, ONLY : JPIM,JPRBT,JPIB USE TPM_DISTR, ONLY : D,MYSETW,D_NSTAGTF,D_NPNTGTB0,D_NPTRLS,D_NDGL_FS USE TPM_GEOMETRY, ONLY : G_NMEN,G_NLOEN USE TPM_DIM, ONLY: R_NSMAX - USE ISO_C_BINDING ! IMPLICIT NONE @@ -80,12 +78,13 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRLTOM_PACK_HANDLE), INTENT(IN) :: HTRLTOM_PACK - INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL + INTEGER(KIND=JPIM) :: JM,JF,IGLG,OFFSET_VAR,KGL + INTEGER(KIND=JPIB) :: IOFF_LAT,ISTA REAL(KIND=JPRBT) :: SCAL CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HFOUBUF_IN),& - & 1_C_SIZE_T, int(D%NLENGT0B*KF_FS*2*SIZEOF(FOUBUF_IN(1)),kind=c_size_t)) + & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*SIZEOF(FOUBUF_IN(1))) #ifdef OMPGPU #endif @@ -111,7 +110,7 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) IOFF_LAT = KF_FS*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) SCAL = 1._JPRBT/REAL(G_NLOEN(IGLG),JPRBT) - ISTA = D_NPNTGTB0(JM,KGL)*KF_FS*2 + ISTA = 2_JPIB*D_NPNTGTB0(JM,KGL)*KF_FS FOUBUF_IN(ISTA+2*JF-1) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+1) FOUBUF_IN(ISTA+2*JF ) = SCAL * PREEL_COMPLEX(IOFF_LAT+2*JM+2) @@ -129,9 +128,8 @@ SUBROUTINE TRLTOM_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) END SUBROUTINE TRLTOM_PACK FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE LEDIR_MOD, ONLY: LEDIR_STRIDES - USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE @@ -139,9 +137,10 @@ FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS TYPE(TRLTOM_UNPACK_HANDLE) :: HTRLTOM_UNPACK - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE - INTEGER(KIND=C_SIZE_T) :: ISIZE + INTEGER(KIND=JPIB) :: ISIZE REAL(KIND=JPRBT) :: ZPRBT_DUMMY REAL(KIND=JPRD) :: ZPRD_DUMMY @@ -155,11 +154,11 @@ FUNCTION PREPARE_TRLTOM_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) ISIZE = ISIZE + ALIGN(IIN0_SIZE*SIZEOF(ZPRD_DUMMY),128) - HTRLTOM_UNPACK%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE) + HTRLTOM_UNPACK%HINPS_AND_ZINPA = RESERVE(ALLOCATOR, ISIZE, "HTRLTOM_UNPACK%HINPS_AND_ZINPA") END FUNCTION PREPARE_TRLTOM_UNPACK SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - USE PARKIND_ECTRANS, ONLY : JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY : JPIM, JPRBT, JPRD, JPIB USE TPM_DIM, ONLY : R_NDGNH, R_NDGL USE TPM_GEOMETRY, ONLY : G_NDGLU USE TPM_FIELDS, ONLY : F_RW, F_RACTHE @@ -178,13 +177,14 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) - INTEGER(KIND=JPIM) :: IIN_STRIDES0, IIN_SIZE + INTEGER(KIND=JPIM) :: IIN_STRIDES0 + INTEGER(KIND=JPIB) :: IIN_SIZE INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE - INTEGER(KIND=C_SIZE_T) :: IALLOC_POS, IALLOC_SZ + INTEGER(KIND=JPIB) :: IALLOC_POS, IALLOC_SZ - INTEGER(KIND=8) :: JF - INTEGER(KIND=JPIM) :: KM, ISL, IGLS, OFFSET1, OFFSET2, JGL, KMLOC + INTEGER(KIND=JPIB) :: JF, OFFSET1, OFFSET2 + INTEGER(KIND=JPIM) :: KM, ISL, IGLS, JGL, KMLOC REAL(KIND=JPRBT) :: PAIA, PAIS @@ -234,8 +234,8 @@ SUBROUTINE TRLTOM_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINP IF (JGL >= ISL) THEN !(DO JGL=ISL,R_NDGNH) IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_FS - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_FS + OFFSET1 = 2_JPIB*D_NPNTGTB1(KMLOC,JGL )*KF_FS + OFFSET2 = 2_JPIB*D_NPNTGTB1(KMLOC,IGLS)*KF_FS PAIA = FOUBUF(OFFSET1+JF)-FOUBUF(OFFSET2+JF) PAIS = FOUBUF(OFFSET1+JF)+FOUBUF(OFFSET2+JF) IF (JF <= 4*KF_UV) THEN diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index a82145f39..5b511832b 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -14,7 +14,7 @@ MODULE TRMTOL_MOD IMPLICIT NONE PRIVATE - PUBLIC :: TRMTOL_CUDAAWARE, PREPARE_TRMTOL, TRMTOL_HANDLE + PUBLIC :: TRMTOL, PREPARE_TRMTOL, TRMTOL_HANDLE TYPE TRMTOL_HANDLE TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF @@ -33,10 +33,10 @@ FUNCTION PREPARE_TRMTOL(ALLOCATOR, KF_LEG) RESULT(HTRMTOL) REAL(KIND=JPRBT) :: DUMMY - HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, int(D%NLENGT0B*2*KF_LEG*SIZEOF(DUMMY),kind=c_size_t)) + HTRMTOL%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_LEG*SIZEOF(DUMMY), "HTRMTOL%HPFBUF") END FUNCTION - SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) + SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) !**** *trmtol * - transposition in Fourier space ! Purpose. @@ -87,14 +87,13 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) ! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT + USE PARKIND_ECTRANS ,ONLY : JPIM ,JPRBT, JPIB USE YOMHOOK ,ONLY : LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK + USE MPL_MODULE ,ONLY : MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK, MPL_ABORT USE TPM_DISTR ,ONLY : D, NPRTRW, NPROC, MYPROC, MYSETW USE TPM_GEN ,ONLY : LSYNC_TRANS USE MPI_F08 USE TPM_STATS, ONLY : GSTATS => GSTATS_NVTX - USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE @@ -102,8 +101,10 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PFBUF(:) REAL(KIND=JPRBT), INTENT(IN) :: PFBUF_IN(:) - INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - INTEGER(KIND=JPIM) :: J, ILEN, ISTA, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + INTEGER(KIND=JPIM) :: ILENS(NPRTRW),ILENR(NPRTRW) + INTEGER(KIND=JPIB) :: IOFFS(NPRTRW),IOFFR(NPRTRW) + INTEGER(KIND=JPIM) :: J, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK + INTEGER(KIND=JPIB) :: ILEN, ISTA REAL(KIND=JPHOOK) :: ZHOOK_HANDLE INTEGER(KIND=JPIM) :: IERROR @@ -118,17 +119,17 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM - IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',0,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('TRMTOL',0,ZHOOK_HANDLE) CALL ASSIGN_PTR(PFBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HPFBUF),& - & 1_C_SIZE_T, int(D%NLENGT0B*2*KF_LEG*SIZEOF(PFBUF(1)),kind=c_size_t)) + & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_LEG*SIZEOF(PFBUF(1))) IF(NPROC > 1) THEN DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*2*KF_LEG - IOFFS(J) = D%NSTAGT1B(J)*2*KF_LEG - ILENR(J) = D%NLTSGTB(J)*2*KF_LEG - IOFFR(J) = D%NSTAGT0B(J)*2*KF_LEG + ILENS(J) = 2_JPIB*D%NLTSFTB(J)*KF_LEG + IOFFS(J) = 2_JPIB*D%NSTAGT1B(J)*KF_LEG + ILENR(J) = 2_JPIB*D%NLTSGTB(J)*KF_LEG + IOFFR(J) = 2_JPIB*D%NSTAGT0B(J)*KF_LEG ENDDO CALL GSTATS(807,0) @@ -145,15 +146,13 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) FROM_RECV = IOFFR(IRANK) + 1 TO_RECV = FROM_RECV + ILENR(IRANK) - 1 #ifdef OMPGPU -#endif -#ifdef ACCGPU - !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) -#endif PFBUF(FROM_RECV:TO_RECV) = PFBUF_IN(FROM_SEND:TO_SEND) -#ifdef OMPGPU #endif #ifdef ACCGPU - !$ACC END KERNELS + !$ACC PARALLEL LOOP GANG VECTOR ASYNC(1) + DO J = 1, TO_SEND - FROM_SEND + 1 + PFBUF(FROM_RECV+J-1) = PFBUF_IN(FROM_SEND+J-1) + ENDDO #endif ILENS(IRANK) = 0 ILENR(IRANK) = 0 @@ -161,7 +160,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) IF (LSYNC_TRANS) THEN CALL GSTATS(440,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(440,1) ENDIF CALL GSTATS(421,0) @@ -175,8 +174,10 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) !! this is safe-but-slow fallback for running without GPU-aware MPI !$ACC UPDATE HOST(PFBUF_IN,PFBUF) #endif - CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& - & PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& + IF (ANY(INT(IOFFR,KIND=JPIM) /= INT(IOFFR,KIND=JPIB))) CALL MPL_ABORT("Overflow in trmtol") + IF (ANY(INT(IOFFS,KIND=JPIM) /= INT(IOFFS,KIND=JPIB))) CALL MPL_ABORT("Overflow in trmtol") + CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,INT(IOFFS,KIND=JPIM),TRMTOL_DTYPE,& + & PFBUF,ILENR,INT(IOFFR,KIND=JPIM),TRMTOL_DTYPE,& & LOCAL_COMM,IERROR) #ifdef USE_CUDA_AWARE_MPI_FT #ifdef OMPGPU @@ -190,7 +191,7 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif IF (LSYNC_TRANS) THEN CALL GSTATS(441,0) - CALL MPL_BARRIER(CDSTRING='') + CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') CALL GSTATS(441,1) ENDIF CALL GSTATS(421,1) @@ -200,8 +201,8 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif CALL GSTATS(807,1) ELSE - ILEN = D%NLTSGTB(MYSETW)*2*KF_LEG - ISTA = D%NSTAGT0B(MYSETW)*2*KF_LEG+1 + ILEN = 2_JPIB*D%NLTSGTB(MYSETW)*KF_LEG + ISTA = 2_JPIB*D%NSTAGT0B(MYSETW)*KF_LEG+1 CALL GSTATS(1608,0) #ifdef OMPGPU #endif @@ -214,8 +215,8 @@ SUBROUTINE TRMTOL_CUDAAWARE(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) CALL GSTATS(1608,1) ENDIF - IF (LHOOK) CALL DR_HOOK('TRMTOL_CUDAAWARE',1,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('TRMTOL',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ - END SUBROUTINE TRMTOL_CUDAAWARE + END SUBROUTINE TRMTOL END MODULE TRMTOL_MOD diff --git a/src/trans/gpu/internal/trmtol_pack_unpack.F90 b/src/trans/gpu/internal/trmtol_pack_unpack.F90 index e7076693e..b02bea90e 100755 --- a/src/trans/gpu/internal/trmtol_pack_unpack.F90 +++ b/src/trans/gpu/internal/trmtol_pack_unpack.F90 @@ -26,9 +26,8 @@ MODULE TRMTOL_PACK_UNPACK CONTAINS FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE TPM_DISTR, ONLY: D - USE ISO_C_BINDING USE LEINV_MOD USE BUFFERED_ALLOCATOR_MOD @@ -39,12 +38,12 @@ FUNCTION PREPARE_TRMTOL_PACK(ALLOCATOR,KF_LEG) RESULT(HTRMTOL_PACK) TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK - INTEGER(KIND=C_SIZE_T) :: IALLOC_SZ + INTEGER(KIND=JPIB) :: IALLOC_SZ REAL(KIND=JPRBT) :: ZPRBT_DUMMY - IALLOC_SZ = D%NLENGT1B*2*KF_LEG*SIZEOF(ZPRBT_DUMMY) - HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, int(IALLOC_SZ,kind=c_size_t)) + IALLOC_SZ = 2_JPIB*D%NLENGT1B*KF_LEG*SIZEOF(ZPRBT_DUMMY) + HTRMTOL_PACK%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRMTOL_PACK%HFOUBUF_IN") END FUNCTION SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,KF_LEG) @@ -85,7 +84,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I ! F. Vana 05-Mar-2015 Support for single precision ! ------------------------------------------------------------------ - USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB,JPRBT,JPRD + USE PARKIND_ECTRANS ,ONLY : JPIM,JPRB,JPRBT,JPRD, JPIB USE YOMHOOK, ONLY : LHOOK,DR_HOOK, JPHOOK USE TPM_DIM, ONLY : R_NDGNH,R_NDGL USE TPM_GEOMETRY,ONLY : G_NDGLU @@ -107,8 +106,10 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I ! LOCAL REAL(KIND=JPRBT) :: ZAOA, ZSOA - INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS, OFFSET1, OFFSET2 - INTEGER(KIND=JPIM) :: IOUT_STRIDES0, IOUT_SIZE + INTEGER(KIND=JPIM) :: KMLOC, KM, ISL, JGL, JK, IGLS + INTEGER(KIND=JPIB) :: OFFSET1, OFFSET2 + INTEGER(KIND=JPIM) :: IOUT_STRIDES0 + INTEGER(KIND=JPIB) :: IOUT_SIZE INTEGER(KIND=JPIM) :: IOUT0_STRIDES0, IOUT0_SIZE REAL(KIND=JPHOOK) :: ZHOOK_HANDLE @@ -116,7 +117,7 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IF (LHOOK) CALL DR_HOOK('TRMTOL_PACK',0,ZHOOK_HANDLE) CALL ASSIGN_PTR(FOUBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& - & 1_C_SIZE_T, int(D%NLENGT1B*2*KF_LEG*SIZEOF(FOUBUF_IN(1)),kind=c_size_t)) + & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_LEG*SIZEOF(FOUBUF_IN(1))) CALL LEINV_STRIDES(KF_LEG,IOUT_STRIDES0=IOUT_STRIDES0,IOUT_SIZE=IOUT_SIZE,& IOUT0_STRIDES0=IOUT0_STRIDES0,IOUT0_SIZE=IOUT0_SIZE) @@ -142,8 +143,8 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I IF (JGL >= ISL) THEN !(DO JGL=ISL,R_NDGNH) IGLS = R_NDGL+1-JGL - OFFSET1 = D_NPNTGTB1(KMLOC,JGL )*2*KF_LEG - OFFSET2 = D_NPNTGTB1(KMLOC,IGLS)*2*KF_LEG + OFFSET1 = 2_JPIB*D_NPNTGTB1(KMLOC,JGL )*KF_LEG + OFFSET2 = 2_JPIB*D_NPNTGTB1(KMLOC,IGLS)*KF_LEG IF(KM /= 0) THEN ZSOA = ZOUTS(JK+(JGL-ISL)*IOUT_STRIDES0+D_OFFSETS_GEMM1(KMLOC)*IOUT_STRIDES0) @@ -177,9 +178,8 @@ SUBROUTINE TRMTOL_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_I END SUBROUTINE TRMTOL_PACK FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT + USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D - USE ISO_C_BINDING, ONLY: C_SIZE_T IMPLICIT NONE @@ -190,7 +190,7 @@ FUNCTION PREPARE_TRMTOL_UNPACK(ALLOCATOR,KF_FS) RESULT(HTRMTOL_UNPACK) REAL(KIND=JPRBT) :: DUMMY - HTRMTOL_UNPACK%HREEL = RESERVE(ALLOCATOR, int(D%NLENGTF*KF_FS*SIZEOF(DUMMY),kind=c_size_t)) + HTRMTOL_UNPACK%HREEL = RESERVE(ALLOCATOR, 1_JPIB*D%NLENGTF*KF_FS*SIZEOF(DUMMY), "HTRMTOL_UNPACK%HREEL") END FUNCTION PREPARE_TRMTOL_UNPACK SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURRENT,KF_TOTAL) @@ -222,10 +222,9 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN ! ------------------------------------------------------------------ -USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT +USE PARKIND_ECTRANS ,ONLY : JPIM,JPRBT,JPIB USE TPM_DISTR ,ONLY : D,MYSETW,MYPROC, NPROC, D_NSTAGTF, D_NPNTGTB0,D_NPTRLS,D_NDGL_FS USE TPM_GEOMETRY ,ONLY : G_NMEN,G_NLOEN,G_NLOEN_MAX -USE ISO_C_BINDING ,ONLY : C_SIZE_T ! IMPLICIT NONE @@ -236,11 +235,12 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRMTOL_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK -INTEGER(KIND=JPIM) :: JM,JF,IGLG,ISTA,OFFSET_VAR,IOFF_LAT,KGL +INTEGER(KIND=JPIM) :: JM,JF,IGLG,OFFSET_VAR,KGL +INTEGER(KIND=JPIB) :: IOFF_LAT, ISTA REAL(KIND=JPRBT) :: RET_REAL, RET_COMPLEX CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& - & 1_C_SIZE_T, int(KF_TOTAL*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1)),kind=c_size_t)) + & 1_JPIB, 1_JPIB*KF_TOTAL*D%NLENGTF*SIZEOF(PREEL_COMPLEX(1))) #ifdef OMPGPU #endif @@ -268,12 +268,12 @@ SUBROUTINE TRMTOL_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,KF_CURREN RET_REAL = 0.0_JPRBT RET_COMPLEX = 0.0_JPRBT IF (JM <= G_NMEN(IGLG)) THEN - ISTA = D_NPNTGTB0(JM,KGL)*KF_CURRENT*2 + ISTA = 2_JPIB*D_NPNTGTB0(JM,KGL)*KF_CURRENT RET_REAL = FOUBUF(ISTA+2*JF-1) RET_COMPLEX = FOUBUF(ISTA+2*JF ) ENDIF - IOFF_LAT = KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) + IOFF_LAT = 1_JPIB*KF_TOTAL*D_NSTAGTF(KGL)+(JF-1)*(D_NSTAGTF(KGL+1)-D_NSTAGTF(KGL)) PREEL_COMPLEX(IOFF_LAT+2*JM+1) = RET_REAL PREEL_COMPLEX(IOFF_LAT+2*JM+2) = RET_COMPLEX ENDIF