diff --git a/CMakeLists.txt b/CMakeLists.txt index dc8f105d6..43c2ca125 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -43,7 +43,6 @@ ecbuild_find_package( NAME fiat REQUIRED ) # Inherit MPI feature from FIAT (if you don't want MPI, rebuild FIAT with ENABLE_MPI=OFF) set( HAVE_MPI ${fiat_HAVE_MPI} ) -set( ectrans_HAVE_MPI ${fiat_HAVE_MPI} ) ecbuild_add_option( FEATURE OMP DEFAULT ON diff --git a/src/programs/ectrans.in b/src/programs/ectrans.in index a80893d26..42e7f8660 100755 --- a/src/programs/ectrans.in +++ b/src/programs/ectrans.in @@ -50,7 +50,7 @@ info() echo " flags : @EC_Fortran_FLAGS@" echo "" echo "Features:" - echo " MPI : @ectrans_HAVE_MPI@" + echo " MPI : @HAVE_MPI@" echo " OMP : @ectrans_HAVE_OMP@" echo " MKL : @ectrans_HAVE_MKL@" echo " FFTW : @ectrans_HAVE_FFTW@" diff --git a/src/trans/gpu/CMakeLists.txt b/src/trans/gpu/CMakeLists.txt index 0a19b4b3b..bdba08570 100644 --- a/src/trans/gpu/CMakeLists.txt +++ b/src/trans/gpu/CMakeLists.txt @@ -40,6 +40,11 @@ else() ecbuild_info("warn: HIP and CUDA not found") endif() +if( HAVE_GPU_AWARE_MPI AND NOT fiat_HAVE_MPL_F08 ) + set( USE_RAW_MPI 1 ) +else() + set( USE_RAW_MPI 0) +endif() set( GPU_LIBRARY_TYPE SHARED ) if( HAVE_GPU_STATIC ) @@ -148,14 +153,14 @@ foreach( prec dp sp ) PRIVATE_LIBS ${ECTRANS_GPU_HIP_LIBRARIES} $<${HAVE_ACC}:OpenACC::OpenACC_Fortran> $<${HAVE_OMP}:OpenMP::OpenMP_Fortran> - $<${HAVE_MPI}:MPI::MPI_Fortran> + $<${USE_RAW_MPI}:MPI::MPI_Fortran> PRIVATE_DEFINITIONS ${GPU_RUNTIME}GPU ${GPU_OFFLOAD}GPU $<${HAVE_CUTLASS}:USE_CUTLASS> $<${HAVE_CUTLASS_3XTF32}:USE_CUTLASS_3XTF32> $<${HAVE_GPU_GRAPHS_GEMM}:USE_GRAPHS_GEMM> $<${HAVE_GPU_GRAPHS_FFT}:USE_GRAPHS_FFT> $<${HAVE_GPU_AWARE_MPI}:USE_GPU_AWARE_MPI> - ECTRANS_HAVE_MPI=${ectrans_HAVE_MPI} + $<${USE_RAW_MPI}:USE_RAW_MPI> ) ectrans_target_fortran_module_directory( diff --git a/src/trans/gpu/internal/trgtol_mod.F90 b/src/trans/gpu/internal/trgtol_mod.F90 index 88a16b56f..f6fbb35fc 100755 --- a/src/trans/gpu/internal/trgtol_mod.F90 +++ b/src/trans/gpu/internal/trgtol_mod.F90 @@ -106,15 +106,15 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE, ONLY: MPL_WAIT, MPL_BARRIER, MPL_ABORT + USE MPL_MODULE, ONLY: MPL_WAIT, MPL_BARRIER, MPL_ABORT, MPL_RECV, MPL_SEND USE TPM_GEN, ONLY: LSYNC_TRANS, LMPOFF USE EQ_REGIONS_MOD, ONLY: MY_REGION_EW, MY_REGION_NS USE TPM_DISTR, ONLY: D, MYSETV, MYSETW, MTAGLG, NPRCIDS, MYPROC, NPROC, NPRTRW, & & NPRTRV USE PE2SET_MOD, ONLY: PE2SET - USE MPL_DATA_MODULE, ONLY: MPL_COMM_OML + USE MPL_DATA_MODULE, ONLY: MPL_COMM_OML, JP_NON_BLOCKING_STANDARD USE OML_MOD, ONLY: OML_MY_THREAD -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_REQUEST, MPI_REAL4, MPI_REAL8 ! Missing: MPI_ISEND, MPI_IRECV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif @@ -186,20 +186,20 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins... INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI TYPE(MPI_COMM) :: LOCAL_COMM TYPE(MPI_REQUEST) :: IREQUEST(2*NPROC) +#else + INTEGER(JPIM) :: IREQUEST(2*NPROC) #endif - - #ifdef PARKINDTRANS_SINGLE #define TRGTOL_DTYPE MPI_REAL4 #else #define TRGTOL_DTYPE MPI_REAL8 #endif -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI IF(.NOT. LMPOFF) THEN LOCAL_COMM%MPI_VAL = MPL_COMM_OML( OML_MY_THREAD() ) ENDIF @@ -673,12 +673,15 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, DO INR=1,IRECV_COUNTS IR=IR+1 IPROC=IRECV_TO_PROC(INR) -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)),IRECVTOT_MPI(IPROC), & & TRGTOL_DTYPE,NPRCIDS(IPROC)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL #else - CALL ABORT_TRANS("Should not be here: MPI is disabled") + CALL MPL_RECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & + & KSOURCE=NPRCIDS(IPROC), KTAG=MTAGLG, KMP_TYPE=JP_NON_BLOCKING_STANDARD, & + & KREQUEST=IREQUEST(IR)) + IREQ(IR) = IREQUEST(IR) #endif ENDDO @@ -686,12 +689,15 @@ SUBROUTINE TRGTOL(ALLOCATOR,HTRGTOL,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, DO INS=1,ISEND_COUNTS IR=IR+1 ISEND=ISEND_TO_PROC(INS) -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT_MPI(ISEND), & & TRGTOL_DTYPE,NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL #else - CALL ABORT_TRANS("Should not be here: MPI is disabled") + CALL MPL_SEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)), & + & KDEST=NPRCIDS(ISEND), KTAG=MTAGLG, KMP_TYPE=JP_NON_BLOCKING_STANDARD, & + & KREQUEST=IREQUEST(IR)) + IREQ(IR) = IREQUEST(IR) #endif ENDDO diff --git a/src/trans/gpu/internal/trltog_mod.F90 b/src/trans/gpu/internal/trltog_mod.F90 index 4fcc11496..7e8d0791f 100755 --- a/src/trans/gpu/internal/trltog_mod.F90 +++ b/src/trans/gpu/internal/trltog_mod.F90 @@ -107,15 +107,15 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, USE PARKIND_ECTRANS, ONLY: JPIM, JPRB, JPRBT, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE, ONLY: MPL_WAIT, MPL_BARRIER, MPL_ABORT + USE MPL_MODULE, ONLY: MPL_WAIT, MPL_BARRIER, MPL_ABORT, MPL_RECV, MPL_SEND USE TPM_GEN, ONLY: LSYNC_TRANS, NERR, LMPOFF USE EQ_REGIONS_MOD, ONLY: MY_REGION_EW, MY_REGION_NS USE TPM_DISTR, ONLY: D,MYSETV, MYSETW, MTAGLG,NPRCIDS,MYPROC,NPROC,NPRTRW,NPRTRV USE PE2SET_MOD, ONLY: PE2SET - USE MPL_DATA_MODULE, ONLY: MPL_COMM_OML + USE MPL_DATA_MODULE, ONLY: MPL_COMM_OML, JP_NON_BLOCKING_STANDARD USE OML_MOD, ONLY: OML_MY_THREAD USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_REQUEST, MPI_REAL4, MPI_REAL8 ! Missing: MPI_ISEND, MPI_IRECV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif @@ -192,9 +192,11 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, TYPE(EXT_ACC_ARR_DESC) :: ACC_POINTERS(5) ! at most 5 copyins... INTEGER(KIND=JPIM) :: ACC_POINTERS_CNT -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI TYPE(MPI_COMM) :: LOCAL_COMM TYPE(MPI_REQUEST) :: IREQUEST(NPROC*2) +#else + INTEGER(KIND=JPIM) :: IREQUEST(NPROC*2) #endif #ifdef PARKINDTRANS_SINGLE @@ -202,7 +204,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, #else #define TRLTOG_DTYPE MPI_REAL8 #endif -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI IF(.NOT. LMPOFF) THEN LOCAL_COMM%MPI_VAL = MPL_COMM_OML( OML_MY_THREAD() ) ENDIF @@ -782,7 +784,7 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, DO INR=1,IRECV_COUNTS IR=IR+1 IRECV=IRECV_TO_PROC(INR) -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI CALL MPI_IRECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & & IRECVTOT_MPI(IRECV), & & TRLTOG_DTYPE,NPRCIDS(IRECV)-1, & @@ -790,7 +792,10 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, & IERROR ) IREQ(IR) = IREQUEST(IR)%MPI_VAL #else - CALL ABORT_TRANS("Should not be here: MPI is disabled") + CALL MPL_RECV(ZCOMBUFR(ICOMBUFR_OFFSET(INR)+1:ICOMBUFR_OFFSET(INR+1)), & + & KSOURCE=NPRCIDS(IRECV), KTAG=MTAGLG, KMP_TYPE=JP_NON_BLOCKING_STANDARD, & + & KREQUEST=IREQUEST(IR)) + IREQ(IR) = IREQUEST(IR) #endif ENDDO @@ -798,12 +803,15 @@ SUBROUTINE TRLTOG(ALLOCATOR,HTRLTOG,PREEL_REAL,KF_FS,KF_GP,KF_UV_G,KF_SCALARS_G, DO INS=1,ISEND_COUNTS IR=IR+1 ISEND=ISEND_TO_PROC(INS) -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI CALL MPI_ISEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)),ISENDTOT_MPI(ISEND), & & TRLTOG_DTYPE, NPRCIDS(ISEND)-1,MTAGLG,LOCAL_COMM,IREQUEST(IR),IERROR) IREQ(IR) = IREQUEST(IR)%MPI_VAL #else - CALL ABORT_TRANS("Should not be here: MPI is disabled") + CALL MPL_SEND(ZCOMBUFS(ICOMBUFS_OFFSET(INS)+1:ICOMBUFS_OFFSET(INS+1)), & + & KDEST=NPRCIDS(ISEND), KTAG=MTAGLG, KMP_TYPE=JP_NON_BLOCKING_STANDARD, & + & KREQUEST=IREQUEST(IR)) + IREQ(IR) = IREQUEST(IR) #endif ENDDO diff --git a/src/trans/gpu/internal/trltom_mod.F90 b/src/trans/gpu/internal/trltom_mod.F90 index 03c62dbd6..f8b63add1 100755 --- a/src/trans/gpu/internal/trltom_mod.F90 +++ b/src/trans/gpu/internal/trltom_mod.F90 @@ -90,10 +90,10 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE, ONLY: MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK + USE MPL_MODULE, ONLY: MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK, MPL_ALLTOALLV USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYSETW USE TPM_GEN, ONLY: LSYNC_TRANS, NERR, LMPOFF -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_REAL4, MPI_REAL8 ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif @@ -116,7 +116,7 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRLTOM_HANDLE), INTENT(IN) :: HTRLTOM -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI TYPE(MPI_COMM) :: LOCAL_COMM #endif @@ -126,7 +126,7 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) #define TRLTOM_DTYPE MPI_REAL8 #endif -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI IF(.NOT. LMPOFF) THEN LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM ENDIF @@ -203,12 +203,14 @@ SUBROUTINE TRLTOM(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) !$ACC UPDATE HOST(PFBUF_IN,PFBUF) #endif #endif -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& & PFBUF,ILENR,IOFFR, TRLTOM_DTYPE, & & LOCAL_COMM,IERROR) #else - CALL ABORT_TRANS("Should not be here: MPI is disabled") + CALL MPL_ALLTOALLV(PSENDBUF=PFBUF_IN, KSENDCOUNTS=ILENS, PRECVBUF=PFBUF, KRECVCOUNTS=ILENR, & + & KSENDDISPL=IOFFS, KRECVDISPL=IOFFR, KCOMM=MPL_ALL_MS_COMM, & + & CDSTRING='TRLTOM:') #endif #ifdef USE_GPU_AWARE_MPI #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trltomad_mod.F90 b/src/trans/gpu/internal/trltomad_mod.F90 index c2f48014e..32d468eff 100755 --- a/src/trans/gpu/internal/trltomad_mod.F90 +++ b/src/trans/gpu/internal/trltomad_mod.F90 @@ -20,7 +20,7 @@ MODULE TRLTOMAD_MOD TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN END TYPE CONTAINS - FUNCTION PREPARE_TRLTOMAD(ALLOCATOR, KF_FS) RESULT(HTRLTOM) + FUNCTION PREPARE_TRLTOMAD(ALLOCATOR, KF_FS) RESULT(HTRLTOMAD) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB USE TPM_DISTR, ONLY: D USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE @@ -30,15 +30,16 @@ FUNCTION PREPARE_TRLTOMAD(ALLOCATOR, KF_FS) RESULT(HTRLTOM) TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - TYPE(TRLTOMAD_HANDLE) :: HTRLTOM - + TYPE(TRLTOMAD_HANDLE) :: HTRLTOMAD + INTEGER(KIND=JPIB) :: IALLOC_SZ REAL(KIND=JPRBT) :: DUMMY - HTRLTOM%HFOUBUF_IN = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM%HFOUBUF_IN") + IALLOC_SZ = 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY) + HTRLTOMAD%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRLTOM%HFOUBUF_IN") END FUNCTION - SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) - !**** *TRLTOM * - transposition in Fourierspace + SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOMAD,PFBUF_IN,PFBUF,KF_FS) + !**** *TRLTOMAD * - transposition in Fourierspace ! Purpose. ! -------- @@ -46,11 +47,11 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) ! over latitudes to partitioning over wave numbers ! This is done between inverse Legendre Transform ! and inverse FFT. - ! This is the inverse routine of TRMTOL. + ! This is the inverse routine of TRMTOLAD. !** Interface. ! ---------- - ! *CALL* *TRLTOM(...)* + ! *CALL* *TRLTOMAD(...)* ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is ! -------------------- used for both input and output. @@ -78,14 +79,14 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) ! Modifications. ! -------------- ! Original : 95-10-01 - ! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use + ! Modified : 97-06-17 G. Mozdzynski - control MPI mailbox use ! (NCOMBFLEN) for nphase.eq.1 ! Modified : 99-05-28 D.Salmond - Optimise copies. ! Modified : 00-02-02 M.Hamrud - Remove NPHASE ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message ! passing and buffer packing ! G.Mozdzynski: 08-01-01 Cleanup - ! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS + ! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS ! ------------------------------------------------------------------ USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB @@ -93,12 +94,12 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) USE MPL_MODULE, ONLY: MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYSETW USE TPM_GEN, ONLY: LSYNC_TRANS, NERR, LMPOFF -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_REAL4, MPI_REAL8 ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif - USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION + USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX USE ISO_C_BINDING, ONLY: C_SIZEOF USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS @@ -115,27 +116,27 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) INTEGER(KIND=JPIM) :: IERROR TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRLTOMAD_HANDLE), INTENT(IN) :: HTRLTOM -#if ECTRANS_HAVE_MPI + TYPE(TRLTOMAD_HANDLE), INTENT(IN) :: HTRLTOMAD +#ifdef USE_RAW_MPI TYPE(MPI_COMM) :: LOCAL_COMM #endif #ifdef PARKINDTRANS_SINGLE -#define TRLTOM_DTYPE MPI_REAL4 +#define TRLTOMAD_DTYPE MPI_REAL4 #else -#define TRLTOM_DTYPE MPI_REAL8 +#define TRLTOMAD_DTYPE MPI_REAL8 #endif -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI IF(.NOT. LMPOFF) THEN LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM ENDIF #endif - IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) + IF (LHOOK) CALL DR_HOOK('TRLTOMAD',0,ZHOOK_HANDLE) - CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HFOUBUF_IN),& - & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(PFBUF_IN(1))) + CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOMAD%HFOUBUF_IN),& + & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(PFBUF_IN(1))) #ifdef OMPGPU @@ -159,7 +160,7 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) IF (ILENS(IRANK) /= ILENR(IRANK)) THEN WRITE(NERR,*) "ERROR", ILENS(IRANK), ILENR(IRANK) - CALL ABORT_TRANS("TRLTOM: Error - ILENS(IRANK) /= ILENR(IRANK)") + CALL ABORT_TRANS("TRLTOMAD: Error - ILENS(IRANK) /= ILENR(IRANK)") ENDIF IF (ILENS(IRANK) > 0) THEN FROM_SEND = IOFFS(IRANK) + 1 @@ -204,20 +205,21 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) !$ACC UPDATE HOST(PFBUF_IN,PFBUF) #endif #endif -#if ECTRANS_HAVE_MPI - CALL MPI_ALLTOALLV(PFBUF,ILENR,IOFFR,TRLTOM_DTYPE,& - & PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& - & LOCAL_COMM,IERROR) +#ifdef USE_RAW_MPI + CALL MPI_ALLTOALLV(PFBUF, ILENR, IOFFR, TRLTOMAD_DTYPE, PFBUF_IN, ILENS, IOFFS, & + & TRLTOMAD_DTYPE, LOCAL_COMM,IERROR) #else - CALL ABORT_TRANS("Should not be here: MPI is disabled") + CALL MPL_ALLTOALLV(PSENDBUF=PFBUF, KSENDCOUNTS=ILENR, PRECVBUF=PFBUF_IN, KRECVCOUNTS=ILENS, & + & KSENDDISPL=IOFFR, KRECVDISPL=IOFFS, KCOMM=MPL_ALL_MS_COMM, & + & CDSTRING='TRLTOMAD:') #endif #ifdef USE_GPU_AWARE_MPI -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif #ifdef ACCGPU !$ACC END HOST_DATA #endif +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif #else !! this is safe-but-slow fallback for running without GPU-aware MPI #ifdef OMPGPU @@ -244,7 +246,8 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) IEND = ISTA+ILEN-1 CALL GSTATS(1607,0) #ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(IEND,ISTA,PFBUF_IN,PFBUF) + !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) & + !$OMP SHARED(IEND,ISTA,PFBUF_IN,PFBUF) #endif #ifdef ACCGPU !$ACC PARALLEL LOOP DEFAULT(NONE) FIRSTPRIVATE(ISTA,IEND) @@ -263,6 +266,7 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) #endif IF (LHOOK) CALL DR_HOOK('TRLTOMAD',1,ZHOOK_HANDLE) + ! ------------------------------------------------------------------ - END SUBROUTINE TRLTOMAD +END SUBROUTINE TRLTOMAD END MODULE TRLTOMAD_MOD diff --git a/src/trans/gpu/internal/trmtol_mod.F90 b/src/trans/gpu/internal/trmtol_mod.F90 index 6775f9c2c..9622e7307 100755 --- a/src/trans/gpu/internal/trmtol_mod.F90 +++ b/src/trans/gpu/internal/trmtol_mod.F90 @@ -93,7 +93,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) USE MPL_MODULE, ONLY: MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYSETW USE TPM_GEN, ONLY: LSYNC_TRANS, NERR, LMPOFF -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_REAL4, MPI_REAL8 ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif @@ -117,7 +117,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR TYPE(TRMTOL_HANDLE), INTENT(IN) :: HTRMTOL -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI TYPE(MPI_COMM) :: LOCAL_COMM #endif @@ -127,7 +127,7 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #define TRMTOL_DTYPE MPI_REAL8 #endif -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI IF(.NOT. LMPOFF) THEN LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM ENDIF @@ -203,12 +203,14 @@ SUBROUTINE TRMTOL(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif #endif -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI CALL MPI_ALLTOALLV(PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& & PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& & LOCAL_COMM,IERROR) #else - CALL ABORT_TRANS("Should not be here: MPI is disabled") + CALL MPL_ALLTOALLV(PSENDBUF=PFBUF_IN, KSENDCOUNTS=ILENS, PRECVBUF=PFBUF, KRECVCOUNTS=ILENR, & + & KSENDDISPL=IOFFS, KRECVDISPL=IOFFR, KCOMM=MPL_ALL_MS_COMM, & + & CDSTRING='TRMTOL:') #endif #ifdef USE_GPU_AWARE_MPI diff --git a/src/trans/gpu/internal/trmtolad_mod.F90 b/src/trans/gpu/internal/trmtolad_mod.F90 index c94dd6916..e3710a1fa 100755 --- a/src/trans/gpu/internal/trmtolad_mod.F90 +++ b/src/trans/gpu/internal/trmtolad_mod.F90 @@ -38,8 +38,8 @@ FUNCTION PREPARE_TRMTOLAD(ALLOCATOR, KF_LEG) RESULT(HTRMTOLAD) HTRMTOLAD%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRMTOLAD%HFOUBUF_IN") END FUNCTION -SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) - !**** *trmtol * - transposition in Fourier space +SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOLAD,PFBUF_IN,PFBUF,KF_LEG) + !**** *TRMTOLAD * - transposition in Fourier space ! Purpose. ! -------- @@ -47,12 +47,12 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) ! over wave numbers to partitioning over latitudes. ! It is called between direct FFT and direct Legendre ! transform. - ! This routine is the inverse of TRLTOM. + ! This routine is the inverse of TRLTOMAD. !** Interface. ! ---------- - ! *call* *trmtol(...)* + ! *CALL* *TRMTOLAD(...)* ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is ! -------------------- used for both input and output. @@ -94,7 +94,7 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) USE MPL_MODULE, ONLY: MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYSETW USE TPM_GEN, ONLY: LSYNC_TRANS, NERR, LMPOFF -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI USE MPI_F08, ONLY: MPI_COMM, MPI_REAL4, MPI_REAL8 ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) #endif @@ -116,19 +116,19 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) INTEGER(KIND=JPIM) :: IERROR TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRMTOLAD_HANDLE), INTENT(IN) :: HTRMTOL + TYPE(TRMTOLAD_HANDLE), INTENT(IN) :: HTRMTOLAD -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI TYPE(MPI_COMM) :: LOCAL_COMM #endif #ifdef PARKINDTRANS_SINGLE -#define TRMTOL_DTYPE MPI_REAL4 +#define TRMTOLAD_DTYPE MPI_REAL4 #else -#define TRMTOL_DTYPE MPI_REAL8 +#define TRMTOLAD_DTYPE MPI_REAL8 #endif -#if ECTRANS_HAVE_MPI +#ifdef USE_RAW_MPI IF(.NOT. LMPOFF) THEN LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM ENDIF @@ -136,9 +136,16 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) IF (LHOOK) CALL DR_HOOK('TRMTOLAD',0,ZHOOK_HANDLE) - CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HFOUBUF_IN),& + CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOLAD%HFOUBUF_IN),& & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_LEG*C_SIZEOF(PFBUF_IN(1))) +#ifdef OMPGPU + !$OMP TARGET DATA MAP(PRESENT,ALLOC:PFBUF,PFBUF_IN) +#endif +#ifdef ACCGPU + !$ACC DATA PRESENT(PFBUF,PFBUF_IN) +#endif + IF(NPROC > 1) THEN DO J=1,NPRTRW ILENS(J) = D%NLTSFTB(J)*2*KF_LEG @@ -153,7 +160,7 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) IF (ILENS(IRANK) /= ILENR(IRANK)) THEN WRITE(NERR,*) "ERROR", ILENS(IRANK), ILENR(IRANK) - CALL ABORT_TRANS("TRMTOL: ILENS(IRANK) /= ILENR(IRANK)") + CALL ABORT_TRANS("TRMTOLAD: Error - ILENS(IRANK) /= ILENR(IRANK)") ENDIF IF (ILENS(IRANK) > 0) THEN FROM_SEND = IOFFS(IRANK) + 1 @@ -197,17 +204,21 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif #else !! this is safe-but-slow fallback for running without GPU-aware MPI +#ifdef OMPGPU + !$OMP TARGET UPDATE FROM(PFBUF_IN,PFBUF) +#endif +#ifdef ACCGPU !$ACC UPDATE HOST(PFBUF_IN,PFBUF) #endif - -#if ECTRANS_HAVE_MPI - CALL MPI_ALLTOALLV(PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& - & PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& - & LOCAL_COMM,IERROR) +#endif +#ifdef USE_RAW_MPI + CALL MPI_ALLTOALLV(PFBUF, ILENR, IOFFR, TRMTOLAD_DTYPE, PFBUF_IN, ILENS, IOFFS, & + & TRMTOLAD_DTYPE, LOCAL_COMM,IERROR) #else - CALL ABORT_TRANS("Should not be here: MPI is disabled") + CALL MPL_ALLTOALLV(PSENDBUF=PFBUF, KSENDCOUNTS=ILENR, PRECVBUF=PFBUF_IN, KRECVCOUNTS=ILENS, & + & KSENDDISPL=IOFFR, KRECVDISPL=IOFFS, KCOMM=MPL_ALL_MS_COMM, & + & CDSTRING='TRMTOLAD:') #endif - #ifdef USE_GPU_AWARE_MPI #ifdef ACCGPU !$ACC END HOST_DATA @@ -217,7 +228,12 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) #endif #else !! this is safe-but-slow fallback for running without GPU-aware MPI +#ifdef OMPGPU + !$OMP TARGET UPDATE TO(PFBUF_IN) +#endif +#ifdef ACCGPU !$ACC UPDATE DEVICE(PFBUF_IN) +#endif #endif IF (LSYNC_TRANS) THEN CALL GSTATS(441,0) @@ -251,7 +267,14 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) CALL GSTATS(1608,1) ENDIF - IF (LHOOK) CALL DR_HOOK('TRMTOL',1,ZHOOK_HANDLE) +#ifdef OMPGPU + !$OMP END TARGET DATA +#endif +#ifdef ACCGPU + !$ACC END DATA +#endif + + IF (LHOOK) CALL DR_HOOK('TRMTOLAD',1,ZHOOK_HANDLE) ! ------------------------------------------------------------------ END SUBROUTINE TRMTOLAD diff --git a/src/transi/CMakeLists.txt b/src/transi/CMakeLists.txt index 7d1c75408..85121d74b 100644 --- a/src/transi/CMakeLists.txt +++ b/src/transi/CMakeLists.txt @@ -23,7 +23,7 @@ ecbuild_add_library( TARGET transi_dp $ PRIVATE_LIBS trans_dp $<${ectrans_HAVE_ETRANS}:etrans_dp> - PRIVATE_DEFINITIONS ECTRANS_HAVE_MPI=${ectrans_HAVE_MPI} + PRIVATE_DEFINITIONS ECTRANS_HAVE_MPI=${HAVE_MPI} ECTRANS_HAVE_ETRANS=${ectrans_HAVE_ETRANS} ) ectrans_target_fortran_module_directory( TARGET transi_dp @@ -41,7 +41,7 @@ if( HAVE_GPU ) PUBLIC_INCLUDES $ $ PRIVATE_LIBS trans_gpu_dp - PRIVATE_DEFINITIONS ECTRANS_HAVE_MPI=${ectrans_HAVE_MPI} + PRIVATE_DEFINITIONS ECTRANS_HAVE_MPI=${HAVE_MPI} ECTRANS_GPU_VERSION ECTRANS_HAVE_ETRANS=0 )