diff --git a/src/trans/gpu/internal/dir_trans_ctlad_mod.F90 b/src/trans/gpu/internal/dir_trans_ctlad_mod.F90 index 695fd4da0..53436f5f6 100755 --- a/src/trans/gpu/internal/dir_trans_ctlad_mod.F90 +++ b/src/trans/gpu/internal/dir_trans_ctlad_mod.F90 @@ -81,10 +81,10 @@ SUBROUTINE DIR_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& USE FTINV_MOD, ONLY: FTINV_HANDLE, PREPARE_FTINV, FTINV USE LTDIRAD_MOD, ONLY: LTDIRAD_HANDLE, PREPARE_LTDIRAD, LTDIRAD USE TRLTOG_MOD, ONLY: TRLTOG_HANDLE, PREPARE_TRLTOG, TRLTOG - USE TRLTOMAD_MOD, ONLY: TRLTOMAD_HANDLE, PREPARE_TRLTOMAD, TRLTOMAD - USE TRLTOMAD_PACK_UNPACK, ONLY: TRLTOMAD_PACK_HANDLE, TRLTOMAD_UNPACK_HANDLE, & - & PREPARE_TRLTOMAD_PACK, PREPARE_TRLTOMAD_UNPACK, TRLTOMAD_PACK, & - & TRLTOMAD_UNPACK + USE TRMTOL_MOD, ONLY: TRMTOL_HANDLE, PREPARE_TRMTOL, TRMTOL + USE TRMTOL_PACK_UNPACK, ONLY: TRMTOL_PACK_HANDLE, TRMTOL_UNPACK_HANDLE, & + & PREPARE_TRMTOL_PACK, PREPARE_TRMTOL_UNPACK + USE TRLTOMAD_PACK_UNPACK, ONLY: TRLTOMAD_PACK, TRLTOMAD_UNPACK USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS IMPLICIT NONE @@ -124,9 +124,9 @@ SUBROUTINE DIR_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR TYPE(TRLTOG_HANDLE) :: HTRLTOG TYPE(FTINV_HANDLE) :: HFTINV - TYPE(TRLTOMAD_PACK_HANDLE) :: HTRLTOM_PACK - TYPE(TRLTOMAD_HANDLE) :: HTRLTOM - TYPE(TRLTOMAD_UNPACK_HANDLE) :: HTRLTOM_UNPACK + TYPE(TRMTOL_UNPACK_HANDLE) :: HTRMTOL_UNPACK + TYPE(TRMTOL_HANDLE) :: HTRMTOL + TYPE(TRMTOL_PACK_HANDLE) :: HTRMTOL_PACK TYPE(LTDIRAD_HANDLE) :: HLTDIR IF (NPROMATR > 0) THEN @@ -137,9 +137,9 @@ SUBROUTINE DIR_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& ALLOCATOR = MAKE_BUFFERED_ALLOCATOR() IF (KF_FS > 0) THEN HLTDIR = PREPARE_LTDIRAD(ALLOCATOR, KF_FS, KF_UV) - HTRLTOM_UNPACK = PREPARE_TRLTOMAD_UNPACK(ALLOCATOR, KF_FS) - HTRLTOM = PREPARE_TRLTOMAD(ALLOCATOR, KF_FS) - HTRLTOM_PACK = PREPARE_TRLTOMAD_PACK(ALLOCATOR, KF_FS) + HTRMTOL_PACK = PREPARE_TRMTOL_PACK(ALLOCATOR, KF_FS) + HTRMTOL = PREPARE_TRMTOL(ALLOCATOR, KF_FS) + HTRMTOL_UNPACK = PREPARE_TRMTOL_UNPACK(ALLOCATOR, KF_FS) HFTINV = PREPARE_FTINV(ALLOCATOR,KF_FS) ENDIF HTRLTOG = PREPARE_TRLTOG(ALLOCATOR,KF_GP,KF_FS) @@ -152,9 +152,9 @@ SUBROUTINE DIR_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_UV,KF_SCALARS,& & PSPSC3A,PSPSC3B,PSPSC2) CALL GSTATS(153,0) - CALL TRLTOMAD_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) - CALL TRLTOMAD(ALLOCATOR,HTRLTOM,FOUBUF_IN,FOUBUF,KF_FS) - CALL TRLTOMAD_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + CALL TRLTOMAD_UNPACK(ALLOCATOR,HTRMTOL_PACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + CALL TRMTOL(ALLOCATOR,HTRMTOL,FOUBUF,FOUBUF_IN,KF_FS) + CALL TRLTOMAD_PACK(ALLOCATOR,HTRMTOL_UNPACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) CALL GSTATS(153,1) ! fourier transform from PREEL_REAL to PREEL_COMPLEX (in-place!) diff --git a/src/trans/gpu/internal/inv_trans_ctlad_mod.F90 b/src/trans/gpu/internal/inv_trans_ctlad_mod.F90 index 0620c4d69..786ab4fe6 100644 --- a/src/trans/gpu/internal/inv_trans_ctlad_mod.F90 +++ b/src/trans/gpu/internal/inv_trans_ctlad_mod.F90 @@ -88,7 +88,7 @@ SUBROUTINE INV_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& USE TPM_TRANS, ONLY: LDIVGP, LSCDERS, LUVDER, LVORGP, GROWING_ALLOCATION USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, MAKE_BUFFERED_ALLOCATOR, INSTANTIATE_ALLOCATOR - USE TRMTOLAD_MOD, ONLY: PREPARE_TRMTOLAD, TRMTOLAD_HANDLE, TRMTOLAD + USE TRLTOM_MOD, ONLY: PREPARE_TRLTOM, TRLTOM_HANDLE, TRLTOM USE LTINVAD_MOD, ONLY: PREPARE_LTINVAD, LTINVAD_HANDLE, LTINVAD USE TRMTOLAD_PACK_UNPACK, ONLY: TRMTOLAD_PACK_HANDLE, TRMTOLAD_UNPACK_HANDLE, & & PREPARE_TRMTOLAD_PACK, PREPARE_TRMTOLAD_UNPACK, TRMTOLAD_PACK, & @@ -141,7 +141,7 @@ SUBROUTINE INV_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& TYPE(BUFFERED_ALLOCATOR) :: ALLOCATOR TYPE(LTINVAD_HANDLE) :: HLTINVAD TYPE(TRMTOLAD_PACK_HANDLE) :: HTRMTOL_PACK - TYPE(TRMTOLAD_HANDLE) :: HTRMTOL + TYPE(TRLTOM_HANDLE) :: HTRLTOM TYPE(TRMTOLAD_UNPACK_HANDLE) :: HTRMTOL_UNPACK TYPE(FTDIR_HANDLE) :: HFTDIR TYPE(TRGTOL_HANDLE) :: HTRGTOL @@ -191,7 +191,7 @@ SUBROUTINE INV_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& IF (KF_FS > 0) THEN HFTDIR = PREPARE_FTDIR(ALLOCATOR,IF_FOURIER) HTRMTOL_UNPACK = PREPARE_TRMTOLAD_UNPACK(ALLOCATOR,IF_LEG) - HTRMTOL = PREPARE_TRMTOLAD(ALLOCATOR,IF_LEG) + HTRLTOM = PREPARE_TRLTOM(ALLOCATOR,IF_LEG) HTRMTOL_PACK = PREPARE_TRMTOLAD_PACK(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) HLTINVAD = PREPARE_LTINVAD(ALLOCATOR,KF_UV,KF_SCALARS,LVORGP,LDIVGP,LSCDERS) ENDIF @@ -218,7 +218,7 @@ SUBROUTINE INV_TRANS_CTLAD(KF_UV_G,KF_SCALARS_G,KF_GP,KF_FS,KF_OUT_LT,& ! Packing into send buffer, to fourier space and unpack CALL GSTATS(152,0) CALL TRMTOLAD_UNPACK(ALLOCATOR,HTRMTOL_UNPACK,FOUBUF,PREEL_COMPLEX,IF_LEG,IF_FOURIER) - CALL TRMTOLAD(ALLOCATOR,HTRMTOL,FOUBUF_IN,FOUBUF,IF_LEG) + CALL TRLTOM(ALLOCATOR,HTRLTOM,FOUBUF,FOUBUF_IN,IF_LEG) CALL TRMTOLAD_PACK(ALLOCATOR,HTRMTOL_PACK,ZOUTS,ZOUTA,ZOUTS0,ZOUTA0,FOUBUF_IN,IF_LEG) CALL GSTATS(152,1) diff --git a/src/trans/gpu/internal/trltomad_mod.F90 b/src/trans/gpu/internal/trltomad_mod.F90 deleted file mode 100755 index c2f48014e..000000000 --- a/src/trans/gpu/internal/trltomad_mod.F90 +++ /dev/null @@ -1,268 +0,0 @@ -! (C) Copyright 1995- ECMWF. -! (C) Copyright 1995- Meteo-France. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE TRLTOMAD_MOD - USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE - IMPLICIT NONE - - PRIVATE - PUBLIC :: TRLTOMAD, PREPARE_TRLTOMAD, TRLTOMAD_HANDLE - - TYPE TRLTOMAD_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN - END TYPE -CONTAINS - FUNCTION PREPARE_TRLTOMAD(ALLOCATOR, KF_FS) RESULT(HTRLTOM) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB - USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE - USE ISO_C_BINDING, ONLY: C_SIZEOF - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - TYPE(TRLTOMAD_HANDLE) :: HTRLTOM - - REAL(KIND=JPRBT) :: DUMMY - - HTRLTOM%HFOUBUF_IN = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM%HFOUBUF_IN") - END FUNCTION - - SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS) - !**** *TRLTOM * - transposition in Fourierspace - - ! Purpose. - ! -------- - ! Transpose Fourier coefficients from partitioning - ! over latitudes to partitioning over wave numbers - ! This is done between inverse Legendre Transform - ! and inverse FFT. - ! This is the inverse routine of TRMTOL. - - !** Interface. - ! ---------- - ! *CALL* *TRLTOM(...)* - - ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is - ! -------------------- used for both input and output. - - ! KF_FS - Number of fields communicated - - ! Implicit arguments : - ! -------------------- - - ! Method. - ! ------- - ! See documentation - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! MPP Group *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 95-10-01 - ! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use - ! (NCOMBFLEN) for nphase.eq.1 - ! Modified : 99-05-28 D.Salmond - Optimise copies. - ! Modified : 00-02-02 M.Hamrud - Remove NPHASE - ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message - ! passing and buffer packing - ! G.Mozdzynski: 08-01-01 Cleanup - ! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS - ! ------------------------------------------------------------------ - - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB - USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE, ONLY: MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK - USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYSETW - USE TPM_GEN, ONLY: LSYNC_TRANS, NERR, LMPOFF -#if ECTRANS_HAVE_MPI - USE MPI_F08, ONLY: MPI_COMM, MPI_REAL4, MPI_REAL8 - ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) -#endif - USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION - USE ISO_C_BINDING, ONLY: C_SIZEOF - USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS - - IMPLICIT NONE - - INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_FS - REAL(KIND=JPRBT) ,INTENT(INOUT), POINTER :: PFBUF(:) - REAL(KIND=JPRBT) ,INTENT(OUT) , POINTER :: PFBUF_IN(:) - - INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - INTEGER(KIND=JPIM) :: J, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK - INTEGER(KIND=JPIB) :: JPOS, ISTA, IEND, ILEN - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - INTEGER(KIND=JPIM) :: IERROR - - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRLTOMAD_HANDLE), INTENT(IN) :: HTRLTOM -#if ECTRANS_HAVE_MPI - TYPE(MPI_COMM) :: LOCAL_COMM -#endif - -#ifdef PARKINDTRANS_SINGLE -#define TRLTOM_DTYPE MPI_REAL4 -#else -#define TRLTOM_DTYPE MPI_REAL8 -#endif - -#if ECTRANS_HAVE_MPI - IF(.NOT. LMPOFF) THEN - LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM - ENDIF -#endif - - IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE) - - CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HFOUBUF_IN),& - & 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(PFBUF_IN(1))) - - -#ifdef OMPGPU - !$OMP TARGET DATA MAP(PRESENT,ALLOC:PFBUF,PFBUF_IN) -#endif -#ifdef ACCGPU - !$ACC DATA PRESENT(PFBUF,PFBUF_IN) -#endif - - IF(NPROC > 1) THEN - DO J=1,NPRTRW - ILENS(J) = D%NLTSGTB(J)*2*KF_FS - IOFFS(J) = D%NSTAGT0B(J)*2*KF_FS - ILENR(J) = D%NLTSFTB(J)*2*KF_FS - IOFFR(J) = D%NSTAGT1B(J)*2*KF_FS - ENDDO - - CALL GSTATS(806,0) - - ! copy to self workaround - IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) - IF (ILENS(IRANK) /= ILENR(IRANK)) THEN - WRITE(NERR,*) "ERROR", ILENS(IRANK), ILENR(IRANK) - CALL ABORT_TRANS("TRLTOM: Error - ILENS(IRANK) /= ILENR(IRANK)") - ENDIF - IF (ILENS(IRANK) > 0) THEN - FROM_SEND = IOFFS(IRANK) + 1 - TO_SEND = FROM_SEND + ILENS(IRANK) - 1 - FROM_RECV = IOFFR(IRANK) + 1 - TO_RECV = FROM_RECV + ILENR(IRANK) - 1 -#ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) FIRSTPRIVATE(PFBUF,PFBUF_IN,FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) - DO JPOS=FROM_RECV,TO_RECV - PFBUF_IN(JPOS-FROM_RECV+FROM_SEND) = PFBUF(JPOS) - ENDDO - !$OMP END TARGET TEAMS DISTRIBUTE PARALLEL DO -#endif -#ifdef ACCGPU - !$ACC KERNELS ASYNC(1) - PFBUF_IN(FROM_SEND:TO_SEND) = PFBUF(FROM_RECV:TO_RECV) - !$ACC END KERNELS -#endif - ILENS(IRANK) = 0 - ILENR(IRANK) = 0 - ENDIF - - IF (LSYNC_TRANS) THEN - CALL GSTATS(430,0) - CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') - CALL GSTATS(430,1) - ENDIF - CALL GSTATS(411,0) -#ifdef USE_GPU_AWARE_MPI -#ifdef OMPGPU - !$OMP TARGET DATA USE_DEVICE_PTR(PFBUF_IN,PFBUF) -#endif -#ifdef ACCGPU - !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) -#endif -#else - !! this is safe-but-slow fallback for running without GPU-aware MPI -#ifdef OMPGPU - !$OMP TARGET UPDATE FROM(PFBUF_IN,PFBUF) -#endif -#ifdef ACCGPU - !$ACC UPDATE HOST(PFBUF_IN,PFBUF) -#endif -#endif -#if ECTRANS_HAVE_MPI - CALL MPI_ALLTOALLV(PFBUF,ILENR,IOFFR,TRLTOM_DTYPE,& - & PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,& - & LOCAL_COMM,IERROR) -#else - CALL ABORT_TRANS("Should not be here: MPI is disabled") -#endif -#ifdef USE_GPU_AWARE_MPI -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif -#ifdef ACCGPU - !$ACC END HOST_DATA -#endif -#else - !! this is safe-but-slow fallback for running without GPU-aware MPI -#ifdef OMPGPU - !$OMP TARGET UPDATE TO(PFBUF_IN) -#endif -#ifdef ACCGPU - !$ACC UPDATE DEVICE(PFBUF_IN) -#endif -#endif - IF (LSYNC_TRANS) THEN - CALL GSTATS(431,0) - CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') - CALL GSTATS(431,1) - ENDIF - CALL GSTATS(411,1) - -#ifdef ACCGPU - !$ACC WAIT(1) -#endif - CALL GSTATS(806,1) - ELSE - ILEN = 2_JPIB*D%NLTSGTB(MYSETW)*KF_FS - ISTA = 2_JPIB*D%NSTAGT1B(MYSETW)*KF_FS+1 - IEND = ISTA+ILEN-1 - CALL GSTATS(1607,0) -#ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(IEND,ISTA,PFBUF_IN,PFBUF) -#endif -#ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) FIRSTPRIVATE(ISTA,IEND) -#endif - DO JPOS=ISTA,IEND - PFBUF_IN(JPOS) = PFBUF(JPOS) - ENDDO - CALL GSTATS(1607,1) - ENDIF - -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif -#ifdef ACCGPU - !$ACC END DATA -#endif - - IF (LHOOK) CALL DR_HOOK('TRLTOMAD',1,ZHOOK_HANDLE) - ! ------------------------------------------------------------------ - END SUBROUTINE TRLTOMAD -END MODULE TRLTOMAD_MOD diff --git a/src/trans/gpu/internal/trltomad_pack_unpack.F90 b/src/trans/gpu/internal/trltomad_pack_unpack.F90 index b5e868202..16fb0b71e 100755 --- a/src/trans/gpu/internal/trltomad_pack_unpack.F90 +++ b/src/trans/gpu/internal/trltomad_pack_unpack.F90 @@ -1,4 +1,3 @@ -#define ALIGN(I, A) (((I)+(A)-1)/(A)*(A)) ! (C) Copyright 2000- ECMWF. ! (C) Copyright 2000- Meteo-France. ! (C) Copyright 2022- NVIDIA. @@ -11,43 +10,14 @@ ! MODULE TRLTOMAD_PACK_UNPACK - USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE - USE PARKIND_ECTRANS, ONLY: JPIM IMPLICIT NONE PRIVATE - PUBLIC :: TRLTOMAD_PACK_HANDLE, PREPARE_TRLTOMAD_PACK, TRLTOMAD_PACK - PUBLIC :: TRLTOMAD_UNPACK_HANDLE, PREPARE_TRLTOMAD_UNPACK, TRLTOMAD_UNPACK - - TYPE TRLTOMAD_PACK_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HREEL_COMPLEX - END TYPE - TYPE TRLTOMAD_UNPACK_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HPFBUF - END TYPE - - INTEGER(KIND=JPIM) :: A = 8 !Alignment + PUBLIC :: TRLTOMAD_PACK + PUBLIC :: TRLTOMAD_UNPACK CONTAINS - FUNCTION PREPARE_TRLTOMAD_PACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_PACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB - USE TPM_DISTR, ONLY: D - USE ISO_C_BINDING, ONLY: C_SIZEOF - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - TYPE(TRLTOMAD_PACK_HANDLE) :: HTRLTOM_PACK - - REAL(KIND=JPRBT) :: DUMMY - - HTRLTOM_PACK%HREEL_COMPLEX = RESERVE(ALLOCATOR, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(DUMMY), & - & "HTRLTOMAD_PACK%HREEL_COMPLEX") - END FUNCTION PREPARE_TRLTOMAD_PACK - - SUBROUTINE TRLTOMAD_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) + SUBROUTINE TRLTOMAD_PACK(ALLOCATOR,HTRMTOL_UNPACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) !**** *TRLTOMAD_PACK* - Copy fourier data from local array to buffer ! Purpose. @@ -76,6 +46,7 @@ SUBROUTINE TRLTOMAD_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) USE TPM_GEOMETRY, ONLY: G USE TPM_DIM, ONLY: R USE ISO_C_BINDING, ONLY: C_SIZEOF + USE TRMTOL_PACK_UNPACK, ONLY: TRMTOL_UNPACK_HANDLE ! IMPLICIT NONE @@ -84,7 +55,7 @@ SUBROUTINE TRLTOMAD_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) REAL(KIND=JPRBT), INTENT(IN) :: FOUBUF_IN(:) INTEGER(KIND=JPIM),INTENT(IN) :: KF_FS TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRLTOMAD_PACK_HANDLE), INTENT(IN) :: HTRLTOM_PACK + TYPE(TRMTOL_UNPACK_HANDLE), INTENT(IN) :: HTRMTOL_UNPACK INTEGER(KIND=JPIM) :: JM,JF,IGLG,OFFSET_VAR,KGL,J,N INTEGER(KIND=JPIB) :: IOFF_LAT,ISTA @@ -94,7 +65,7 @@ SUBROUTINE TRLTOMAD_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) ASSOCIATE(D_NSTAGTF=>D%NSTAGTF, D_NPNTGTB0=>D%NPNTGTB0, D_NPTRLS=>D%NPTRLS, & & D_NDGL_FS=>D%NDGL_FS, G_NMEN=>G%NMEN, G_NLOEN=>G%NLOEN, R_NSMAX=>R%NSMAX) - CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRLTOM_PACK%HREEL_COMPLEX),& + CALL ASSIGN_PTR(PREEL_COMPLEX, GET_ALLOCATION(ALLOCATOR, HTRMTOL_UNPACK%HREEL),& & 1_JPIB, 1_JPIB*KF_FS*D%NLENGTF*C_SIZEOF(PREEL_COMPLEX(1))) N = 1_JPIB*KF_FS*D%NLENGTF @@ -170,33 +141,7 @@ SUBROUTINE TRLTOMAD_PACK(ALLOCATOR,HTRLTOM_PACK,PREEL_COMPLEX,FOUBUF_IN,KF_FS) END ASSOCIATE END SUBROUTINE TRLTOMAD_PACK - FUNCTION PREPARE_TRLTOMAD_UNPACK(ALLOCATOR, KF_FS) RESULT(HTRLTOM_UNPACK) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE - USE LEDIR_MOD, ONLY: LEDIR_STRIDES - USE ISO_C_BINDING, ONLY: C_SIZEOF - USE TPM_DISTR, ONLY: D - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS - TYPE(TRLTOMAD_UNPACK_HANDLE) :: HTRLTOM_UNPACK - - INTEGER(KIND=JPIM) :: IIN_STRIDES0 - INTEGER(KIND=JPIB) :: IIN_SIZE - INTEGER(KIND=JPIM) :: IIN0_STRIDES0, IIN0_SIZE - INTEGER(KIND=JPIB) :: ISIZE - - REAL(KIND=JPRBT) :: DUMMY - - CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,& - IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) - - HTRLTOM_UNPACK%HPFBUF = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM_UNPACK%HPFBUF") - END FUNCTION PREPARE_TRLTOMAD_UNPACK - - SUBROUTINE TRLTOMAD_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) + SUBROUTINE TRLTOMAD_UNPACK(ALLOCATOR,HTRMTOL_PACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZINPA0,KF_FS,KF_UV) USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPRD, JPIB USE TPM_DIM, ONLY: R USE TPM_GEOMETRY, ONLY: G @@ -205,6 +150,7 @@ SUBROUTINE TRLTOMAD_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZI USE TPM_DISTR, ONLY: D USE LEDIR_MOD, ONLY: LEDIR_STRIDES USE, INTRINSIC :: ISO_C_BINDING, ONLY: C_SIZEOF + USE TRMTOL_PACK_UNPACK, ONLY: TRMTOL_PACK_HANDLE IMPLICIT NONE @@ -213,7 +159,7 @@ SUBROUTINE TRLTOMAD_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZI REAL(KIND=JPRD), INTENT(IN) :: ZINPS0(:), ZINPA0(:) INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS, KF_UV TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRLTOMAD_UNPACK_HANDLE), INTENT(IN) :: HTRLTOM_UNPACK + TYPE(TRMTOL_PACK_HANDLE), INTENT(IN) :: HTRMTOL_PACK REAL(KIND=JPRBT), POINTER :: PREEL_COMPLEX(:) @@ -232,7 +178,7 @@ SUBROUTINE TRLTOMAD_UNPACK(ALLOCATOR,HTRLTOM_UNPACK,FOUBUF,ZINPS,ZINPA,ZINPS0,ZI CALL LEDIR_STRIDES(KF_FS,IIN_STRIDES0=IIN_STRIDES0,IIN_SIZE=IIN_SIZE,& IIN0_STRIDES0=IIN0_STRIDES0,IIN0_SIZE=IIN0_SIZE) - CALL ASSIGN_PTR(FOUBUF, GET_ALLOCATION(ALLOCATOR, HTRLTOM_UNPACK%HPFBUF),& + CALL ASSIGN_PTR(FOUBUF, GET_ALLOCATION(ALLOCATOR, HTRMTOL_PACK%HFOUBUF_IN),& & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_FS*C_SIZEOF(FOUBUF(1))) #ifdef OMPGPU diff --git a/src/trans/gpu/internal/trmtolad_mod.F90 b/src/trans/gpu/internal/trmtolad_mod.F90 deleted file mode 100755 index c94dd6916..000000000 --- a/src/trans/gpu/internal/trmtolad_mod.F90 +++ /dev/null @@ -1,258 +0,0 @@ -! (C) Copyright 1995- ECMWF. -! (C) Copyright 1995- Meteo-France. -! (C) Copyright 2022- NVIDIA. -! -! This software is licensed under the terms of the Apache Licence Version 2.0 -! which can be obtained at http://www.apache.org/licenses/LICENSE-2.0. -! In applying this licence, ECMWF does not waive the privileges and immunities -! granted to it by virtue of its status as an intergovernmental organisation -! nor does it submit to any jurisdiction. -! - -MODULE TRMTOLAD_MOD -USE BUFFERED_ALLOCATOR_MOD, ONLY: ALLOCATION_RESERVATION_HANDLE -IMPLICIT NONE - -PRIVATE -PUBLIC :: TRMTOLAD, PREPARE_TRMTOLAD, TRMTOLAD_HANDLE - -TYPE TRMTOLAD_HANDLE - TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN -END TYPE -CONTAINS -FUNCTION PREPARE_TRMTOLAD(ALLOCATOR, KF_LEG) RESULT(HTRMTOLAD) - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB - USE TPM_DISTR, ONLY: D - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE - USE ISO_C_BINDING, ONLY: C_SIZEOF - - IMPLICIT NONE - - TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR - INTEGER(KIND=JPIM), INTENT(IN) :: KF_LEG - TYPE(TRMTOLAD_HANDLE) :: HTRMTOLAD - INTEGER(KIND=JPIB) :: IALLOC_SZ - REAL(KIND=JPRBT) :: DUMMY - - IALLOC_SZ = 2_JPIB*D%NLENGT1B*KF_LEG*C_SIZEOF(DUMMY) - HTRMTOLAD%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRMTOLAD%HFOUBUF_IN") -END FUNCTION - -SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG) - !**** *trmtol * - transposition in Fourier space - - ! Purpose. - ! -------- - ! Transpose Fourier buffer data from partitioning - ! over wave numbers to partitioning over latitudes. - ! It is called between direct FFT and direct Legendre - ! transform. - ! This routine is the inverse of TRLTOM. - - - !** Interface. - ! ---------- - ! *call* *trmtol(...)* - - ! Explicit arguments : PFBUF - Fourier coefficient buffer. It is - ! -------------------- used for both input and output. - ! KF_LEG - Number of fields communicated - - ! Implicit arguments : - ! -------------------- - - ! Method. - ! ------- - ! See documentation - - ! Externals. - ! ---------- - - ! Reference. - ! ---------- - ! ECMWF Research Department documentation of the IFS - - ! Author. - ! ------- - ! MPP Group *ECMWF* - - ! Modifications. - ! -------------- - ! Original : 95-10-01 - ! Modified : 97-06-17 G. Mozdzynski - control MPI mailbox use - ! (NCOMBFLEN) for nphase.eq.1 - ! Modified : 99-05-28 D.Salmond - Optimise copies. - ! Modified : 00-02-02 M.Hamrud - Remove NPHASE - ! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message - ! passing and buffer packing - ! G.Mozdzynski: 08-01-01 Cleanup - ! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS - ! ------------------------------------------------------------------ - - USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB - USE YOMHOOK, ONLY: LHOOK, DR_HOOK, JPHOOK - USE MPL_MODULE, ONLY: MPL_ALLTOALLV, MPL_BARRIER, MPL_ALL_MS_COMM, MPL_MYRANK - USE TPM_DISTR, ONLY: D, NPRTRW, NPROC, MYSETW - USE TPM_GEN, ONLY: LSYNC_TRANS, NERR, LMPOFF -#if ECTRANS_HAVE_MPI - USE MPI_F08, ONLY: MPI_COMM, MPI_REAL4, MPI_REAL8 - ! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157) -#endif - USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION - USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX - USE ISO_C_BINDING, ONLY: C_SIZEOF - USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS - - IMPLICIT NONE - - INTEGER(KIND=JPIM) ,INTENT(IN) :: KF_LEG - REAL(KIND=JPRBT), INTENT(IN) :: PFBUF(:) - REAL(KIND=JPRBT), INTENT(OUT), POINTER :: PFBUF_IN(:) - - INTEGER(KIND=JPIM) :: ILENS(NPRTRW),IOFFS(NPRTRW),ILENR(NPRTRW),IOFFR(NPRTRW) - INTEGER(KIND=JPIM) :: J, FROM_SEND, TO_SEND, FROM_RECV, TO_RECV, IRANK - INTEGER(KIND=JPIB) :: JPOS, ISTA, IEND, ILEN - REAL(KIND=JPHOOK) :: ZHOOK_HANDLE - INTEGER(KIND=JPIM) :: IERROR - - TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR - TYPE(TRMTOLAD_HANDLE), INTENT(IN) :: HTRMTOL - -#if ECTRANS_HAVE_MPI - TYPE(MPI_COMM) :: LOCAL_COMM -#endif - -#ifdef PARKINDTRANS_SINGLE -#define TRMTOL_DTYPE MPI_REAL4 -#else -#define TRMTOL_DTYPE MPI_REAL8 -#endif - -#if ECTRANS_HAVE_MPI - IF(.NOT. LMPOFF) THEN - LOCAL_COMM%MPI_VAL = MPL_ALL_MS_COMM - ENDIF -#endif - - IF (LHOOK) CALL DR_HOOK('TRMTOLAD',0,ZHOOK_HANDLE) - - CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HFOUBUF_IN),& - & 1_JPIB, 2_JPIB*D%NLENGT1B*KF_LEG*C_SIZEOF(PFBUF_IN(1))) - - IF(NPROC > 1) THEN - DO J=1,NPRTRW - ILENS(J) = D%NLTSFTB(J)*2*KF_LEG - IOFFS(J) = D%NSTAGT1B(J)*2*KF_LEG - ILENR(J) = D%NLTSGTB(J)*2*KF_LEG - IOFFR(J) = D%NSTAGT0B(J)*2*KF_LEG - ENDDO - - CALL GSTATS(807,0) - - ! copy to self workaround - IRANK = MPL_MYRANK(MPL_ALL_MS_COMM) - IF (ILENS(IRANK) /= ILENR(IRANK)) THEN - WRITE(NERR,*) "ERROR", ILENS(IRANK), ILENR(IRANK) - CALL ABORT_TRANS("TRMTOL: ILENS(IRANK) /= ILENR(IRANK)") - ENDIF - IF (ILENS(IRANK) > 0) THEN - FROM_SEND = IOFFS(IRANK) + 1 - TO_SEND = FROM_SEND + ILENS(IRANK) - 1 - FROM_RECV = IOFFR(IRANK) + 1 - TO_RECV = FROM_RECV + ILENR(IRANK) - 1 -#ifdef OMPGPU - !$OMP TARGET TEAMS MAP(PRESENT,ALLOC:PFBUF,PFBUF_IN) MAP(TO:FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) -#endif -#ifdef ACCGPU -#ifdef __HIP_PLATFORM_AMD__ - ! Workaround for AMD GPUs - ASYNC execution of this kernel gives numerical errors - !$ACC KERNELS DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) -#else - !$ACC KERNELS ASYNC(1) DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) COPYIN(FROM_RECV,TO_RECV,FROM_SEND,TO_SEND) -#endif -#endif - PFBUF_IN(FROM_SEND:TO_SEND) = PFBUF(FROM_RECV:TO_RECV) -#ifdef OMPGPU - !$OMP END TARGET TEAMS -#endif -#ifdef ACCGPU - !$ACC END KERNELS -#endif - ILENS(IRANK) = 0 - ILENR(IRANK) = 0 - ENDIF - - IF (LSYNC_TRANS) THEN - CALL GSTATS(440,0) - CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') - CALL GSTATS(440,1) - ENDIF - CALL GSTATS(421,0) -#ifdef USE_GPU_AWARE_MPI -#ifdef OMPGPU - !$OMP TARGET DATA USE_DEVICE_PTR(PFBUF_IN,PFBUF) -#endif -#ifdef ACCGPU - !$ACC HOST_DATA USE_DEVICE(PFBUF_IN, PFBUF) -#endif -#else - !! this is safe-but-slow fallback for running without GPU-aware MPI - !$ACC UPDATE HOST(PFBUF_IN,PFBUF) -#endif - -#if ECTRANS_HAVE_MPI - CALL MPI_ALLTOALLV(PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,& - & PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,& - & LOCAL_COMM,IERROR) -#else - CALL ABORT_TRANS("Should not be here: MPI is disabled") -#endif - -#ifdef USE_GPU_AWARE_MPI -#ifdef ACCGPU - !$ACC END HOST_DATA -#endif -#ifdef OMPGPU - !$OMP END TARGET DATA -#endif -#else - !! this is safe-but-slow fallback for running without GPU-aware MPI - !$ACC UPDATE DEVICE(PFBUF_IN) -#endif - IF (LSYNC_TRANS) THEN - CALL GSTATS(441,0) - CALL MPL_BARRIER(MPL_ALL_MS_COMM,CDSTRING='') - CALL GSTATS(441,1) - ENDIF - CALL GSTATS(421,1) - -#ifdef ACCGPU -#ifndef __HIP_PLATFORM_AMD__ - ! Workaround for AMD GPUs - ASYNC execution of this kernel gives numerical errors - !$ACC WAIT(1) -#endif -#endif - CALL GSTATS(807,1) - ELSE - ILEN = 2_JPIB*D%NLTSGTB(MYSETW)*KF_LEG - ISTA = 2_JPIB*D%NSTAGT0B(MYSETW)*KF_LEG+1 - IEND = ISTA+ILEN-1 - CALL GSTATS(1608,0) -#ifdef OMPGPU - !$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) & - !$OMP SHARED(PFBUF,PFBUF_IN,ISTA,IEND) MAP(TO:ISTA,IEND) -#endif -#ifdef ACCGPU - !$ACC PARALLEL LOOP DEFAULT(NONE) PRESENT(PFBUF,PFBUF_IN) FIRSTPRIVATE(ISTA,IEND) -#endif - DO JPOS=ISTA,IEND - PFBUF_IN(JPOS) = PFBUF(JPOS) - ENDDO - CALL GSTATS(1608,1) - ENDIF - - IF (LHOOK) CALL DR_HOOK('TRMTOL',1,ZHOOK_HANDLE) - - ! ------------------------------------------------------------------ -END SUBROUTINE TRMTOLAD -END MODULE TRMTOLAD_MOD