Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 28 additions & 26 deletions src/trans/gpu/internal/trltomad_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ MODULE TRLTOMAD_MOD
TYPE(ALLOCATION_RESERVATION_HANDLE) :: HFOUBUF_IN
END TYPE
CONTAINS
FUNCTION PREPARE_TRLTOMAD(ALLOCATOR, KF_FS) RESULT(HTRLTOM)
FUNCTION PREPARE_TRLTOMAD(ALLOCATOR, KF_FS) RESULT(HTRLTOMAD)
USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB
USE TPM_DISTR, ONLY: D
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, RESERVE
Expand All @@ -30,27 +30,28 @@ FUNCTION PREPARE_TRLTOMAD(ALLOCATOR, KF_FS) RESULT(HTRLTOM)

TYPE(BUFFERED_ALLOCATOR), INTENT(INOUT) :: ALLOCATOR
INTEGER(KIND=JPIM), INTENT(IN) :: KF_FS
TYPE(TRLTOMAD_HANDLE) :: HTRLTOM

TYPE(TRLTOMAD_HANDLE) :: HTRLTOMAD
INTEGER(KIND=JPIB) :: IALLOC_SZ
REAL(KIND=JPRBT) :: DUMMY

HTRLTOM%HFOUBUF_IN = RESERVE(ALLOCATOR, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY), "HTRLTOM%HFOUBUF_IN")
IALLOC_SZ = 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(DUMMY)
HTRLTOMAD%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRLTOM%HFOUBUF_IN")
END FUNCTION

SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
!**** *TRLTOM * - transposition in Fourierspace
SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOMAD,PFBUF_IN,PFBUF,KF_FS)
!**** *TRLTOMAD * - transposition in Fourierspace

! Purpose.
! --------
! Transpose Fourier coefficients from partitioning
! over latitudes to partitioning over wave numbers
! This is done between inverse Legendre Transform
! and inverse FFT.
! This is the inverse routine of TRMTOL.
! This is the inverse routine of TRMTOLAD.

!** Interface.
! ----------
! *CALL* *TRLTOM(...)*
! *CALL* *TRLTOMAD(...)*

! Explicit arguments : PFBUF - Fourier coefficient buffer. It is
! -------------------- used for both input and output.
Expand Down Expand Up @@ -78,14 +79,14 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
! Modifications.
! --------------
! Original : 95-10-01
! Modified : 97-06-18 G. Mozdzynski - control MPI mailbox use
! Modified : 97-06-17 G. Mozdzynski - control MPI mailbox use
! (NCOMBFLEN) for nphase.eq.1
! Modified : 99-05-28 D.Salmond - Optimise copies.
! Modified : 00-02-02 M.Hamrud - Remove NPHASE
! D.Salmond : 01-11-23 LIMP_NOOLAP Option for non-overlapping message
! passing and buffer packing
! G.Mozdzynski: 08-01-01 Cleanup
! Y.Seity : 07-08-30 Add barrier synchronisation under LSYNC_TRANS
! Y.Seity : 07-08-31 add barrier synchronisation under LSYNC_TRANS
! ------------------------------------------------------------------

USE PARKIND_ECTRANS, ONLY: JPIM, JPRBT, JPIB
Expand All @@ -97,8 +98,8 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
USE MPI_F08, ONLY: MPI_COMM, MPI_REAL4, MPI_REAL8
! Missing: MPI_ALLTOALLV on purpose due to cray-mpi bug (see https://github.com/ecmwf-ifs/ectrans/pull/157)
#endif
USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX
USE BUFFERED_ALLOCATOR_MOD, ONLY: BUFFERED_ALLOCATOR, ASSIGN_PTR, GET_ALLOCATION
USE TPM_STATS, ONLY: GSTATS => GSTATS_NVTX
USE ISO_C_BINDING, ONLY: C_SIZEOF
USE ABORT_TRANS_MOD, ONLY: ABORT_TRANS

Expand All @@ -115,15 +116,15 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
INTEGER(KIND=JPIM) :: IERROR

TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
TYPE(TRLTOMAD_HANDLE), INTENT(IN) :: HTRLTOM
TYPE(TRLTOMAD_HANDLE), INTENT(IN) :: HTRLTOMAD
#if ECTRANS_HAVE_MPI
TYPE(MPI_COMM) :: LOCAL_COMM
#endif

#ifdef PARKINDTRANS_SINGLE
#define TRLTOM_DTYPE MPI_REAL4
#define TRLTOMAD_DTYPE MPI_REAL4
#else
#define TRLTOM_DTYPE MPI_REAL8
#define TRLTOMAD_DTYPE MPI_REAL8
#endif

#if ECTRANS_HAVE_MPI
Expand All @@ -132,10 +133,10 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
ENDIF
#endif

IF (LHOOK) CALL DR_HOOK('TRLTOM',0,ZHOOK_HANDLE)
IF (LHOOK) CALL DR_HOOK('TRLTOMAD',0,ZHOOK_HANDLE)

CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOM%HFOUBUF_IN),&
& 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(PFBUF_IN(1)))
CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRLTOMAD%HFOUBUF_IN),&
& 1_JPIB, 2_JPIB*D%NLENGT0B*KF_FS*C_SIZEOF(PFBUF_IN(1)))


#ifdef OMPGPU
Expand All @@ -159,7 +160,7 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
IRANK = MPL_MYRANK(MPL_ALL_MS_COMM)
IF (ILENS(IRANK) /= ILENR(IRANK)) THEN
WRITE(NERR,*) "ERROR", ILENS(IRANK), ILENR(IRANK)
CALL ABORT_TRANS("TRLTOM: Error - ILENS(IRANK) /= ILENR(IRANK)")
CALL ABORT_TRANS("TRLTOMAD: Error - ILENS(IRANK) /= ILENR(IRANK)")
ENDIF
IF (ILENS(IRANK) > 0) THEN
FROM_SEND = IOFFS(IRANK) + 1
Expand Down Expand Up @@ -205,19 +206,18 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
#endif
#endif
#if ECTRANS_HAVE_MPI
CALL MPI_ALLTOALLV(PFBUF,ILENR,IOFFR,TRLTOM_DTYPE,&
& PFBUF_IN,ILENS,IOFFS,TRLTOM_DTYPE,&
& LOCAL_COMM,IERROR)
CALL MPI_ALLTOALLV(PFBUF, ILENR, IOFFR, TRLTOMAD_DTYPE, PFBUF_IN, ILENS, IOFFS, &
& TRLTOMAD_DTYPE, LOCAL_COMM,IERROR)
#else
CALL ABORT_TRANS("Should not be here: MPI is disabled")
#endif
#ifdef USE_GPU_AWARE_MPI
#ifdef OMPGPU
!$OMP END TARGET DATA
#endif
#ifdef ACCGPU
!$ACC END HOST_DATA
#endif
#ifdef OMPGPU
!$OMP END TARGET DATA
#endif
#else
!! this is safe-but-slow fallback for running without GPU-aware MPI
#ifdef OMPGPU
Expand All @@ -244,7 +244,8 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
IEND = ISTA+ILEN-1
CALL GSTATS(1607,0)
#ifdef OMPGPU
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) SHARED(IEND,ISTA,PFBUF_IN,PFBUF)
!$OMP TARGET TEAMS DISTRIBUTE PARALLEL DO DEFAULT(NONE) &
!$OMP SHARED(IEND,ISTA,PFBUF_IN,PFBUF)
#endif
#ifdef ACCGPU
!$ACC PARALLEL LOOP DEFAULT(NONE) FIRSTPRIVATE(ISTA,IEND)
Expand All @@ -263,6 +264,7 @@ SUBROUTINE TRLTOMAD(ALLOCATOR,HTRLTOM,PFBUF_IN,PFBUF,KF_FS)
#endif

IF (LHOOK) CALL DR_HOOK('TRLTOMAD',1,ZHOOK_HANDLE)

! ------------------------------------------------------------------
END SUBROUTINE TRLTOMAD
END SUBROUTINE TRLTOMAD
END MODULE TRLTOMAD_MOD
51 changes: 36 additions & 15 deletions src/trans/gpu/internal/trmtolad_mod.F90
Original file line number Diff line number Diff line change
Expand Up @@ -38,21 +38,21 @@ FUNCTION PREPARE_TRMTOLAD(ALLOCATOR, KF_LEG) RESULT(HTRMTOLAD)
HTRMTOLAD%HFOUBUF_IN = RESERVE(ALLOCATOR, IALLOC_SZ, "HTRMTOLAD%HFOUBUF_IN")
END FUNCTION

SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG)
!**** *trmtol * - transposition in Fourier space
SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOLAD,PFBUF_IN,PFBUF,KF_LEG)
!**** *TRMTOLAD * - transposition in Fourier space

! Purpose.
! --------
! Transpose Fourier buffer data from partitioning
! over wave numbers to partitioning over latitudes.
! It is called between direct FFT and direct Legendre
! transform.
! This routine is the inverse of TRLTOM.
! This routine is the inverse of TRLTOMAD.


!** Interface.
! ----------
! *call* *trmtol(...)*
! *CALL* *TRMTOLAD(...)*

! Explicit arguments : PFBUF - Fourier coefficient buffer. It is
! -------------------- used for both input and output.
Expand Down Expand Up @@ -116,16 +116,16 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG)
INTEGER(KIND=JPIM) :: IERROR

TYPE(BUFFERED_ALLOCATOR), INTENT(IN) :: ALLOCATOR
TYPE(TRMTOLAD_HANDLE), INTENT(IN) :: HTRMTOL
TYPE(TRMTOLAD_HANDLE), INTENT(IN) :: HTRMTOLAD

#if ECTRANS_HAVE_MPI
TYPE(MPI_COMM) :: LOCAL_COMM
#endif

#ifdef PARKINDTRANS_SINGLE
#define TRMTOL_DTYPE MPI_REAL4
#define TRMTOLAD_DTYPE MPI_REAL4
#else
#define TRMTOL_DTYPE MPI_REAL8
#define TRMTOLAD_DTYPE MPI_REAL8
#endif

#if ECTRANS_HAVE_MPI
Expand All @@ -136,9 +136,16 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG)

IF (LHOOK) CALL DR_HOOK('TRMTOLAD',0,ZHOOK_HANDLE)

CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOL%HFOUBUF_IN),&
CALL ASSIGN_PTR(PFBUF_IN, GET_ALLOCATION(ALLOCATOR, HTRMTOLAD%HFOUBUF_IN),&
& 1_JPIB, 2_JPIB*D%NLENGT1B*KF_LEG*C_SIZEOF(PFBUF_IN(1)))

#ifdef OMPGPU
!$OMP TARGET DATA MAP(PRESENT,ALLOC:PFBUF,PFBUF_IN)
#endif
#ifdef ACCGPU
!$ACC DATA PRESENT(PFBUF,PFBUF_IN)
#endif

IF(NPROC > 1) THEN
DO J=1,NPRTRW
ILENS(J) = D%NLTSFTB(J)*2*KF_LEG
Expand All @@ -153,7 +160,7 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG)
IRANK = MPL_MYRANK(MPL_ALL_MS_COMM)
IF (ILENS(IRANK) /= ILENR(IRANK)) THEN
WRITE(NERR,*) "ERROR", ILENS(IRANK), ILENR(IRANK)
CALL ABORT_TRANS("TRMTOL: ILENS(IRANK) /= ILENR(IRANK)")
CALL ABORT_TRANS("TRMTOLAD: Error - ILENS(IRANK) /= ILENR(IRANK)")
ENDIF
IF (ILENS(IRANK) > 0) THEN
FROM_SEND = IOFFS(IRANK) + 1
Expand Down Expand Up @@ -197,17 +204,19 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG)
#endif
#else
!! this is safe-but-slow fallback for running without GPU-aware MPI
#ifdef OMPGPU
!$OMP TARGET UPDATE FROM(PFBUF_IN,PFBUF)
#endif
#ifdef ACCGPU
!$ACC UPDATE HOST(PFBUF_IN,PFBUF)
#endif

#endif
#if ECTRANS_HAVE_MPI
CALL MPI_ALLTOALLV(PFBUF,ILENR,IOFFR,TRMTOL_DTYPE,&
& PFBUF_IN,ILENS,IOFFS,TRMTOL_DTYPE,&
& LOCAL_COMM,IERROR)
CALL MPI_ALLTOALLV(PFBUF, ILENR, IOFFR, TRMTOLAD_DTYPE, PFBUF_IN, ILENS, IOFFS, &
& TRMTOLAD_DTYPE, LOCAL_COMM,IERROR)
#else
CALL ABORT_TRANS("Should not be here: MPI is disabled")
#endif

#ifdef USE_GPU_AWARE_MPI
#ifdef ACCGPU
!$ACC END HOST_DATA
Expand All @@ -217,7 +226,12 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG)
#endif
#else
!! this is safe-but-slow fallback for running without GPU-aware MPI
#ifdef OMPGPU
!$OMP TARGET UPDATE TO(PFBUF_IN)
#endif
#ifdef ACCGPU
!$ACC UPDATE DEVICE(PFBUF_IN)
#endif
#endif
IF (LSYNC_TRANS) THEN
CALL GSTATS(441,0)
Expand Down Expand Up @@ -251,7 +265,14 @@ SUBROUTINE TRMTOLAD(ALLOCATOR,HTRMTOL,PFBUF_IN,PFBUF,KF_LEG)
CALL GSTATS(1608,1)
ENDIF

IF (LHOOK) CALL DR_HOOK('TRMTOL',1,ZHOOK_HANDLE)
#ifdef OMPGPU
!$OMP END TARGET DATA
#endif
#ifdef ACCGPU
!$ACC END DATA
#endif

IF (LHOOK) CALL DR_HOOK('TRMTOLAD',1,ZHOOK_HANDLE)

! ------------------------------------------------------------------
END SUBROUTINE TRMTOLAD
Expand Down
Loading