diff --git a/CMakeLists.txt b/CMakeLists.txt index 947a0f6..7ce5e6e 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -8,6 +8,7 @@ set(CMAKE_CXX_STANDARD 11) option(OPENACC "Enable OpenACC support" OFF) option(ACC_MULTICORE "Whether for OpenACC to compile for multicore" OFF) +option(ACC_KERNELS "Whether for OpenACC to use the kernels directives" OFF) option(OPENMP "Enable OpenMP support" ON) option(OPENMP_OFFLOAD "Enable OpenMP offloading to GPU support" OFF) option(ROCM "Enable ROCm, offloading to MI100 using OpenMPI 4.5" OFF) @@ -110,23 +111,28 @@ if ((NOT OPENACC) AND OPENMP_FOUND) else () # OpenACC is incompatible with OpenMP, either one or the other or none if (OPENACC) - add_definitions(-DHAVE_OPENACC) - if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "^NVHPC|PGI") - if (ACC_MULTICORE) - message(STATUS "Apply OpenACC directives for offloading to multicore CPU") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc=multicore -Minfo=acc") + add_definitions(-DHAVE_OPENACC) + if ("${CMAKE_CXX_COMPILER_ID}" MATCHES "^NVHPC|PGI") + if (ACC_KERNELS) + message(STATUS "Apply OpenACC kernels directives") + add_definitions(-DHAVE_OPENACC_KERNELS) + endif () + if (ACC_MULTICORE) + message(STATUS "Apply OpenACC directives for offloading to multicore CPU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc=multicore -Minfo=acc") set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -acc=multicore -Minfo=acc") - else () - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc -gpu=mem:managed -Minfo=acc") + else () + message(STATUS "Apply OpenACC directives for offloading to GPU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -acc -gpu=mem:managed -Minfo=acc") set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -acc -gpu=mem:managed -Minfo=acc") - endif () - message(STATUS "Using PGI or Nvidia C++ compiler, added OpenACC compiler switches") - elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "^GNU") - set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenacc -fopt-info-optimized-omp") - set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fopenacc -fopt-info-optimized-omp") - message(STATUS "Using GNU C++ compiler, added OpenACC compiler switches") - endif () - message(STATUS "OpenACC enabled") + endif () + message(STATUS "Using PGI or Nvidia C++ compiler, added OpenACC compiler switches") + elseif ("${CMAKE_CXX_COMPILER_ID}" MATCHES "^GNU") + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fopenacc -fopt-info-optimized-omp") + set(CMAKE_Fortran_FLAGS "${CMAKE_Fortran_FLAGS} -fopenacc -fopt-info-optimized-omp") + message(STATUS "Using GNU C++ compiler, added OpenACC compiler switches") + endif () + message(STATUS "OpenACC enabled") endif () endif () diff --git a/upwind/fortran/CMakeLists.txt b/upwind/fortran/CMakeLists.txt index cb61b02..71b4066 100644 --- a/upwind/fortran/CMakeLists.txt +++ b/upwind/fortran/CMakeLists.txt @@ -1,5 +1,4 @@ -configure_file(upwindF08Acc.sl.in upwindF08Acc.sl) -configure_file(upwindF08OmpGpu.sl.in upwindF08OmpGpu.sl) +configure_file(upwindF08Gpu.sl.in upwindF08Gpu.sl) if (HAVE_OPENMP) add_definitions(-DHAVE_OPENMP) @@ -12,8 +11,7 @@ endif() add_executable(upwindFortran upwind.F90) add_executable(upwindF03 upwindF03.F90) add_executable(upwindF08 upwindF08.F90) -add_executable(upwindF08Acc upwindF08Acc.f90) -add_executable(upwindF08OmpGpu upwindF08OmpGpu.f90) +add_executable(upwindF08Gpu upwindF08Gpu.F90) add_test(NAME upwindFortran1 @@ -56,16 +54,11 @@ add_test(NAME upwindF08 COMMAND upwindF08 "${NUM_CELLS}" "${NUM_TIME_STEPS}") -add_test(NAME upwindF08Acc - COMMAND upwindF08Acc +add_test(NAME upwindF08Gpu + COMMAND upwindF08Gpu "${NUM_CELLS}" "${NUM_TIME_STEPS}") - -add_test(NAME upwindF08OmpGpu - COMMAND upwindF08OmpGpu - "${NUM_CELLS}" "${NUM_TIME_STEPS}") - -set_tests_properties(upwindF03 upwindF08 upwindF08Acc +set_tests_properties(upwindF03 upwindF08 upwindF08Gpu PROPERTIES PASS_REGULAR_EXPRESSION "[Cc]heck sum:[ ]*[1|0\\.999]") diff --git a/upwind/fortran/upwindF08Acc.f90 b/upwind/fortran/upwindF08Acc.f90 deleted file mode 100644 index b836612..0000000 --- a/upwind/fortran/upwindF08Acc.f90 +++ /dev/null @@ -1,385 +0,0 @@ -module upwind_mod - - implicit none - - integer, parameter :: r8 = selected_real_kind(12, 100) - - type upwind_type - ! number of space dimensions - integer :: ndims - - ! total number of cells in the domain - integer :: ntot - - ! number of cells in the ndims directions - integer, allocatable :: numCells(:) - - ! upwind direction - integer, allocatable :: upDirection(:) - - ! cell sizes in the ndims directions - real(r8), allocatable :: deltas(:) - - ! velocity - real(r8), allocatable :: v(:) - - ! domain lengths - real(r8), allocatable :: lengths(:) - - ! field as a flat array - real(r8), pointer :: f(:) - - ! product of the dimensions, used to switch back and forth - ! between the flat index and the multi-index representations - integer, allocatable :: dimProd(:) - - contains - - procedure :: new => upwind_new - procedure :: del => upwind_del - procedure :: advect => upwind_advect - procedure :: saveVTK => upwind_saveVTK - procedure :: getFlatIndex => upwind_getFlatIndex - - end type - -contains - - subroutine getIndexSet(i, ndims, numCells, dimProd, inds) - implicit none - integer, intent(in) :: i - integer, intent(in) :: ndims - integer, intent(in) :: numCells(:) - integer, intent(in) :: dimProd(:) - integer, intent(out) :: inds(:) - - !$acc routine - - integer :: j - - do j = 1, ndims - inds(j) = mod((i - 1)/dimProd(j), numCells(j)) + 1 - enddo - end subroutine getIndexSet - - ! Constructor - ! @param velocity velocity field (constant) - ! @param lengths domain lengths - ! @param numCells number of cells in the x, y, ... directions - subroutine upwind_new(this, velocity, lengths, numCells) - - class(upwind_type) :: this - real(r8), intent(in) :: velocity(:) - real(r8), intent(in) :: lengths(:) - integer, intent(in) :: numCells(:) - - integer :: j - - this % ndims = size(velocity) - - allocate(this % upDirection(this % ndims)) - allocate(this % deltas(this % ndims)) - allocate(this % v(this % ndims)) - allocate(this % lengths(this % ndims)) - allocate(this % numCells(this % ndims)) - allocate(this % dimProd(this % ndims)) - - this % v = velocity - this % lengths = lengths - this % numCells = numCells - - ! compute the total number of cells and other stuff - this % ntot = 1 - do j = 1, this % ndims - this % upDirection(j) = -1 - if (velocity(j) < 0.) then - this % upDirection(j) = 1 - endif - this % deltas(j) = lengths(j) / numCells(j) - this % ntot = this % ntot * numCells(j) - enddo - - this % dimProd(this % ndims) = 1 - do j = this % ndims - 1, 1, -1 - this % dimProd(j) = this % dimProd(j + 1) * this % numCells(j + 1) - enddo - - allocate(this % f(this % ntot)) - - ! initialize the field, zero everywhere except for the - ! low corner cell where the field is one - this % f = 0 - this % f(1) = 1 - - end subroutine - - ! Destructor - subroutine upwind_del(this) - - class(upwind_type) :: this - - deallocate(this % v) - deallocate(this % lengths) - deallocate(this % numCells) - deallocate(this % upDirection) - deallocate(this % deltas) - deallocate(this % dimProd) - deallocate(this % f) - - end subroutine - - ! Advance by one time step - ! @param deltaTime time step - subroutine upwind_advect(this, deltaTime) - - class(upwind_type) :: this - real(r8), intent(in) :: deltaTime - - real(r8), allocatable :: oldF(:) - integer :: i, j, oldIndex, upI - integer :: inds(this % ndims) - real(r8), pointer :: fptr(:) - real(r8) :: v(this % ndims), deltas(this % ndims) - integer :: ntot, ndims, numCells(this % ndims), dimProd(this % ndims) - integer :: upDirection(this % ndims) - - ! allocate and copy the field - allocate(oldF(this % ntot)) - - fptr => this % f - ntot = this % ntot - ndims = this % ndims - do j = 1, ndims - numCells(j) = this % numCells(j) - upDirection(j) = this % upDirection(j) - dimProd(j) = this % dimProd(j) - v(j) = this % v(j) - deltas(j) = this % deltas(j) - enddo - - !$acc parallel loop - do i = 1, ntot - oldF(i) = fptr(i) - enddo - !$acc end parallel loop - - ! iterate over the cells - !$acc parallel loop private(inds, j, oldIndex, upI) - do i = 1, ntot - - ! compute the index set of this cell - call getIndexSet(i, ndims, numCells, dimProd, inds) - !do j = 1, ndims - ! inds(j) = mod((i - 1)/dimProd(j), numCells(j)) + 1 - !enddo - - do j = 1, ndims - - ! cache the cell index - oldIndex = inds(j) - - ! increment the cell index - inds(j) = inds(j) + upDirection(j) - - ! apply periodic BCs - inds(j) = modulo(inds(j) + numCells(j) - 1, numCells(j)) + 1 - - ! compute the new flat index - upI = dot_product(dimProd, inds - 1) + 1 - - ! update the field - fptr(i) = fptr(i) - & - & deltaTime*v(j)*upDirection(j)*(oldF(upI) - oldF(i))/deltas(j) - - ! reset the index - inds(j) = oldIndex - enddo - enddo - !$acc end parallel loop - deallocate(oldF) - - end subroutine - - subroutine upwind_saveVTK(this, filename) - class(upwind_type) :: this - character(len=*), intent(in) :: filename - - integer iunit, i - - ! f2008 - !open(newunit = iunit, file = filename, status = 'unknown') - iunit = 10 - ! f95 - open(unit = iunit, file = filename, status = 'unknown') - write(iunit, '(a)') '# vtk DataFile Version 2.0' - write(iunit, '(a)') 'upwind.f90' - write(iunit, '(a)') 'ASCII' - write(iunit, '(a)') 'DATASET RECTILINEAR_GRID' - - ! in VTK the first dimension varies fastest so need - ! to invert the order of the dimensions - if (this % ndims > 2) then - write(iunit, '(a, i10, i10, i10)') 'DIMENSIONS ', & - & this % numCells(3) + 1, this % numCells(2) + 1, this % numCells(1) + 1 - else - if (this % ndims > 1) then - write(iunit, '(a, i10, i10)') 'DIMENSIONS 1', & - & this % numCells(2) + 1, this % numCells(1) + 1 - else - write(iunit, '(a, i10)') 'DIMENSIONS 1 1', this % numCells(1) + 1 - endif - endif - - write(iunit, '(a, i10, a)') 'X_COORDINATES ', this % numCells(1) + 1, ' double' - do i = 1, this % numCells(1) + 1 - write(iunit, '(e20.7)') 0.0 + this % deltas(1) * (i - 1) - enddo - - write(iunit, *) - if (this % ndims > 1) then - write(iunit, '(a, i10, a)') 'Y_COORDINATES ', this % numCells(2) + 1, ' double' - do i = 1, this % numCells(2) + 1 - write(iunit, '(e20.7)') 0.0 + this % deltas(2) * (i - 1) - enddo - else - write(iunit, '(a)') 'Y_COORDINATES 1 double' - write(iunit, '(a)') '0.0' - endif - - write(iunit, *) - if (this % ndims > 2) then - write(iunit, '(a, i10, a)') 'Z_COORDINATES ', this % numCells(3) + 1, ' double' - do i = 1, this % numCells(3) + 1 - write(iunit, '(e20.7)') 0.0 + this % deltas(3) * (i - 1) - enddo - else - write(iunit, '(a)') 'Z_COORDINATES 1 double' - write(iunit, '(a)') '0.0' - endif - - write(iunit, '(a, i20)') 'CELL_DATA ', this % ntot - write(iunit, '(a)') 'SCALARS f double 1' - write(iunit, '(a)') 'LOOKUP_TABLE default' - do i = 1, this % ntot - write(iunit, '(e20.7)') this % f(i) - enddo - - close(iunit) - - end subroutine - - subroutine upwind_print(this) - class(upwind_type), intent(in) :: this - integer :: i - - do i = 1, this % ntot - write(*, '(a, i10, a, e20.13)') 'i = ', i, ' f = ', this % f(i) - enddo - - end subroutine - - pure function upwind_getFlatIndex(this, inds) result(res) - class(upwind_type), intent(in) :: this - integer, intent(in) :: inds(:) - integer :: res - - res = dot_product(this % dimProd, inds - 1) + 1 - - end function - -end module - -! Converts a string into an integer -subroutine str2int(str, i, stat) - implicit none - ! Arguments - character(len=*), intent(in) :: str - integer, intent(out) :: i - integer, intent(out) :: stat - - read(str,*,iostat=stat) i -end subroutine str2int - -program main - - use upwind_mod - - implicit none - - integer, parameter :: ndims = 3 - - integer :: argc, numCells(ndims), n, ier, numTimeSteps, i, j, & - & numThreads, maxNumThreads, threadId - logical :: doVtk - character(len=32) :: argv - real(r8) :: velocity(ndims) - real(r8) :: lengths(ndims) - real(r8) :: courant, dt, dx, val, chksum - type(upwind_type) :: up - - numCells = -1 - doVtk = .FALSE. - - ! default number of steps - numTimeSteps = 100 - argc = 0 - do - call get_command_argument(argc, argv) - if (len_trim(argv) == 0) exit - call str2int(argv, n, ier) - if (argc == 1) then - numCells = n - else if (argc == 2) then - numTimeSteps = n - else if (argc == 3 .and. argv == 'vtk') then - doVtk = .TRUE. - endif - argc = argc + 1 - enddo - - if (argc < 2) then - stop 'must specify number of cells in each direction.' - endif - - write(*, '(a)', advance='no') 'number of cells: ' - do i = 1, ndims - write(*, '(i10, a)', advance='no') numCells(i), ' ' - enddo - write(*, *) ' ' ! new line - write(*, '(a,i10)') 'number of time steps: ', numTimeSteps - - ! velocity field - velocity = 1 - - ! domain lengths - lengths = 1 - - ! compute time step from Courant's condition - courant = 0.1_r8 - dt = huge(1.0_r8) - do j = 1, ndims - dx = lengths(j) / real(numCells(j), r8) - val = courant * dx / velocity(j) - dt = min(val, dt) - enddo - - ! instantiate up - call up % new(velocity, lengths, numCells) - - ! call up % saveVTK('up0.vtk') - - ! advance - do i = 1, numTimeSteps - call up % advect(dt) - enddo - - write(*,'(a, f15.9)') 'check sum: ', sum(up % f) - - if (doVtk) then - call up % saveVTK('up.vtk') - endif - - ! clean up - call up % del() - -end program diff --git a/upwind/fortran/upwindF08OmpGpu.f90 b/upwind/fortran/upwindF08Gpu.F90 similarity index 92% rename from upwind/fortran/upwindF08OmpGpu.f90 rename to upwind/fortran/upwindF08Gpu.F90 index 6b50321..ddae6c7 100644 --- a/upwind/fortran/upwindF08OmpGpu.f90 +++ b/upwind/fortran/upwindF08Gpu.F90 @@ -53,7 +53,11 @@ subroutine getIndexSet(i, ndims, numCells, dimProd, inds) integer, intent(in) :: dimProd(:) integer, intent(out) :: inds(:) +#if defined(HAVE_OPENACC) + !$acc routine +#else !$omp declare target +#endif integer :: j @@ -157,15 +161,43 @@ subroutine upwind_advect(this, deltaTime) deltas(j) = this % deltas(j) enddo +#if defined(HAVE_OPENACC) + #if defined(HAVE_OPENACC_KERNELS) + #warning "Using OpenACC kernels directive" + !$acc kernels + #else + !$acc parallel loop + #endif +#else !$omp target !$omp parallel do +#endif do i = 1, ntot oldF(i) = fptr(i) enddo +#if defined(HAVE_OPENACC) + #if defined(HAVE_OPENACC_KERNELS) + !$acc end kernels + #else + !$acc end parallel loop + #endif +#else !$omp end parallel do + !$omp end target +#endif ! iterate over the cells +#if defined(HAVE_OPENACC) + #if defined(HAVE_OPENACC_KERNELS) + #warning "Using OpenACC kernels directive" + !$acc kernels + #else + !$acc parallel loop private(inds, j, oldIndex, upI) + #endif +#else + !$omp target !$omp parallel do private(inds, j, oldIndex, upI) +#endif do i = 1, ntot ! compute the index set of this cell @@ -187,7 +219,7 @@ subroutine upwind_advect(this, deltaTime) ! compute the new flat index upI = dot_product(dimProd, inds - 1) + 1 - + ! update the field fptr(i) = fptr(i) - & & deltaTime*v(j)*upDirection(j)*(oldF(upI) - oldF(i))/deltas(j) @@ -196,8 +228,16 @@ subroutine upwind_advect(this, deltaTime) inds(j) = oldIndex enddo enddo +#if defined(HAVE_OPENACC) + #if defined(HAVE_OPENACC_KERNELS) + !$acc end kernels + #else + !$acc end parallel loop + #endif +#else !$omp end parallel do !$omp end target +#endif deallocate(oldF) end subroutine diff --git a/upwind/fortran/upwindF08Acc.sl.in b/upwind/fortran/upwindF08Gpu.sl.in similarity index 87% rename from upwind/fortran/upwindF08Acc.sl.in rename to upwind/fortran/upwindF08Gpu.sl.in index eb69f23..c86a493 100644 --- a/upwind/fortran/upwindF08Acc.sl.in +++ b/upwind/fortran/upwindF08Gpu.sl.in @@ -16,6 +16,6 @@ nvidia-smi ncells=512 nsteps=10 -exe="apptainer exec --nv /nesi/nobackup/pletzera/ngarch_nvhpc.sif @CMAKE_BINARY_DIR@/upwind/fortran/upwindF08Acc $ncells $nsteps" +exe="apptainer exec --nv /nesi/nobackup/pletzera/ngarch_nvhpc.sif @CMAKE_BINARY_DIR@/upwind/fortran/upwindF08Gpu $ncells $nsteps" time $exe diff --git a/upwind/fortran/upwindF08OmpGpu.sl.in b/upwind/fortran/upwindF08OmpGpu.sl.in deleted file mode 100644 index 1fa3a5f..0000000 --- a/upwind/fortran/upwindF08OmpGpu.sl.in +++ /dev/null @@ -1,21 +0,0 @@ -#!/bin/bash -e -#SBATCH --job-name=upwind # job name (shows up in the queue) -#SBATCH --time=00:10:00 # Walltime (HH:MM:SS) -#SBATCH --cpus-per-task=1 -#SBATCH --ntasks=1 -#SBATCH --mem=6g -##SBTACH --gpus-per-node=A100-1g.5gb:1 -##SBATCH --gpus-per-node=A100:1 -#SBATCH --gpus-per-node=P100:1 - -ml purge -ml Apptainer CUDA/11.8.0 -unset PYTHONPATH - -nvidia-smi - -ncells=512 -nsteps=10 -exe="apptainer exec --nv /nesi/nobackup/pletzera/ngarch_nvhpc.sif @CMAKE_BINARY_DIR@/upwind/fortran/upwindF08OmpGpu $ncells $nsteps" -time $exe -