diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 142106aec..8c421ba90 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,64 +18,85 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP parallel do shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) & +!$OMP if (.not.use_device_ptr) do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%pack_is(n); ie = group%pack_ie(n) - js = group%pack_js(n); je = group%pack_je(n) + is = group%pack_is(n); ie = group%pack_ie(n); ni = ie-is+1 + js = group%pack_js(n); je = group%pack_je(n); nj = je-js+1 rotation = group%pack_rotation(n) if( group%pack_type(n) == FIELD_S ) then select case( rotation ) case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do enddo + pos = pos + ksize*nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -83,77 +104,109 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + ! pos = pos + 1 + ! buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) @@ -162,86 +215,118 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) endif enddo else +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = 1, npack*ksize n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 @@ -255,42 +340,62 @@ else case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -298,65 +403,95 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) @@ -365,65 +500,95 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 7f60ed93d..a745982cf 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,92 +18,128 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k ) +!$OMP parallel do shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) & +!$OMP if (.not.use_device_ptr) do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: field(is:ie,js:je,1:ksize)) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do endif enddo else +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = nunpack*ksize, 1, -1 n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos + (k-1)*group%unpack_size(n) - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do endif enddo diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index 82df7e316..ee66b0cd4 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -31,6 +31,7 @@ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !> @brief Initialize the @ref mpp_mod module. Must be called before any usage. subroutine mpp_init( flags, localcomm, test_level, alt_input_nml_path ) + !$use omp_lib integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG integer, optional, intent(in) :: localcomm !< Id of MPI communicator used to initialize integer, optional, intent(in) :: test_level !< Used to exit initialization at certain stages @@ -54,6 +55,14 @@ call MPI_COMM_RANK( mpp_comm_private, pe, error ) call MPI_COMM_SIZE( mpp_comm_private, npes, error ) + ! set default device to enable multi GPU parallelism + ! calls to both OpenACC and OpenMP runtimes are needed + ! because we use both do-concurrent and openmp + ! if you remove either, the code will run multiple + ! ranks on a _single_ GPU. Be careful out there! + !$ call omp_set_default_device(pe) + !$acc set device_num(pe) + module_is_initialized = .TRUE. if (present(test_level)) then t_level = test_level diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 45be7ea53..6a4cb2465 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -419,20 +419,22 @@ subroutine MPP_CREATE_GROUP_UPDATE_4D_V_( group, fieldx, fieldy, domain, flags, end subroutine MPP_CREATE_GROUP_UPDATE_4D_V_ -subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) +subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) type(mpp_group_update_type), intent(inout) :: group type(domain2D), intent(inout) :: domain MPP_TYPE_, intent(in) :: d_type + logical, optional, intent(in) :: omp_offload integer :: nscalar, nvector, nlist logical :: recv_y(8) integer :: nsend, nrecv, flags_v integer :: msgsize - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk integer :: shift, gridtype, midpoint integer :: npack, nunpack, rotation, isd + logical :: use_device_ptr MPP_TYPE_ :: buffer(mpp_domains_stack_size) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) @@ -448,6 +450,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) nlist = size(domain%list(:)) gridtype = group%gridtype + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + !--- ksize_s must equal ksize_v if(nvector > 0 .AND. nscalar > 0) then if(group%ksize_s .NE. group%ksize_v) then @@ -476,13 +481,16 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) !---pre-post receive. call mpp_clock_begin(group_recv_clock) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target enter data map(alloc: buffer) if(use_device_ptr) +#endif do m = 1, nrecv msgsize = group%recv_size(m) from_pe = group%from_pe(m) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_recv(m) call mpp_recv( buffer(buffer_pos+1), glen=msgsize, from_pe=from_pe, block=.false., & - tag=COMM_TAG_1) + tag=COMM_TAG_1, omp_offload=omp_offload) end if end do @@ -504,7 +512,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_send(n) to_pe = group%to_pe(n) - call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1) + call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1, omp_offload=omp_offload) endif enddo call mpp_clock_end(group_send_clock) @@ -519,6 +527,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) #include +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target exit data map(release: buffer) if(use_device_ptr) +#endif call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -644,10 +655,11 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) integer :: nscalar, nvector integer :: nsend, nrecv, flags_v integer :: msgsize, npack, rotation - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk logical :: reuse_buf_pos + logical, parameter :: use_device_ptr = .false. ! placeholder character(len=8) :: text MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) @@ -749,11 +761,12 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) MPP_TYPE_, intent(in) :: d_type integer :: nsend, nrecv, nscalar, nvector - integer :: k, buffer_pos, pos, m, n, l + integer :: k, buffer_pos, pos, m, n, l, idx integer :: is, ie, js, je, ksize, i, j integer :: shift, gridtype, midpoint, flags_v - integer :: nunpack, rotation, buffer_start_pos, nk, isd + integer :: nunpack, rotation, buffer_start_pos, ni, nj, nk, isd logical :: recv_y(8) + logical, parameter :: use_device_ptr = .false. ! placeholder MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) MPP_TYPE_ :: fieldx(group%is_x:group%ie_x,group%js_x:group%je_x, group%ksize_v) diff --git a/mpp/include/mpp_transmit.inc b/mpp/include/mpp_transmit.inc index aaa770cc0..b3eac0c3f 100644 --- a/mpp/include/mpp_transmit.inc +++ b/mpp/include/mpp_transmit.inc @@ -171,7 +171,7 @@ call mpp_transmit( put_data, put_len, to_pe, dummy, 1, NULL_PE, tag=tag, send_request=request ) end subroutine MPP_SEND_ - subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request ) + subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request, omp_offload ) !a mpp_transmit with null arguments on the put side integer, intent(in) :: from_pe MPP_TYPE_, intent(out) :: get_data @@ -180,6 +180,7 @@ integer, intent(out), optional :: request integer, optional, intent(in) :: glen + logical, optional, intent(in) :: omp_offload integer :: get_len MPP_TYPE_ :: get_data1D(1) MPP_TYPE_ :: dummy(1) @@ -189,17 +190,18 @@ ptr = LOC(get_data) get_len=1; if(PRESENT(glen))get_len=glen - call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request, omp_offload=omp_offload ) end subroutine MPP_RECV_SCALAR_ - subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request) + subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request, omp_offload) !a mpp_transmit with null arguments on the get side integer, intent(in) :: to_pe MPP_TYPE_, intent(in) :: put_data integer, optional, intent(in) :: plen integer, intent(in), optional :: tag integer, intent(out), optional :: request + logical, optional, intent(in) :: omp_offload integer :: put_len MPP_TYPE_ :: put_data1D(1) MPP_TYPE_ :: dummy(1) @@ -207,7 +209,7 @@ pointer( ptr, put_data1D ) ptr = LOC(put_data) put_len=1; if(PRESENT(plen))put_len=plen - call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request ) + call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request, omp_offload=omp_offload ) end subroutine MPP_SEND_SCALAR_ diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index 1fdfc2f6c..729cc165b 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -37,7 +37,7 @@ !!(avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -45,10 +45,15 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag integer, intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload logical :: block_comm integer :: i integer :: comm_tag integer :: rsize + logical :: use_device_ptr + + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return @@ -82,8 +87,15 @@ if( cur_send_request > max_request ) & call mpp_error(FATAL, & & "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & - request_send(cur_send_request), error) + if (use_device_ptr) then + !$omp target data use_device_ptr(put_data) + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + !$omp end target data + else + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + endif endif if (debug .and. (current_clock.NE.0)) call increment_current_clock(EVENT_SEND, put_len*MPP_TYPE_BYTELEN_) else if (to_pe.EQ.ALL_PES) then !this is a broadcast from from_pe @@ -130,8 +142,15 @@ if( cur_recv_request > max_request ) & call mpp_error(FATAL, & "MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & - request_recv(cur_recv_request), error ) + if (use_device_ptr) then + !$omp target data use_device_ptr(get_data) + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + !$omp end target data + else + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + endif size_recv(cur_recv_request) = get_len type_recv(cur_recv_request) = MPI_TYPE_ endif