Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
407 changes: 337 additions & 70 deletions mpp/include/group_update_pack.inc

Large diffs are not rendered by default.

88 changes: 68 additions & 20 deletions mpp/include/group_update_unpack.inc
Original file line number Diff line number Diff line change
Expand Up @@ -18,92 +18,140 @@
!***********************************************************************

if( group%k_loop_inside ) then
!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
! nvfortran + cray pointers imposes some restrictions on the loops below:
! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops
! must be ported rather than the whole outer loop.
! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop".
! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)".
#ifndef __NVCOMPILER_OPENMP_GPU
!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, &
!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k )
!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx)
#endif
do n = nunpack, 1, -1
buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
pos = buffer_pos
is = group%unpack_is(n); ie = group%unpack_ie(n)
js = group%unpack_js(n); je = group%unpack_je(n)
is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1
js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1
if( group%unpack_type(n) == FIELD_S ) then
do l=1,nscalar ! loop over number of fields
ptr_field = group%addrs_s(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) if(use_device_ptr) default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: field(is:ie,js:je,1:ksize))
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
field(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
field(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
else if( group%unpack_type(n) == FIELD_X ) then
do l=1,nvector ! loop over number of fields
ptr_fieldx = group%addrs_x(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr)
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
fieldx(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
fieldx(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
else if( group%unpack_type(n) == FIELD_Y ) then
do l=1,nvector ! loop over number of fields
ptr_fieldy = group%addrs_y(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(3) default(shared) &
!$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldy,ptr) &
!$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) &
!$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr)
#endif
do k = 1, ksize
do j = js, je
do i = is, ie
pos = pos + 1
fieldy(i,j,k) = buffer(pos)
idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1
fieldy(i,j,k) = buffer(idx)
end do
end do
end do
pos = pos + ksize*nj*ni
end do
endif
enddo
else
!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
#ifndef __NVCOMPILER_OPENMP_GPU
!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) &
!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, &
!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k)
!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx)
#endif
do nk = nunpack*ksize, 1, -1
n = (nk-1)/ksize + 1
k = mod((nk-1), ksize) + 1
buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos
pos = buffer_pos + (k-1)*group%unpack_size(n)
is = group%unpack_is(n); ie = group%unpack_ie(n)
js = group%unpack_js(n); je = group%unpack_je(n)
is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1
js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1
if( group%unpack_type(n) == FIELD_S ) then
do l=1,nscalar ! loop over number of fields
ptr_field = group%addrs_s(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr)
#endif
do j = js, je
do i = is, ie
pos = pos + 1
field(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
field(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
else if( group%unpack_type(n) == FIELD_X ) then
do l=1,nvector ! loop over number of fields
ptr_fieldx = group%addrs_x(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr)
#endif
do j = js, je
do i = is, ie
pos = pos + 1
fieldx(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
fieldx(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
else if( group%unpack_type(n) == FIELD_Y ) then
do l=1,nvector ! loop over number of fields
ptr_fieldy = group%addrs_y(l)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target teams distribute parallel do collapse(2) default(shared) &
!$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) &
!$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr)
#endif
do j = js, je
do i = is, ie
pos = pos + 1
fieldy(i,j,k) = buffer(pos)
idx = pos + (j-js)*ni + (i-is) + 1
fieldy(i,j,k) = buffer(idx)
end do
end do
pos = pos + ni*nj
end do
endif
enddo
Expand Down
9 changes: 9 additions & 0 deletions mpp/include/mpp_comm_mpi.inc
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
!> @brief Initialize the @ref mpp_mod module. Must be called before any usage.
subroutine mpp_init( flags, localcomm, test_level, alt_input_nml_path )
!$ use omp_lib
integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG
integer, optional, intent(in) :: localcomm !< Id of MPI communicator used to initialize
integer, optional, intent(in) :: test_level !< Used to exit initialization at certain stages
Expand All @@ -54,6 +55,14 @@
call MPI_COMM_RANK( mpp_comm_private, pe, error )
call MPI_COMM_SIZE( mpp_comm_private, npes, error )

! set default device to enable multi GPU parallelism
! calls to both OpenACC and OpenMP runtimes are needed
! because we use both do-concurrent and openmp
! if you remove either, the code will run multiple
! ranks on a _single_ GPU. Be careful out there!
!$ call omp_set_default_device(pe)
!$acc set device_num(pe)

module_is_initialized = .TRUE.
if (present(test_level)) then
t_level = test_level
Expand Down
59 changes: 56 additions & 3 deletions mpp/include/mpp_group_update.fh
Original file line number Diff line number Diff line change
Expand Up @@ -430,7 +430,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
integer :: msgsize
integer :: from_pe, to_pe, buffer_pos, pos
integer :: ksize, is, ie, js, je
integer :: n, l, m, i, j, k, buffer_start_pos, nk
integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk
integer :: shift, gridtype, midpoint
integer :: npack, nunpack, rotation, isd

Expand Down Expand Up @@ -476,6 +476,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)

!---pre-post receive.
call mpp_clock_begin(group_recv_clock)
#ifdef __NVCOMPILER_OPENMP_GPU
!$omp target enter data map(alloc: buffer) if(use_device_ptr)
#endif
do m = 1, nrecv
msgsize = group%recv_size(m)
from_pe = group%from_pe(m)
Expand All @@ -495,7 +498,19 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
call mpp_clock_begin(group_pack_clock)
!pack the data
buffer_start_pos = 0
! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with
! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
if (use_device_ptr) then
#include <group_update_pack.inc>
else
#ifdef __NVCOMPILER_OPENMP_GPU
#undef __NVCOMPILER_OPENMP_GPU
#include <group_update_pack.inc>
#define __NVCOMPILER_OPENMP_GPU
#else
#include <group_update_pack.inc>
#endif
endif
call mpp_clock_end(group_pack_clock)

call mpp_clock_begin(group_send_clock)
Expand All @@ -518,7 +533,20 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type)
!---unpack the buffer
nunpack = group%nunpack
call mpp_clock_begin(group_unpk_clock)
! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with
! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
if (use_device_ptr) then
#include <group_update_unpack.inc>
else
#ifdef __NVCOMPILER_OPENMP_GPU
#undef __NVCOMPILER_OPENMP_GPU
#include <group_update_unpack.inc>
#define __NVCOMPILER_OPENMP_GPU
!$omp target exit data map(release: buffer) if(use_device_ptr)
#else
#include <group_update_unpack.inc>
#endif
endif
call mpp_clock_end(group_unpk_clock)

! ---northern boundary fold
Expand Down Expand Up @@ -646,7 +674,7 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer)
integer :: msgsize, npack, rotation
integer :: from_pe, to_pe, buffer_pos, pos
integer :: ksize, is, ie, js, je
integer :: n, l, m, i, j, k, buffer_start_pos, nk
integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk
logical :: reuse_buf_pos
character(len=8) :: text

Expand Down Expand Up @@ -726,7 +754,19 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer)
call mpp_clock_begin(nonblock_group_pack_clock)
npack = group%npack
buffer_start_pos = group%buffer_start_pos
! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with
! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
if (use_device_ptr) then
#include <group_update_pack.inc>
else
#ifdef __NVCOMPILER_OPENMP_GPU
#undef __NVCOMPILER_OPENMP_GPU
#include <group_update_pack.inc>
#define __NVCOMPILER_OPENMP_GPU
#else
#include <group_update_pack.inc>
#endif
endif
call mpp_clock_end(nonblock_group_pack_clock)

call mpp_clock_begin(nonblock_group_send_clock)
Expand All @@ -752,7 +792,7 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)
integer :: k, buffer_pos, pos, m, n, l
integer :: is, ie, js, je, ksize, i, j
integer :: shift, gridtype, midpoint, flags_v
integer :: nunpack, rotation, buffer_start_pos, nk, isd
integer :: nunpack, rotation, buffer_start_pos, ni, nj, nk, isd
logical :: recv_y(8)
MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:)))
MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s)
Expand Down Expand Up @@ -794,7 +834,20 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type)

call mpp_clock_begin(nonblock_group_unpk_clock)
buffer_start_pos = group%buffer_start_pos
! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with
! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined
if (use_device_ptr) then
#include <group_update_unpack.inc>
else
#ifdef __NVCOMPILER_OPENMP_GPU
#undef __NVCOMPILER_OPENMP_GPU
#include <group_update_unpack.inc>
#define __NVCOMPILER_OPENMP_GPU
!$omp target exit data map(release: buffer) if(use_device_ptr)
#else
#include <group_update_unpack.inc>
#endif
endif
call mpp_clock_end(nonblock_group_unpk_clock)

! ---northern boundary fold
Expand Down
9 changes: 6 additions & 3 deletions mpp/include/mpp_transmit.inc
Original file line number Diff line number Diff line change
Expand Up @@ -189,7 +189,8 @@

ptr = LOC(get_data)
get_len=1; if(PRESENT(glen))get_len=glen
call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request )
call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, &
block, tag, recv_request=request, omp_offload=omp_offload )

end subroutine MPP_RECV_SCALAR_

Expand All @@ -207,7 +208,8 @@
pointer( ptr, put_data1D )
ptr = LOC(put_data)
put_len=1; if(PRESENT(plen))put_len=plen
call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request )
call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, &
tag=tag, send_request=request, omp_offload=omp_offload )

end subroutine MPP_SEND_SCALAR_

Expand All @@ -220,7 +222,8 @@
integer, intent(out), optional :: request

MPP_TYPE_ :: dummy(1,1)
call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, block, tag, recv_request=request )
call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, &
block, tag, recv_request=request )
end subroutine MPP_RECV_2D_

subroutine MPP_SEND_2D_( put_data, put_len, to_pe, tag, request )
Expand Down
26 changes: 22 additions & 4 deletions mpp/include/mpp_transmit_mpi.fh
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,14 @@
integer :: i
integer :: comm_tag
integer :: rsize
logical :: use_device_ptr

if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' )
if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return

use_device_ptr = .false.
if (present(omp_offload)) use_device_ptr = omp_offload

block_comm = .true.
if(PRESENT(block)) block_comm = block

Expand Down Expand Up @@ -82,8 +86,15 @@
if( cur_send_request > max_request ) &
call mpp_error(FATAL, &
& "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply")
call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, &
request_send(cur_send_request), error)
if (use_device_ptr) then
!$omp target data use_device_addr(put_data)
call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, &
request_send(cur_send_request), error)
!$omp end target data
else
call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, &
request_send(cur_send_request), error)
endif
endif
if (debug .and. (current_clock.NE.0)) call increment_current_clock(EVENT_SEND, put_len*MPP_TYPE_BYTELEN_)
else if (to_pe.EQ.ALL_PES) then !this is a broadcast from from_pe
Expand Down Expand Up @@ -130,8 +141,15 @@
if( cur_recv_request > max_request ) &
call mpp_error(FATAL, &
"MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply")
call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, &
request_recv(cur_recv_request), error )
if (use_device_ptr) then
!$omp target data use_device_addr(get_data)
call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, &
request_recv(cur_recv_request), error )
!$omp end target data
else
call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, &
request_recv(cur_recv_request), error )
endif
size_recv(cur_recv_request) = get_len
type_recv(cur_recv_request) = MPI_TYPE_
endif
Expand Down
Loading
Loading