From 7d77de70a11f9785d19315897c39a151cd80e517 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 28 Aug 2025 17:55:26 +1000 Subject: [PATCH 01/28] add gpu2gpu mpi transer with flag for do_group_update --- mpp/include/group_update_pack.inc | 27 +++++++++++++++------------ mpp/include/group_update_unpack.inc | 28 ++++++++++++++++------------ mpp/include/mpp_group_update.fh | 21 +++++++++++++++------ mpp/include/mpp_transmit.inc | 10 ++++++---- mpp/include/mpp_transmit_mpi.fh | 29 ++++++++++++++++++++++++----- 5 files changed, 76 insertions(+), 39 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 142106aece..b1a55b68e3 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,9 +18,6 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos @@ -32,14 +29,17 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do enddo + pos = pos + ksize*(je-js+1)*(ie-is+1) enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields @@ -83,14 +83,17 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then @@ -162,14 +165,17 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields @@ -239,9 +245,6 @@ if( group%k_loop_inside ) then endif enddo else -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) do nk = 1, npack*ksize n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 7f60ed93df..c18275e9bb 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,9 +18,6 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k ) do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos @@ -29,45 +26,52 @@ if( group%k_loop_inside ) then if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) + !$omp target teams distribute parallel do if(use_device_ptr) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(from: field(is:ie,js:je,1:ksize)) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) + !$omp target teams distribute parallel do private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*(je-js+1)*(ie-is+1) end do endif enddo else -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) do nk = nunpack*ksize, 1, -1 n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 0f04c06c3b..8464fa55f5 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -419,20 +419,22 @@ subroutine MPP_CREATE_GROUP_UPDATE_4D_V_( group, fieldx, fieldy, domain, flags, end subroutine MPP_CREATE_GROUP_UPDATE_4D_V_ -subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) +subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) type(mpp_group_update_type), intent(inout) :: group type(domain2D), intent(inout) :: domain MPP_TYPE_, intent(in) :: d_type + logical, optional, intent(in) :: omp_offload integer :: nscalar, nvector, nlist logical :: recv_y(8) integer :: nsend, nrecv, flags_v integer :: msgsize - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je integer :: n, l, m, i, j, k, buffer_start_pos, nk integer :: shift, gridtype, midpoint integer :: npack, nunpack, rotation, isd + logical :: use_device_ptr MPP_TYPE_ :: buffer(mpp_domains_stack_size) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) @@ -448,6 +450,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) nlist = size(domain%list(:)) gridtype = group%gridtype + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + !--- ksize_s must equal ksize_v if(nvector > 0 .AND. nscalar > 0) then if(group%ksize_s .NE. group%ksize_v) then @@ -476,13 +481,14 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) !---pre-post receive. call mpp_clock_begin(group_recv_clock) + !$omp target enter data map(alloc: buffer) if(use_device_ptr) do m = 1, nrecv msgsize = group%recv_size(m) from_pe = group%from_pe(m) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_recv(m) call mpp_recv( buffer(buffer_pos+1), glen=msgsize, from_pe=from_pe, block=.false., & - tag=COMM_TAG_1) + tag=COMM_TAG_1, omp_offload=omp_offload) end if end do @@ -504,7 +510,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_send(n) to_pe = group%to_pe(n) - call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1) + call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1, omp_offload=omp_offload) endif enddo call mpp_clock_end(group_send_clock) @@ -519,6 +525,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) #include + !$omp target exit data map(release: buffer) if(use_device_ptr) call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -644,10 +651,11 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) integer :: nscalar, nvector integer :: nsend, nrecv, flags_v integer :: msgsize, npack, rotation - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je integer :: n, l, m, i, j, k, buffer_start_pos, nk logical :: reuse_buf_pos + logical, parameter :: use_device_ptr = .false. ! placeholder character(len=8) :: text MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) @@ -749,11 +757,12 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) MPP_TYPE_, intent(in) :: d_type integer :: nsend, nrecv, nscalar, nvector - integer :: k, buffer_pos, pos, m, n, l + integer :: k, buffer_pos, pos, m, n, l, idx integer :: is, ie, js, je, ksize, i, j integer :: shift, gridtype, midpoint, flags_v integer :: nunpack, rotation, buffer_start_pos, nk, isd logical :: recv_y(8) + logical, parameter :: use_device_ptr = .false. ! placeholder MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) MPP_TYPE_ :: fieldx(group%is_x:group%ie_x,group%js_x:group%je_x, group%ksize_v) diff --git a/mpp/include/mpp_transmit.inc b/mpp/include/mpp_transmit.inc index 24d0cc437f..560de01c18 100644 --- a/mpp/include/mpp_transmit.inc +++ b/mpp/include/mpp_transmit.inc @@ -171,7 +171,7 @@ call mpp_transmit( put_data, put_len, to_pe, dummy, 1, NULL_PE, tag=tag, send_request=request ) end subroutine MPP_SEND_ - subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request ) + subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request, omp_offload ) !a mpp_transmit with null arguments on the put side integer, intent(in) :: from_pe MPP_TYPE_, intent(out) :: get_data @@ -180,6 +180,7 @@ integer, intent(out), optional :: request integer, optional, intent(in) :: glen + logical, optional, intent(in) :: omp_offload integer :: get_len MPP_TYPE_ :: get_data1D(1) MPP_TYPE_ :: dummy(1) @@ -189,17 +190,18 @@ ptr = LOC(get_data) get_len=1; if(PRESENT(glen))get_len=glen - call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request, omp_offload=omp_offload ) end subroutine MPP_RECV_SCALAR_ - subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request) + subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request, omp_offload) !a mpp_transmit with null arguments on the get side integer, intent(in) :: to_pe MPP_TYPE_, intent(in) :: put_data integer, optional, intent(in) :: plen integer, intent(in), optional :: tag integer, intent(out), optional :: request + logical, optional, intent(in) :: omp_offload integer :: put_len MPP_TYPE_ :: put_data1D(1) MPP_TYPE_ :: dummy(1) @@ -207,7 +209,7 @@ pointer( ptr, put_data1D ) ptr = LOC(put_data) put_len=1; if(PRESENT(plen))put_len=plen - call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request ) + call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request, omp_offload=omp_offload ) end subroutine MPP_SEND_SCALAR_ diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index 023c2d5124..2d2fef19e6 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -37,7 +37,7 @@ !!(avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -45,10 +45,15 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag integer, intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload logical :: block_comm integer :: i integer :: comm_tag integer :: rsize + logical :: use_device_ptr + + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return @@ -82,8 +87,15 @@ if( cur_send_request > max_request ) & call mpp_error(FATAL, & & "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & - request_send(cur_send_request), error) + if (use_device_ptr) then + !$omp target data use_device_ptr(put_data) + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + !$omp end target data + else + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + endif endif if (debug .and. (current_clock.NE.0)) call increment_current_clock(EVENT_SEND, put_len*MPP_TYPE_BYTELEN_) else if (to_pe.EQ.ALL_PES) then !this is a broadcast from from_pe @@ -130,8 +142,15 @@ if( cur_recv_request > max_request ) & call mpp_error(FATAL, & "MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & - request_recv(cur_recv_request), error ) + if (use_device_ptr) then + !$omp target data use_device_ptr(get_data) + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + !$omp end target data + else + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + endif size_recv(cur_recv_request) = get_len type_recv(cur_recv_request) = MPI_TYPE_ endif From 5a85ab4809d2a2f83f74b029ecf6f0565a58266b Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 2 Sep 2025 10:12:37 +1000 Subject: [PATCH 02/28] add missing collapse(3) clauses --- mpp/include/group_update_pack.inc | 6 +++--- mpp/include/group_update_unpack.inc | 12 +++++++----- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index b1a55b68e3..d08335d1e6 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -29,7 +29,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) - !$omp target teams distribute parallel do private(idx) & + !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je @@ -83,7 +83,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) - !$omp target teams distribute parallel do private(idx) & + !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je @@ -165,7 +165,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) - !$omp target teams distribute parallel do private(idx) & + !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) do k = 1, ksize do j = js, je diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index c18275e9bb..f242cd3452 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -26,7 +26,7 @@ if( group%k_loop_inside ) then if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) - !$omp target teams distribute parallel do if(use_device_ptr) private(idx) & + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & !$omp map(from: field(is:ie,js:je,1:ksize)) do k = 1, ksize @@ -42,8 +42,9 @@ if( group%k_loop_inside ) then else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) - !$omp target teams distribute parallel do private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie @@ -57,8 +58,9 @@ if( group%k_loop_inside ) then else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) - !$omp target teams distribute parallel do private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) do k = 1, ksize do j = js, je do i = is, ie From aa8f22b6610679440309f07ad716f53a7b24ef20 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 10 Oct 2025 13:14:15 +1100 Subject: [PATCH 03/28] Use __NVCOMPILER macro for target regions --- mpp/include/group_update_pack.inc | 28 +++++++++++++++++----------- mpp/include/group_update_unpack.inc | 28 +++++++++++++++++----------- mpp/include/mpp_group_update.fh | 10 +++++++--- 3 files changed, 41 insertions(+), 25 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index d08335d1e6..43e2af5bf5 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -21,25 +21,27 @@ if( group%k_loop_inside ) then do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%pack_is(n); ie = group%pack_ie(n) - js = group%pack_js(n); je = group%pack_je(n) + is = group%pack_is(n); ie = group%pack_ie(n); ni = ie-is+1 + js = group%pack_js(n); je = group%pack_je(n); nj = je-js+1 rotation = group%pack_rotation(n) if( group%pack_type(n) == FIELD_S ) then select case( rotation ) case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 buffer(idx) = field(i,j,k) end do end do enddo - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields @@ -83,17 +85,19 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 buffer(idx) = fieldx(i,j,k) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then @@ -165,17 +169,19 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 buffer(idx) = fieldy(i,j,k) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index f242cd3452..3f0b08faeb 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -21,55 +21,61 @@ if( group%k_loop_inside ) then do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: field(is:ie,js:je,1:ksize)) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 field(i,j,k) = buffer(idx) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 fieldx(i,j,k) = buffer(idx) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: buffer(pos+1:pos+ksize*(je-js+1)*(ie-is+1))) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - idx = pos + (k-1)*(je-js+1)*(ie-is+1) + (j-js)*(ie-is+1) + (i-is) + 1 + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 fieldy(i,j,k) = buffer(idx) end do end do end do - pos = pos + ksize*(je-js+1)*(ie-is+1) + pos = pos + ksize*nj*ni end do endif enddo diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 8464fa55f5..8f9251b10f 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -431,7 +431,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) integer :: msgsize integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk integer :: shift, gridtype, midpoint integer :: npack, nunpack, rotation, isd logical :: use_device_ptr @@ -481,7 +481,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) !---pre-post receive. call mpp_clock_begin(group_recv_clock) +#ifdef __NVCOMPILER !$omp target enter data map(alloc: buffer) if(use_device_ptr) +#endif do m = 1, nrecv msgsize = group%recv_size(m) from_pe = group%from_pe(m) @@ -525,7 +527,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) #include +#ifdef __NVCOMPILER !$omp target exit data map(release: buffer) if(use_device_ptr) +#endif call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -653,7 +657,7 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) integer :: msgsize, npack, rotation integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk logical :: reuse_buf_pos logical, parameter :: use_device_ptr = .false. ! placeholder character(len=8) :: text @@ -760,7 +764,7 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) integer :: k, buffer_pos, pos, m, n, l, idx integer :: is, ie, js, je, ksize, i, j integer :: shift, gridtype, midpoint, flags_v - integer :: nunpack, rotation, buffer_start_pos, nk, isd + integer :: nunpack, rotation, buffer_start_pos, ni, nj, nk, isd logical :: recv_y(8) logical, parameter :: use_device_ptr = .false. ! placeholder MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) From 1e8274712f7c26c4a3df1b07865ffd9d89cab1a0 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 10 Oct 2025 13:19:55 +1100 Subject: [PATCH 04/28] add back old omp directive wrapped in #ifndef __NVCOMPILER --- mpp/include/group_update_pack.inc | 10 ++++++++++ mpp/include/group_update_unpack.inc | 14 ++++++++++++-- 2 files changed, 22 insertions(+), 2 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 43e2af5bf5..6fa8866e91 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,6 +18,11 @@ !*********************************************************************** if( group%k_loop_inside ) then +#ifndef __NVCOMPILER +!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos @@ -251,6 +256,11 @@ if( group%k_loop_inside ) then endif enddo else +#ifndef __NVCOMPILER +!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = 1, npack*ksize n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 3f0b08faeb..894d6b110c 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,6 +18,11 @@ !*********************************************************************** if( group%k_loop_inside ) then +#ifndef __NVCOMPILER +!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) +#endif do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos @@ -80,13 +85,18 @@ if( group%k_loop_inside ) then endif enddo else +#ifndef __NVCOMPILER +!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = nunpack*ksize, 1, -1 n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos + (k-1)*group%unpack_size(n) - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) From dc36e574ef3069e752b9e50050539e0feb504d26 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 10 Oct 2025 15:46:53 +1100 Subject: [PATCH 05/28] port remaining un/pack loops --- mpp/include/group_update_pack.inc | 263 ++++++++++++++++++++++------ mpp/include/group_update_unpack.inc | 27 ++- 2 files changed, 226 insertions(+), 64 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 6fa8866e91..27ed8b9438 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -51,38 +51,53 @@ if( group%k_loop_inside ) then case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -108,64 +123,91 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + ! pos = pos + 1 + ! buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) @@ -191,65 +233,90 @@ if( group%k_loop_inside ) then case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) @@ -274,42 +341,62 @@ else case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -317,65 +404,95 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) @@ -384,65 +501,95 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 894d6b110c..3fa214c7bc 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -100,32 +100,47 @@ else if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER__ + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER + !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do endif enddo From 972a97c77cf46f7794345d6982c23c7ef05f823c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jorge=20Luis=20G=C3=A1lvez=20Vallejo?= Date: Thu, 30 Oct 2025 16:51:55 +1100 Subject: [PATCH 06/28] add multi gpu support (#2) * add multi gpu support * address review comments, add helpful comment for the acc/mp runbtime call --- mpp/include/mpp_comm_mpi.inc | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index 928f9fcb92..7b20d43044 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -31,6 +31,7 @@ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !> @brief Initialize the @ref mpp_mod module. Must be called before any usage. subroutine mpp_init( flags, localcomm, test_level, alt_input_nml_path ) + !$use omp_lib integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG integer, optional, intent(in) :: localcomm !< Id of MPI communicator used to initialize integer, optional, intent(in) :: test_level !< Used to exit initialization at certain stages @@ -54,6 +55,14 @@ call MPI_COMM_RANK( mpp_comm_private, pe, error ) call MPI_COMM_SIZE( mpp_comm_private, npes, error ) + ! set default device to enable multi GPU parallelism + ! calls to both OpenACC and OpenMP runtimes are needed + ! because we use both do-concurrent and openmp + ! if you remove either, the code will run multiple + ! ranks on a _single_ GPU. Be careful out there! + !$call omp_set_default_device(pe) + !$acc set device_num(pe) + module_is_initialized = .TRUE. if (present(test_level)) then t_level = test_level From 78b436d9792f62893666023c8d6fa03b878d5cd1 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 14 Oct 2025 13:20:54 +1100 Subject: [PATCH 07/28] sub __NVCOMPILER with __NVCOMPILER_OPENMP_GPU --- mpp/include/group_update_pack.inc | 68 ++++++++++++++--------------- mpp/include/group_update_unpack.inc | 16 +++---- mpp/include/mpp_group_update.fh | 4 +- 3 files changed, 44 insertions(+), 44 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 27ed8b9438..4c5b7be7e3 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,7 +18,7 @@ !*********************************************************************** if( group%k_loop_inside ) then -#ifndef __NVCOMPILER +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) @@ -34,7 +34,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -51,7 +51,7 @@ if( group%k_loop_inside ) then case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -68,7 +68,7 @@ if( group%k_loop_inside ) then case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -85,7 +85,7 @@ if( group%k_loop_inside ) then case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -105,7 +105,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -123,7 +123,7 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -140,7 +140,7 @@ if( group%k_loop_inside ) then else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -158,7 +158,7 @@ if( group%k_loop_inside ) then case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -178,7 +178,7 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -195,7 +195,7 @@ if( group%k_loop_inside ) then else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -216,7 +216,7 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -233,7 +233,7 @@ if( group%k_loop_inside ) then case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -251,7 +251,7 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -268,7 +268,7 @@ if( group%k_loop_inside ) then else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -287,7 +287,7 @@ if( group%k_loop_inside ) then if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -304,7 +304,7 @@ if( group%k_loop_inside ) then else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) #endif @@ -323,7 +323,7 @@ if( group%k_loop_inside ) then endif enddo else -#ifndef __NVCOMPILER +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) @@ -341,7 +341,7 @@ else case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -356,7 +356,7 @@ else case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -371,7 +371,7 @@ else case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -386,7 +386,7 @@ else case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -404,7 +404,7 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -420,7 +420,7 @@ else if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -435,7 +435,7 @@ else else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -451,7 +451,7 @@ else case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -467,7 +467,7 @@ else if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -482,7 +482,7 @@ else else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -501,7 +501,7 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -516,7 +516,7 @@ else case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -532,7 +532,7 @@ else if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -547,7 +547,7 @@ else else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -564,7 +564,7 @@ else if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif @@ -579,7 +579,7 @@ else else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) #endif diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 3fa214c7bc..286f07bc83 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,7 +18,7 @@ !*********************************************************************** if( group%k_loop_inside ) then -#ifndef __NVCOMPILER +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) @@ -31,7 +31,7 @@ if( group%k_loop_inside ) then if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: field(is:ie,js:je,1:ksize)) @@ -49,7 +49,7 @@ if( group%k_loop_inside ) then else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) @@ -67,7 +67,7 @@ if( group%k_loop_inside ) then else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) @@ -85,7 +85,7 @@ if( group%k_loop_inside ) then endif enddo else -#ifndef __NVCOMPILER +#ifndef __NVCOMPILER_OPENMP_GPU !$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) @@ -100,7 +100,7 @@ else if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr) #endif @@ -115,7 +115,7 @@ else else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) -#ifdef __NVCOMPILER__ +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr) #endif @@ -130,7 +130,7 @@ else else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr) #endif diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 8f9251b10f..4f4c668b7d 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -481,7 +481,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) !---pre-post receive. call mpp_clock_begin(group_recv_clock) -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target enter data map(alloc: buffer) if(use_device_ptr) #endif do m = 1, nrecv @@ -527,7 +527,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) #include -#ifdef __NVCOMPILER +#ifdef __NVCOMPILER_OPENMP_GPU !$omp target exit data map(release: buffer) if(use_device_ptr) #endif call mpp_clock_end(group_unpk_clock) From 5ac547f974ea671ff938dd59ec632c0409c40007 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 16 Dec 2025 16:42:46 +1100 Subject: [PATCH 08/28] allow choice of gpu or cpu parallel To enable this, had to be removed - otherwise segfaults happen on the GPU. --- mpp/include/group_update_pack.inc | 9 ++++----- mpp/include/group_update_unpack.inc | 9 ++++----- 2 files changed, 8 insertions(+), 10 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 4c5b7be7e3..8c421ba906 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,11 +18,10 @@ !*********************************************************************** if( group%k_loop_inside ) then -#ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) -#endif +!$OMP parallel do shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) & +!$OMP if (.not.use_device_ptr) do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 286f07bc83..a745982cf0 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,11 +18,10 @@ !*********************************************************************** if( group%k_loop_inside ) then -#ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) -#endif +!$OMP parallel do shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) & +!$OMP if (.not.use_device_ptr) do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos From 85ae7efbc81e6c4053e9c519e82a1ccc1032a7df Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Thu, 15 Jan 2026 02:31:34 +0000 Subject: [PATCH 09/28] fix omp set device call --- mpp/include/mpp_comm_mpi.inc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index 7b20d43044..42d30f308f 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -60,7 +60,7 @@ ! because we use both do-concurrent and openmp ! if you remove either, the code will run multiple ! ranks on a _single_ GPU. Be careful out there! - !$call omp_set_default_device(pe) + !$ call omp_set_default_device(pe) !$acc set device_num(pe) module_is_initialized = .TRUE. From b4993f5dcce9bff06ee3aae21921e30e044f0d33 Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Tue, 17 Feb 2026 09:54:24 +1100 Subject: [PATCH 10/28] Revert "allow choice of gpu or cpu parallel" This reverts commit 0cc2a77ee70851ad08458a52a1e4f6307e163463. Having both the CPU and GPU OpenMP directives compiled caused a significant slowdown in GPU packing/unpacking performance - even if parallelism is controlled using OpenMP "if" clause. --- mpp/include/group_update_pack.inc | 9 +++++---- mpp/include/group_update_unpack.inc | 9 +++++---- 2 files changed, 10 insertions(+), 8 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 8c421ba906..4c5b7be7e3 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,10 +18,11 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) & -!$OMP if (.not.use_device_ptr) +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index a745982cf0..286f07bc83 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,10 +18,11 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & -!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) & -!$OMP if (.not.use_device_ptr) +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) +#endif do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos From baa8d88cc0df5d5e2ca20dca1d3d507d96c4196d Mon Sep 17 00:00:00 2001 From: Marshall Ward Date: Mon, 16 Mar 2026 19:09:09 +0000 Subject: [PATCH 11/28] OMP MPI: Minor cleanups Some very minor changes to the OpenMP target MPI PR: * use_device_ptr -> use_device_addr This appears to be the updated form (or at least nvfortran says it is) * Whitespace added to `!$ use omp_lib` Does not seem crucial but from our previous discussion it appears more correct. * Removal of some trailing whitespace. --- mpp/include/mpp_comm_mpi.inc | 10 +++++----- mpp/include/mpp_transmit_mpi.fh | 4 ++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index 42d30f308f..142473497e 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -31,7 +31,7 @@ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !> @brief Initialize the @ref mpp_mod module. Must be called before any usage. subroutine mpp_init( flags, localcomm, test_level, alt_input_nml_path ) - !$use omp_lib + !$ use omp_lib integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG integer, optional, intent(in) :: localcomm !< Id of MPI communicator used to initialize integer, optional, intent(in) :: test_level !< Used to exit initialization at certain stages @@ -55,10 +55,10 @@ call MPI_COMM_RANK( mpp_comm_private, pe, error ) call MPI_COMM_SIZE( mpp_comm_private, npes, error ) - ! set default device to enable multi GPU parallelism - ! calls to both OpenACC and OpenMP runtimes are needed - ! because we use both do-concurrent and openmp - ! if you remove either, the code will run multiple + ! set default device to enable multi GPU parallelism + ! calls to both OpenACC and OpenMP runtimes are needed + ! because we use both do-concurrent and openmp + ! if you remove either, the code will run multiple ! ranks on a _single_ GPU. Be careful out there! !$ call omp_set_default_device(pe) !$acc set device_num(pe) diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index 2d2fef19e6..ce70b19812 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -88,7 +88,7 @@ call mpp_error(FATAL, & & "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply") if (use_device_ptr) then - !$omp target data use_device_ptr(put_data) + !$omp target data use_device_addr(put_data) call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & request_send(cur_send_request), error) !$omp end target data @@ -143,7 +143,7 @@ call mpp_error(FATAL, & "MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply") if (use_device_ptr) then - !$omp target data use_device_ptr(get_data) + !$omp target data use_device_addr(get_data) call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & request_recv(cur_recv_request), error ) !$omp end target data From 8d83224b1feb87d5160a80b07b9926f36e225f76 Mon Sep 17 00:00:00 2001 From: Marshall Ward Date: Mon, 16 Mar 2026 19:47:25 +0000 Subject: [PATCH 12/28] OMP target MPI: line length compliance This patch refactors several lines to keep within the 121-character line length limit prescribed by the FMS style guidelines. --- mpp/include/group_update_pack.inc | 128 ++++++++++++++++++++++-------- mpp/include/mpp_transmit.inc | 9 ++- 2 files changed, 102 insertions(+), 35 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 4c5b7be7e3..2422c1d8fa 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -36,7 +36,9 @@ if( group%k_loop_inside ) then ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = js, je @@ -53,7 +55,9 @@ if( group%k_loop_inside ) then ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = is, ie @@ -70,7 +74,9 @@ if( group%k_loop_inside ) then ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = ie, is, -1 @@ -87,7 +93,9 @@ if( group%k_loop_inside ) then ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: field(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -107,7 +115,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = js, je @@ -125,7 +135,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = is, ie @@ -142,7 +154,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = is, ie @@ -160,7 +174,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = ie, is, -1 @@ -180,7 +196,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if(use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -197,7 +215,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -218,7 +238,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = js, je @@ -235,7 +257,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = is, ie @@ -253,7 +277,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = ie, is, -1 @@ -270,7 +296,9 @@ if( group%k_loop_inside ) then ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldx(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do i = ie, is, -1 @@ -289,7 +317,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -306,7 +336,9 @@ if( group%k_loop_inside ) then ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(3) private(idx) & - !$omp map(to: fieldy(is:ie,js:je,1:ksize)) map(from: buffer(pos+1:pos+ksize*nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if(use_device_ptr) #endif do k = 1, ksize do j = je, js, -1 @@ -343,7 +375,9 @@ else ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = js, je do i = is, ie @@ -358,7 +392,9 @@ else ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = is, ie do j = je, js, -1 @@ -373,7 +409,9 @@ else ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = ie, is, -1 do j = js, je @@ -388,7 +426,9 @@ else ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: field(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 @@ -406,7 +446,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = js, je do i = is, ie @@ -422,7 +464,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = is, ie do j = je, js, -1 @@ -437,7 +481,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = is, ie do j = je, js, -1 @@ -453,7 +499,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = ie, is, -1 do j = js, je @@ -469,7 +517,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 @@ -484,7 +534,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 @@ -503,7 +555,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = js, je do i = is, ie @@ -518,7 +572,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if(use_device_ptr) #endif do i = is, ie do j = je, js, -1 @@ -534,7 +590,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = ie, is, -1 do j = js, je @@ -549,7 +607,9 @@ else ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldx(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do i = ie, is, -1 do j = js, je @@ -566,7 +626,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 @@ -581,7 +643,9 @@ else ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU !$omp target teams distribute parallel do collapse(2) private(idx) & - !$omp map(to: fieldy(k, is:ie, js:je)) map(from: buffer(pos+1:pos+nj*ni)) if(use_device_ptr) + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) #endif do j = je, js, -1 do i = ie, is, -1 diff --git a/mpp/include/mpp_transmit.inc b/mpp/include/mpp_transmit.inc index 560de01c18..71b0d97f0c 100644 --- a/mpp/include/mpp_transmit.inc +++ b/mpp/include/mpp_transmit.inc @@ -190,7 +190,8 @@ ptr = LOC(get_data) get_len=1; if(PRESENT(glen))get_len=glen - call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request, omp_offload=omp_offload ) + call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, & + block, tag, recv_request=request, omp_offload=omp_offload ) end subroutine MPP_RECV_SCALAR_ @@ -209,7 +210,8 @@ pointer( ptr, put_data1D ) ptr = LOC(put_data) put_len=1; if(PRESENT(plen))put_len=plen - call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request, omp_offload=omp_offload ) + call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, & + tag=tag, send_request=request, omp_offload=omp_offload ) end subroutine MPP_SEND_SCALAR_ @@ -222,7 +224,8 @@ integer, intent(out), optional :: request MPP_TYPE_ :: dummy(1,1) - call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, & + block, tag, recv_request=request ) end subroutine MPP_RECV_2D_ subroutine MPP_SEND_2D_( put_data, put_len, to_pe, tag, request ) From 236aef32a060abce0acff34048a94de39abcf244 Mon Sep 17 00:00:00 2001 From: Marshall Ward Date: Tue, 17 Mar 2026 14:14:53 +0000 Subject: [PATCH 13/28] OMP MPI: Update nocomm interface The no-comm (no MPI) interface has been updated to support the new omp_offload argument. --- mpp/include/mpp_transmit_mpi.fh | 6 +++--- mpp/include/mpp_transmit_nocomm.fh | 5 ++++- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index ce70b19812..4ce6af6148 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -52,12 +52,12 @@ integer :: rsize logical :: use_device_ptr - use_device_ptr = .false. - if (present(omp_offload)) use_device_ptr = omp_offload - if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + block_comm = .true. if(PRESENT(block)) block_comm = block diff --git a/mpp/include/mpp_transmit_nocomm.fh b/mpp/include/mpp_transmit_nocomm.fh index ca132a4dc8..ee08d53a53 100644 --- a/mpp/include/mpp_transmit_nocomm.fh +++ b/mpp/include/mpp_transmit_nocomm.fh @@ -36,7 +36,7 @@ !!words from an array of any rank to be passed (avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -44,6 +44,8 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag integer, intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload + ! NOTE: omp_offload is unused in this function integer :: i, outunit MPP_TYPE_, allocatable, save :: local_data(:) !local copy used by non-parallel code (no SHMEM or MPI) @@ -55,6 +57,7 @@ if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return + outunit = stdout() if( debug )then call SYSTEM_CLOCK(tick) From e3116ef234591c1c6772b96e68d86ada970c64ed Mon Sep 17 00:00:00 2001 From: Edward Yang Date: Fri, 17 Apr 2026 13:48:32 +1000 Subject: [PATCH 14/28] use openmp cpu if ompoffload=.false. This ensures that (un)packing steps in do_group_update is performed with openmp cpu parallelism if ompoffload=.false.. Previously it would only do serial. This is implemented by undefining the GPU macro (currently __NVCOMPILER_OPENMP_GPU) and re-including the (un)packing files. To make this work, the default(shared) was used in all the relevant OpenMP directives. If default(none) is used, the loops would hang or segfault. --- mpp/include/group_update_pack.inc | 105 +++++++++++++++++++--------- mpp/include/group_update_unpack.inc | 27 ++++--- mpp/include/mpp_group_update.fh | 47 +++++++++++++ 3 files changed, 137 insertions(+), 42 deletions(-) diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 2422c1d8fa..a6ee72d7b8 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,8 +18,13 @@ !*********************************************************************** if( group%k_loop_inside ) then +! nvfortran + cray pointers imposes some restrictions on the loops below: +! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops +! must be ported rather than the whole outer loop. +! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop". +! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)". #ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) #endif @@ -35,7 +40,8 @@ if( group%k_loop_inside ) then do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: field(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -54,7 +60,8 @@ if( group%k_loop_inside ) then do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: field(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -73,7 +80,8 @@ if( group%k_loop_inside ) then do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: field(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -92,7 +100,8 @@ if( group%k_loop_inside ) then do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: field(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -114,7 +123,8 @@ if( group%k_loop_inside ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -134,7 +144,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -153,7 +164,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -173,7 +185,8 @@ if( group%k_loop_inside ) then do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -195,7 +208,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if(use_device_ptr) @@ -214,7 +228,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -237,7 +252,8 @@ if( group%k_loop_inside ) then do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -256,7 +272,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -276,7 +293,8 @@ if( group%k_loop_inside ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -295,7 +313,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -316,7 +335,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if (use_device_ptr) @@ -335,7 +355,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & !$omp if(use_device_ptr) @@ -356,7 +377,7 @@ if( group%k_loop_inside ) then enddo else #ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) #endif @@ -374,7 +395,8 @@ else do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: field(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -391,7 +413,8 @@ else do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: field(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -408,7 +431,8 @@ else do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: field(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -425,7 +449,8 @@ else do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: field(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -445,7 +470,8 @@ else do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -463,7 +489,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -480,7 +507,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -498,7 +526,8 @@ else do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -516,7 +545,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -533,7 +563,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -554,7 +585,8 @@ else do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -571,7 +603,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if(use_device_ptr) @@ -589,7 +622,8 @@ else do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -606,7 +640,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: fieldx(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -625,7 +660,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) @@ -642,7 +678,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: fieldy(k, is:ie, js:je)) & !$omp map(from: buffer(pos+1:pos+nj*ni)) & !$omp if (use_device_ptr) diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 286f07bc83..25a31ee8fc 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,8 +18,13 @@ !*********************************************************************** if( group%k_loop_inside ) then +! nvfortran + cray pointers imposes some restrictions on the loops below: +! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops +! must be ported rather than the whole outer loop. +! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop". +! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)". #ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) #endif @@ -32,7 +37,8 @@ if( group%k_loop_inside ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) private(idx) & + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: field(is:ie,js:je,1:ksize)) #endif @@ -50,7 +56,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) #endif @@ -68,7 +75,8 @@ if( group%k_loop_inside ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(3) private(idx) & + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldy,ptr) & !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) #endif @@ -86,7 +94,7 @@ if( group%k_loop_inside ) then enddo else #ifndef __NVCOMPILER_OPENMP_GPU -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & !$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) #endif @@ -101,7 +109,8 @@ else do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr) #endif do j = js, je @@ -116,7 +125,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr) #endif do j = js, je @@ -131,7 +141,8 @@ else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) #ifdef __NVCOMPILER_OPENMP_GPU - !$omp target teams distribute parallel do collapse(2) private(idx) & + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr) #endif do j = js, je diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 4f4c668b7d..b8aaadabe3 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -503,7 +503,19 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) call mpp_clock_begin(group_pack_clock) !pack the data buffer_start_pos = 0 + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU +#else +#include +#endif + endif call mpp_clock_end(group_pack_clock) call mpp_clock_begin(group_send_clock) @@ -526,10 +538,20 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) !---unpack the buffer nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else #ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU !$omp target exit data map(release: buffer) if(use_device_ptr) +#else +#include #endif + endif call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -738,7 +760,19 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) call mpp_clock_begin(nonblock_group_pack_clock) npack = group%npack buffer_start_pos = group%buffer_start_pos + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU +#else +#include +#endif + endif call mpp_clock_end(nonblock_group_pack_clock) call mpp_clock_begin(nonblock_group_send_clock) @@ -807,7 +841,20 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) call mpp_clock_begin(nonblock_group_unpk_clock) buffer_start_pos = group%buffer_start_pos + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU + !$omp target exit data map(release: buffer) if(use_device_ptr) +#else +#include +#endif + endif call mpp_clock_end(nonblock_group_unpk_clock) ! ---northern boundary fold From 41521155b8177650acbe605ef3f4d2e192e94e75 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Wed, 6 May 2026 12:33:16 -0600 Subject: [PATCH 15/28] Linting clean-up Removed trailing whitespace, replaced tabs with spaces, and kept all lines at 121 lines or fewer --- constants/cesm_constants.fh | 26 ++++--- diag_manager/diag_manager.F90 | 19 +++-- diag_manager/diag_util.F90 | 8 +- diag_manager/mppnccombine.c | 128 ++++++++++++++++---------------- fms2_io/netcdf_io.F90 | 4 +- mpp/include/mpp_group_update.fh | 8 +- 6 files changed, 101 insertions(+), 92 deletions(-) diff --git a/constants/cesm_constants.fh b/constants/cesm_constants.fh index db2ba4bb6c..eee816ce88 100644 --- a/constants/cesm_constants.fh +++ b/constants/cesm_constants.fh @@ -27,7 +27,7 @@ character(len=18), public, parameter :: constants_version = 'FMSConstants: CESM' ! Constants below use CESM shr values real(R8), public, parameter :: KAPPA = RDGAS/CP_AIR !< RDGAS / CP_AIR [dimensionless] -real(R8), public, parameter :: RHO0R = 1.0_r8/RHO0 !< Reciprocal of average density of sea water [m^3/kg] +real(R8), public, parameter :: RHO0R = 1.0_r8/RHO0 !< Reciprocal of average density of sea water [m^3/kg] real(R8), public, parameter :: RHO_CP = RHO0*CP_OCEAN !< (kg/m^3)*(cal/kg/deg C)(joules/cal) = !<(joules/m^3/deg C) [J/m^3/deg] real(R8), public, parameter :: ES0 = 1.0_r8 !< Humidity factor. Controls the humidity content of the @@ -49,15 +49,21 @@ real(R8), public, parameter :: SECONDS_PER_HOUR = 3600._r8 !< Seconds in real(R8), public, parameter :: SECONDS_PER_MINUTE = 60._r8 !< Seconds in a minute [s] real(R8), public, parameter :: RAD_TO_DEG = 180._r8/PI !< Degrees per radian [deg/rad] real(R8), public, parameter :: DEG_TO_RAD = PI/180._r8 !< Radians per degree [rad/deg] -real(R8), public, parameter :: RADIAN = RAD_TO_DEG !< Equal to RAD_TO_DEG for backward compatability. [rad/deg] -real(R8), public, parameter :: ALOGMIN = -50.0_r8 !< Minimum value allowed as argument to log function [N/A] -real(R8), public, parameter :: EPSLN = 1.0e-40_r8 !< A small number to prevent divide by zero exceptions [N/A] -real(R8), public, parameter :: RADCON = ((1.0E+02_r8*GRAV)/(1.0E+04_r8*CP_AIR))*SECONDS_PER_DAY !< convert flux divergence - !to heating rate in degrees per day [deg sec/(cm day)] +real(R8), public, parameter :: RADIAN = RAD_TO_DEG !< Equal to RAD_TO_DEG for backward compatability. + !! [rad/deg] +real(R8), public, parameter :: ALOGMIN = -50.0_r8 !< Minimum value allowed as argument to log function + !! [N/A] +real(R8), public, parameter :: EPSLN = 1.0e-40_r8 !< A small number to prevent divide by zero exceptions + !! [N/A] +real(R8), public, parameter :: RADCON = ((1.0E+02_r8*GRAV)/(1.0E+04_r8*CP_AIR))*SECONDS_PER_DAY !< convert flux + !! divergence to heating rate in degrees per day + !! [deg sec/(cm day)] real(R8), public, parameter :: RADCON_MKS = (GRAV/CP_AIR)*SECONDS_PER_DAY !< Factor used to convert flux divergence to - !< heating rate in degrees per day [deg sec/(m day)] -real(R8), public, parameter :: O2MIXRAT = 2.0953E-01_r8 !< Mixing ratio of molecular oxygen in air [dimensionless] -real(R8), public, parameter :: C2DBARS = 1.e-4_r8 !< rho*g*z(mks) to dbars: 1dbar = 10^4 (kg/m^3)(m/s^2)m [dbars] + !! heating rate in degrees per day [deg sec/(m day)] +real(R8), public, parameter :: O2MIXRAT = 2.0953E-01_r8 !< Mixing ratio of molecular oxygen in air + !! [dimensionless] +real(R8), public, parameter :: C2DBARS = 1.e-4_r8 !< rho*g*z(mks) to dbars: 1dbar = 10^4 + !! (kg/m^3)(m/s^2)m [dbars] real(R8), public, parameter :: KELVIN = 273.15_r8 !< Degrees Kelvin at zero Celsius [K] #ifdef SMALL_EARTH @@ -70,4 +76,4 @@ real(R8), public, parameter :: KELVIN = 273.15_r8 !< Degrees Ke #endif #else real, public, parameter :: small_fac = 1._r8 -#endif \ No newline at end of file +#endif diff --git a/diag_manager/diag_manager.F90 b/diag_manager/diag_manager.F90 index d0d6e4a3ad..335e805d7b 100644 --- a/diag_manager/diag_manager.F90 +++ b/diag_manager/diag_manager.F90 @@ -239,7 +239,8 @@ MODULE diag_manager_mod USE fms_diag_outfield_mod, ONLY: fmsDiagOutfieldIndex_type, fmsDiagOutfield_type USE fms_diag_fieldbuff_update_mod, ONLY: fieldbuff_update, fieldbuff_copy_missvals, & & fieldbuff_copy_fieldvals - use netcdf_io_mod, ONLY: filepath_list_type, partitioned_global_files, partitioned_section_files, append_to_filepath_list + use netcdf_io_mod, ONLY: filepath_list_type, partitioned_global_files, partitioned_section_files, & + & append_to_filepath_list #ifdef use_netCDF USE netcdf, ONLY: NF90_INT, NF90_FLOAT, NF90_CHAR @@ -376,7 +377,7 @@ MODULE diag_manager_mod END INTERFACE diag_field_add_attribute ! ----- interface to the C function ----- -interface +interface function exec_mppnccombine(outfile) bind(C) use, intrinsic :: iso_c_binding, only: c_int, c_char implicit none @@ -3733,7 +3734,7 @@ subroutine combine_files() integer(c_int) :: smallest_pix ! The smallest IO PE index of the set of IO PEs writing the current section file. integer(c_int) :: ireturn ! Return code from mppnccombine integer(c_int) :: niopes ! Number of IO PEs participating in writing of global files - integer :: f ! File index for the global diagnostic files + integer :: f ! File index for the global diagnostic files integer :: pix_order ! 0-based order of the IO PE in the list of all IO PEs writing the file. type(filepath_list_type), pointer :: current type(filepath_list_type), pointer :: files_to_combine ! list of files to combined by this PE @@ -3754,8 +3755,8 @@ subroutine combine_files() filepath = trim(adjustl(current%path)) outfile = filepath(1:len(filepath)-5) // c_null_char - ! get the number of files to combine (for the first global file only). The number of files is the - ! same for all global files. Similarly, get pix and pix_order for the first file only, since for + ! get the number of files to combine (for the first global file only). The number of files is the + ! same for all global files. Similarly, get pix and pix_order for the first file only, since for ! all global files, the pix and pix_order are the same. if (niopes == 0) then niopes = num_partitioned_files(outfile) @@ -3806,7 +3807,9 @@ subroutine combine_files() do while (associated(current)) !write(stdout_unit,*) ' Combining file ' // filepath(1:len(filepath)-5) ireturn = exec_mppnccombine(current%path) - if (ireturn /= 0) call error_mesg('diag_manager_mod::combine_files', 'mppnccombine failed for file ' // trim(current%path), FATAL) + if (ireturn /= 0) & + call error_mesg('diag_manager_mod::combine_files', 'mppnccombine failed for file ' // & + trim(current%path), FATAL) current => current%next end do end if @@ -3825,10 +3828,10 @@ function get_pix_order(filename, num_files, pix) result(pix_order) ! local integer :: pix_order ! 0-based order of the pix in the list of all IO PEs writing the file character(len=4) :: suffix ! 0000, 0001, etc. - integer :: npes ! total number of all PEs + integer :: npes ! total number of all PEs integer :: i, f logical :: exists - + npes = mpp_npes() pix_order = -1 diff --git a/diag_manager/diag_util.F90 b/diag_manager/diag_util.F90 index e42c2d7639..1e3c6f7557 100644 --- a/diag_manager/diag_util.F90 +++ b/diag_manager/diag_util.F90 @@ -1726,8 +1726,8 @@ SUBROUTINE opening_file(file, time, filename_time) INTEGER, ALLOCATABLE :: axesc(:) ! indices if compressed axes associated with the field LOGICAL :: time_ops, aux_present, match_aux_name, req_present, match_req_fields CHARACTER(len=7) :: avg_name = 'average' - CHARACTER(len=MAX_NAME_LENGTH) :: time_units, timeb_units, avg, error_string, filename, aux_name, req_fields, fieldname - CHARACTER(len=MAX_NAME_LENGTH) :: suffix, base_name + CHARACTER(len=MAX_NAME_LENGTH) :: time_units, timeb_units, avg, error_string, filename, aux_name, req_fields + CHARACTER(len=MAX_NAME_LENGTH) :: fieldname, suffix, base_name CHARACTER(len=32) :: time_name, timeb_name,time_longname, timeb_longname, cart_name CHARACTER(len=MAX_FILENAME_LENGTH) :: fname CHARACTER(len=24) :: start_date @@ -2290,10 +2290,10 @@ integer function find_first_fms_percent(filename) integer :: i integer :: first_percent integer :: first_percent_loc - + first_percent = INDEX(filename, '%1') do i=2,9 - write(first_fms_percent,"('%',i1)") i + write(first_fms_percent,"('%',i1)") i first_percent_loc = INDEX(filename, first_fms_percent) if (first_percent_loc > 0) then if (first_percent == 0) then diff --git a/diag_manager/mppnccombine.c b/diag_manager/mppnccombine.c index bf0546c673..4dfb1c7877 100644 --- a/diag_manager/mppnccombine.c +++ b/diag_manager/mppnccombine.c @@ -142,19 +142,19 @@ #include #include #include - + #ifndef MAX_BF #define MAX_BF 100 /* maximum blocking factor */ #endif #ifndef DEFAULT_BF /* default blocking factor, if none set */ #define DEFAULT_BF 1 #endif - + /* Block size for NetCDF file open/reads */ #ifndef NC_BLKSZ #define NC_BLKSZ 65536 #endif - + /* Information structure for a file */ struct fileinfo { @@ -190,20 +190,20 @@ int flush_decomp (struct fileinfo *, int, int, int, unsigned char); void print_debug (struct fileinfo *, unsigned char); char *nc_type_to_str (nc_type); int min (int, int); - + static void ***varbuf = NULL; /* Buffers for multiple records of decomposed var */ - + struct rusage ruse; /* structure used to store values from getrusage() */ static unsigned long maxrss = 0; /* maximum memory used so far in kilobytes */ static int print_mem_usage = 0; static unsigned long mem_allocated = 0; /* memory allocated so far */ - + static const char version[] = "2024.02"; - + static unsigned long estimated_maxrss = 0; /* see option: -x */ static int mem_dry_run = 0; /* set if -x option is used */ - + static inline void check_mem_usage (void) { @@ -233,7 +233,7 @@ int flush_decomp (struct fileinfo *, int, int, int, unsigned char); prev_rss = rss; return; } - + static void print_estimated_mem_footprint (int verbose) { @@ -586,7 +586,7 @@ char** find_partitioned_files(const char* filepath, int* count) { if (!strcmp (strptr, ".0000")) outfilename[outlen - 5] = '\0'; } - + /* if -x (estimate memory usage) is set, k will be automatically set to 1 */ if (mem_dry_run) { @@ -598,10 +598,10 @@ char** find_partitioned_files(const char* filepath, int* count) { printf ("This run will estimate peak memory resident size. No output " "file will be created.\n"); } - + /* Disable fatal returns from netCDF library functions */ ncopts = 0; - + if (!mem_dry_run) { /* Create a new netCDF output file */ @@ -642,7 +642,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + /* No input files are specified on the command-line */ if (inputarg == (-1)) { @@ -904,7 +904,7 @@ char** find_partitioned_files(const char* filepath, int* count) { varbuf[k][v] = NULL; } } - + /* Cleanup and check for any input or output file errors */ if (ncsync (ncoutfile->ncfid) == (-1)) outfileerrors++; @@ -973,7 +973,7 @@ char** find_partitioned_files(const char* filepath, int* count) { return (1); } } - + /* Print the usage message for mppnccombine */ void usage () @@ -1071,7 +1071,7 @@ char** find_partitioned_files(const char* filepath, int* count) { "value of 1\n"); printf ("otherwise.\n"); } - + int min (int a, int b) { @@ -1079,7 +1079,7 @@ char** find_partitioned_files(const char* filepath, int* count) { return a; return b; } - + /* Open an input file and get some information about it, define the */ /* structure of the output file if necessary, prepare to copy all the */ /* variables for the current block to memory (and non-decomposed variables */ @@ -1103,10 +1103,10 @@ char** find_partitioned_files(const char* filepath, int* count) { char attname[MAX_NC_NAME]; /* Name of a global or variable attribute */ unsigned char ncinfileerror = 0; /* Were there any file errors? */ size_t blksz = NC_BLKSZ; /* netCDF block size */ - + if (print_mem_usage) check_mem_usage (); - + /* Information for netCDF input file */ if ((ncinfile = (struct fileinfo *)malloc (sizeof (struct fileinfo))) == NULL) @@ -1114,7 +1114,7 @@ char** find_partitioned_files(const char* filepath, int* count) { fprintf (stderr, "Error: cannot allocate enough memory!\n"); return (1); } - + /* Open an input netCDF file */ if ((ncinfile->ncfid = ncopen (ncname, NC_NOWRITE)) == (-1)) { @@ -1122,7 +1122,7 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + /* Determine the number of files in the decomposed domain */ if (ncattget (ncinfile->ncfid, NC_GLOBAL, "NumFilesInSet", (void *)&nfiles2) == (-1)) @@ -1141,7 +1141,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } } *nfiles = nfiles2; - + /* Get some general information about the input netCDF file */ if (ncinquire (ncinfile->ncfid, &(ncinfile->ndims), &(ncinfile->nvars), &(ncinfile->ngatts), &(ncinfile->recdim)) @@ -1152,7 +1152,7 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + /* Get some information about the dimensions */ for (d = 0; d < ncinfile->ndims; d++) { @@ -1170,14 +1170,14 @@ char** find_partitioned_files(const char* filepath, int* count) { ncinfile->dimstart[d] = 1; ncinfile->dimend[d] = (-1); } - + /* Save some information for the output file */ if ((block == 0) && (!mem_dry_run)) { ncoutfile->nvars = ncinfile->nvars; ncoutfile->recdim = ncinfile->recdim; } - + /* Get some information about the variables */ for (v = 0; v < ncinfile->nvars; v++) { @@ -1191,7 +1191,7 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + /* If the variable is also a dimension then get decomposition info */ if ((dimid = ncdimid (ncinfile->ncfid, ncinfile->varname[v])) != (-1)) { @@ -1213,14 +1213,14 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + /* Get some additional information about the variables */ for (v = 0; v < ncinfile->nvars; v++) { - + /* start by assuming the variable has no decomposed dimension */ ncinfile->vardecomp[v] = 0; - + /* now, iterate over the variable's dimensions and mark the */ /* variable as a decomposed variable if any dimension of */ /* the variable is decomposed */ @@ -1233,7 +1233,7 @@ char** find_partitioned_files(const char* filepath, int* count) { break; } } - + /* Save some information for the output file */ /* This only needs to be done once per output file */ if ((!appendnc) && (!mem_dry_run)) @@ -1248,13 +1248,13 @@ char** find_partitioned_files(const char* filepath, int* count) { ncoutfile->varmiss[v] = 0; } } - + /* If the output netCDF file was just created then define its structure */ if ((!appendnc) && (!mem_dry_run)) { if (verbose) printf (" Creating output \"%s\"\n", outncname); - + /* Determine the format of the input netCDF file */ if (nc_inq_format (ncinfile->ncfid, &ncinformat) == (-1)) { @@ -1263,7 +1263,7 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + /* Determine the format of the output netCDF file */ if (nc_inq_format (ncoutfile->ncfid, &ncoutformat) == (-1)) { @@ -1272,11 +1272,11 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + if (verbose) printf (" ncinformat=%d, ncoutformat=%d\n", ncinformat, ncoutformat); - + /* If the format option (-64 or -n4) for the output netCDF file * is not specified then recreate the output netCDF file based * upon the format of the input netCDF file. */ @@ -1320,7 +1320,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } ncsetfill (ncoutfile->ncfid, NC_NOFILL); } - + /* Define the dimensions */ for (d = 0; d < ncinfile->ndims; d++) { @@ -1330,7 +1330,7 @@ char** find_partitioned_files(const char* filepath, int* count) { ncdimdef (ncoutfile->ncfid, ncinfile->dimname[d], ncinfile->dimfullsize[d]); } - + /* Define the variables and copy their attributes */ for (v = 0; v < ncinfile->nvars; v++) { @@ -1367,7 +1367,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + /* Copy the global attributes */ for (n = 0; n < ncinfile->ngatts; n++) { @@ -1390,7 +1390,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + if (deflate == 1 && deflation > 0) { for (v = 0; v < ncinfile->nvars; v++) @@ -1405,11 +1405,11 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + /* Definitions done */ nc__enddef (ncoutfile->ncfid, headerpad, 4, 0, 4); } - + /* Copy all data values of the dimensions and variables to memory */ /* For non-decomposed variables, process_vars will write them to the */ /* output file. Decomposed variables for N records from this file will */ @@ -1436,13 +1436,13 @@ char** find_partitioned_files(const char* filepath, int* count) { r < min ( ((block + 1) * (*bf)), *nrecs)); // r is a minimum of the next block start point and nrecs - + /* Done */ ncclose (ncinfile->ncfid); free (ncinfile); return (ncinfileerror); } - + /* Decomposed variables from an input file and record will be written to memory */ /* non-decomposed variables will be written to the output file */ @@ -1468,17 +1468,17 @@ char** find_partitioned_files(const char* filepath, int* count) { int recdimsize; /* Using a local value to correct issue when netcdf file does not have a record dimension */ long long varbufsize; - + if (ncinfile->recdim < 0) recdimsize = 1; else recdimsize = ncinfile->dimsize[ncinfile->recdim]; - + /* Check the number of records */ if (*nrecs == 1) { *nrecs = recdimsize; - + /* adjust bf */ if ((*bf) >= 1) { @@ -1504,7 +1504,7 @@ char** find_partitioned_files(const char* filepath, int* count) { *nblocks = (int)((*nrecs) / (*bf)) + 1; else *nblocks = (int)((*nrecs) / (*bf)); - + if (verbose) fprintf (stderr, "blocking factor=%d, num. blocks=%d, num. records=%d\n", *bf, @@ -1517,7 +1517,7 @@ char** find_partitioned_files(const char* filepath, int* count) { "Error: different number of records than the first input file!\n"); return (1); } - + /* Allocate memory for the decomposed variables, if none has been allocated yet We use an optimized algorithm to malloc and set up a double dimension array using a single malloc call. We do the cross-linking after the @@ -1563,13 +1563,13 @@ char** find_partitioned_files(const char* filepath, int* count) { } */ } /* end of memory allocation, done once per block */ - + /* Loop over all the variables */ for (v = 0; v < ncinfile->nvars; v++) { if (verbose > 1) printf (" variable = %s\n", ncinfile->varname[v]); - + /* Get read/write dimension sizes for the variable */ recsize = 1; recfullsize = 1; @@ -1597,7 +1597,7 @@ char** find_partitioned_files(const char* filepath, int* count) { printf (" dim %d: instart=%ld outstart=%ld count=%ld\n", d, instart[d], outstart[d], count[d]); } - + /* Prevent unnecessary reads/writes */ if (r > 0) { @@ -1618,11 +1618,11 @@ char** find_partitioned_files(const char* filepath, int* count) { { /* Prevent unnecessary reads/writes of non-decomposed variables if (ncinfile->vardecomp[v]!=1 && appendnc) continue; */ - + /* Non-record variables */ if (varrecdim == (-1)) continue; - + /* Non-decomposed record variables */ if (ncinfile->vardecomp[v] != 1 && f > 0) continue; @@ -1633,7 +1633,7 @@ char** find_partitioned_files(const char* filepath, int* count) { if (ncinfile->vardecomp[v] != 1 && appendnc) continue; } - + /* Allocate a buffer for the variable's record */ if ((values = malloc (nctypelen (ncinfile->datatype[v]) * recsize)) == NULL) @@ -1645,7 +1645,7 @@ char** find_partitioned_files(const char* filepath, int* count) { ncinfile->varname[v]); return (1); } - + /* Read the variable */ if (varrecdim != (-1)) instart[varrecdim] = outstart[varrecdim] = r; @@ -1655,7 +1655,7 @@ char** find_partitioned_files(const char* filepath, int* count) { ncinfile->varname[v]); return (1); } - + /* Write the buffered variable immediately if it's not decomposed */ if ((ncinfile->vardecomp[v] != 1) && (!mem_dry_run)) { @@ -1740,7 +1740,7 @@ char** find_partitioned_files(const char* filepath, int* count) { if (verbose > 1) printf (" writing %lld bytes to memory\n", nctypelen (ncinfile->datatype[v]) * recsize); - + imax = ncinfile ->dimsize[ncinfile->vardim[v][ncinfile->varndims[v] - 1]]; if (ncinfile->varndims[v] > 1) @@ -1776,7 +1776,7 @@ char** find_partitioned_files(const char* filepath, int* count) { if (verbose > 1) printf (" imap=%d jmax=%d kmax=%d lmax=%d\n", imax, jmax, kmax, lmax); - + imaxfull = ncinfile->dimfullsize[ncinfile->vardim[v][ncinfile->varndims[v] - 1]]; @@ -1808,7 +1808,7 @@ char** find_partitioned_files(const char* filepath, int* count) { imaxfull, jmaxfull, kmaxfull, lmaxfull); imaxjmaxfull = imaxfull * jmaxfull; imaxjmaxkmaxfull = imaxfull * jmaxfull * kmaxfull; - + ioffset = outstart[ncinfile->varndims[v] - 0 - 1]; if (ncinfile->varndims[v] > 1) joffset = outstart[ncinfile->varndims[v] - 1 - 1]; @@ -1939,7 +1939,7 @@ char** find_partitioned_files(const char* filepath, int* count) { break; } } - + /* Deallocate the decomposed variable's buffer */ if (values != NULL) free (values); @@ -1947,7 +1947,7 @@ char** find_partitioned_files(const char* filepath, int* count) { first = 0; return (0); } - + /* Write all the buffered decomposed variables to the output file */ int flush_decomp (struct fileinfo *ncoutfile, int nfiles, int r, int bf, @@ -1957,12 +1957,12 @@ char** find_partitioned_files(const char* filepath, int* count) { long outstart[MAX_NC_DIMS]; /* Data array sizes */ long count[MAX_NC_DIMS]; /* " */ int varrecdim; /* Position of a variable's record dimension */ - + if (verbose > 1) { printf (" nvars=%d\n", ncoutfile->nvars); } - + /* Write out all the decomposed variables */ for (v = 0; v < ncoutfile->nvars; v++) { @@ -2002,4 +2002,4 @@ char** find_partitioned_files(const char* filepath, int* count) { } } return (0); - } \ No newline at end of file + } diff --git a/fms2_io/netcdf_io.F90 b/fms2_io/netcdf_io.F90 index e3c120b8e8..bdfba1f843 100644 --- a/fms2_io/netcdf_io.F90 +++ b/fms2_io/netcdf_io.F90 @@ -247,7 +247,7 @@ module netcdf_io_mod public :: read_restart_bc public :: flush_file public :: partitioned_global_files -public :: partitioned_section_files +public :: partitioned_section_files public :: append_to_filepath_list !> @ingroup netcdf_io_mod @@ -703,7 +703,7 @@ subroutine append_to_filepath_list(filepath, filepath_list) else current => filepath_list do - ! If file is already in the list, return. This happens when new_file_freq (e.g., daily) + ! If file is already in the list, return. This happens when new_file_freq (e.g., daily) ! is more frequent than the date suffix (e.g., %4yr-%2mo), leading to overriding the file. if (string_compare(current%path, trim(filepath))) return ! If we reach the end of the list, exit the loop to add the new file. diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index b8aaadabe3..5bbde5ce6a 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -503,7 +503,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) call mpp_clock_begin(group_pack_clock) !pack the data buffer_start_pos = 0 - ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined if (use_device_ptr) then #include @@ -538,7 +538,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) !---unpack the buffer nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) - ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined if (use_device_ptr) then #include @@ -760,7 +760,7 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) call mpp_clock_begin(nonblock_group_pack_clock) npack = group%npack buffer_start_pos = group%buffer_start_pos - ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined if (use_device_ptr) then #include @@ -841,7 +841,7 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) call mpp_clock_begin(nonblock_group_unpk_clock) buffer_start_pos = group%buffer_start_pos - ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined if (use_device_ptr) then #include From d26f787f40f77f0eecfe9dd3640a433bb417d599 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Wed, 6 May 2026 12:39:38 -0600 Subject: [PATCH 16/28] One more lint clean-up Missed one of the long lines in cesm_constants.fh --- constants/cesm_constants.fh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/constants/cesm_constants.fh b/constants/cesm_constants.fh index eee816ce88..ed8cb6f197 100644 --- a/constants/cesm_constants.fh +++ b/constants/cesm_constants.fh @@ -30,9 +30,10 @@ real(R8), public, parameter :: KAPPA = RDGAS/CP_AIR !< RDGAS / CP real(R8), public, parameter :: RHO0R = 1.0_r8/RHO0 !< Reciprocal of average density of sea water [m^3/kg] real(R8), public, parameter :: RHO_CP = RHO0*CP_OCEAN !< (kg/m^3)*(cal/kg/deg C)(joules/cal) = !<(joules/m^3/deg C) [J/m^3/deg] -real(R8), public, parameter :: ES0 = 1.0_r8 !< Humidity factor. Controls the humidity content of the - !< atmosphere through the Saturation Vapour Pressure - !< expression when using DO_SIMPLE. [dimensionless] +real(R8), public, parameter :: ES0 = 1.0_r8 !< Humidity factor. Controls the humidity content of + !! the atmosphere through the Saturation Vapour + !! Pressure expression when using DO_SIMPLE. + !! [dimensionless] real(R8), public, parameter :: HLS = HLV + HLF !< Latent heat of sublimation [J/kg] real(R8), public, parameter :: WTMOZONE = 47.99820_r8 !< Molecular weight of ozone [AMU] real(R8), public, parameter :: WTMC = 12.00000_r8 !< Molecular weight of carbon [AMU] From 6b71b2d53eba9460aaa8713b006f95dce67adbf6 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Wed, 6 May 2026 12:55:39 -0600 Subject: [PATCH 17/28] One more linting commit I thought I shortened all the long lines in cesm_constants.fh but my awk command to find the line length was off-by-one --- constants/cesm_constants.fh | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/constants/cesm_constants.fh b/constants/cesm_constants.fh index ed8cb6f197..2904b0e4f9 100644 --- a/constants/cesm_constants.fh +++ b/constants/cesm_constants.fh @@ -27,7 +27,8 @@ character(len=18), public, parameter :: constants_version = 'FMSConstants: CESM' ! Constants below use CESM shr values real(R8), public, parameter :: KAPPA = RDGAS/CP_AIR !< RDGAS / CP_AIR [dimensionless] -real(R8), public, parameter :: RHO0R = 1.0_r8/RHO0 !< Reciprocal of average density of sea water [m^3/kg] +real(R8), public, parameter :: RHO0R = 1.0_r8/RHO0 !< Reciprocal of average density of sea water + !! [m^3/kg] real(R8), public, parameter :: RHO_CP = RHO0*CP_OCEAN !< (kg/m^3)*(cal/kg/deg C)(joules/cal) = !<(joules/m^3/deg C) [J/m^3/deg] real(R8), public, parameter :: ES0 = 1.0_r8 !< Humidity factor. Controls the humidity content of @@ -54,13 +55,14 @@ real(R8), public, parameter :: RADIAN = RAD_TO_DEG !< Equal to R !! [rad/deg] real(R8), public, parameter :: ALOGMIN = -50.0_r8 !< Minimum value allowed as argument to log function !! [N/A] -real(R8), public, parameter :: EPSLN = 1.0e-40_r8 !< A small number to prevent divide by zero exceptions - !! [N/A] +real(R8), public, parameter :: EPSLN = 1.0e-40_r8 !< A small number to prevent divide by zero + !! exceptions [N/A] real(R8), public, parameter :: RADCON = ((1.0E+02_r8*GRAV)/(1.0E+04_r8*CP_AIR))*SECONDS_PER_DAY !< convert flux !! divergence to heating rate in degrees per day !! [deg sec/(cm day)] -real(R8), public, parameter :: RADCON_MKS = (GRAV/CP_AIR)*SECONDS_PER_DAY !< Factor used to convert flux divergence to - !! heating rate in degrees per day [deg sec/(m day)] +real(R8), public, parameter :: RADCON_MKS = (GRAV/CP_AIR)*SECONDS_PER_DAY !< Factor used to convert flux divergence + !! to heating rate in degrees per day + !! [deg sec/(m day)] real(R8), public, parameter :: O2MIXRAT = 2.0953E-01_r8 !< Mixing ratio of molecular oxygen in air !! [dimensionless] real(R8), public, parameter :: C2DBARS = 1.e-4_r8 !< rho*g*z(mks) to dbars: 1dbar = 10^4 From 1d377f23289cbf0fc18e83c773f217982e94467f Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Fri, 8 May 2026 14:04:17 -0600 Subject: [PATCH 18/28] Update github actions Use a container with gcc 15.1.0 --- .github/workflows/build_cmake_gnu.yml | 2 +- .github/workflows/build_ubuntu_gnu.yml | 27 +++++++++++++------------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/.github/workflows/build_cmake_gnu.yml b/.github/workflows/build_cmake_gnu.yml index f649345d8a..a86329faa3 100644 --- a/.github/workflows/build_cmake_gnu.yml +++ b/.github/workflows/build_cmake_gnu.yml @@ -10,7 +10,7 @@ jobs: omp-flags: [ -DOPENMP=on, -DOPENMP=off ] libyaml-flag: [ "", -DWITH_YAML=on ] container: - image: noaagfdl/hpc-me.ubuntu-minimal:cmake + image: ghcr.io/noaa-gfdl/fms/fms-ci-rocky-gnu:15.1.0 env: CMAKE_FLAGS: "${{ matrix.omp-flags }} ${{ matrix.libyaml-flag }} -D64BIT=on" steps: diff --git a/.github/workflows/build_ubuntu_gnu.yml b/.github/workflows/build_ubuntu_gnu.yml index f4dc48225f..b5a866c5dc 100644 --- a/.github/workflows/build_ubuntu_gnu.yml +++ b/.github/workflows/build_ubuntu_gnu.yml @@ -10,27 +10,26 @@ jobs: shell: bash strategy: matrix: - conf-flags: [--disable-openmp, --enable-mixed-mode, --disable-setting-flags, --with-mpi=no] - input-flag: [--with-yaml, --enable-test-input=/home/unit_tests_input] + conf-flag: [ --disable-openmp, --with-mpi=no, --disable-r8-default] + input-flag: [--enable-test-input=/home/unit_tests_input] + exclude: + - conf-flag: --with-mpi=no + input-flag: --enable-test-input=/home/unit_tests_input container: - image: noaagfdl/hpc-me.ubuntu-minimal:gnu-input + image: ghcr.io/noaa-gfdl/fms/fms-ci-rocky-gnu:15.1.0 env: - TEST_VERBOSE: 1 - DISTCHECK_CONFIGURE_FLAGS: "${{ matrix.conf-flags }} ${{ matrix.input-flag }}" + DISTCHECK_CONFIGURE_FLAGS: "${{ matrix.conf-flag }} ${{ matrix.input-flag }}" + DEBUG_FLAGS: "-O0 -g -fbounds-check -ffpe-trap=invalid,zero,overflow" steps: - name: Checkout code - uses: actions/checkout@v2 + uses: actions/checkout@v6.0.1 - name: Prepare GNU autoconf for build run: autoreconf -if - name: Configure the build - if: ${{ matrix.conf-flags != '--disable-setting-flags' }} - run: ./configure ${DISTCHECK_CONFIGURE_FLAGS} || cat config.log - - name: Configure the build with compiler flags - if: ${{ matrix.conf-flags == '--disable-setting-flags' }} - run: ./configure ${DISTCHECK_CONFIGURE_FLAGS} FCFLAGS="-fdefault-real-8 -fdefault-double-8 -fcray-pointer -ffree-line-length-none -I/usr/include $FCFLAGS" || cat config.log - - name: Build the library + run: ./configure ${DISTCHECK_CONFIGURE_FLAGS} FCFLAGS="$FCFLAGS $DEBUG_FLAGS" || cat config.log + - name: Run distcheck run: make distcheck - if: ${{ matrix.conf-flags != '--with-mpi=no' }} + if: ${{ matrix.conf-flag != '--with-mpi=no' }} - name: Build the library (without test suite for serial build) run: make - if: ${{ matrix.conf-flags == '--with-mpi=no' }} + if: ${{ matrix.conf-flag == '--with-mpi=no' }} From 695a88b422843cd7c107705ff226b12b87557224 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Fri, 8 May 2026 15:56:41 -0600 Subject: [PATCH 19/28] Turn off autoconf CI testing Also, don't define CESM_CONSTANTS macro (will let us drop CESM share code from the turbo-stack submodules, and is bit-for-bit) --- .github/workflows/build_ubuntu_gnu.yml | 35 -------------------------- constants/fmsconstants.F90 | 1 - 2 files changed, 36 deletions(-) delete mode 100644 .github/workflows/build_ubuntu_gnu.yml diff --git a/.github/workflows/build_ubuntu_gnu.yml b/.github/workflows/build_ubuntu_gnu.yml deleted file mode 100644 index b5a866c5dc..0000000000 --- a/.github/workflows/build_ubuntu_gnu.yml +++ /dev/null @@ -1,35 +0,0 @@ -name: Build libFMS test with autotools - -on: [push, pull_request] - -jobs: - build: - runs-on: ubuntu-latest - defaults: - run: - shell: bash - strategy: - matrix: - conf-flag: [ --disable-openmp, --with-mpi=no, --disable-r8-default] - input-flag: [--enable-test-input=/home/unit_tests_input] - exclude: - - conf-flag: --with-mpi=no - input-flag: --enable-test-input=/home/unit_tests_input - container: - image: ghcr.io/noaa-gfdl/fms/fms-ci-rocky-gnu:15.1.0 - env: - DISTCHECK_CONFIGURE_FLAGS: "${{ matrix.conf-flag }} ${{ matrix.input-flag }}" - DEBUG_FLAGS: "-O0 -g -fbounds-check -ffpe-trap=invalid,zero,overflow" - steps: - - name: Checkout code - uses: actions/checkout@v6.0.1 - - name: Prepare GNU autoconf for build - run: autoreconf -if - - name: Configure the build - run: ./configure ${DISTCHECK_CONFIGURE_FLAGS} FCFLAGS="$FCFLAGS $DEBUG_FLAGS" || cat config.log - - name: Run distcheck - run: make distcheck - if: ${{ matrix.conf-flag != '--with-mpi=no' }} - - name: Build the library (without test suite for serial build) - run: make - if: ${{ matrix.conf-flag == '--with-mpi=no' }} diff --git a/constants/fmsconstants.F90 b/constants/fmsconstants.F90 index c167a9ffd9..c28b82de01 100644 --- a/constants/fmsconstants.F90 +++ b/constants/fmsconstants.F90 @@ -55,7 +55,6 @@ module FMSconstants use platform_mod, only: r4_kind, r8_kind -#define CESM_CONSTANTS #if defined(CESM_CONSTANTS) use shr_kind_mod, only : R8 => shr_kind_r8 use shr_const_mod, only : & From a440407498e1f615c1be6ccbf6b8c4208d0ca2e4 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Fri, 8 May 2026 16:02:15 -0600 Subject: [PATCH 20/28] Update containers for intel and coupler CI tests --- .github/workflows/coupler.yml | 13 ++++++------- .github/workflows/intel_pr.yml | 16 +++++++--------- 2 files changed, 13 insertions(+), 16 deletions(-) diff --git a/.github/workflows/coupler.yml b/.github/workflows/coupler.yml index ea24899b95..1dc2e8d647 100644 --- a/.github/workflows/coupler.yml +++ b/.github/workflows/coupler.yml @@ -5,21 +5,20 @@ jobs: coupler-build: runs-on: ubuntu-latest container: - image: ryanmulhall/hpc-me.ubuntu-minimal:coupler + image: ghcr.io/noaa-gfdl/fms/fms-ci-rocky-gnu:15.1.0 env: CC: mpicc FC: mpif90 - CPPFLAGS: '-I/usr/include -Duse_LARGEFILE -DMAXFIELDMETHODS_=500' - FCFLAGS: '-fcray-pointer -fdefault-double-8 -fdefault-real-8 -Waliasing -ffree-line-length-none -fno-range-check -I/usr/include' - LDFLAGS: '-L/usr/lib' - VERBOSE: 1 + CPPFLAGS: '-I/opt/view/include' + FFLAGS: '-fallow-argument-mismatch' # mkmf uses FFLAGS instead of FC + LDFLAGS: '-L/opt/view/lib' steps: - name: Checkout FMS - uses: actions/checkout@v2 + uses: actions/checkout@v4.2.2 with: path: FMS - name: Checkout FMScoupler - uses: actions/checkout@v2 + uses: actions/checkout@v4.2.2 with: repository: 'NOAA-GFDL/FMScoupler' path: FMScoupler diff --git a/.github/workflows/intel_pr.yml b/.github/workflows/intel_pr.yml index d95519fbf2..e609c03044 100644 --- a/.github/workflows/intel_pr.yml +++ b/.github/workflows/intel_pr.yml @@ -3,17 +3,17 @@ jobs: intel-autotools: runs-on: ubuntu-latest container: - image: intel/oneapi-hpckit:2022.2-devel-ubuntu20.04 + image: intel/oneapi-hpckit:2025.2.2-0-devel-ubuntu24.04 env: - CC: mpiicc - FC: mpiifort + CC: mpicc + FC: mpiifx CFLAGS: "-I/libs/include" FCFLAGS: "-I/libs/include -g -traceback" LDFLAGS: "-L/libs/lib" TEST_VERBOSE: 1 I_MPI_FABRICS: "shm" # needed for mpi in image # intel bug causes some failures with shm option(required in container) - SKIP_TESTS: "test_mpp_update_domains.1 test_update_domains_performance.1 test_diag_manager2.23" + SKIP_TESTS: "test_fms2_io.1 test_mpp_update_domains.1 test_update_domains_performance.1 test_diag_manager2.23" steps: - name: Cache dependencies id: cache @@ -22,7 +22,7 @@ jobs: path: /libs key: ${{ runner.os }}-intel-libs - name: Install packages for building - run: apt update && apt install -y autoconf libtool automake zlibc zlib1g-dev + run: apt update && apt install -y autoconf libtool automake zlib1g-dev - if: steps.cache.outputs.cache-hit != 'true' name: Build netcdf run: | @@ -33,20 +33,18 @@ jobs: make -j install && cd .. wget https://github.com/Unidata/netcdf-c/archive/refs/tags/v4.8.1.tar.gz tar xf v4.8.1.tar.gz && cd netcdf-c-4.8.1 - ./configure --prefix=/libs --enable-remote-fortran-bootstrap + ./configure --prefix=/libs --enable-remote-fortran-bootstrap --disable-libxml2 make -j install # sets this here to pass embeded configure checks export LD_LIBRARY_PATH="/libs/lib:$LD_LIBRARY_PATH" make -j -k build-netcdf-fortran make -j install-netcdf-fortran - wget https://github.com/yaml/libyaml/releases/download/0.2.5/yaml-0.2.5.tar.gz - tar xf yaml-0.2.5.tar.gz && cd yaml-0.2.5 ./configure --prefix=/libs make -j install && cd - name: checkout uses: actions/checkout@v2 - name: Configure - run: autoreconf -if ./configure.ac && ./configure --with-yaml + run: autoreconf -if ./configure.ac && ./configure - name: Compile run: make -j || make - name: Run test suite From 207a4658c91f9047ee4c36da34b9b9cf25d4d773 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Fri, 8 May 2026 16:11:12 -0600 Subject: [PATCH 21/28] Drop FMS coupler CI test I think our version of FMS is much older than what the FMS coupler is expecting, so tests are failing... but we don't want to update our branch to include newer FMS code so we should drop the test instead --- .github/workflows/coupler.yml | 26 -------------------------- 1 file changed, 26 deletions(-) delete mode 100644 .github/workflows/coupler.yml diff --git a/.github/workflows/coupler.yml b/.github/workflows/coupler.yml deleted file mode 100644 index 1dc2e8d647..0000000000 --- a/.github/workflows/coupler.yml +++ /dev/null @@ -1,26 +0,0 @@ -name: Test coupler build -on: [pull_request] - -jobs: - coupler-build: - runs-on: ubuntu-latest - container: - image: ghcr.io/noaa-gfdl/fms/fms-ci-rocky-gnu:15.1.0 - env: - CC: mpicc - FC: mpif90 - CPPFLAGS: '-I/opt/view/include' - FFLAGS: '-fallow-argument-mismatch' # mkmf uses FFLAGS instead of FC - LDFLAGS: '-L/opt/view/lib' - steps: - - name: Checkout FMS - uses: actions/checkout@v4.2.2 - with: - path: FMS - - name: Checkout FMScoupler - uses: actions/checkout@v4.2.2 - with: - repository: 'NOAA-GFDL/FMScoupler' - path: FMScoupler - - name: Test Null build - run: FMScoupler/t/null_model_build.sh --local-fms From f88a0955c1dd07185dadd5f673d2fbb3c266bb46 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Mon, 11 May 2026 09:22:00 -0600 Subject: [PATCH 22/28] Drop YAML from configure.ac --- configure.ac | 23 ----------------------- 1 file changed, 23 deletions(-) diff --git a/configure.ac b/configure.ac index 61675e71f7..6cf92e14f2 100644 --- a/configure.ac +++ b/configure.ac @@ -66,12 +66,6 @@ AC_ARG_WITH([mpi], AS_IF([test ${with_mpi:-yes} = yes], [with_mpi=yes], [with_mpi=no]) -AC_ARG_WITH([yaml], - [AS_HELP_STRING([--with-yaml], - [Build with YAML support. This option will be ignored if --disable-fortran-flag-setting is also given. (Default no)])]) -AS_IF([test ${with_yaml:-no} = no], - [with_yaml=no], - [with_yaml=yes]) AC_ARG_ENABLE([setting-flags], [AS_HELP_STRING([--enable-setting-flags], [Allow configure to set some compiler flags. Disabling this will also disable any other --with or --enable options that set flags, and will only use user-provided falgs. (Default yes)])]) @@ -181,19 +175,6 @@ if test $with_mpi = yes; then GX_MPI() fi -# Require yaml -if test $with_yaml = yes; then - AC_CHECK_HEADERS([yaml.h], [], [AC_MSG_ERROR(["Can't find the libYAML C header file. Set CC/CPPFLAGS/CFLAGS"])]) - AC_SEARCH_LIBS([yaml_parser_initialize], [yaml], [], [AC_MSG_ERROR(["Can't find the libYAML C library. Set CC/LDFLAGS/LIBS"])]) - - #If the test pass, define use_yaml macro - AC_DEFINE([use_yaml], [1], [This is required to use yaml parser]) - - AM_CONDITIONAL([SKIP_PARSER_TESTS], false ) -else - AM_CONDITIONAL([SKIP_PARSER_TESTS], true ) -fi - # Require netCDF AC_CHECK_HEADERS([netcdf.h], [], [AC_MSG_ERROR([Can't find the netCDF C header file. Set CPPFLAGS/CFLAGS])]) AC_SEARCH_LIBS([nc_create], [netcdf], [], [AC_MSG_ERROR([Can't find the netCDF C library. Set LDFLAGS/LIBS])]) @@ -335,10 +316,6 @@ if test $enable_setting_flags = yes; then FCFLAGS="$FCFLAGS -prof-gen=srcpos" CFLAGS="$CFLAGS -prof-gen=srcpos" fi - # add yaml flag - if test $with_yaml = yes; then - LDFLAGS="$LDFLAGS -lyaml" - fi fi # Find which mpi launcher to use From b01892162e38023003b3450f5f2802d305fa0941 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Mon, 11 May 2026 09:32:14 -0600 Subject: [PATCH 23/28] Drop SKIP_PARSER_TESTS from makefiles I'm trying to make FMS configure look more like TIM configure, and some I removed some stuff from configure.ac that is now breaking Makefiles --- test_fms/data_override/Makefile.am | 4 ---- test_fms/parser/Makefile.am | 4 ---- 2 files changed, 8 deletions(-) diff --git a/test_fms/data_override/Makefile.am b/test_fms/data_override/Makefile.am index df990995f0..a622ceb5e7 100644 --- a/test_fms/data_override/Makefile.am +++ b/test_fms/data_override/Makefile.am @@ -36,11 +36,7 @@ test_data_override_SOURCES = test_data_override.F90 test_data_override_ongrid_SOURCES = test_data_override_ongrid.F90 test_get_grid_v1_SOURCES = test_get_grid_v1.F90 -if SKIP_PARSER_TESTS -skipflag="skip" -else skipflag="" -endif TEST_EXTENSIONS = .sh SH_LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \ diff --git a/test_fms/parser/Makefile.am b/test_fms/parser/Makefile.am index 05fbcd737c..e941af705e 100644 --- a/test_fms/parser/Makefile.am +++ b/test_fms/parser/Makefile.am @@ -49,11 +49,7 @@ TEST_EXTENSIONS = .sh SH_LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \ $(abs_top_srcdir)/test_fms/tap-driver.sh -if SKIP_PARSER_TESTS -skipflag="skip" -else skipflag="" -endif TESTS_ENVIRONMENT = parser_skip=${skipflag} # Clean up From 67cafb5a6ac89e78d343046061703d2c697225d0 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Mon, 11 May 2026 10:01:19 -0600 Subject: [PATCH 24/28] Add mppnccombine.c to diag_manager makefile This c file is included in the TIM version of Makefile.ac but not the FMS2 version, and may be responsible for failing CI build --- diag_manager/Makefile.am | 1 + 1 file changed, 1 insertion(+) diff --git a/diag_manager/Makefile.am b/diag_manager/Makefile.am index 37759e838f..8c19b996d7 100644 --- a/diag_manager/Makefile.am +++ b/diag_manager/Makefile.am @@ -33,6 +33,7 @@ noinst_LTLIBRARIES = libdiag_manager.la libdiag_manager_la_SOURCES = \ diag_axis.F90 \ diag_data.F90 \ + mppnccombine.c \ diag_grid.F90 \ diag_manager.F90 \ diag_output.F90 \ From 9369bf04d2386aef13c1ca50b270230b3a5a893f Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Mon, 11 May 2026 10:18:59 -0600 Subject: [PATCH 25/28] Remove bad test test_gather2DV does not exist in TIM or in more recent FMS commits, and it has some code that doesn't compile... I suspect that's related to why the test was removed in subsequent PRs --- test_fms/mpp/test_mpp_gatscat.F90 | 118 ------------------------------ 1 file changed, 118 deletions(-) diff --git a/test_fms/mpp/test_mpp_gatscat.F90 b/test_fms/mpp/test_mpp_gatscat.F90 index 47ff6cf81c..3ebacf61d9 100644 --- a/test_fms/mpp/test_mpp_gatscat.F90 +++ b/test_fms/mpp/test_mpp_gatscat.F90 @@ -779,122 +779,4 @@ subroutine test_gatherV(npes,pe,root,out_unit) deallocate(sdata,rdata,ref) end subroutine test_gatherV - !> @brief Test the 2D vector mpp_gather routine. - !> @todo This is a legacy routine which does not work in all conditions. For the gcc version, - !> the use of cray pointers is suspect to causing a crash at the call to mpp_gather. -subroutine test_gather2DV(npes,pe,root,out_unit) - implicit none - integer, intent(in) :: npes,pe,root,out_unit - - integer :: pelist(npes),rsize(npes) - integer :: pelist2(npes),rsize2(npes) - integer :: i,j,k,l,nz,ssize,nelems - real,allocatable,dimension(:,:) :: data, cdata, sbuff,rbuff - real,allocatable :: ref(:,:) - integer, parameter :: KSIZE=10 - - real :: sbuff1D(size(sbuff)) - real :: rbuff1D(size(rbuff)) - pointer(sptr,sbuff1D); pointer(rptr,rbuff1D) - - - if(npes < 3)then - call mpp_error(FATAL, "Test_gather2DV: minimum of 3 ranks required. Not testing gather; too few ranks.") - elseif(npes > 9999)then - call mpp_error(FATAL, "Test_gather2DV: maximum of 9999 ranks supported. Not testing gather2DV; too many ranks.") - return - endif - write(out_unit,*) - - ssize = pe+1 - allocate(data(ssize,KSIZE)) - do k=1,KSIZE; do i=1,ssize - data(i,k) = 10000.0*k + pe + 0.0001*i - enddo; enddo - do i=1,npes - pelist(i) = i-1 - rsize(i) = i - enddo - - nz = KSIZE - nelems = sum(rsize(:)) - - allocate(rbuff(nz,nelems)); rbuff = -1.0 - allocate(ref(nelems,nz),cdata(nelems,nz)) - ref = 0.0; cdata = 0.0 - if(pe == root)then - do k=1,KSIZE - l=1 - do j=1,npes - do i=1,rsize(j) - ref(l,k) = 10000.0*k + pelist(j) + 0.0001*i - l = l+1 - enddo; enddo;enddo - endif - allocate(sbuff(nz,ssize)) - ! this matrix inversion makes for easy gather to the IO root - ! and a clear, concise unpack - do j=1,ssize - do i=1,nz - sbuff(i,j) = data(j,i) - enddo; enddo - - ! Note that the gatherV implied here is asymmetric; only root needs to know the vector of recv size - sptr = LOC(sbuff); rptr = LOC(rbuff) - call mpp_gather(sbuff1D,size(sbuff),rbuff1D,nz*rsize(:)) - - if(pe == root)then - do j=1,nz - do i=1,nelems - cdata(i,j) = rbuff(j,i) - enddo; enddo - do j=1,nz - do i=1,nelems - if(cdata(i,j) /= ref(i,j))then - write(6,*) "Gathered data ",cdata(i,j), " NE reference ",ref(i,j), "at i,j=",i,j - call mpp_error(FATAL, "Test gather2DV global pelist failed") - endif - enddo;enddo - endif - - call mpp_sync() - write(out_unit,*) "Test gather2DV with global pelist successful" - - do i=1,npes - pelist2(i) = pelist(npes-i+1) - rsize2(i) = rsize(npes-i+1) - enddo - - rbuff = -1.0 - ref = 0.0; cdata = 0.0 - if(pe == pelist2(1))then - do k=1,KSIZE - l=1 - do j=1,npes - do i=1,rsize2(j) - ref(l,k) = 10000.0*k + pelist2(j) + 0.0001*i - l = l+1 - enddo; enddo;enddo - endif - - call mpp_gather(sbuff1D,size(sbuff),rbuff1D,nz*rsize2(:),pelist2) - - if(pe == pelist2(1))then - do j=1,nz - do i=1,nelems - cdata(i,j) = rbuff(j,i) - enddo; enddo - do j=1,nz - do i=1,nelems - if(cdata(i,j) /= ref(i,j))then - write(6,*) "Gathered data ",cdata(i,j), " NE reference ",ref(i,j), "at i,j=",i,j - call mpp_error(FATAL, "Test gather2DV with reversed pelist failed") - endif - enddo;enddo - endif - call mpp_sync() - write(out_unit,*) "Test gather2DV with reversed pelist successful" - deallocate(data,sbuff,rbuff,cdata,ref) - end subroutine test_gather2DV - end program test_mpp_gatscat From 9a76ecebffb7862038ad399602bd96742d0a216f Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Mon, 11 May 2026 10:36:47 -0600 Subject: [PATCH 26/28] Increase filename size test_simple_domain.nc does not fit into character(len=20); increased to len=25 just to have some breathing room --- test_fms/fms2_io/test_domain_io.F90 | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test_fms/fms2_io/test_domain_io.F90 b/test_fms/fms2_io/test_domain_io.F90 index 07a3e2845a..8b170f3819 100644 --- a/test_fms/fms2_io/test_domain_io.F90 +++ b/test_fms/fms2_io/test_domain_io.F90 @@ -46,7 +46,7 @@ program test_domain_read integer :: xhalo = 3 !< Number of halo points in X integer :: yhalo = 2 !< Number of halo points in Y integer :: nz = 2 !< Number of points in the z dimension - character(len=20) :: filename="test.nc" !< Name of the file + character(len=25) :: filename="test.nc" !< Name of the file logical :: use_edges=.false. !< Use North and East domain positions integer :: ndim4 !< Number of points in dim4 From e68d0034516ca6af8e99cef114473bde59c9ef76 Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Mon, 11 May 2026 10:40:35 -0600 Subject: [PATCH 27/28] Revert "Drop SKIP_PARSER_TESTS from makefiles" This reverts commit b01892162e38023003b3450f5f2802d305fa0941. --- test_fms/data_override/Makefile.am | 4 ++++ test_fms/parser/Makefile.am | 4 ++++ 2 files changed, 8 insertions(+) diff --git a/test_fms/data_override/Makefile.am b/test_fms/data_override/Makefile.am index a622ceb5e7..df990995f0 100644 --- a/test_fms/data_override/Makefile.am +++ b/test_fms/data_override/Makefile.am @@ -36,7 +36,11 @@ test_data_override_SOURCES = test_data_override.F90 test_data_override_ongrid_SOURCES = test_data_override_ongrid.F90 test_get_grid_v1_SOURCES = test_get_grid_v1.F90 +if SKIP_PARSER_TESTS +skipflag="skip" +else skipflag="" +endif TEST_EXTENSIONS = .sh SH_LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \ diff --git a/test_fms/parser/Makefile.am b/test_fms/parser/Makefile.am index e941af705e..05fbcd737c 100644 --- a/test_fms/parser/Makefile.am +++ b/test_fms/parser/Makefile.am @@ -49,7 +49,11 @@ TEST_EXTENSIONS = .sh SH_LOG_DRIVER = env AM_TAP_AWK='$(AWK)' $(SHELL) \ $(abs_top_srcdir)/test_fms/tap-driver.sh +if SKIP_PARSER_TESTS +skipflag="skip" +else skipflag="" +endif TESTS_ENVIRONMENT = parser_skip=${skipflag} # Clean up From 5284362d371df19afb5da96e9d605801ac2d556f Mon Sep 17 00:00:00 2001 From: Michael Levy Date: Mon, 11 May 2026 10:40:54 -0600 Subject: [PATCH 28/28] Revert "Drop YAML from configure.ac" This reverts commit f88a0955c1dd07185dadd5f673d2fbb3c266bb46. --- configure.ac | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/configure.ac b/configure.ac index 6cf92e14f2..61675e71f7 100644 --- a/configure.ac +++ b/configure.ac @@ -66,6 +66,12 @@ AC_ARG_WITH([mpi], AS_IF([test ${with_mpi:-yes} = yes], [with_mpi=yes], [with_mpi=no]) +AC_ARG_WITH([yaml], + [AS_HELP_STRING([--with-yaml], + [Build with YAML support. This option will be ignored if --disable-fortran-flag-setting is also given. (Default no)])]) +AS_IF([test ${with_yaml:-no} = no], + [with_yaml=no], + [with_yaml=yes]) AC_ARG_ENABLE([setting-flags], [AS_HELP_STRING([--enable-setting-flags], [Allow configure to set some compiler flags. Disabling this will also disable any other --with or --enable options that set flags, and will only use user-provided falgs. (Default yes)])]) @@ -175,6 +181,19 @@ if test $with_mpi = yes; then GX_MPI() fi +# Require yaml +if test $with_yaml = yes; then + AC_CHECK_HEADERS([yaml.h], [], [AC_MSG_ERROR(["Can't find the libYAML C header file. Set CC/CPPFLAGS/CFLAGS"])]) + AC_SEARCH_LIBS([yaml_parser_initialize], [yaml], [], [AC_MSG_ERROR(["Can't find the libYAML C library. Set CC/LDFLAGS/LIBS"])]) + + #If the test pass, define use_yaml macro + AC_DEFINE([use_yaml], [1], [This is required to use yaml parser]) + + AM_CONDITIONAL([SKIP_PARSER_TESTS], false ) +else + AM_CONDITIONAL([SKIP_PARSER_TESTS], true ) +fi + # Require netCDF AC_CHECK_HEADERS([netcdf.h], [], [AC_MSG_ERROR([Can't find the netCDF C header file. Set CPPFLAGS/CFLAGS])]) AC_SEARCH_LIBS([nc_create], [netcdf], [], [AC_MSG_ERROR([Can't find the netCDF C library. Set LDFLAGS/LIBS])]) @@ -316,6 +335,10 @@ if test $enable_setting_flags = yes; then FCFLAGS="$FCFLAGS -prof-gen=srcpos" CFLAGS="$CFLAGS -prof-gen=srcpos" fi + # add yaml flag + if test $with_yaml = yes; then + LDFLAGS="$LDFLAGS -lyaml" + fi fi # Find which mpi launcher to use