diff --git a/.github/workflows/build_cmake_gnu.yml b/.github/workflows/build_cmake_gnu.yml index f649345d8a..a86329faa3 100644 --- a/.github/workflows/build_cmake_gnu.yml +++ b/.github/workflows/build_cmake_gnu.yml @@ -10,7 +10,7 @@ jobs: omp-flags: [ -DOPENMP=on, -DOPENMP=off ] libyaml-flag: [ "", -DWITH_YAML=on ] container: - image: noaagfdl/hpc-me.ubuntu-minimal:cmake + image: ghcr.io/noaa-gfdl/fms/fms-ci-rocky-gnu:15.1.0 env: CMAKE_FLAGS: "${{ matrix.omp-flags }} ${{ matrix.libyaml-flag }} -D64BIT=on" steps: diff --git a/.github/workflows/build_ubuntu_gnu.yml b/.github/workflows/build_ubuntu_gnu.yml deleted file mode 100644 index f4dc48225f..0000000000 --- a/.github/workflows/build_ubuntu_gnu.yml +++ /dev/null @@ -1,36 +0,0 @@ -name: Build libFMS test with autotools - -on: [push, pull_request] - -jobs: - build: - runs-on: ubuntu-latest - defaults: - run: - shell: bash - strategy: - matrix: - conf-flags: [--disable-openmp, --enable-mixed-mode, --disable-setting-flags, --with-mpi=no] - input-flag: [--with-yaml, --enable-test-input=/home/unit_tests_input] - container: - image: noaagfdl/hpc-me.ubuntu-minimal:gnu-input - env: - TEST_VERBOSE: 1 - DISTCHECK_CONFIGURE_FLAGS: "${{ matrix.conf-flags }} ${{ matrix.input-flag }}" - steps: - - name: Checkout code - uses: actions/checkout@v2 - - name: Prepare GNU autoconf for build - run: autoreconf -if - - name: Configure the build - if: ${{ matrix.conf-flags != '--disable-setting-flags' }} - run: ./configure ${DISTCHECK_CONFIGURE_FLAGS} || cat config.log - - name: Configure the build with compiler flags - if: ${{ matrix.conf-flags == '--disable-setting-flags' }} - run: ./configure ${DISTCHECK_CONFIGURE_FLAGS} FCFLAGS="-fdefault-real-8 -fdefault-double-8 -fcray-pointer -ffree-line-length-none -I/usr/include $FCFLAGS" || cat config.log - - name: Build the library - run: make distcheck - if: ${{ matrix.conf-flags != '--with-mpi=no' }} - - name: Build the library (without test suite for serial build) - run: make - if: ${{ matrix.conf-flags == '--with-mpi=no' }} diff --git a/.github/workflows/coupler.yml b/.github/workflows/coupler.yml deleted file mode 100644 index ea24899b95..0000000000 --- a/.github/workflows/coupler.yml +++ /dev/null @@ -1,27 +0,0 @@ -name: Test coupler build -on: [pull_request] - -jobs: - coupler-build: - runs-on: ubuntu-latest - container: - image: ryanmulhall/hpc-me.ubuntu-minimal:coupler - env: - CC: mpicc - FC: mpif90 - CPPFLAGS: '-I/usr/include -Duse_LARGEFILE -DMAXFIELDMETHODS_=500' - FCFLAGS: '-fcray-pointer -fdefault-double-8 -fdefault-real-8 -Waliasing -ffree-line-length-none -fno-range-check -I/usr/include' - LDFLAGS: '-L/usr/lib' - VERBOSE: 1 - steps: - - name: Checkout FMS - uses: actions/checkout@v2 - with: - path: FMS - - name: Checkout FMScoupler - uses: actions/checkout@v2 - with: - repository: 'NOAA-GFDL/FMScoupler' - path: FMScoupler - - name: Test Null build - run: FMScoupler/t/null_model_build.sh --local-fms diff --git a/.github/workflows/intel_pr.yml b/.github/workflows/intel_pr.yml index d95519fbf2..e609c03044 100644 --- a/.github/workflows/intel_pr.yml +++ b/.github/workflows/intel_pr.yml @@ -3,17 +3,17 @@ jobs: intel-autotools: runs-on: ubuntu-latest container: - image: intel/oneapi-hpckit:2022.2-devel-ubuntu20.04 + image: intel/oneapi-hpckit:2025.2.2-0-devel-ubuntu24.04 env: - CC: mpiicc - FC: mpiifort + CC: mpicc + FC: mpiifx CFLAGS: "-I/libs/include" FCFLAGS: "-I/libs/include -g -traceback" LDFLAGS: "-L/libs/lib" TEST_VERBOSE: 1 I_MPI_FABRICS: "shm" # needed for mpi in image # intel bug causes some failures with shm option(required in container) - SKIP_TESTS: "test_mpp_update_domains.1 test_update_domains_performance.1 test_diag_manager2.23" + SKIP_TESTS: "test_fms2_io.1 test_mpp_update_domains.1 test_update_domains_performance.1 test_diag_manager2.23" steps: - name: Cache dependencies id: cache @@ -22,7 +22,7 @@ jobs: path: /libs key: ${{ runner.os }}-intel-libs - name: Install packages for building - run: apt update && apt install -y autoconf libtool automake zlibc zlib1g-dev + run: apt update && apt install -y autoconf libtool automake zlib1g-dev - if: steps.cache.outputs.cache-hit != 'true' name: Build netcdf run: | @@ -33,20 +33,18 @@ jobs: make -j install && cd .. wget https://github.com/Unidata/netcdf-c/archive/refs/tags/v4.8.1.tar.gz tar xf v4.8.1.tar.gz && cd netcdf-c-4.8.1 - ./configure --prefix=/libs --enable-remote-fortran-bootstrap + ./configure --prefix=/libs --enable-remote-fortran-bootstrap --disable-libxml2 make -j install # sets this here to pass embeded configure checks export LD_LIBRARY_PATH="/libs/lib:$LD_LIBRARY_PATH" make -j -k build-netcdf-fortran make -j install-netcdf-fortran - wget https://github.com/yaml/libyaml/releases/download/0.2.5/yaml-0.2.5.tar.gz - tar xf yaml-0.2.5.tar.gz && cd yaml-0.2.5 ./configure --prefix=/libs make -j install && cd - name: checkout uses: actions/checkout@v2 - name: Configure - run: autoreconf -if ./configure.ac && ./configure --with-yaml + run: autoreconf -if ./configure.ac && ./configure - name: Compile run: make -j || make - name: Run test suite diff --git a/constants/cesm_constants.fh b/constants/cesm_constants.fh index db2ba4bb6c..2904b0e4f9 100644 --- a/constants/cesm_constants.fh +++ b/constants/cesm_constants.fh @@ -27,12 +27,14 @@ character(len=18), public, parameter :: constants_version = 'FMSConstants: CESM' ! Constants below use CESM shr values real(R8), public, parameter :: KAPPA = RDGAS/CP_AIR !< RDGAS / CP_AIR [dimensionless] -real(R8), public, parameter :: RHO0R = 1.0_r8/RHO0 !< Reciprocal of average density of sea water [m^3/kg] +real(R8), public, parameter :: RHO0R = 1.0_r8/RHO0 !< Reciprocal of average density of sea water + !! [m^3/kg] real(R8), public, parameter :: RHO_CP = RHO0*CP_OCEAN !< (kg/m^3)*(cal/kg/deg C)(joules/cal) = !<(joules/m^3/deg C) [J/m^3/deg] -real(R8), public, parameter :: ES0 = 1.0_r8 !< Humidity factor. Controls the humidity content of the - !< atmosphere through the Saturation Vapour Pressure - !< expression when using DO_SIMPLE. [dimensionless] +real(R8), public, parameter :: ES0 = 1.0_r8 !< Humidity factor. Controls the humidity content of + !! the atmosphere through the Saturation Vapour + !! Pressure expression when using DO_SIMPLE. + !! [dimensionless] real(R8), public, parameter :: HLS = HLV + HLF !< Latent heat of sublimation [J/kg] real(R8), public, parameter :: WTMOZONE = 47.99820_r8 !< Molecular weight of ozone [AMU] real(R8), public, parameter :: WTMC = 12.00000_r8 !< Molecular weight of carbon [AMU] @@ -49,15 +51,22 @@ real(R8), public, parameter :: SECONDS_PER_HOUR = 3600._r8 !< Seconds in real(R8), public, parameter :: SECONDS_PER_MINUTE = 60._r8 !< Seconds in a minute [s] real(R8), public, parameter :: RAD_TO_DEG = 180._r8/PI !< Degrees per radian [deg/rad] real(R8), public, parameter :: DEG_TO_RAD = PI/180._r8 !< Radians per degree [rad/deg] -real(R8), public, parameter :: RADIAN = RAD_TO_DEG !< Equal to RAD_TO_DEG for backward compatability. [rad/deg] -real(R8), public, parameter :: ALOGMIN = -50.0_r8 !< Minimum value allowed as argument to log function [N/A] -real(R8), public, parameter :: EPSLN = 1.0e-40_r8 !< A small number to prevent divide by zero exceptions [N/A] -real(R8), public, parameter :: RADCON = ((1.0E+02_r8*GRAV)/(1.0E+04_r8*CP_AIR))*SECONDS_PER_DAY !< convert flux divergence - !to heating rate in degrees per day [deg sec/(cm day)] -real(R8), public, parameter :: RADCON_MKS = (GRAV/CP_AIR)*SECONDS_PER_DAY !< Factor used to convert flux divergence to - !< heating rate in degrees per day [deg sec/(m day)] -real(R8), public, parameter :: O2MIXRAT = 2.0953E-01_r8 !< Mixing ratio of molecular oxygen in air [dimensionless] -real(R8), public, parameter :: C2DBARS = 1.e-4_r8 !< rho*g*z(mks) to dbars: 1dbar = 10^4 (kg/m^3)(m/s^2)m [dbars] +real(R8), public, parameter :: RADIAN = RAD_TO_DEG !< Equal to RAD_TO_DEG for backward compatability. + !! [rad/deg] +real(R8), public, parameter :: ALOGMIN = -50.0_r8 !< Minimum value allowed as argument to log function + !! [N/A] +real(R8), public, parameter :: EPSLN = 1.0e-40_r8 !< A small number to prevent divide by zero + !! exceptions [N/A] +real(R8), public, parameter :: RADCON = ((1.0E+02_r8*GRAV)/(1.0E+04_r8*CP_AIR))*SECONDS_PER_DAY !< convert flux + !! divergence to heating rate in degrees per day + !! [deg sec/(cm day)] +real(R8), public, parameter :: RADCON_MKS = (GRAV/CP_AIR)*SECONDS_PER_DAY !< Factor used to convert flux divergence + !! to heating rate in degrees per day + !! [deg sec/(m day)] +real(R8), public, parameter :: O2MIXRAT = 2.0953E-01_r8 !< Mixing ratio of molecular oxygen in air + !! [dimensionless] +real(R8), public, parameter :: C2DBARS = 1.e-4_r8 !< rho*g*z(mks) to dbars: 1dbar = 10^4 + !! (kg/m^3)(m/s^2)m [dbars] real(R8), public, parameter :: KELVIN = 273.15_r8 !< Degrees Kelvin at zero Celsius [K] #ifdef SMALL_EARTH @@ -70,4 +79,4 @@ real(R8), public, parameter :: KELVIN = 273.15_r8 !< Degrees Ke #endif #else real, public, parameter :: small_fac = 1._r8 -#endif \ No newline at end of file +#endif diff --git a/constants/fmsconstants.F90 b/constants/fmsconstants.F90 index c167a9ffd9..c28b82de01 100644 --- a/constants/fmsconstants.F90 +++ b/constants/fmsconstants.F90 @@ -55,7 +55,6 @@ module FMSconstants use platform_mod, only: r4_kind, r8_kind -#define CESM_CONSTANTS #if defined(CESM_CONSTANTS) use shr_kind_mod, only : R8 => shr_kind_r8 use shr_const_mod, only : & diff --git a/diag_manager/Makefile.am b/diag_manager/Makefile.am index 37759e838f..8c19b996d7 100644 --- a/diag_manager/Makefile.am +++ b/diag_manager/Makefile.am @@ -33,6 +33,7 @@ noinst_LTLIBRARIES = libdiag_manager.la libdiag_manager_la_SOURCES = \ diag_axis.F90 \ diag_data.F90 \ + mppnccombine.c \ diag_grid.F90 \ diag_manager.F90 \ diag_output.F90 \ diff --git a/diag_manager/diag_manager.F90 b/diag_manager/diag_manager.F90 index d0d6e4a3ad..335e805d7b 100644 --- a/diag_manager/diag_manager.F90 +++ b/diag_manager/diag_manager.F90 @@ -239,7 +239,8 @@ MODULE diag_manager_mod USE fms_diag_outfield_mod, ONLY: fmsDiagOutfieldIndex_type, fmsDiagOutfield_type USE fms_diag_fieldbuff_update_mod, ONLY: fieldbuff_update, fieldbuff_copy_missvals, & & fieldbuff_copy_fieldvals - use netcdf_io_mod, ONLY: filepath_list_type, partitioned_global_files, partitioned_section_files, append_to_filepath_list + use netcdf_io_mod, ONLY: filepath_list_type, partitioned_global_files, partitioned_section_files, & + & append_to_filepath_list #ifdef use_netCDF USE netcdf, ONLY: NF90_INT, NF90_FLOAT, NF90_CHAR @@ -376,7 +377,7 @@ MODULE diag_manager_mod END INTERFACE diag_field_add_attribute ! ----- interface to the C function ----- -interface +interface function exec_mppnccombine(outfile) bind(C) use, intrinsic :: iso_c_binding, only: c_int, c_char implicit none @@ -3733,7 +3734,7 @@ subroutine combine_files() integer(c_int) :: smallest_pix ! The smallest IO PE index of the set of IO PEs writing the current section file. integer(c_int) :: ireturn ! Return code from mppnccombine integer(c_int) :: niopes ! Number of IO PEs participating in writing of global files - integer :: f ! File index for the global diagnostic files + integer :: f ! File index for the global diagnostic files integer :: pix_order ! 0-based order of the IO PE in the list of all IO PEs writing the file. type(filepath_list_type), pointer :: current type(filepath_list_type), pointer :: files_to_combine ! list of files to combined by this PE @@ -3754,8 +3755,8 @@ subroutine combine_files() filepath = trim(adjustl(current%path)) outfile = filepath(1:len(filepath)-5) // c_null_char - ! get the number of files to combine (for the first global file only). The number of files is the - ! same for all global files. Similarly, get pix and pix_order for the first file only, since for + ! get the number of files to combine (for the first global file only). The number of files is the + ! same for all global files. Similarly, get pix and pix_order for the first file only, since for ! all global files, the pix and pix_order are the same. if (niopes == 0) then niopes = num_partitioned_files(outfile) @@ -3806,7 +3807,9 @@ subroutine combine_files() do while (associated(current)) !write(stdout_unit,*) ' Combining file ' // filepath(1:len(filepath)-5) ireturn = exec_mppnccombine(current%path) - if (ireturn /= 0) call error_mesg('diag_manager_mod::combine_files', 'mppnccombine failed for file ' // trim(current%path), FATAL) + if (ireturn /= 0) & + call error_mesg('diag_manager_mod::combine_files', 'mppnccombine failed for file ' // & + trim(current%path), FATAL) current => current%next end do end if @@ -3825,10 +3828,10 @@ function get_pix_order(filename, num_files, pix) result(pix_order) ! local integer :: pix_order ! 0-based order of the pix in the list of all IO PEs writing the file character(len=4) :: suffix ! 0000, 0001, etc. - integer :: npes ! total number of all PEs + integer :: npes ! total number of all PEs integer :: i, f logical :: exists - + npes = mpp_npes() pix_order = -1 diff --git a/diag_manager/diag_util.F90 b/diag_manager/diag_util.F90 index e42c2d7639..1e3c6f7557 100644 --- a/diag_manager/diag_util.F90 +++ b/diag_manager/diag_util.F90 @@ -1726,8 +1726,8 @@ SUBROUTINE opening_file(file, time, filename_time) INTEGER, ALLOCATABLE :: axesc(:) ! indices if compressed axes associated with the field LOGICAL :: time_ops, aux_present, match_aux_name, req_present, match_req_fields CHARACTER(len=7) :: avg_name = 'average' - CHARACTER(len=MAX_NAME_LENGTH) :: time_units, timeb_units, avg, error_string, filename, aux_name, req_fields, fieldname - CHARACTER(len=MAX_NAME_LENGTH) :: suffix, base_name + CHARACTER(len=MAX_NAME_LENGTH) :: time_units, timeb_units, avg, error_string, filename, aux_name, req_fields + CHARACTER(len=MAX_NAME_LENGTH) :: fieldname, suffix, base_name CHARACTER(len=32) :: time_name, timeb_name,time_longname, timeb_longname, cart_name CHARACTER(len=MAX_FILENAME_LENGTH) :: fname CHARACTER(len=24) :: start_date @@ -2290,10 +2290,10 @@ integer function find_first_fms_percent(filename) integer :: i integer :: first_percent integer :: first_percent_loc - + first_percent = INDEX(filename, '%1') do i=2,9 - write(first_fms_percent,"('%',i1)") i + write(first_fms_percent,"('%',i1)") i first_percent_loc = INDEX(filename, first_fms_percent) if (first_percent_loc > 0) then if (first_percent == 0) then diff --git a/diag_manager/mppnccombine.c b/diag_manager/mppnccombine.c index bf0546c673..4dfb1c7877 100644 --- a/diag_manager/mppnccombine.c +++ b/diag_manager/mppnccombine.c @@ -142,19 +142,19 @@ #include #include #include - + #ifndef MAX_BF #define MAX_BF 100 /* maximum blocking factor */ #endif #ifndef DEFAULT_BF /* default blocking factor, if none set */ #define DEFAULT_BF 1 #endif - + /* Block size for NetCDF file open/reads */ #ifndef NC_BLKSZ #define NC_BLKSZ 65536 #endif - + /* Information structure for a file */ struct fileinfo { @@ -190,20 +190,20 @@ int flush_decomp (struct fileinfo *, int, int, int, unsigned char); void print_debug (struct fileinfo *, unsigned char); char *nc_type_to_str (nc_type); int min (int, int); - + static void ***varbuf = NULL; /* Buffers for multiple records of decomposed var */ - + struct rusage ruse; /* structure used to store values from getrusage() */ static unsigned long maxrss = 0; /* maximum memory used so far in kilobytes */ static int print_mem_usage = 0; static unsigned long mem_allocated = 0; /* memory allocated so far */ - + static const char version[] = "2024.02"; - + static unsigned long estimated_maxrss = 0; /* see option: -x */ static int mem_dry_run = 0; /* set if -x option is used */ - + static inline void check_mem_usage (void) { @@ -233,7 +233,7 @@ int flush_decomp (struct fileinfo *, int, int, int, unsigned char); prev_rss = rss; return; } - + static void print_estimated_mem_footprint (int verbose) { @@ -586,7 +586,7 @@ char** find_partitioned_files(const char* filepath, int* count) { if (!strcmp (strptr, ".0000")) outfilename[outlen - 5] = '\0'; } - + /* if -x (estimate memory usage) is set, k will be automatically set to 1 */ if (mem_dry_run) { @@ -598,10 +598,10 @@ char** find_partitioned_files(const char* filepath, int* count) { printf ("This run will estimate peak memory resident size. No output " "file will be created.\n"); } - + /* Disable fatal returns from netCDF library functions */ ncopts = 0; - + if (!mem_dry_run) { /* Create a new netCDF output file */ @@ -642,7 +642,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + /* No input files are specified on the command-line */ if (inputarg == (-1)) { @@ -904,7 +904,7 @@ char** find_partitioned_files(const char* filepath, int* count) { varbuf[k][v] = NULL; } } - + /* Cleanup and check for any input or output file errors */ if (ncsync (ncoutfile->ncfid) == (-1)) outfileerrors++; @@ -973,7 +973,7 @@ char** find_partitioned_files(const char* filepath, int* count) { return (1); } } - + /* Print the usage message for mppnccombine */ void usage () @@ -1071,7 +1071,7 @@ char** find_partitioned_files(const char* filepath, int* count) { "value of 1\n"); printf ("otherwise.\n"); } - + int min (int a, int b) { @@ -1079,7 +1079,7 @@ char** find_partitioned_files(const char* filepath, int* count) { return a; return b; } - + /* Open an input file and get some information about it, define the */ /* structure of the output file if necessary, prepare to copy all the */ /* variables for the current block to memory (and non-decomposed variables */ @@ -1103,10 +1103,10 @@ char** find_partitioned_files(const char* filepath, int* count) { char attname[MAX_NC_NAME]; /* Name of a global or variable attribute */ unsigned char ncinfileerror = 0; /* Were there any file errors? */ size_t blksz = NC_BLKSZ; /* netCDF block size */ - + if (print_mem_usage) check_mem_usage (); - + /* Information for netCDF input file */ if ((ncinfile = (struct fileinfo *)malloc (sizeof (struct fileinfo))) == NULL) @@ -1114,7 +1114,7 @@ char** find_partitioned_files(const char* filepath, int* count) { fprintf (stderr, "Error: cannot allocate enough memory!\n"); return (1); } - + /* Open an input netCDF file */ if ((ncinfile->ncfid = ncopen (ncname, NC_NOWRITE)) == (-1)) { @@ -1122,7 +1122,7 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + /* Determine the number of files in the decomposed domain */ if (ncattget (ncinfile->ncfid, NC_GLOBAL, "NumFilesInSet", (void *)&nfiles2) == (-1)) @@ -1141,7 +1141,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } } *nfiles = nfiles2; - + /* Get some general information about the input netCDF file */ if (ncinquire (ncinfile->ncfid, &(ncinfile->ndims), &(ncinfile->nvars), &(ncinfile->ngatts), &(ncinfile->recdim)) @@ -1152,7 +1152,7 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + /* Get some information about the dimensions */ for (d = 0; d < ncinfile->ndims; d++) { @@ -1170,14 +1170,14 @@ char** find_partitioned_files(const char* filepath, int* count) { ncinfile->dimstart[d] = 1; ncinfile->dimend[d] = (-1); } - + /* Save some information for the output file */ if ((block == 0) && (!mem_dry_run)) { ncoutfile->nvars = ncinfile->nvars; ncoutfile->recdim = ncinfile->recdim; } - + /* Get some information about the variables */ for (v = 0; v < ncinfile->nvars; v++) { @@ -1191,7 +1191,7 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + /* If the variable is also a dimension then get decomposition info */ if ((dimid = ncdimid (ncinfile->ncfid, ncinfile->varname[v])) != (-1)) { @@ -1213,14 +1213,14 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + /* Get some additional information about the variables */ for (v = 0; v < ncinfile->nvars; v++) { - + /* start by assuming the variable has no decomposed dimension */ ncinfile->vardecomp[v] = 0; - + /* now, iterate over the variable's dimensions and mark the */ /* variable as a decomposed variable if any dimension of */ /* the variable is decomposed */ @@ -1233,7 +1233,7 @@ char** find_partitioned_files(const char* filepath, int* count) { break; } } - + /* Save some information for the output file */ /* This only needs to be done once per output file */ if ((!appendnc) && (!mem_dry_run)) @@ -1248,13 +1248,13 @@ char** find_partitioned_files(const char* filepath, int* count) { ncoutfile->varmiss[v] = 0; } } - + /* If the output netCDF file was just created then define its structure */ if ((!appendnc) && (!mem_dry_run)) { if (verbose) printf (" Creating output \"%s\"\n", outncname); - + /* Determine the format of the input netCDF file */ if (nc_inq_format (ncinfile->ncfid, &ncinformat) == (-1)) { @@ -1263,7 +1263,7 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + /* Determine the format of the output netCDF file */ if (nc_inq_format (ncoutfile->ncfid, &ncoutformat) == (-1)) { @@ -1272,11 +1272,11 @@ char** find_partitioned_files(const char* filepath, int* count) { free (ncinfile); return (1); } - + if (verbose) printf (" ncinformat=%d, ncoutformat=%d\n", ncinformat, ncoutformat); - + /* If the format option (-64 or -n4) for the output netCDF file * is not specified then recreate the output netCDF file based * upon the format of the input netCDF file. */ @@ -1320,7 +1320,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } ncsetfill (ncoutfile->ncfid, NC_NOFILL); } - + /* Define the dimensions */ for (d = 0; d < ncinfile->ndims; d++) { @@ -1330,7 +1330,7 @@ char** find_partitioned_files(const char* filepath, int* count) { ncdimdef (ncoutfile->ncfid, ncinfile->dimname[d], ncinfile->dimfullsize[d]); } - + /* Define the variables and copy their attributes */ for (v = 0; v < ncinfile->nvars; v++) { @@ -1367,7 +1367,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + /* Copy the global attributes */ for (n = 0; n < ncinfile->ngatts; n++) { @@ -1390,7 +1390,7 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + if (deflate == 1 && deflation > 0) { for (v = 0; v < ncinfile->nvars; v++) @@ -1405,11 +1405,11 @@ char** find_partitioned_files(const char* filepath, int* count) { } } } - + /* Definitions done */ nc__enddef (ncoutfile->ncfid, headerpad, 4, 0, 4); } - + /* Copy all data values of the dimensions and variables to memory */ /* For non-decomposed variables, process_vars will write them to the */ /* output file. Decomposed variables for N records from this file will */ @@ -1436,13 +1436,13 @@ char** find_partitioned_files(const char* filepath, int* count) { r < min ( ((block + 1) * (*bf)), *nrecs)); // r is a minimum of the next block start point and nrecs - + /* Done */ ncclose (ncinfile->ncfid); free (ncinfile); return (ncinfileerror); } - + /* Decomposed variables from an input file and record will be written to memory */ /* non-decomposed variables will be written to the output file */ @@ -1468,17 +1468,17 @@ char** find_partitioned_files(const char* filepath, int* count) { int recdimsize; /* Using a local value to correct issue when netcdf file does not have a record dimension */ long long varbufsize; - + if (ncinfile->recdim < 0) recdimsize = 1; else recdimsize = ncinfile->dimsize[ncinfile->recdim]; - + /* Check the number of records */ if (*nrecs == 1) { *nrecs = recdimsize; - + /* adjust bf */ if ((*bf) >= 1) { @@ -1504,7 +1504,7 @@ char** find_partitioned_files(const char* filepath, int* count) { *nblocks = (int)((*nrecs) / (*bf)) + 1; else *nblocks = (int)((*nrecs) / (*bf)); - + if (verbose) fprintf (stderr, "blocking factor=%d, num. blocks=%d, num. records=%d\n", *bf, @@ -1517,7 +1517,7 @@ char** find_partitioned_files(const char* filepath, int* count) { "Error: different number of records than the first input file!\n"); return (1); } - + /* Allocate memory for the decomposed variables, if none has been allocated yet We use an optimized algorithm to malloc and set up a double dimension array using a single malloc call. We do the cross-linking after the @@ -1563,13 +1563,13 @@ char** find_partitioned_files(const char* filepath, int* count) { } */ } /* end of memory allocation, done once per block */ - + /* Loop over all the variables */ for (v = 0; v < ncinfile->nvars; v++) { if (verbose > 1) printf (" variable = %s\n", ncinfile->varname[v]); - + /* Get read/write dimension sizes for the variable */ recsize = 1; recfullsize = 1; @@ -1597,7 +1597,7 @@ char** find_partitioned_files(const char* filepath, int* count) { printf (" dim %d: instart=%ld outstart=%ld count=%ld\n", d, instart[d], outstart[d], count[d]); } - + /* Prevent unnecessary reads/writes */ if (r > 0) { @@ -1618,11 +1618,11 @@ char** find_partitioned_files(const char* filepath, int* count) { { /* Prevent unnecessary reads/writes of non-decomposed variables if (ncinfile->vardecomp[v]!=1 && appendnc) continue; */ - + /* Non-record variables */ if (varrecdim == (-1)) continue; - + /* Non-decomposed record variables */ if (ncinfile->vardecomp[v] != 1 && f > 0) continue; @@ -1633,7 +1633,7 @@ char** find_partitioned_files(const char* filepath, int* count) { if (ncinfile->vardecomp[v] != 1 && appendnc) continue; } - + /* Allocate a buffer for the variable's record */ if ((values = malloc (nctypelen (ncinfile->datatype[v]) * recsize)) == NULL) @@ -1645,7 +1645,7 @@ char** find_partitioned_files(const char* filepath, int* count) { ncinfile->varname[v]); return (1); } - + /* Read the variable */ if (varrecdim != (-1)) instart[varrecdim] = outstart[varrecdim] = r; @@ -1655,7 +1655,7 @@ char** find_partitioned_files(const char* filepath, int* count) { ncinfile->varname[v]); return (1); } - + /* Write the buffered variable immediately if it's not decomposed */ if ((ncinfile->vardecomp[v] != 1) && (!mem_dry_run)) { @@ -1740,7 +1740,7 @@ char** find_partitioned_files(const char* filepath, int* count) { if (verbose > 1) printf (" writing %lld bytes to memory\n", nctypelen (ncinfile->datatype[v]) * recsize); - + imax = ncinfile ->dimsize[ncinfile->vardim[v][ncinfile->varndims[v] - 1]]; if (ncinfile->varndims[v] > 1) @@ -1776,7 +1776,7 @@ char** find_partitioned_files(const char* filepath, int* count) { if (verbose > 1) printf (" imap=%d jmax=%d kmax=%d lmax=%d\n", imax, jmax, kmax, lmax); - + imaxfull = ncinfile->dimfullsize[ncinfile->vardim[v][ncinfile->varndims[v] - 1]]; @@ -1808,7 +1808,7 @@ char** find_partitioned_files(const char* filepath, int* count) { imaxfull, jmaxfull, kmaxfull, lmaxfull); imaxjmaxfull = imaxfull * jmaxfull; imaxjmaxkmaxfull = imaxfull * jmaxfull * kmaxfull; - + ioffset = outstart[ncinfile->varndims[v] - 0 - 1]; if (ncinfile->varndims[v] > 1) joffset = outstart[ncinfile->varndims[v] - 1 - 1]; @@ -1939,7 +1939,7 @@ char** find_partitioned_files(const char* filepath, int* count) { break; } } - + /* Deallocate the decomposed variable's buffer */ if (values != NULL) free (values); @@ -1947,7 +1947,7 @@ char** find_partitioned_files(const char* filepath, int* count) { first = 0; return (0); } - + /* Write all the buffered decomposed variables to the output file */ int flush_decomp (struct fileinfo *ncoutfile, int nfiles, int r, int bf, @@ -1957,12 +1957,12 @@ char** find_partitioned_files(const char* filepath, int* count) { long outstart[MAX_NC_DIMS]; /* Data array sizes */ long count[MAX_NC_DIMS]; /* " */ int varrecdim; /* Position of a variable's record dimension */ - + if (verbose > 1) { printf (" nvars=%d\n", ncoutfile->nvars); } - + /* Write out all the decomposed variables */ for (v = 0; v < ncoutfile->nvars; v++) { @@ -2002,4 +2002,4 @@ char** find_partitioned_files(const char* filepath, int* count) { } } return (0); - } \ No newline at end of file + } diff --git a/fms2_io/netcdf_io.F90 b/fms2_io/netcdf_io.F90 index e3c120b8e8..bdfba1f843 100644 --- a/fms2_io/netcdf_io.F90 +++ b/fms2_io/netcdf_io.F90 @@ -247,7 +247,7 @@ module netcdf_io_mod public :: read_restart_bc public :: flush_file public :: partitioned_global_files -public :: partitioned_section_files +public :: partitioned_section_files public :: append_to_filepath_list !> @ingroup netcdf_io_mod @@ -703,7 +703,7 @@ subroutine append_to_filepath_list(filepath, filepath_list) else current => filepath_list do - ! If file is already in the list, return. This happens when new_file_freq (e.g., daily) + ! If file is already in the list, return. This happens when new_file_freq (e.g., daily) ! is more frequent than the date suffix (e.g., %4yr-%2mo), leading to overriding the file. if (string_compare(current%path, trim(filepath))) return ! If we reach the end of the list, exit the loop to add the new file. diff --git a/mpp/include/group_update_pack.inc b/mpp/include/group_update_pack.inc index 142106aece..a6ee72d7b8 100644 --- a/mpp/include/group_update_pack.inc +++ b/mpp/include/group_update_pack.inc @@ -18,64 +18,103 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +! nvfortran + cray pointers imposes some restrictions on the loops below: +! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops +! must be ported rather than the whole outer loop. +! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop". +! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)". +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do n = 1, npack buffer_pos = group%pack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%pack_is(n); ie = group%pack_ie(n) - js = group%pack_js(n); je = group%pack_je(n) + is = group%pack_is(n); ie = group%pack_ie(n); ni = ie-is+1 + js = group%pack_js(n); je = group%pack_je(n); nj = je-js+1 rotation = group%pack_rotation(n) if( group%pack_type(n) == FIELD_S ) then select case( rotation ) case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do enddo + pos = pos + ksize*nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: field(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -83,77 +122,127 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + ! pos = pos + 1 + ! buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) @@ -162,86 +251,136 @@ if( group%k_loop_inside ) then case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (k-1)*nj*ni + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if (use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,is,ie,je,js,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(is:ie,js:je,1:ksize)) & + !$omp map(from: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp if(use_device_ptr) +#endif do k = 1, ksize do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (k-1)*nj*ni + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do end do + pos = pos + ksize*nj*ni end do end if end select ! select case( rotation(n) ) endif enddo else -!$OMP parallel do default(none) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(shared) shared(npack,group,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is,ie,js,je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = 1, npack*ksize n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 @@ -255,42 +394,74 @@ else case(ZERO) do l=1, group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni enddo case( MINUS_NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) do l=1,group%nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: field(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = field(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = field(i,j,k) end do end do + pos = pos + nj*ni end do end select else if( group%pack_type(n) == FIELD_X ) then @@ -298,65 +469,113 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if case( NINETY ) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) @@ -365,65 +584,113 @@ else case(ZERO) do l=1, nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (j-js)*ni + (i-is) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do case( MINUS_NINETY ) do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if(use_device_ptr) +#endif do i = is, ie do j = je, js, -1 - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (i-is)*nj + (je-j) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do case( NINETY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1, nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = fieldx(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: fieldx(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do i = ie, is, -1 do j = js, je - pos = pos + 1 - buffer(pos) = -fieldx(i,j,k) + idx = pos + (ie-i)*nj + (j-js) + 1 + buffer(idx) = -fieldx(i,j,k) end do end do + pos = pos + nj*ni end do end if case( ONE_HUNDRED_EIGHTY ) if( BTEST(group%flags_v,SCALAR_BIT) ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = fieldy(i,j,k) end do end do + pos = pos + nj*ni end do else do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: fieldy(k, is:ie, js:je)) & + !$omp map(from: buffer(pos+1:pos+nj*ni)) & + !$omp if (use_device_ptr) +#endif do j = je, js, -1 do i = ie, is, -1 - pos = pos + 1 - buffer(pos) = -fieldy(i,j,k) + idx = pos + (je-j)*ni + (ie-i) + 1 + buffer(idx) = -fieldy(i,j,k) end do end do + pos = pos + nj*ni end do end if end select ! select case( rotation(n) ) diff --git a/mpp/include/group_update_unpack.inc b/mpp/include/group_update_unpack.inc index 7f60ed93df..25a31ee8fc 100644 --- a/mpp/include/group_update_unpack.inc +++ b/mpp/include/group_update_unpack.inc @@ -18,92 +18,140 @@ !*********************************************************************** if( group%k_loop_inside ) then -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +! nvfortran + cray pointers imposes some restrictions on the loops below: +! * the compiler cannot privatise OpenMP cray pointers in offloaded loops. Hence, inner loops +! must be ported rather than the whole outer loop. +! * the more verbose form of openmp offload loops must be used. Would prefer "target teams loop". +! * default(shared) must be used otherwise loops hang or segfault. Would prefer "default(none)". +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k ) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy, n,k,ni,nj,idx) +#endif do n = nunpack, 1, -1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) if(use_device_ptr) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_field,ptr) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: field(is:ie,js:je,1:ksize)) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldx,ptr) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: fieldx(is:ie,js:je,1:ksize)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(3) default(shared) & + !$omp private(i,j,k,idx) shared(ksize,js,je,is,ie,pos,nj,ni,ptr_fieldy,ptr) & + !$omp map(to: buffer(pos+1:pos+ksize*nj*ni)) & + !$omp map(from: fieldy(is:ie,js:je,1:ksize)) if(use_device_ptr) +#endif do k = 1, ksize do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (k-1)*nj*ni + (j-js)*ni + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do end do + pos = pos + ksize*nj*ni end do endif enddo else -!$OMP parallel do default(none) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & +#ifndef __NVCOMPILER_OPENMP_GPU +!$OMP parallel do default(shared) shared(nunpack,group,nscalar,ptr,nvector,ksize,buffer_start_pos) & !$OMP private(buffer_pos,pos,m,is, ie, js, je,rotation, & -!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k) +!$OMP ptr_field, ptr_fieldx, ptr_fieldy,n,k,ni,nj,idx) +#endif do nk = nunpack*ksize, 1, -1 n = (nk-1)/ksize + 1 k = mod((nk-1), ksize) + 1 buffer_pos = group%unpack_buffer_pos(n) + buffer_start_pos pos = buffer_pos + (k-1)*group%unpack_size(n) - is = group%unpack_is(n); ie = group%unpack_ie(n) - js = group%unpack_js(n); je = group%unpack_je(n) + is = group%unpack_is(n); ie = group%unpack_ie(n); ni = ie-is+1 + js = group%unpack_js(n); je = group%unpack_je(n); nj = je-js+1 if( group%unpack_type(n) == FIELD_S ) then do l=1,nscalar ! loop over number of fields ptr_field = group%addrs_s(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_field,ptr) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: field(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - field(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + field(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_X ) then do l=1,nvector ! loop over number of fields ptr_fieldx = group%addrs_x(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldx,ptr) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldx(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldx(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldx(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do else if( group%unpack_type(n) == FIELD_Y ) then do l=1,nvector ! loop over number of fields ptr_fieldy = group%addrs_y(l) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target teams distribute parallel do collapse(2) default(shared) & + !$omp private(i,j,idx) shared(k,js,je,is,ie,pos,ni,ptr_fieldy,ptr) & + !$omp map(to: buffer(pos+1:pos+nj*ni)) map(from: fieldy(is:ie,js:je,k)) if(use_device_ptr) +#endif do j = js, je do i = is, ie - pos = pos + 1 - fieldy(i,j,k) = buffer(pos) + idx = pos + (j-js)*ni + (i-is) + 1 + fieldy(i,j,k) = buffer(idx) end do end do + pos = pos + ni*nj end do endif enddo diff --git a/mpp/include/mpp_comm_mpi.inc b/mpp/include/mpp_comm_mpi.inc index 928f9fcb92..142473497e 100644 --- a/mpp/include/mpp_comm_mpi.inc +++ b/mpp/include/mpp_comm_mpi.inc @@ -31,6 +31,7 @@ !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! !> @brief Initialize the @ref mpp_mod module. Must be called before any usage. subroutine mpp_init( flags, localcomm, test_level, alt_input_nml_path ) + !$ use omp_lib integer, optional, intent(in) :: flags !< Flags for debug output, can be MPP_VERBOSE or MPP_DEBUG integer, optional, intent(in) :: localcomm !< Id of MPI communicator used to initialize integer, optional, intent(in) :: test_level !< Used to exit initialization at certain stages @@ -54,6 +55,14 @@ call MPI_COMM_RANK( mpp_comm_private, pe, error ) call MPI_COMM_SIZE( mpp_comm_private, npes, error ) + ! set default device to enable multi GPU parallelism + ! calls to both OpenACC and OpenMP runtimes are needed + ! because we use both do-concurrent and openmp + ! if you remove either, the code will run multiple + ! ranks on a _single_ GPU. Be careful out there! + !$ call omp_set_default_device(pe) + !$acc set device_num(pe) + module_is_initialized = .TRUE. if (present(test_level)) then t_level = test_level diff --git a/mpp/include/mpp_group_update.fh b/mpp/include/mpp_group_update.fh index 0f04c06c3b..5bbde5ce6a 100644 --- a/mpp/include/mpp_group_update.fh +++ b/mpp/include/mpp_group_update.fh @@ -419,20 +419,22 @@ subroutine MPP_CREATE_GROUP_UPDATE_4D_V_( group, fieldx, fieldy, domain, flags, end subroutine MPP_CREATE_GROUP_UPDATE_4D_V_ -subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) +subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type, omp_offload) type(mpp_group_update_type), intent(inout) :: group type(domain2D), intent(inout) :: domain MPP_TYPE_, intent(in) :: d_type + logical, optional, intent(in) :: omp_offload integer :: nscalar, nvector, nlist logical :: recv_y(8) integer :: nsend, nrecv, flags_v integer :: msgsize - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk integer :: shift, gridtype, midpoint integer :: npack, nunpack, rotation, isd + logical :: use_device_ptr MPP_TYPE_ :: buffer(mpp_domains_stack_size) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) @@ -448,6 +450,9 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) nlist = size(domain%list(:)) gridtype = group%gridtype + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + !--- ksize_s must equal ksize_v if(nvector > 0 .AND. nscalar > 0) then if(group%ksize_s .NE. group%ksize_v) then @@ -476,13 +481,16 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) !---pre-post receive. call mpp_clock_begin(group_recv_clock) +#ifdef __NVCOMPILER_OPENMP_GPU + !$omp target enter data map(alloc: buffer) if(use_device_ptr) +#endif do m = 1, nrecv msgsize = group%recv_size(m) from_pe = group%from_pe(m) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_recv(m) call mpp_recv( buffer(buffer_pos+1), glen=msgsize, from_pe=from_pe, block=.false., & - tag=COMM_TAG_1) + tag=COMM_TAG_1, omp_offload=omp_offload) end if end do @@ -495,7 +503,19 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) call mpp_clock_begin(group_pack_clock) !pack the data buffer_start_pos = 0 + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then +#include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU +#else #include +#endif + endif call mpp_clock_end(group_pack_clock) call mpp_clock_begin(group_send_clock) @@ -504,7 +524,7 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) if( msgsize .GT. 0 )then buffer_pos = group%buffer_pos_send(n) to_pe = group%to_pe(n) - call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1) + call mpp_send( buffer(buffer_pos+1), plen=msgsize, to_pe=to_pe, tag=COMM_TAG_1, omp_offload=omp_offload) endif enddo call mpp_clock_end(group_send_clock) @@ -518,7 +538,20 @@ subroutine MPP_DO_GROUP_UPDATE_(group, domain, d_type) !---unpack the buffer nunpack = group%nunpack call mpp_clock_begin(group_unpk_clock) + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then +#include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU #include +#define __NVCOMPILER_OPENMP_GPU + !$omp target exit data map(release: buffer) if(use_device_ptr) +#else +#include +#endif + endif call mpp_clock_end(group_unpk_clock) ! ---northern boundary fold @@ -644,10 +677,11 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) integer :: nscalar, nvector integer :: nsend, nrecv, flags_v integer :: msgsize, npack, rotation - integer :: from_pe, to_pe, buffer_pos, pos + integer :: from_pe, to_pe, buffer_pos, pos, idx integer :: ksize, is, ie, js, je - integer :: n, l, m, i, j, k, buffer_start_pos, nk + integer :: n, l, m, i, j, k, buffer_start_pos, ni, nj, nk logical :: reuse_buf_pos + logical, parameter :: use_device_ptr = .false. ! placeholder character(len=8) :: text MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) @@ -726,7 +760,19 @@ subroutine MPP_START_GROUP_UPDATE_(group, domain, d_type, reuse_buffer) call mpp_clock_begin(nonblock_group_pack_clock) npack = group%npack buffer_start_pos = group%buffer_start_pos + ! below switch runs OpenMP offloaded packing if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then +#include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU #include +#define __NVCOMPILER_OPENMP_GPU +#else +#include +#endif + endif call mpp_clock_end(nonblock_group_pack_clock) call mpp_clock_begin(nonblock_group_send_clock) @@ -749,11 +795,12 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) MPP_TYPE_, intent(in) :: d_type integer :: nsend, nrecv, nscalar, nvector - integer :: k, buffer_pos, pos, m, n, l + integer :: k, buffer_pos, pos, m, n, l, idx integer :: is, ie, js, je, ksize, i, j integer :: shift, gridtype, midpoint, flags_v - integer :: nunpack, rotation, buffer_start_pos, nk, isd + integer :: nunpack, rotation, buffer_start_pos, ni, nj, nk, isd logical :: recv_y(8) + logical, parameter :: use_device_ptr = .false. ! placeholder MPP_TYPE_ :: buffer(size(mpp_domains_stack_nonblock(:))) MPP_TYPE_ :: field (group%is_s:group%ie_s,group%js_s:group%je_s, group%ksize_s) MPP_TYPE_ :: fieldx(group%is_x:group%ie_x,group%js_x:group%je_x, group%ksize_v) @@ -794,7 +841,20 @@ subroutine MPP_COMPLETE_GROUP_UPDATE_(group, domain, d_type) call mpp_clock_begin(nonblock_group_unpk_clock) buffer_start_pos = group%buffer_start_pos + ! below switch runs OpenMP offloaded unpacking if ompoffload is .true. and compiled with + ! OpenMP offload support. Otherwise, run OpenMP CPU by undefining GPU macro if defined + if (use_device_ptr) then #include + else +#ifdef __NVCOMPILER_OPENMP_GPU +#undef __NVCOMPILER_OPENMP_GPU +#include +#define __NVCOMPILER_OPENMP_GPU + !$omp target exit data map(release: buffer) if(use_device_ptr) +#else +#include +#endif + endif call mpp_clock_end(nonblock_group_unpk_clock) ! ---northern boundary fold diff --git a/mpp/include/mpp_transmit.inc b/mpp/include/mpp_transmit.inc index 24d0cc437f..71b0d97f0c 100644 --- a/mpp/include/mpp_transmit.inc +++ b/mpp/include/mpp_transmit.inc @@ -171,7 +171,7 @@ call mpp_transmit( put_data, put_len, to_pe, dummy, 1, NULL_PE, tag=tag, send_request=request ) end subroutine MPP_SEND_ - subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request ) + subroutine MPP_RECV_SCALAR_( get_data, from_pe, glen, block, tag, request, omp_offload ) !a mpp_transmit with null arguments on the put side integer, intent(in) :: from_pe MPP_TYPE_, intent(out) :: get_data @@ -180,6 +180,7 @@ integer, intent(out), optional :: request integer, optional, intent(in) :: glen + logical, optional, intent(in) :: omp_offload integer :: get_len MPP_TYPE_ :: get_data1D(1) MPP_TYPE_ :: dummy(1) @@ -189,17 +190,19 @@ ptr = LOC(get_data) get_len=1; if(PRESENT(glen))get_len=glen - call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data1D, get_len, from_pe, & + block, tag, recv_request=request, omp_offload=omp_offload ) end subroutine MPP_RECV_SCALAR_ - subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request) + subroutine MPP_SEND_SCALAR_( put_data, to_pe, plen, tag, request, omp_offload) !a mpp_transmit with null arguments on the get side integer, intent(in) :: to_pe MPP_TYPE_, intent(in) :: put_data integer, optional, intent(in) :: plen integer, intent(in), optional :: tag integer, intent(out), optional :: request + logical, optional, intent(in) :: omp_offload integer :: put_len MPP_TYPE_ :: put_data1D(1) MPP_TYPE_ :: dummy(1) @@ -207,7 +210,8 @@ pointer( ptr, put_data1D ) ptr = LOC(put_data) put_len=1; if(PRESENT(plen))put_len=plen - call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, tag = tag, send_request=request ) + call mpp_transmit( put_data1D, put_len, to_pe, dummy, 1, NULL_PE, & + tag=tag, send_request=request, omp_offload=omp_offload ) end subroutine MPP_SEND_SCALAR_ @@ -220,7 +224,8 @@ integer, intent(out), optional :: request MPP_TYPE_ :: dummy(1,1) - call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, block, tag, recv_request=request ) + call mpp_transmit( dummy, 1, NULL_PE, get_data, get_len, from_pe, & + block, tag, recv_request=request ) end subroutine MPP_RECV_2D_ subroutine MPP_SEND_2D_( put_data, put_len, to_pe, tag, request ) diff --git a/mpp/include/mpp_transmit_mpi.fh b/mpp/include/mpp_transmit_mpi.fh index 023c2d5124..4ce6af6148 100644 --- a/mpp/include/mpp_transmit_mpi.fh +++ b/mpp/include/mpp_transmit_mpi.fh @@ -37,7 +37,7 @@ !!(avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -45,14 +45,19 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag integer, intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload logical :: block_comm integer :: i integer :: comm_tag integer :: rsize + logical :: use_device_ptr if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return + use_device_ptr = .false. + if (present(omp_offload)) use_device_ptr = omp_offload + block_comm = .true. if(PRESENT(block)) block_comm = block @@ -82,8 +87,15 @@ if( cur_send_request > max_request ) & call mpp_error(FATAL, & & "MPP_TRANSMIT: cur_send_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & - request_send(cur_send_request), error) + if (use_device_ptr) then + !$omp target data use_device_addr(put_data) + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + !$omp end target data + else + call MPI_ISEND( put_data, put_len, MPI_TYPE_, to_pe, comm_tag, mpp_comm_private, & + request_send(cur_send_request), error) + endif endif if (debug .and. (current_clock.NE.0)) call increment_current_clock(EVENT_SEND, put_len*MPP_TYPE_BYTELEN_) else if (to_pe.EQ.ALL_PES) then !this is a broadcast from from_pe @@ -130,8 +142,15 @@ if( cur_recv_request > max_request ) & call mpp_error(FATAL, & "MPP_TRANSMIT: cur_recv_request is greater than max_request, increase mpp_nml request_multiply") - call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & - request_recv(cur_recv_request), error ) + if (use_device_ptr) then + !$omp target data use_device_addr(get_data) + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + !$omp end target data + else + call MPI_IRECV( get_data, get_len, MPI_TYPE_, from_pe, comm_tag, mpp_comm_private, & + request_recv(cur_recv_request), error ) + endif size_recv(cur_recv_request) = get_len type_recv(cur_recv_request) = MPI_TYPE_ endif diff --git a/mpp/include/mpp_transmit_nocomm.fh b/mpp/include/mpp_transmit_nocomm.fh index ca132a4dc8..ee08d53a53 100644 --- a/mpp/include/mpp_transmit_nocomm.fh +++ b/mpp/include/mpp_transmit_nocomm.fh @@ -36,7 +36,7 @@ !!words from an array of any rank to be passed (avoiding f90 rank conformance check) !!caller is responsible for completion checks (mpp_sync_self) before and after subroutine MPP_TRANSMIT_( put_data, put_len, to_pe, get_data, get_len, from_pe, block, tag, recv_request, & - & send_request ) + & send_request, omp_offload ) integer, intent(in) :: put_len, to_pe, get_len, from_pe MPP_TYPE_, intent(in) :: put_data(*) @@ -44,6 +44,8 @@ logical, intent(in), optional :: block integer, intent(in), optional :: tag integer, intent(out), optional :: recv_request, send_request + logical, intent(in), optional :: omp_offload + ! NOTE: omp_offload is unused in this function integer :: i, outunit MPP_TYPE_, allocatable, save :: local_data(:) !local copy used by non-parallel code (no SHMEM or MPI) @@ -55,6 +57,7 @@ if( .NOT.module_is_initialized )call mpp_error( FATAL, 'MPP_TRANSMIT: You must first call mpp_init.' ) if( to_pe.EQ.NULL_PE .AND. from_pe.EQ.NULL_PE )return + outunit = stdout() if( debug )then call SYSTEM_CLOCK(tick) diff --git a/test_fms/fms2_io/test_domain_io.F90 b/test_fms/fms2_io/test_domain_io.F90 index 07a3e2845a..8b170f3819 100644 --- a/test_fms/fms2_io/test_domain_io.F90 +++ b/test_fms/fms2_io/test_domain_io.F90 @@ -46,7 +46,7 @@ program test_domain_read integer :: xhalo = 3 !< Number of halo points in X integer :: yhalo = 2 !< Number of halo points in Y integer :: nz = 2 !< Number of points in the z dimension - character(len=20) :: filename="test.nc" !< Name of the file + character(len=25) :: filename="test.nc" !< Name of the file logical :: use_edges=.false. !< Use North and East domain positions integer :: ndim4 !< Number of points in dim4 diff --git a/test_fms/mpp/test_mpp_gatscat.F90 b/test_fms/mpp/test_mpp_gatscat.F90 index 47ff6cf81c..3ebacf61d9 100644 --- a/test_fms/mpp/test_mpp_gatscat.F90 +++ b/test_fms/mpp/test_mpp_gatscat.F90 @@ -779,122 +779,4 @@ subroutine test_gatherV(npes,pe,root,out_unit) deallocate(sdata,rdata,ref) end subroutine test_gatherV - !> @brief Test the 2D vector mpp_gather routine. - !> @todo This is a legacy routine which does not work in all conditions. For the gcc version, - !> the use of cray pointers is suspect to causing a crash at the call to mpp_gather. -subroutine test_gather2DV(npes,pe,root,out_unit) - implicit none - integer, intent(in) :: npes,pe,root,out_unit - - integer :: pelist(npes),rsize(npes) - integer :: pelist2(npes),rsize2(npes) - integer :: i,j,k,l,nz,ssize,nelems - real,allocatable,dimension(:,:) :: data, cdata, sbuff,rbuff - real,allocatable :: ref(:,:) - integer, parameter :: KSIZE=10 - - real :: sbuff1D(size(sbuff)) - real :: rbuff1D(size(rbuff)) - pointer(sptr,sbuff1D); pointer(rptr,rbuff1D) - - - if(npes < 3)then - call mpp_error(FATAL, "Test_gather2DV: minimum of 3 ranks required. Not testing gather; too few ranks.") - elseif(npes > 9999)then - call mpp_error(FATAL, "Test_gather2DV: maximum of 9999 ranks supported. Not testing gather2DV; too many ranks.") - return - endif - write(out_unit,*) - - ssize = pe+1 - allocate(data(ssize,KSIZE)) - do k=1,KSIZE; do i=1,ssize - data(i,k) = 10000.0*k + pe + 0.0001*i - enddo; enddo - do i=1,npes - pelist(i) = i-1 - rsize(i) = i - enddo - - nz = KSIZE - nelems = sum(rsize(:)) - - allocate(rbuff(nz,nelems)); rbuff = -1.0 - allocate(ref(nelems,nz),cdata(nelems,nz)) - ref = 0.0; cdata = 0.0 - if(pe == root)then - do k=1,KSIZE - l=1 - do j=1,npes - do i=1,rsize(j) - ref(l,k) = 10000.0*k + pelist(j) + 0.0001*i - l = l+1 - enddo; enddo;enddo - endif - allocate(sbuff(nz,ssize)) - ! this matrix inversion makes for easy gather to the IO root - ! and a clear, concise unpack - do j=1,ssize - do i=1,nz - sbuff(i,j) = data(j,i) - enddo; enddo - - ! Note that the gatherV implied here is asymmetric; only root needs to know the vector of recv size - sptr = LOC(sbuff); rptr = LOC(rbuff) - call mpp_gather(sbuff1D,size(sbuff),rbuff1D,nz*rsize(:)) - - if(pe == root)then - do j=1,nz - do i=1,nelems - cdata(i,j) = rbuff(j,i) - enddo; enddo - do j=1,nz - do i=1,nelems - if(cdata(i,j) /= ref(i,j))then - write(6,*) "Gathered data ",cdata(i,j), " NE reference ",ref(i,j), "at i,j=",i,j - call mpp_error(FATAL, "Test gather2DV global pelist failed") - endif - enddo;enddo - endif - - call mpp_sync() - write(out_unit,*) "Test gather2DV with global pelist successful" - - do i=1,npes - pelist2(i) = pelist(npes-i+1) - rsize2(i) = rsize(npes-i+1) - enddo - - rbuff = -1.0 - ref = 0.0; cdata = 0.0 - if(pe == pelist2(1))then - do k=1,KSIZE - l=1 - do j=1,npes - do i=1,rsize2(j) - ref(l,k) = 10000.0*k + pelist2(j) + 0.0001*i - l = l+1 - enddo; enddo;enddo - endif - - call mpp_gather(sbuff1D,size(sbuff),rbuff1D,nz*rsize2(:),pelist2) - - if(pe == pelist2(1))then - do j=1,nz - do i=1,nelems - cdata(i,j) = rbuff(j,i) - enddo; enddo - do j=1,nz - do i=1,nelems - if(cdata(i,j) /= ref(i,j))then - write(6,*) "Gathered data ",cdata(i,j), " NE reference ",ref(i,j), "at i,j=",i,j - call mpp_error(FATAL, "Test gather2DV with reversed pelist failed") - endif - enddo;enddo - endif - call mpp_sync() - write(out_unit,*) "Test gather2DV with reversed pelist successful" - deallocate(data,sbuff,rbuff,cdata,ref) - end subroutine test_gather2DV - end program test_mpp_gatscat