diff --git a/features/config/TEMPLATE_nvSHMEM.xml b/features/config/TEMPLATE_nvSHMEM.xml
new file mode 100644
index 000000000..2f7ffa371
--- /dev/null
+++ b/features/config/TEMPLATE_nvSHMEM.xml
@@ -0,0 +1,15 @@
+
+
+
+ test
+
+
+
+
+
+
+
+
+
+
+
diff --git a/features/feature_case/nvSHMEM/nvshmem.cu b/features/feature_case/nvSHMEM/nvshmem.cu
new file mode 100644
index 000000000..35485fc24
--- /dev/null
+++ b/features/feature_case/nvSHMEM/nvshmem.cu
@@ -0,0 +1,146 @@
+// ===--------------- nvshmem.cu --------------- *- CUDA -* ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+#include
+#include
+#include
+#include
+#include
+
+#define N 32
+
+__global__ void set_data(int *shared_data, int mype) {
+ size_t i = threadIdx.x;
+
+ shared_data[i] = static_cast(mype * 2);
+}
+
+__global__ void kernel_putmem_signal_nbi(int *shared_data, uint64_t *signal, int val, int target_pe) {
+ nvshmem_putmem_signal_nbi(shared_data, shared_data, N * sizeof(int), signal, val, NVSHMEM_SIGNAL_SET, target_pe);
+}
+
+__global__ void kernel_signal_wait_until(uint64_t *signal, uint64_t val) {
+ nvshmem_signal_wait_until(signal, NVSHMEM_CMP_EQ, val);
+}
+
+int main(int argc, char **argv) {
+ MPI_Init(&argc, &argv);
+ int rank, nranks;
+ MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+ MPI_Comm_size(MPI_COMM_WORLD, &nranks);
+
+ nvshmemx_init_attr_t attr;
+ MPI_Comm mpi_comm = MPI_COMM_WORLD;
+
+ attr.mpi_comm = &mpi_comm;
+ nvshmemx_init_attr(NVSHMEMX_INIT_WITH_MPI_COMM, &attr);
+
+ int mype = nvshmem_my_pe();
+ int npes = nvshmem_n_pes();
+
+ std::cout << "ISHMEM initialized with " << npes << " PEs." << std::endl;
+ std::cout << "My PE: " << mype << std::endl;
+
+ int *shared_data = (int *)nvshmem_malloc(N * sizeof(int));
+ assert(shared_data != nullptr && "nvshmem_malloc failed");
+
+ uint64_t *signal_addr = (uint64_t *)nvshmem_malloc(sizeof(uint64_t));
+ assert(signal_addr != nullptr && "nvshmem_malloc for signal failed");
+
+ set_data<<>>(shared_data, mype);
+
+ const int target_pe = 1;
+
+ // copy data from PE 0 to PE 1
+ if (mype == 0) {
+ nvshmem_putmem_nbi(shared_data, shared_data, N * sizeof(int), target_pe);
+ }
+
+ int recv_shared_data[N] = {1};
+
+ // Retrieve the data on PE 1
+ if (mype == 1) {
+ cudaMemcpy((void *)recv_shared_data, (void *)shared_data, N * sizeof(int), cudaMemcpyDeviceToHost);
+
+ for (int i = 0; i < N; i++) {
+ if (recv_shared_data[i] != 0) {
+ std::cerr << "[" << mype << "] Data verification 1 failed at index " << i << ": " << recv_shared_data[i] << "\n";
+ std::exit(1);
+ }
+ }
+ }
+
+ // Get the pointer to data on PE 1
+ int *remote_data = (int *)nvshmem_ptr(shared_data, 1);
+ cudaMemcpy((void *)recv_shared_data, (void *)remote_data, N * sizeof(int), cudaMemcpyDeviceToHost);
+
+ if (mype == 0) {
+ for (int i = 0; i < N; i++) {
+ if (recv_shared_data[i] != 0) {
+ std::cerr << "[" << mype << "] Data verification 2 failed at index " << i << ": " << recv_shared_data[i] << "\n";
+ std::exit(1);
+ }
+ }
+ }
+ std::cout << "putmem_nbi & shmem_ptr: Data verification successful" << std::endl;
+
+ // Reset data on all PEs
+ set_data<<>>(shared_data, mype);
+
+ // Allocate & set signal memory
+ uint64_t *signal = (uint64_t *)nvshmem_malloc(sizeof(uint64_t));
+ assert(signal != nullptr && "nvshmem_malloc failed");
+
+ uint64_t h_signal = 0;
+ cudaMemcpy((void *)signal, (void *)&h_signal, sizeof(uint64_t), cudaMemcpyHostToDevice);
+
+ // Copy data from PE 0 to PE 1 and signal completion
+ // nvshmem_barrier_all();
+ kernel_putmem_signal_nbi<<<1, 1>>>(shared_data, signal, 1, 1);
+
+ // Check whether signal value & data updated in PE 1
+ if (mype == 1) {
+ kernel_signal_wait_until<<<1, 1>>>(signal, 1);
+
+ cudaMemcpy((void *)recv_shared_data, (void *)shared_data, N * sizeof(int), cudaMemcpyDeviceToHost);
+
+ for (int i = 0; i < N; i++) {
+ if (recv_shared_data[i] != 0) {
+ std::cerr << "[" << mype << "] Data verification 3 failed at index " << i << ": " << recv_shared_data[i] << "\n";
+ std::exit(1);
+ }
+ }
+ }
+ std::cout << "putmem_signal_nbi & signal_wait_until: Data verification successful" << std::endl;
+
+ // Set and update signal value in PE 1
+ nvshmemx_signal_op(signal, 1, NVSHMEM_SIGNAL_SET, 1);
+ nvshmemx_signal_op(signal, 1, NVSHMEM_SIGNAL_ADD, 1);
+
+ // Check whether signal value updated in PE 1
+ if (mype == 1) {
+ kernel_signal_wait_until<<<1, 1>>>(signal, 2);
+
+ cudaMemcpy((void *)recv_shared_data, (void *)signal, sizeof(uint64_t), cudaMemcpyDeviceToHost);
+
+ if (recv_shared_data[0] != 2) {
+ std::cerr << "[" << mype << "] Signal verification failed: " << recv_shared_data[0] << "\n";
+ std::exit(1);
+ }
+ }
+ std::cout << "signal_set/add: Data verification successful" << std::endl;
+
+ nvshmem_free(shared_data);
+ nvshmem_free(signal);
+
+ nvshmem_finalize();
+ MPI_Finalize();
+
+ return 0;
+}
diff --git a/features/features.xml b/features/features.xml
index 1d65a94eb..581250511 100644
--- a/features/features.xml
+++ b/features/features.xml
@@ -392,5 +392,6 @@
+
diff --git a/features/test_feature.py b/features/test_feature.py
index ba22b61c7..927438ac6 100644
--- a/features/test_feature.py
+++ b/features/test_feature.py
@@ -249,6 +249,15 @@ def build_test():
link_opts.append(' dnnl.lib')
ret = False
+ if test_config.current_test == 'nvshmem':
+ ISHMEMROOT = os.environ['ISHMEMROOT']
+ ISHMEMVER = os.environ['ISHMEMVER']
+
+ if (ISHMEMROOT and ISHMEMVER):
+ link_opts.append(os.path.join(ISHMEMROOT, ISHMEMVER, 'lib', 'libishmem.a'))
+
+ link_opts.append('-lze_loader -lmpi')
+
if (test_config.current_test == 'cufft-external-workspace'):
manual_fix_for_cufft_external_workspace(srcs[0])
if (test_config.current_test in occupancy_calculation_exper):
@@ -275,4 +284,3 @@ def run_test():
if test_config.current_test.startswith('ccl-test'):
return call_subprocess('mpirun -n 2 ' + os.path.join(os.path.curdir, test_config.current_test + '.run '))
return run_binary_with_args()
-