aws · gmarciani · Sep 19, 2025 · Sep 22, 2025 · Sep 22, 2025
@@ -439,8 +439,7 @@ def _assert_build_image_stack_deleted(stack_name, region, timeout_seconds=600, p
     pytest.fail(f"Timed-out waiting for stack {stack_name} deletion (last status: {last_status})")
 
 
-def assert_regex_in_file(cluster: Cluster, compute_node_ip: str, file_name: str, pattern: str, negate: bool = True):
-    rce = RemoteCommandExecutor(cluster, compute_node_ip)
+def assert_regex_in_file(rce: RemoteCommandExecutor, file_name: str, pattern: str, negate: bool = True):
     file_content = read_remote_file(rce, file_name)
     assertion = assert_that(bool(re.search(pattern, file_content, re.IGNORECASE)))
     assertion.is_false() if negate else assertion.is_fals()
@@ -53,30 +53,33 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in
     return job_id
 
 
-def assert_imex_nodes_config_is_correct(
-    rce: RemoteCommandExecutor, queue_name: str, compute_resource_name: str, expected_ips: list
-):
-    logging.info(f"Checking IMEX nodes config contains the expected nodes: {expected_ips}")
-    imex_nodes_config_file = (
-        f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{queue_name}_{compute_resource_name}.cfg"
-    )
-    imex_config_content = read_remote_file(rce, imex_nodes_config_file)
-    imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")]
-    actual_ips = [ip.strip() for ip in imex_config_content_clean]
-    assert_that(actual_ips).contains_only(*expected_ips)
-    logging.info(f"IMEX nodes config {imex_nodes_config_file} contains the expected nodes: {expected_ips}")
+def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list):
+    for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource):
+        logging.info(
+            f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}"
+        )
+        rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip)
+        imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg")
+        imex_config_content_clean = [
+            line for line in imex_config_content.split("\n") if not line.strip().startswith("#")
+        ]
+        actual_ips = [ip.strip() for ip in imex_config_content_clean]
+        assert_that(actual_ips).contains_only(*expected_ips)
+        logging.info(
+            f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}"
+        )
 
 
 def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str):
-    rce = RemoteCommandExecutor(cluster)
     logs = ["/var/log/nvidia-imex-verbose.log", "/var/log/parallelcluster/nvidia-imex-prolog.log"]
     for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource):
+        rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip)
         for log in logs:
             logging.info(f"Checking file {log} log does not contain any error")
             if log == "/var/log/nvidia-imex-verbose.log" and not is_existing_remote_file(rce, log):
                 logging.info("IMEX log file not found. Not an issue as IMEX writes logs there only in case of errors.")
                 continue
-            assert_regex_in_file(cluster, compute_node_ip, log, r"(warn|error|fail)", negate=True)
+            assert_regex_in_file(rce, log, r"(warn|error|fail)", negate=True)
 
 
 def assert_imex_status(
@@ -210,7 +213,7 @@ def _check_imex_healthy():
             f"Private IP addresses for nodes in queue {queue} and compute resource {compute_resource}: " f"{ips}"
         )
 
-        assert_imex_nodes_config_is_correct(rce, queue, compute_resource, ips)
+        assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, ips)
         assert_imex_status(rce, job_id, ips, service_status="UP", node_status="READY", connection_status="CONNECTED")
         assert_no_errors_in_logs(cluster, queue, compute_resource)
 
@@ -240,7 +243,7 @@ def assert_imex_not_configured(cluster: Cluster, queue: str, compute_resource: s
 
     job_id = submit_job_imex_status(rce, queue, max_nodes)
 
-    assert_imex_nodes_config_is_correct(rce, queue, compute_resource, FAKE_IPS)
+    assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, FAKE_IPS)
     assert_imex_status(
         rce, job_id, FAKE_IPS, service_status="DOWN", node_status="UNAVAILABLE", connection_status="INVALID"
     )

@@ -1,7 +1,17 @@
 #!/usr/bin/env bash
 
-# This prolog script configures the NVIDIA IMEX nodes config file and reloads the nvidia-imex service.
-# This prolog is meant to be run by compute nodes with exclusive jobs.
+# This prolog script configures the NVIDIA IMEX on compute nodes involved in the job execution.
+#
+# In particular:
+# - Checks whether the job is executed exclusively.
+#   If not, it exits immediately because it requires jobs to be executed exclusively.
+# - Writes the private IP addresses of compute nodes into /etc/nvidia-imex/nodes_config.cfg.
+# - Creates the IMEX default channel.
+#   For more information about IMEX channels, see https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html
+# - Restarts the IMEX system service.
+#
+# REQUIREMENTS:
+#  - This prolog assumes to be run only with exclusive jobs.
 
 LOG_FILE_PATH="/var/log/parallelcluster/nvidia-imex-prolog.log"
 SCONTROL_CMD="/opt/slurm/bin/scontrol"
@@ -10,6 +20,7 @@ IMEX_STOP_TIMEOUT=15
 #TODO In production, specify p6e-gb200, only. We added g5g only for testing purposes.
 ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g)"
 IMEX_SERVICE="nvidia-imex"
+IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg"
 
 function info() {
   echo "$(date "+%Y-%m-%dT%H:%M:%S.%3N") [INFO] [PID:$$] [JOB:${SLURM_JOB_ID}] $1"
@@ -48,24 +59,13 @@ function return_if_unsupported_instance_type() {
   fi
 }
 
-function get_node_names() {
-  local _queue_name=$1
-  local _compute_resource_name=$2
-
-  ${SCONTROL_CMD} show nodes --json | \
-    jq -r \
-      --arg queue_name "${_queue_name}" \
-      --arg compute_resource_name "${_compute_resource_name}" \
-      '[
-        .nodes[] |
-        select(
-          (.partitions[] | contains($queue_name)) and
-          (.features[] | contains($compute_resource_name)) and
-          (.features[] | contains("static"))
-        ) |
-        .name
-      ] |
-      join(",")'
+function return_if_job_is_not_exclusive() {
+  if [[ "${SLURM_JOB_OVERSUBSCRIBE}" = "NO" ]]; then
+    info "Job is exclusive, proceeding with IMEX configuration"
+  else
+    info "Skipping IMEX configuration because the job is not exclusive"
+    prolog_end
+  fi
 }
 
 function get_ips_from_node_names() {
@@ -80,53 +80,6 @@ function get_compute_resource_name() {
   echo "${_slurmd_node_name}" | sed -E "s/${_queue_name_prefix}(.+)-[0-9]+$/\1/"
 }
 
-function write_file() {
-  local _file=$1
-  local _content=$2
-  local _lock_file="${_file}.lock"
-  local _lock_timeout_seconds=60
-
-  if [[ -f "${_file}" ]] && [[ "$(cat "${_file}")" = "${_content}" ]]; then
-    info "File ${_file} already has the expected content, skipping the write operation"
-    return 1 # Not Updated
-  fi
-
-  # Try to acquire lock with timeout
-  (
-      if ! flock -x -w ${_lock_timeout_seconds} 200; then
-        # If timeout, assume deadlock and try to recover
-        info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery"
-        exit 1
-      fi
-      echo "${_content}" > "${_file}"
-  ) 200>"${_lock_file}"
-
-  local _lock_result=$?
-
-  if [[ ${_lock_result} -eq 0 ]]; then
-    return 0 # Updated successfully
-  fi
-
-  # Deadlock recovery: remove stale lock file and retry once
-  error "Potential deadlock detected for ${_file}, attempting recovery"
-  rm -f "${_lock_file}"
-  sleep 1  # Brief pause to avoid race conditions
-
-  (
-      if ! flock -x -w 10 200; then
-        exit 1
-      fi
-      echo "${_content}" > "${_file}"
-  ) 200>"${_lock_file}"
-
-  if [[ $? -eq 0 ]]; then
-    info "Lock acquired after deadlock recovery for ${_file}"
-    return 0 # Updated
-  fi
-
-  error_exit "Failed to acquire lock for ${_file} even after deadlock recovery"
-}
-
 function reload_imex() {
   info "Stopping IMEX"
   timeout ${IMEX_STOP_TIMEOUT} systemctl stop ${IMEX_SERVICE}
@@ -142,9 +95,6 @@ function reload_imex() {
 }
 
 function create_default_imex_channel() {
-  # This configuration follows
-  # [Nvidia doc](https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html)
-  # This configuration is only suitable for single user environment, and not compatible with multi-user environment.
   info "Creating IMEX default Channel"
   MAJOR_NUMBER=$(cat /proc/devices | grep nvidia-caps-imex-channels | cut -d' ' -f1)
   if [ ! -d "/dev/nvidia-caps-imex-channels" ]; then
@@ -163,27 +113,19 @@ function create_default_imex_channel() {
 {
   info "PROLOG Start JobId=${SLURM_JOB_ID}: $0"
 
+  return_if_job_is_not_exclusive
   return_if_unsupported_instance_type
 
   create_default_imex_channel
 
-  QUEUE_NAME=$SLURM_JOB_PARTITION
-  COMPUTE_RESOURCE_NAME=$(get_compute_resource_name "${QUEUE_NAME}-st-" $SLURMD_NODENAME)
-  CR_NODES=$(get_node_names "${QUEUE_NAME}" "${COMPUTE_RESOURCE_NAME}")
-  IPS_FROM_CR=$(get_ips_from_node_names "${CR_NODES}")
-  IMEX_MAIN_CONFIG="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg"
-  IMEX_NODES_CONFIG="/opt/parallelcluster/shared/nvidia-imex/nodes_config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg"
-
-  info "Queue Name: ${QUEUE_NAME}"
-  info "CR Name: ${COMPUTE_RESOURCE_NAME}"
-  info "CR Nodes: ${CR_NODES}"
-  info "Node IPs from CR: ${IPS_FROM_CR}"
-  info "IMEX Main Config: ${IMEX_MAIN_CONFIG}"
+  IPS_FROM_CR=$(get_ips_from_node_names "${SLURM_NODELIST}")
+
+  info "Node Names: ${SLURM_NODELIST}"
+  info "Node IPs: ${IPS_FROM_CR}"
   info "IMEX Nodes Config: ${IMEX_NODES_CONFIG}"
 
   info "Updating IMEX nodes config ${IMEX_NODES_CONFIG}"
-  write_file "${IMEX_NODES_CONFIG}" "${IPS_FROM_CR}"
-
+  echo "${IPS_FROM_CR}" > "${IMEX_NODES_CONFIG}"
   reload_imex
 
   prolog_end

@@ -8,6 +8,5 @@
 sleep 45
 QUEUE_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_queue_name")
 COMPUTE_RES_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_compute_resource_name")
-IMEX_CONFIG_FILE="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RES_NAME}.cfg"
 
-srun bash -c "/usr/bin/nvidia-imex-ctl -N -j -c ${IMEX_CONFIG_FILE} > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err"
+srun bash -c "/usr/bin/nvidia-imex-ctl -N -j > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err"