Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions tests/integration-tests/tests/common/assertions.py
Original file line number Diff line number Diff line change
Expand Up @@ -439,8 +439,7 @@ def _assert_build_image_stack_deleted(stack_name, region, timeout_seconds=600, p
pytest.fail(f"Timed-out waiting for stack {stack_name} deletion (last status: {last_status})")


def assert_regex_in_file(cluster: Cluster, compute_node_ip: str, file_name: str, pattern: str, negate: bool = True):
rce = RemoteCommandExecutor(cluster, compute_node_ip)
def assert_regex_in_file(rce: RemoteCommandExecutor, file_name: str, pattern: str, negate: bool = True):
file_content = read_remote_file(rce, file_name)
assertion = assert_that(bool(re.search(pattern, file_content, re.IGNORECASE)))
assertion.is_false() if negate else assertion.is_fals()
35 changes: 19 additions & 16 deletions tests/integration-tests/tests/ultraserver/test_gb200.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,30 +53,33 @@ def submit_job_imex_status(rce: RemoteCommandExecutor, queue: str, max_nodes: in
return job_id


def assert_imex_nodes_config_is_correct(
rce: RemoteCommandExecutor, queue_name: str, compute_resource_name: str, expected_ips: list
):
logging.info(f"Checking IMEX nodes config contains the expected nodes: {expected_ips}")
imex_nodes_config_file = (
f"/opt/parallelcluster/shared/nvidia-imex/nodes_config_{queue_name}_{compute_resource_name}.cfg"
)
imex_config_content = read_remote_file(rce, imex_nodes_config_file)
imex_config_content_clean = [line for line in imex_config_content.split("\n") if not line.strip().startswith("#")]
actual_ips = [ip.strip() for ip in imex_config_content_clean]
assert_that(actual_ips).contains_only(*expected_ips)
logging.info(f"IMEX nodes config {imex_nodes_config_file} contains the expected nodes: {expected_ips}")
def assert_imex_nodes_config_is_correct(cluster: Cluster, queue: str, compute_resource: str, expected_ips: list):
for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource):
logging.info(
f"Checking IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}"
)
rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip)
imex_config_content = read_remote_file(rce, "/etc/nvidia-imex/nodes_config.cfg")
imex_config_content_clean = [
line for line in imex_config_content.split("\n") if not line.strip().startswith("#")
]
actual_ips = [ip.strip() for ip in imex_config_content_clean]
assert_that(actual_ips).contains_only(*expected_ips)
logging.info(
f"IMEX nodes config for compute node {compute_node_ip} contains the expected nodes: {expected_ips}"
)


def assert_no_errors_in_logs(cluster: Cluster, queue: str, compute_resource: str):
rce = RemoteCommandExecutor(cluster)
logs = ["/var/log/nvidia-imex-verbose.log", "/var/log/parallelcluster/nvidia-imex-prolog.log"]
for compute_node_ip in cluster.get_compute_nodes_private_ip(queue, compute_resource):
rce = RemoteCommandExecutor(cluster, compute_node_ip=compute_node_ip)
for log in logs:
logging.info(f"Checking file {log} log does not contain any error")
if log == "/var/log/nvidia-imex-verbose.log" and not is_existing_remote_file(rce, log):
logging.info("IMEX log file not found. Not an issue as IMEX writes logs there only in case of errors.")
continue
assert_regex_in_file(cluster, compute_node_ip, log, r"(warn|error|fail)", negate=True)
assert_regex_in_file(rce, log, r"(warn|error|fail)", negate=True)


def assert_imex_status(
Expand Down Expand Up @@ -210,7 +213,7 @@ def _check_imex_healthy():
f"Private IP addresses for nodes in queue {queue} and compute resource {compute_resource}: " f"{ips}"
)

assert_imex_nodes_config_is_correct(rce, queue, compute_resource, ips)
assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, ips)
assert_imex_status(rce, job_id, ips, service_status="UP", node_status="READY", connection_status="CONNECTED")
assert_no_errors_in_logs(cluster, queue, compute_resource)

Expand Down Expand Up @@ -240,7 +243,7 @@ def assert_imex_not_configured(cluster: Cluster, queue: str, compute_resource: s

job_id = submit_job_imex_status(rce, queue, max_nodes)

assert_imex_nodes_config_is_correct(rce, queue, compute_resource, FAKE_IPS)
assert_imex_nodes_config_is_correct(cluster, queue, compute_resource, FAKE_IPS)
assert_imex_status(
rce, job_id, FAKE_IPS, service_status="DOWN", node_status="UNAVAILABLE", connection_status="INVALID"
)
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,17 @@
#!/usr/bin/env bash

# This prolog script configures the NVIDIA IMEX nodes config file and reloads the nvidia-imex service.
# This prolog is meant to be run by compute nodes with exclusive jobs.
# This prolog script configures the NVIDIA IMEX on compute nodes involved in the job execution.
#
# In particular:
# - Checks whether the job is executed exclusively.
# If not, it exits immediately because it requires jobs to be executed exclusively.
# - Writes the private IP addresses of compute nodes into /etc/nvidia-imex/nodes_config.cfg.
# - Creates the IMEX default channel.
# For more information about IMEX channels, see https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html
# - Restarts the IMEX system service.
#
# REQUIREMENTS:
# - This prolog assumes to be run only with exclusive jobs.

LOG_FILE_PATH="/var/log/parallelcluster/nvidia-imex-prolog.log"
SCONTROL_CMD="/opt/slurm/bin/scontrol"
Expand All @@ -10,6 +20,7 @@ IMEX_STOP_TIMEOUT=15
#TODO In production, specify p6e-gb200, only. We added g5g only for testing purposes.
ALLOWED_INSTANCE_TYPES="^(p6e-gb200|g5g)"
IMEX_SERVICE="nvidia-imex"
IMEX_NODES_CONFIG="/etc/nvidia-imex/nodes_config.cfg"

function info() {
echo "$(date "+%Y-%m-%dT%H:%M:%S.%3N") [INFO] [PID:$$] [JOB:${SLURM_JOB_ID}] $1"
Expand Down Expand Up @@ -48,24 +59,13 @@ function return_if_unsupported_instance_type() {
fi
}

function get_node_names() {
local _queue_name=$1
local _compute_resource_name=$2

${SCONTROL_CMD} show nodes --json | \
jq -r \
--arg queue_name "${_queue_name}" \
--arg compute_resource_name "${_compute_resource_name}" \
'[
.nodes[] |
select(
(.partitions[] | contains($queue_name)) and
(.features[] | contains($compute_resource_name)) and
(.features[] | contains("static"))
) |
.name
] |
join(",")'
function return_if_job_is_not_exclusive() {
if [[ "${SLURM_JOB_OVERSUBSCRIBE}" = "NO" ]]; then
info "Job is exclusive, proceeding with IMEX configuration"
else
info "Skipping IMEX configuration because the job is not exclusive"
prolog_end
fi
}

function get_ips_from_node_names() {
Expand All @@ -80,53 +80,6 @@ function get_compute_resource_name() {
echo "${_slurmd_node_name}" | sed -E "s/${_queue_name_prefix}(.+)-[0-9]+$/\1/"
}

function write_file() {
local _file=$1
local _content=$2
local _lock_file="${_file}.lock"
local _lock_timeout_seconds=60

if [[ -f "${_file}" ]] && [[ "$(cat "${_file}")" = "${_content}" ]]; then
info "File ${_file} already has the expected content, skipping the write operation"
return 1 # Not Updated
fi

# Try to acquire lock with timeout
(
if ! flock -x -w ${_lock_timeout_seconds} 200; then
# If timeout, assume deadlock and try to recover
info "Lock timeout after ${_lock_timeout_seconds}s, attempting deadlock recovery"
exit 1
fi
echo "${_content}" > "${_file}"
) 200>"${_lock_file}"

local _lock_result=$?

if [[ ${_lock_result} -eq 0 ]]; then
return 0 # Updated successfully
fi

# Deadlock recovery: remove stale lock file and retry once
error "Potential deadlock detected for ${_file}, attempting recovery"
rm -f "${_lock_file}"
sleep 1 # Brief pause to avoid race conditions

(
if ! flock -x -w 10 200; then
exit 1
fi
echo "${_content}" > "${_file}"
) 200>"${_lock_file}"

if [[ $? -eq 0 ]]; then
info "Lock acquired after deadlock recovery for ${_file}"
return 0 # Updated
fi

error_exit "Failed to acquire lock for ${_file} even after deadlock recovery"
}

function reload_imex() {
info "Stopping IMEX"
timeout ${IMEX_STOP_TIMEOUT} systemctl stop ${IMEX_SERVICE}
Expand All @@ -142,9 +95,6 @@ function reload_imex() {
}

function create_default_imex_channel() {
# This configuration follows
# [Nvidia doc](https://docs.nvidia.com/multi-node-nvlink-systems/imex-guide/imexchannels.html)
# This configuration is only suitable for single user environment, and not compatible with multi-user environment.
info "Creating IMEX default Channel"
MAJOR_NUMBER=$(cat /proc/devices | grep nvidia-caps-imex-channels | cut -d' ' -f1)
if [ ! -d "/dev/nvidia-caps-imex-channels" ]; then
Expand All @@ -163,27 +113,19 @@ function create_default_imex_channel() {
{
info "PROLOG Start JobId=${SLURM_JOB_ID}: $0"

return_if_job_is_not_exclusive
return_if_unsupported_instance_type

create_default_imex_channel

QUEUE_NAME=$SLURM_JOB_PARTITION
COMPUTE_RESOURCE_NAME=$(get_compute_resource_name "${QUEUE_NAME}-st-" $SLURMD_NODENAME)
CR_NODES=$(get_node_names "${QUEUE_NAME}" "${COMPUTE_RESOURCE_NAME}")
IPS_FROM_CR=$(get_ips_from_node_names "${CR_NODES}")
IMEX_MAIN_CONFIG="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg"
IMEX_NODES_CONFIG="/opt/parallelcluster/shared/nvidia-imex/nodes_config_${QUEUE_NAME}_${COMPUTE_RESOURCE_NAME}.cfg"

info "Queue Name: ${QUEUE_NAME}"
info "CR Name: ${COMPUTE_RESOURCE_NAME}"
info "CR Nodes: ${CR_NODES}"
info "Node IPs from CR: ${IPS_FROM_CR}"
info "IMEX Main Config: ${IMEX_MAIN_CONFIG}"
IPS_FROM_CR=$(get_ips_from_node_names "${SLURM_NODELIST}")

info "Node Names: ${SLURM_NODELIST}"
info "Node IPs: ${IPS_FROM_CR}"
info "IMEX Nodes Config: ${IMEX_NODES_CONFIG}"

info "Updating IMEX nodes config ${IMEX_NODES_CONFIG}"
write_file "${IMEX_NODES_CONFIG}" "${IPS_FROM_CR}"

echo "${IPS_FROM_CR}" > "${IMEX_NODES_CONFIG}"
reload_imex

prolog_end
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,5 @@
sleep 45
QUEUE_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_queue_name")
COMPUTE_RES_NAME=$(cat "/etc/chef/dna.json" | jq -r ".cluster.scheduler_compute_resource_name")
IMEX_CONFIG_FILE="/opt/parallelcluster/shared/nvidia-imex/config_${QUEUE_NAME}_${COMPUTE_RES_NAME}.cfg"

srun bash -c "/usr/bin/nvidia-imex-ctl -N -j -c ${IMEX_CONFIG_FILE} > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err"
srun bash -c "/usr/bin/nvidia-imex-ctl -N -j > result_\${SLURM_JOB_ID}_\$(hostname).out 2> result_\${SLURM_JOB_ID}_\$(hostname).err"
Loading