From 6bb715669289cd849a7d7db78cd0b183e7a4aa40 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Tue, 24 Jun 2025 16:36:07 +0200 Subject: [PATCH 1/8] Create for utils function nvidia-smi check Signed-off-by: laraPPr --- EESSI-install-software.sh | 16 +--------------- bot/build.sh | 12 +----------- bot/test.sh | 11 +---------- scripts/utils.sh | 13 +++++++++++++ 4 files changed, 16 insertions(+), 36 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index c491819f..a674fa6f 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -17,11 +17,6 @@ display_help() { echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)" } -# Function to check if a command exists -function command_exists() { - command -v "$1" >/dev/null 2>&1 -} - function copy_build_log() { # copy specified build log to specified directory, with some context added build_log=${1} @@ -307,16 +302,7 @@ fi # Install NVIDIA drivers in host_injections (if they exist) if command_exists "nvidia-smi"; then export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" - nvidia-smi --version - ec=$? - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - fi + check_nvidia-smi_installation fi if [ ! -z "${shared_fs_path}" ]; then diff --git a/bot/build.sh b/bot/build.sh index 2bba0cba..7875e70f 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -264,18 +264,8 @@ BUILD_STEP_ARGS+=("--storage" "${STORAGE}") if command_exists "nvidia-smi"; then # Accept that this may fail set +e - nvidia-smi --version - ec=$? + check_nvidia-smi_installation set -e - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found, using available GPU" - BUILD_STEP_ARGS+=("--nvidia" "all") - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - BUILD_STEP_ARGS+=("--nvidia" "install") - fi else echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" BUILD_STEP_ARGS+=("--nvidia" "install") diff --git a/bot/test.sh b/bot/test.sh index 168b0d5c..0eaf07dd 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -222,17 +222,8 @@ TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") if command_exists "nvidia-smi"; then # Accept that this may fail set +e - nvidia-smi --version - ec=$? + check_nvidia-smi_installation set -e - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found, using available GPU" - TEST_STEP_ARGS+=("--nvidia" "run") - else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." - fi fi # prepare arguments to test_suite.sh (specific to test step) diff --git a/scripts/utils.sh b/scripts/utils.sh index 962decd2..cb0a5fe7 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -147,3 +147,16 @@ function get_ipv4_address { echo "${hipv4}" return 0 } + +function check_nvidia-smi_installation { + nvidia-smi --version + ec=$? + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + fi +} From fb8fdfd673cc6e449d23cc50fef0efc807b8532b Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 31 Jul 2025 16:11:16 +0200 Subject: [PATCH 2/8] Update tests_scripts.yml --- .github/workflows/tests_scripts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 7a8f5fa4..8b55b9a3 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -51,7 +51,7 @@ jobs: # can't test with EasyBuild versions older than v4.5.2 when using EESSI 2023.06, # since Python in compat layer is Python 3.11.x; # testing with a single EasyBuild version takes a while in GitHub Actions, so stick to a single sensible version - for EB_VERSION in '4.6.0'; do + for EB_VERSION in '5.1.0'; do # Create script that uses load_easybuild_module.sh which we can run in compat layer environment # note: Be careful with single vs double quotes below! # ${EB_VERSION} should be expanded, so use double quotes; From 986f58d27a9d25e04f7a0e4480c2a782afce164d Mon Sep 17 00:00:00 2001 From: ocaisa Date: Thu, 31 Jul 2025 16:24:13 +0200 Subject: [PATCH 3/8] Update tests_scripts.yml --- .github/workflows/tests_scripts.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests_scripts.yml b/.github/workflows/tests_scripts.yml index 8b55b9a3..db942c57 100644 --- a/.github/workflows/tests_scripts.yml +++ b/.github/workflows/tests_scripts.yml @@ -113,7 +113,7 @@ jobs: # scripts need to be copied to /tmp, # since create_directory_tarballs.sh must be accessible from within build container - ./eessi_container.sh --mode run --verbose /software-layer-scripts/create_directory_tarballs.sh 2023.06 + ./eessi_container.sh --mode run --verbose /software-layer-scripts/create_directory_tarballs.sh "${{matrix.EESSI_VERSION}}" # check if tarballs have been produced ls -l *.tar.gz From 49de3c3a9f361b7dc3b002099c9a8e04362a31bb Mon Sep 17 00:00:00 2001 From: laraPPr Date: Fri, 1 Aug 2025 13:47:13 +0200 Subject: [PATCH 4/8] fix the nvidia-smi utils funtion Signed-off-by: laraPPr --- EESSI-install-software.sh | 11 +++++++++-- bot/build.sh | 17 ++++++++++------- bot/test.sh | 12 +++++++----- scripts/utils.sh | 24 +++++++++++++++--------- 4 files changed, 41 insertions(+), 23 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 4b0868ed..c76745cc 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -310,10 +310,17 @@ else fi # Install NVIDIA drivers in host_injections (if they exist) -if command_exists "nvidia-smi"; then +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" + ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +elif [ ${ec} -eq 1 ]; then export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" - check_nvidia-smi_installation fi +set -e if [ ! -z "${shared_fs_path}" ]; then shared_eb_sourcepath=${shared_fs_path}/easybuild/sources diff --git a/bot/build.sh b/bot/build.sh index b40cdfac..6a08c7f2 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -220,15 +220,18 @@ BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -if command_exists "nvidia-smi"; then - # Accept that this may fail - set +e - check_nvidia-smi_installation - set -e -else - echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + BUILD_STEP_ARGS+=("--nvidia" "all") +elif [ ${ec} -eq 1 ]; then + BUILD_STEP_ARGS+=("--nvidia" "install") +elif [ ${ec} -eq 2 ]; then BUILD_STEP_ARGS+=("--nvidia" "install") fi +set -e # Retain location for host injections so we don't reinstall CUDA # (Always need to run the driver installation as available driver may change) diff --git a/bot/test.sh b/bot/test.sh index cfeccc99..0cb10174 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -225,12 +225,14 @@ fi TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support -if command_exists "nvidia-smi"; then - # Accept that this may fail - set +e - check_nvidia-smi_installation - set -e +# Accept that this may fail +set +e +verify_nvidia-smi +ec=$? +if [ ${ec} -eq 0 ]; then + TEST_STEP_ARGS+=("--nvidia" "run") fi +set -e # prepare arguments to test_suite.sh (specific to test step) declare -a TEST_SUITE_ARGS=() diff --git a/scripts/utils.sh b/scripts/utils.sh index cb0a5fe7..2adc1a0d 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -148,15 +148,21 @@ function get_ipv4_address { return 0 } -function check_nvidia-smi_installation { - nvidia-smi --version - ec=$? - if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." - ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh +function verify_nvidia-smi { + if command_exists "nvidia-smi"; then + nvidia-smi --version + ec=$? + if [ ${ec} -eq 0 ]; then + echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + return 0 + else + echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." + echo "This script now assumes this is NOT a GPU node." + echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + return 1 + fi else - echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." - echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." + echo echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" + return 2 fi } From a84d7c6e32e0ae2719ea2ac354347c96825ea76c Mon Sep 17 00:00:00 2001 From: Lara Ramona Peeters <49882639+laraPPr@users.noreply.github.com> Date: Fri, 1 Aug 2025 14:19:36 +0200 Subject: [PATCH 5/8] simplify code in test.sh Co-authored-by: ocaisa --- bot/test.sh | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/bot/test.sh b/bot/test.sh index 0cb10174..cad883af 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -225,14 +225,9 @@ fi TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support -# Accept that this may fail -set +e -verify_nvidia-smi -ec=$? -if [ ${ec} -eq 0 ]; then +if verify_nvidia-smi; then TEST_STEP_ARGS+=("--nvidia" "run") fi -set -e # prepare arguments to test_suite.sh (specific to test step) declare -a TEST_SUITE_ARGS=() From 2e8e7a4bb270a0122b911b1aa9a29adb7101cda3 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Fri, 1 Aug 2025 14:23:26 +0200 Subject: [PATCH 6/8] fix comment for EESSI-install-software.sh Signed-off-by: laraPPr --- EESSI-install-software.sh | 1 + scripts/utils.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index c76745cc..8cbaaebf 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -315,6 +315,7 @@ set +e verify_nvidia-smi ec=$? if [ ${ec} -eq 0 ]; then + echo "Installing NVIDIA drivers for use in prefix shell..."" export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh elif [ ${ec} -eq 1 ]; then diff --git a/scripts/utils.sh b/scripts/utils.sh index 2adc1a0d..74bea158 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -153,7 +153,7 @@ function verify_nvidia-smi { nvidia-smi --version ec=$? if [ ${ec} -eq 0 ]; then - echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..." + echo "Command 'nvidia-smi' found." return 0 else echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." From a741480085e416166355f234f10ada1b41f92633 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Fri, 1 Aug 2025 15:08:56 +0200 Subject: [PATCH 7/8] Take all sugestions into account Signed-off-by: laraPPr --- scripts/utils.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/utils.sh b/scripts/utils.sh index 74bea158..7782e5af 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -162,7 +162,7 @@ function verify_nvidia-smi { return 1 fi else - echo echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" + echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" return 2 fi } From f400e47d27108b5b8d99cf0f5e3697b6bc176601 Mon Sep 17 00:00:00 2001 From: laraPPr Date: Fri, 1 Aug 2025 15:12:38 +0200 Subject: [PATCH 8/8] Take all sugestions into account Signed-off-by: laraPPr --- EESSI-install-software.sh | 12 ++---------- bot/build.sh | 11 ++--------- bot/test.sh | 2 +- scripts/utils.sh | 8 ++++---- 4 files changed, 9 insertions(+), 24 deletions(-) diff --git a/EESSI-install-software.sh b/EESSI-install-software.sh index 8cbaaebf..bf5c59ca 100755 --- a/EESSI-install-software.sh +++ b/EESSI-install-software.sh @@ -310,18 +310,10 @@ else fi # Install NVIDIA drivers in host_injections (if they exist) -# Accept that this may fail -set +e -verify_nvidia-smi -ec=$? -if [ ${ec} -eq 0 ]; then - echo "Installing NVIDIA drivers for use in prefix shell..."" - export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" +if nvidia_gpu_available; then + echo "Installing NVIDIA drivers for use in prefix shell..." ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh -elif [ ${ec} -eq 1 ]; then - export LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" fi -set -e if [ ! -z "${shared_fs_path}" ]; then shared_eb_sourcepath=${shared_fs_path}/easybuild/sources diff --git a/bot/build.sh b/bot/build.sh index 6a08c7f2..290444f1 100755 --- a/bot/build.sh +++ b/bot/build.sh @@ -220,18 +220,11 @@ BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}") BUILD_STEP_ARGS+=("--storage" "${STORAGE}") # add options required to handle NVIDIA support -# Accept that this may fail -set +e -verify_nvidia-smi -ec=$? -if [ ${ec} -eq 0 ]; then +if nvidia_gpu_available; then BUILD_STEP_ARGS+=("--nvidia" "all") -elif [ ${ec} -eq 1 ]; then - BUILD_STEP_ARGS+=("--nvidia" "install") -elif [ ${ec} -eq 2 ]; then +else BUILD_STEP_ARGS+=("--nvidia" "install") fi -set -e # Retain location for host injections so we don't reinstall CUDA # (Always need to run the driver installation as available driver may change) diff --git a/bot/test.sh b/bot/test.sh index cad883af..93907de5 100755 --- a/bot/test.sh +++ b/bot/test.sh @@ -225,7 +225,7 @@ fi TEST_STEP_ARGS+=("--extra-bind-paths" "/sys/fs/cgroup:/hostsys/fs/cgroup:ro") # add options required to handle NVIDIA support -if verify_nvidia-smi; then +if nvidia_gpu_available; then TEST_STEP_ARGS+=("--nvidia" "run") fi diff --git a/scripts/utils.sh b/scripts/utils.sh index 7782e5af..51fb2155 100644 --- a/scripts/utils.sh +++ b/scripts/utils.sh @@ -148,9 +148,10 @@ function get_ipv4_address { return 0 } -function verify_nvidia-smi { +function nvidia_gpu_available { if command_exists "nvidia-smi"; then - nvidia-smi --version + # We are careful here in case we are running in a container and LD_LIBARY_PATH has been wiped + LD_LIBRARY_PATH="/.singularity.d/libs:${LD_LIBRARY_PATH}" nvidia-smi --version ec=$? if [ ${ec} -eq 0 ]; then echo "Command 'nvidia-smi' found." @@ -158,11 +159,10 @@ function verify_nvidia-smi { else echo "Warning: command 'nvidia-smi' found, but 'nvidia-smi --version' did not run succesfully." echo "This script now assumes this is NOT a GPU node." - echo "If, and only if, the current node actually does contain Nvidia GPUs, this should be considered an error." return 1 fi else - echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check" + echo "No 'nvidia-smi' found, no available GPU." return 2 fi }