Skip to content

Commit 1ac2c30

Browse files
authored
Merge pull request #579 from ocaisa/override_lmod_gpu_check
Allow overriding the Lmod GPU driver check
2 parents c635420 + b15fc3d commit 1ac2c30

File tree

6 files changed

+36
-12
lines changed

6 files changed

+36
-12
lines changed

EESSI-install-software.sh

Lines changed: 10 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,11 @@ display_help() {
1717
echo " --skip-cuda-install - disable installing a full CUDA SDK in the host_injections prefix (e.g. in CI)"
1818
}
1919

20+
# Function to check if a command exists
21+
function command_exists() {
22+
command -v "$1" >/dev/null 2>&1
23+
}
24+
2025
function copy_build_log() {
2126
# copy specified build log to specified directory, with some context added
2227
build_log=${1}
@@ -238,10 +243,11 @@ else
238243
echo "Skipping installation of CUDA SDK in host_injections, since the --skip-cuda-install flag was passed OR no EasyBuild module was found"
239244
fi
240245

241-
# Install drivers in host_injections
242-
# TODO: this is commented out for now, because the script assumes that nvidia-smi is available and works;
243-
# if not, an error is produced, and the bot flags the whole build as failed (even when not installing GPU software)
244-
# ${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
246+
# Install NVIDIA drivers in host_injections (if they exist)
247+
if command_exists "nvidia-smi"; then
248+
echo "Command 'nvidia-smi' found. Installing NVIDIA drivers for use in prefix shell..."
249+
${EESSI_PREFIX}/scripts/gpu_support/nvidia/link_nvidia_host_libraries.sh
250+
fi
245251

246252
# use PR patch file to determine in which easystack files stuff was added
247253
changed_easystacks=$(cat ${pr_diff} | grep '^+++' | cut -f2 -d' ' | sed 's@^[a-z]/@@g' | grep '^easystacks/.*yml$' | egrep -v 'known-issues|missing')

bot/build.sh

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -223,7 +223,15 @@ mkdir -p ${TARBALL_TMP_BUILD_STEP_DIR}
223223
BUILD_STEP_ARGS+=("--save" "${TARBALL_TMP_BUILD_STEP_DIR}")
224224
BUILD_STEP_ARGS+=("--storage" "${STORAGE}")
225225
# add options required to handle NVIDIA support
226-
BUILD_STEP_ARGS+=("--nvidia" "all")
226+
if command_exists "nvidia-smi"; then
227+
echo "Command 'nvidia-smi' found, using available GPU"
228+
BUILD_STEP_ARGS+=("--nvidia" "all")
229+
else
230+
echo "No 'nvidia-smi' found, no available GPU but allowing overriding this check"
231+
BUILD_STEP_ARGS+=("--nvidia" "install")
232+
fi
233+
# Retain location for host injections so we don't reinstall CUDA
234+
# (Always need to run the driver installation as available driver may change)
227235
if [[ ! -z ${SHARED_FS_PATH} ]]; then
228236
BUILD_STEP_ARGS+=("--host-injections" "${SHARED_FS_PATH}/host-injections")
229237
fi

create_lmodsitepackage.py

Lines changed: 6 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -131,8 +131,9 @@
131131
end
132132
-- when loading CUDA enabled modules check if the necessary driver libraries are accessible to the EESSI linker,
133133
-- otherwise, refuse to load the requested module and print error message
134-
local haveGpu = mt:haveProperty(simpleName,"arch","gpu")
135-
if haveGpu then
134+
local checkGpu = mt:haveProperty(simpleName,"arch","gpu")
135+
local overrideGpuCheck = os.getenv("EESSI_OVERRIDE_GPU_CHECK")
136+
if checkGpu and (overrideGpuCheck == nil) then
136137
local arch = os.getenv("EESSI_CPU_FAMILY") or ""
137138
local cudaVersionFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/cuda_version.txt"
138139
local cudaDriverFile = "/cvmfs/software.eessi.io/host_injections/nvidia/" .. arch .. "/latest/libcuda.so"
@@ -141,7 +142,9 @@
141142
if not (cudaDriverExists or singularityCudaExists) then
142143
local advice = "which relies on the CUDA runtime environment and driver libraries. "
143144
advice = advice .. "In order to be able to use the module, you will need "
144-
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system.\\n"
145+
advice = advice .. "to make sure EESSI can find the GPU driver libraries on your host system. You can "
146+
advice = advice .. "override this check by setting the environment variable EESSI_OVERRIDE_GPU_CHECK but "
147+
advice = advice .. "the loaded application will not be able to execute on your system.\\n"
145148
advice = advice .. refer_to_docs
146149
LmodError("\\nYou requested to load ", simpleName, " ", advice)
147150
else

eessi_container.sh

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -464,10 +464,9 @@ if [[ ${SETUP_NVIDIA} -eq 1 ]]; then
464464
BIND_PATHS="${BIND_PATHS},${EESSI_VAR_LOG}:/var/log,${EESSI_USR_LOCAL_CUDA}:/usr/local/cuda"
465465
[[ ${VERBOSE} -eq 1 ]] && echo "BIND_PATHS=${BIND_PATHS}"
466466
if [[ "${NVIDIA_MODE}" == "install" ]] ; then
467-
# We need to "trick" our LMOD_RC file to allow us to load CUDA modules even without a CUDA driver
468-
# (this works because we build within a container and the LMOD_RC recognises that)
469-
touch ${EESSI_TMPDIR}/libcuda.so
470-
export SINGULARITY_CONTAINLIBS="${EESSI_TMPDIR}/libcuda.so"
467+
# No GPU so we need to "trick" Lmod to allow us to load CUDA modules even without a CUDA driver
468+
# (this variable means EESSI_OVERRIDE_GPU_CHECK=1 will be set inside the container)
469+
export SINGULARITYENV_EESSI_OVERRIDE_GPU_CHECK=1
471470
fi
472471
fi
473472
fi

run_in_compat_layer_env.sh

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,9 @@ fi
2626
if [ ! -z ${EESSI_VERSION_OVERRIDE} ]; then
2727
INPUT="export EESSI_VERSION_OVERRIDE=${EESSI_VERSION_OVERRIDE}; ${INPUT}"
2828
fi
29+
if [ ! -z ${EESSI_OVERRIDE_GPU_CHECK} ]; then
30+
INPUT="export EESSI_OVERRIDE_GPU_CHECK=${EESSI_OVERRIDE_GPU_CHECK}; ${INPUT}"
31+
fi
2932
if [ ! -z ${http_proxy} ]; then
3033
INPUT="export http_proxy=${http_proxy}; ${INPUT}"
3134
fi

scripts/utils.sh

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -78,6 +78,11 @@ function create_directory_structure() {
7878
return $return_code
7979
}
8080

81+
# Function to check if a command exists
82+
function command_exists() {
83+
command -v "$1" >/dev/null 2>&1
84+
}
85+
8186
function get_path_for_tool {
8287
tool_name=$1
8388
tool_envvar_name=$2

0 commit comments

Comments
 (0)