Skip to content

Get Nvidia reps in line with what we run on the clusters #74

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 13 commits into from
May 8, 2025
34 changes: 22 additions & 12 deletions build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ SCRIPT=$(readlink -f "${BASH_SOURCE[0]}")
ORIGIN=$(dirname "$SCRIPT")

# which version to download from github
SLURM_VERSION=${VERSION:-24.05.3}
SLURM_VERSION=${VERSION:-24.05.7}
UPSTREAM_REL=${UPSTREAM_REL:-1}

# which release should be used for our RPMs
Expand All @@ -28,17 +28,24 @@ OUR_RELEASE=${RELEASE:-1}
# allow _empty_ version, which is used in pipeline

if grep "release 8.8" /etc/redhat-release; then
NVIDIA_DRIVER=${NVIDIA_DRIVER-555.42.06}
NVDRV_NVML_PKG="nvidia-driver-NVML${NVIDIA_DRIVER:+-$NVIDIA_DRIVER}"
CUDA_VERSION=${CUDA_VERSION:-12.6}
CUDA_NVML_PKG="cuda-nvml-devel-${CUDA_VERSION//./-}"
NVIDIA_MAJOR_VERSION=570
NVIDIA_MINOR_VERSION=133.20
NVIDIA_DRIVER=${NVIDIA_DRIVER-${NVIDIA_MAJOR_VERSION}.${NVIDIA_MINOR_VERSION}}
NVDRV_NVML_PKG="libnvidia-ml${NVIDIA_DRIVER:+-$NVIDIA_DRIVER}"
CUDA_VERSION=${CUDA_VERSION:-12.8}
CUDA_NVML_PKG="cuda-nvml-devel-${CUDA_VERSION//./-}"
elif grep "release 9.4" /etc/redhat-release; then
NVIDIA_DRIVER=${NVIDIA_DRIVER-555.42.06}
NVDRV_NVML_PKG="nvidia-driver-NVML${NVIDIA_DRIVER:+-$NVIDIA_DRIVER}"
CUDA_VERSION=${CUDA_VERSION:-12.6}
CUDA_NVML_PKG="cuda-nvml-devel-${CUDA_VERSION//./-}"
NVIDIA_MAJOR_VERSION=570
NVIDIA_MINOR_VERSION=133.20
NVIDIA_DRIVER=${NVIDIA_DRIVER-${NVIDIA_MAJOR_VERSION}.${NVIDIA_MINOR_VERSION}}
NVDRV_NVML_PKG="libnvidia-ml${NVIDIA_DRIVER:+-$NVIDIA_DRIVER}"
CUDA_VERSION=${CUDA_VERSION:-12.8}
CUDA_NVML_PKG="cuda-nvml-devel-${CUDA_VERSION//./-}"
fi




# Prepare directory structure
rm -Rf $ORIGIN/rpmbuild/ $ORIGIN/dist/
mkdir -p $ORIGIN/rpmbuild/{BUILD,RPMS,SRPMS,SOURCES} $ORIGIN/dist
Expand Down Expand Up @@ -97,7 +104,10 @@ sudo dnf -y install munge-devel libjwt-devel pam-devel
sudo dnf -y install http-parser-devel json-c-devel libyaml-devel
# - features: Nvidia NVML
sudo dnf -y autoremove cuda-nvml-* nvidia-driver-NVML-* nvidia-driver* libnvidia-ml*
sudo dnf -y install "$CUDA_NVML_PKG" "$NVDRV_NVML_PKG" "nvidia-driver-devel"

sudo dnf -y module switch-to nvidia-driver:${NVIDIA_MAJOR_VERSION}-dkms

sudo dnf -y install "$CUDA_NVML_PKG" "$NVDRV_NVML_PKG" # "nvidia-driver-devel"
# - plugins: MPI
sudo dnf -y install pmix "pmix-devel ${PMIX_VERSION}" "ucx-devel-${UCX_VERSION}"
# - plugins: cgroup/v2
Expand Down Expand Up @@ -128,7 +138,7 @@ rpmbuild -ba "${RPM_DEFINES[@]}" "${SLURM_BUILDOPTS[@]}" --without nvml \
echo "Doing rpm rebuild (without nvml)"
for rpm in $ORIGIN/rpmbuild/RPMS/x86_64/slurm-*$SUFFIX*.rpm ; do
rpmrebuild --release=${OUR_RELEASE}.${GITTAG}$(rpm -E '%dist').nogpu.ug -d $ORIGIN/dist -p $rpm
done
done 2>&1 | tee rpmrebuild-without-nvml.out


echo "Running rpmbuild (with nvml)"
Expand All @@ -139,7 +149,7 @@ rpmbuild -ba "${RPM_DEFINES[@]}" "${SLURM_BUILDOPTS[@]}" --with nvml \
echo "Doing rpm rebuild (with nvml)"
for rpm in $ORIGIN/rpmbuild/RPMS/x86_64/slurm-*$SUFFIX*.rpm ; do
rpmrebuild --release=${OUR_RELEASE}.${GITTAG}$(rpm -E '%dist').ug -d $ORIGIN/dist -p $rpm
done
done 2>&1 | tee rpmrebuild-with-nvml.out

# strip out torque binaries/wrapper from slurm-torque
rpmrebuild -d $ORIGIN/dist --change-spec-files="sed '/\(pbsnodes\|mpiexec\|bin\/q.\+\)/d'" -p $ORIGIN/dist/x86_64/slurm-torque-*-${OUR_RELEASE}.${GITTAG}$(rpm -E '%dist').nogpu.ug*.rpm
Expand Down
26 changes: 20 additions & 6 deletions src/plugins/accounting_storage/mysql/as_mysql_user.c
Original file line number Diff line number Diff line change
Expand Up @@ -539,6 +539,7 @@ extern int as_mysql_add_users(mysql_conn_t *mysql_conn, uint32_t uid,
if (check_connection(mysql_conn) != SLURM_SUCCESS)
return ESLURM_DB_CONNECTION;

bool is_admin = false;
if (!is_user_min_admin_level(mysql_conn, uid, SLURMDB_ADMIN_OPERATOR)) {
slurmdb_user_rec_t user;

Expand All @@ -560,6 +561,8 @@ extern int as_mysql_add_users(mysql_conn_t *mysql_conn, uint32_t uid,
* these accounts if they are coordinators of the
* parent they are trying to add to
*/
} else {
is_admin = true;
}

if (!user_list || !list_count(user_list)) {
Expand All @@ -585,6 +588,11 @@ extern int as_mysql_add_users(mysql_conn_t *mysql_conn, uint32_t uid,
(long)now, (long)now, object->name);

if (object->admin_level != SLURMDB_ADMIN_NOTSET) {
if (!is_admin) {
error("Only admins/operators can add make a user and operator/admin");
rc = ESLURM_ACCESS_DENIED;
break;
}
xstrcat(cols, ", admin_level");
xstrfmtcat(vals, ", %u", object->admin_level);
xstrfmtcat(extra, ", admin_level=%u",
Expand Down Expand Up @@ -666,7 +674,7 @@ extern int as_mysql_add_users(mysql_conn_t *mysql_conn, uint32_t uid,
list_iterator_destroy(itr);
xfree(user_name);

if (rc != SLURM_ERROR) {
if (rc == SLURM_SUCCESS) {
if (txn_query) {
xstrcat(txn_query, ";");
rc = mysql_db_query(mysql_conn,
Expand Down Expand Up @@ -711,7 +719,16 @@ extern char *as_mysql_add_users_cond(mysql_conn_t *mysql_conn, uint32_t uid,
}

if (!is_user_min_admin_level(mysql_conn, uid, SLURMDB_ADMIN_OPERATOR)) {
slurmdb_user_rec_t user;
slurmdb_user_rec_t user_coord = {
.uid = uid,
};

if (user->admin_level != SLURMDB_ADMIN_NOTSET) {
ret_str = xstrdup("Only admins/operators can add make a user and admin/operator");
error("%s", ret_str);
errno = ESLURM_ACCESS_DENIED;
return ret_str;
}

if (slurmdbd_conf->flags & DBD_CONF_FLAG_DISABLE_COORD_DBD) {
ret_str = xstrdup("Coordinator privilege revoked with DisableCoordDBD, only admins/operators can add accounts.");
Expand All @@ -720,10 +737,7 @@ extern char *as_mysql_add_users_cond(mysql_conn_t *mysql_conn, uint32_t uid,
return ret_str;
}

memset(&user, 0, sizeof(slurmdb_user_rec_t));
user.uid = uid;

if (!is_user_any_coord(mysql_conn, &user)) {
if (!is_user_any_coord(mysql_conn, &user_coord)) {
ret_str = xstrdup("Only admins/operators/coordinators can add accounts");
error("%s", ret_str);
errno = ESLURM_ACCESS_DENIED;
Expand Down