From d132a211a87ddf8b948bcc7805498f0e41c4ff81 Mon Sep 17 00:00:00 2001 From: Christopher Tate Date: Tue, 29 Oct 2024 12:13:51 -0600 Subject: [PATCH] Fix to NVIDIA GPU driver devices as volume mounts A suggested fix from NVIDIA for the drivers that were going missing on the GPU nodes in the prod, test, and test-2 clusters. Fixes nerc-project/operations#768 --- nvidia-gpu-operator/base/gpu-cluster-policy.yaml | 6 ++++++ .../nerc-ocp-test/clusterpolicy/clusterpolicy_patch.yaml | 2 ++ 2 files changed, 8 insertions(+) diff --git a/nvidia-gpu-operator/base/gpu-cluster-policy.yaml b/nvidia-gpu-operator/base/gpu-cluster-policy.yaml index 267cc5e8..d37c6d79 100644 --- a/nvidia-gpu-operator/base/gpu-cluster-policy.yaml +++ b/nvidia-gpu-operator/base/gpu-cluster-policy.yaml @@ -23,6 +23,9 @@ spec: default: "" name: "" enabled: true + env: + - name: DEVICE_LIST_STRATEGY + value: volume-mounts driver: certConfig: name: "" @@ -76,6 +79,9 @@ spec: toolkit: enabled: true installDir: /usr/local/nvidia + env: + - name: ACCEPT_NVIDIA_VISIBLE_DEVICES_AS_VOLUME_MOUNTS + value: "true" validator: plugin: env: diff --git a/nvidia-gpu-operator/overlays/nerc-ocp-test/clusterpolicy/clusterpolicy_patch.yaml b/nvidia-gpu-operator/overlays/nerc-ocp-test/clusterpolicy/clusterpolicy_patch.yaml index 54ebe26f..14d1b966 100644 --- a/nvidia-gpu-operator/overlays/nerc-ocp-test/clusterpolicy/clusterpolicy_patch.yaml +++ b/nvidia-gpu-operator/overlays/nerc-ocp-test/clusterpolicy/clusterpolicy_patch.yaml @@ -14,3 +14,5 @@ spec: env: - name: ACCEPT_NVIDIA_VISIBLE_DEVICES_ENVVAR_WHEN_UNPRIVILEGED value: 'false' + - name: ACCEPT_NVIDIA_VISIBLE_DEVICES_AS_VOLUME_MOUNTS + value: "true"