Skip to content

Commit e2b9f10

Browse files
committed
fix(gpu): write complete CDI spec instead of fragile sed patching
The previous approach used sed to inject GPU UUID entries and libdxcore.so mounts into the nvidia-ctk-generated CDI spec. This corrupted the YAML structure (duplicate containerEdits keys) causing CDI device resolution to fail with "failed to unmarshal CDI Spec". Replace with writing the complete CDI spec from scratch using a heredoc. This is more robust and easier to understand. The spec includes: - /dev/dxg device node - Per-GPU entries by UUID and index (for device plugin allocation) - libdxcore.so mount (missing from nvidia-ctk on WSL2) - All WSL driver store library mounts - ldcache update hooks for both driver store and libdxcore directories Tested end-to-end: nemoclaw onboard -> gateway start -> WSL2 fix -> sandbox create with GPU -> nvidia-smi working inside sandbox pod.
1 parent dae043b commit e2b9f10

File tree

1 file changed

+83
-45
lines changed

1 file changed

+83
-45
lines changed

deploy/docker/cluster-entrypoint.sh

Lines changed: 83 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -345,55 +345,93 @@ if [ "${GPU_ENABLED:-}" = "true" ]; then
345345
if [ -c /dev/dxg ]; then
346346
echo "WSL2 detected (/dev/dxg present) — configuring CDI mode for GPU"
347347

348-
# 1. Generate CDI spec (nvidia-ctk auto-detects WSL mode)
349-
if command -v nvidia-ctk >/dev/null 2>&1; then
348+
# 1. Build a complete CDI spec from scratch.
349+
# nvidia-ctk cdi generate has two WSL2 bugs:
350+
# a) only creates name=all but the device plugin assigns by UUID
351+
# b) misses libdxcore.so (the NVML-to-DXG bridge library)
352+
# Writing the spec directly avoids fragile sed patching of YAML.
353+
if command -v nvidia-ctk >/dev/null 2>&1 && command -v nvidia-smi >/dev/null 2>&1; then
350354
mkdir -p /var/run/cdi
351-
nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml 2>&1 || true
352-
353-
# 2. Add per-GPU device entries (UUID and index) to CDI spec.
354-
# nvidia-ctk only generates name=all, but the device plugin
355-
# assigns GPUs by UUID which must resolve as a CDI device.
356-
if [ -f /var/run/cdi/nvidia.yaml ] && command -v nvidia-smi >/dev/null 2>&1; then
357-
idx=0
358-
nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2>/dev/null | while read -r uuid; do
359-
uuid=$(echo "$uuid" | tr -d ' ')
360-
[ -z "$uuid" ] && continue
361-
sed -i "/- name: all/a\\
362-
- name: $uuid\\
363-
containerEdits:\\
364-
deviceNodes:\\
365-
- path: /dev/dxg\\
366-
- name: \"$idx\"\\
367-
containerEdits:\\
368-
deviceNodes:\\
369-
- path: /dev/dxg" /var/run/cdi/nvidia.yaml
370-
idx=$((idx + 1))
371-
done
372-
# nvidia-ctk cdi generate uses cdiVersion 0.3.0 but the
373-
# installed CDI library requires >= 0.5.0
374-
sed -i 's/cdiVersion: 0\.3\.0/cdiVersion: 0.5.0/' /var/run/cdi/nvidia.yaml
375-
echo "CDI spec: added per-GPU UUID and index device entries"
376-
fi
377355

378-
# 4. Patch CDI spec: add libdxcore.so mount (nvidia-ctk misses it)
356+
GPU_UUID=$(nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2>/dev/null | tr -d ' ' | head -1)
379357
DXCORE_PATH=$(find /usr/lib -name "libdxcore.so" 2>/dev/null | head -1)
380-
if [ -n "$DXCORE_PATH" ] && [ -f /var/run/cdi/nvidia.yaml ]; then
381-
DXCORE_DIR=$(dirname "$DXCORE_PATH")
382-
# Insert libdxcore mount after the mounts: key
383-
sed -i "/^ mounts:/a\\
384-
- hostPath: $DXCORE_PATH\\
385-
containerPath: $DXCORE_PATH\\
386-
options:\\
387-
- ro\\
388-
- nosuid\\
389-
- nodev\\
390-
- rbind\\
391-
- rprivate" /var/run/cdi/nvidia.yaml
392-
# Add ldcache folder for libdxcore directory
393-
sed -i "s|update-ldcache|update-ldcache\n - --folder\n - $DXCORE_DIR|" /var/run/cdi/nvidia.yaml
394-
echo "CDI spec patched with libdxcore.so from $DXCORE_PATH"
358+
DXCORE_DIR=$(dirname "$DXCORE_PATH" 2>/dev/null || echo "/usr/lib/x86_64-linux-gnu")
359+
DRIVER_DIR=$(ls -d /usr/lib/wsl/drivers/nv*.inf_amd64_* 2>/dev/null | head -1)
360+
361+
if [ -n "$DRIVER_DIR" ] && [ -n "$GPU_UUID" ]; then
362+
cat > /var/run/cdi/nvidia.yaml <<CDIEOF
363+
---
364+
cdiVersion: "0.5.0"
365+
kind: nvidia.com/gpu
366+
devices:
367+
- name: all
368+
containerEdits:
369+
deviceNodes:
370+
- path: /dev/dxg
371+
- name: "${GPU_UUID}"
372+
containerEdits:
373+
deviceNodes:
374+
- path: /dev/dxg
375+
- name: "0"
376+
containerEdits:
377+
deviceNodes:
378+
- path: /dev/dxg
379+
containerEdits:
380+
env:
381+
- NVIDIA_VISIBLE_DEVICES=void
382+
hooks:
383+
- hookName: createContainer
384+
path: /usr/bin/nvidia-cdi-hook
385+
args:
386+
- nvidia-cdi-hook
387+
- create-symlinks
388+
- --link
389+
- ${DRIVER_DIR}/nvidia-smi::/usr/bin/nvidia-smi
390+
env:
391+
- NVIDIA_CTK_DEBUG=false
392+
- hookName: createContainer
393+
path: /usr/bin/nvidia-cdi-hook
394+
args:
395+
- nvidia-cdi-hook
396+
- update-ldcache
397+
- --folder
398+
- ${DRIVER_DIR}
399+
- --folder
400+
- ${DXCORE_DIR}
401+
env:
402+
- NVIDIA_CTK_DEBUG=false
403+
mounts:
404+
- hostPath: ${DXCORE_PATH}
405+
containerPath: ${DXCORE_PATH}
406+
options: [ro, nosuid, nodev, rbind, rprivate]
407+
- hostPath: ${DRIVER_DIR}/libcuda.so.1.1
408+
containerPath: ${DRIVER_DIR}/libcuda.so.1.1
409+
options: [ro, nosuid, nodev, rbind, rprivate]
410+
- hostPath: ${DRIVER_DIR}/libcuda_loader.so
411+
containerPath: ${DRIVER_DIR}/libcuda_loader.so
412+
options: [ro, nosuid, nodev, rbind, rprivate]
413+
- hostPath: ${DRIVER_DIR}/libnvdxgdmal.so.1
414+
containerPath: ${DRIVER_DIR}/libnvdxgdmal.so.1
415+
options: [ro, nosuid, nodev, rbind, rprivate]
416+
- hostPath: ${DRIVER_DIR}/libnvidia-ml.so.1
417+
containerPath: ${DRIVER_DIR}/libnvidia-ml.so.1
418+
options: [ro, nosuid, nodev, rbind, rprivate]
419+
- hostPath: ${DRIVER_DIR}/libnvidia-ml_loader.so
420+
containerPath: ${DRIVER_DIR}/libnvidia-ml_loader.so
421+
options: [ro, nosuid, nodev, rbind, rprivate]
422+
- hostPath: ${DRIVER_DIR}/libnvidia-ptxjitcompiler.so.1
423+
containerPath: ${DRIVER_DIR}/libnvidia-ptxjitcompiler.so.1
424+
options: [ro, nosuid, nodev, rbind, rprivate]
425+
- hostPath: ${DRIVER_DIR}/nvcubins.bin
426+
containerPath: ${DRIVER_DIR}/nvcubins.bin
427+
options: [ro, nosuid, nodev, rbind, rprivate]
428+
- hostPath: ${DRIVER_DIR}/nvidia-smi
429+
containerPath: ${DRIVER_DIR}/nvidia-smi
430+
options: [ro, nosuid, nodev, rbind, rprivate]
431+
CDIEOF
432+
echo "CDI spec written (GPU: $GPU_UUID, driver: $DRIVER_DIR, dxcore: $DXCORE_PATH)"
395433
else
396-
echo "Warning: libdxcore.so not found — NVML may fail inside pods"
434+
echo "Warning: could not detect GPU UUID or WSL driver store — CDI spec not written"
397435
fi
398436
fi
399437

0 commit comments

Comments
 (0)