@@ -345,55 +345,93 @@ if [ "${GPU_ENABLED:-}" = "true" ]; then
345345 if [ -c /dev/dxg ]; then
346346 echo " WSL2 detected (/dev/dxg present) — configuring CDI mode for GPU"
347347
348- # 1. Generate CDI spec (nvidia-ctk auto-detects WSL mode)
349- if command -v nvidia-ctk > /dev/null 2>&1 ; then
348+ # 1. Build a complete CDI spec from scratch.
349+ # nvidia-ctk cdi generate has two WSL2 bugs:
350+ # a) only creates name=all but the device plugin assigns by UUID
351+ # b) misses libdxcore.so (the NVML-to-DXG bridge library)
352+ # Writing the spec directly avoids fragile sed patching of YAML.
353+ if command -v nvidia-ctk > /dev/null 2>&1 && command -v nvidia-smi > /dev/null 2>&1 ; then
350354 mkdir -p /var/run/cdi
351- nvidia-ctk cdi generate --output=/var/run/cdi/nvidia.yaml 2>&1 || true
352-
353- # 2. Add per-GPU device entries (UUID and index) to CDI spec.
354- # nvidia-ctk only generates name=all, but the device plugin
355- # assigns GPUs by UUID which must resolve as a CDI device.
356- if [ -f /var/run/cdi/nvidia.yaml ] && command -v nvidia-smi > /dev/null 2>&1 ; then
357- idx=0
358- nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2> /dev/null | while read -r uuid; do
359- uuid=$( echo " $uuid " | tr -d ' ' )
360- [ -z " $uuid " ] && continue
361- sed -i " /- name: all/a\\
362- - name: $uuid \\
363- containerEdits:\\
364- deviceNodes:\\
365- - path: /dev/dxg\\
366- - name: \" $idx \"\\
367- containerEdits:\\
368- deviceNodes:\\
369- - path: /dev/dxg" /var/run/cdi/nvidia.yaml
370- idx=$(( idx + 1 ))
371- done
372- # nvidia-ctk cdi generate uses cdiVersion 0.3.0 but the
373- # installed CDI library requires >= 0.5.0
374- sed -i ' s/cdiVersion: 0\.3\.0/cdiVersion: 0.5.0/' /var/run/cdi/nvidia.yaml
375- echo " CDI spec: added per-GPU UUID and index device entries"
376- fi
377355
378- # 4. Patch CDI spec: add libdxcore.so mount (nvidia-ctk misses it )
356+ GPU_UUID= $( nvidia-smi --query-gpu=gpu_uuid --format=csv,noheader 2> /dev/null | tr -d ' ' | head -1 )
379357 DXCORE_PATH=$( find /usr/lib -name " libdxcore.so" 2> /dev/null | head -1)
380- if [ -n " $DXCORE_PATH " ] && [ -f /var/run/cdi/nvidia.yaml ]; then
381- DXCORE_DIR=$( dirname " $DXCORE_PATH " )
382- # Insert libdxcore mount after the mounts: key
383- sed -i " /^ mounts:/a\\
384- - hostPath: $DXCORE_PATH \\
385- containerPath: $DXCORE_PATH \\
386- options:\\
387- - ro\\
388- - nosuid\\
389- - nodev\\
390- - rbind\\
391- - rprivate" /var/run/cdi/nvidia.yaml
392- # Add ldcache folder for libdxcore directory
393- sed -i " s|update-ldcache|update-ldcache\n - --folder\n - $DXCORE_DIR |" /var/run/cdi/nvidia.yaml
394- echo " CDI spec patched with libdxcore.so from $DXCORE_PATH "
358+ DXCORE_DIR=$( dirname " $DXCORE_PATH " 2> /dev/null || echo " /usr/lib/x86_64-linux-gnu" )
359+ DRIVER_DIR=$( ls -d /usr/lib/wsl/drivers/nv* .inf_amd64_* 2> /dev/null | head -1)
360+
361+ if [ -n " $DRIVER_DIR " ] && [ -n " $GPU_UUID " ]; then
362+ cat > /var/run/cdi/nvidia.yaml << CDIEOF
363+ ---
364+ cdiVersion: "0.5.0"
365+ kind: nvidia.com/gpu
366+ devices:
367+ - name: all
368+ containerEdits:
369+ deviceNodes:
370+ - path: /dev/dxg
371+ - name: "${GPU_UUID} "
372+ containerEdits:
373+ deviceNodes:
374+ - path: /dev/dxg
375+ - name: "0"
376+ containerEdits:
377+ deviceNodes:
378+ - path: /dev/dxg
379+ containerEdits:
380+ env:
381+ - NVIDIA_VISIBLE_DEVICES=void
382+ hooks:
383+ - hookName: createContainer
384+ path: /usr/bin/nvidia-cdi-hook
385+ args:
386+ - nvidia-cdi-hook
387+ - create-symlinks
388+ - --link
389+ - ${DRIVER_DIR} /nvidia-smi::/usr/bin/nvidia-smi
390+ env:
391+ - NVIDIA_CTK_DEBUG=false
392+ - hookName: createContainer
393+ path: /usr/bin/nvidia-cdi-hook
394+ args:
395+ - nvidia-cdi-hook
396+ - update-ldcache
397+ - --folder
398+ - ${DRIVER_DIR}
399+ - --folder
400+ - ${DXCORE_DIR}
401+ env:
402+ - NVIDIA_CTK_DEBUG=false
403+ mounts:
404+ - hostPath: ${DXCORE_PATH}
405+ containerPath: ${DXCORE_PATH}
406+ options: [ro, nosuid, nodev, rbind, rprivate]
407+ - hostPath: ${DRIVER_DIR} /libcuda.so.1.1
408+ containerPath: ${DRIVER_DIR} /libcuda.so.1.1
409+ options: [ro, nosuid, nodev, rbind, rprivate]
410+ - hostPath: ${DRIVER_DIR} /libcuda_loader.so
411+ containerPath: ${DRIVER_DIR} /libcuda_loader.so
412+ options: [ro, nosuid, nodev, rbind, rprivate]
413+ - hostPath: ${DRIVER_DIR} /libnvdxgdmal.so.1
414+ containerPath: ${DRIVER_DIR} /libnvdxgdmal.so.1
415+ options: [ro, nosuid, nodev, rbind, rprivate]
416+ - hostPath: ${DRIVER_DIR} /libnvidia-ml.so.1
417+ containerPath: ${DRIVER_DIR} /libnvidia-ml.so.1
418+ options: [ro, nosuid, nodev, rbind, rprivate]
419+ - hostPath: ${DRIVER_DIR} /libnvidia-ml_loader.so
420+ containerPath: ${DRIVER_DIR} /libnvidia-ml_loader.so
421+ options: [ro, nosuid, nodev, rbind, rprivate]
422+ - hostPath: ${DRIVER_DIR} /libnvidia-ptxjitcompiler.so.1
423+ containerPath: ${DRIVER_DIR} /libnvidia-ptxjitcompiler.so.1
424+ options: [ro, nosuid, nodev, rbind, rprivate]
425+ - hostPath: ${DRIVER_DIR} /nvcubins.bin
426+ containerPath: ${DRIVER_DIR} /nvcubins.bin
427+ options: [ro, nosuid, nodev, rbind, rprivate]
428+ - hostPath: ${DRIVER_DIR} /nvidia-smi
429+ containerPath: ${DRIVER_DIR} /nvidia-smi
430+ options: [ro, nosuid, nodev, rbind, rprivate]
431+ CDIEOF
432+ echo " CDI spec written (GPU: $GPU_UUID , driver: $DRIVER_DIR , dxcore: $DXCORE_PATH )"
395433 else
396- echo " Warning: libdxcore.so not found — NVML may fail inside pods "
434+ echo " Warning: could not detect GPU UUID or WSL driver store — CDI spec not written "
397435 fi
398436 fi
399437
0 commit comments