Skip to content

Commit 4a20ede

Browse files
committed
feat(gpu): add WSL CDI spec watcher and set deviceIDStrategy to index
On WSL2 hosts the NVIDIA device plugin generates CDI specs that cannot be used directly by k3s containerd since it includes a single device name "all" and not one based on the index or UUID of the device. Add a background watch_cdi_specs function to cluster-entrypoint.sh that: - detects WSL2 via /dev/dxg presence - handles specs already present at gateway restart - uses inotifywait to watch for new/updated specs - transforms the spec with jq (cdiVersion=0.5.0, devices[0].name="0") Add inotify-tools and jq to the cluster image apt-get install block to support the watcher. Signed-off-by: Evan Lezar <elezar@nvidia.com>
1 parent 6afe945 commit 4a20ede

File tree

3 files changed

+47
-0
lines changed

3 files changed

+47
-0
lines changed

deploy/docker/Dockerfile.images

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -229,6 +229,8 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
229229
iptables \
230230
mount \
231231
dnsutils \
232+
inotify-tools \
233+
jq \
232234
&& rm -rf /var/lib/apt/lists/*
233235

234236
COPY --from=k3s /bin/ /bin/

deploy/docker/cluster-entrypoint.sh

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -317,6 +317,40 @@ fi
317317
# the k3s manifests directory so the Helm controller installs it automatically.
318318
# The nvidia-container-runtime binary is already on PATH (baked into the image)
319319
# so k3s registers the "nvidia" RuntimeClass at startup.
320+
CDI_SPEC_DIR="/var/run/cdi"
321+
CDI_WSL_INPUT="${CDI_SPEC_DIR}/k8s.device-plugin.nvidia.com-gpu.json"
322+
CDI_WSL_OUTPUT="${CDI_SPEC_DIR}/openshell-wsl.json"
323+
324+
transform_wsl_cdi_spec() {
325+
local tmp="${CDI_WSL_OUTPUT}.tmp.$$"
326+
if jq '.cdiVersion = "0.5.0" | .devices[0].name = "0"' \
327+
"$CDI_WSL_INPUT" > "$tmp" 2>/dev/null; then
328+
mv "$tmp" "$CDI_WSL_OUTPUT"
329+
echo "CDI: transformed WSL spec -> $CDI_WSL_OUTPUT"
330+
else
331+
rm -f "$tmp"
332+
echo "CDI: failed to transform WSL spec (jq error)"
333+
fi
334+
}
335+
336+
watch_cdi_specs() {
337+
mkdir -p "$CDI_SPEC_DIR"
338+
339+
# Process spec already present at startup (e.g. gateway restart)
340+
if [ -f "$CDI_WSL_INPUT" ] && grep -q '/dev/dxg' "$CDI_WSL_INPUT" 2>/dev/null; then
341+
transform_wsl_cdi_spec
342+
fi
343+
344+
# Watch for the spec to appear or be updated
345+
inotifywait -m -e close_write,moved_to --format '%f' "$CDI_SPEC_DIR" 2>/dev/null \
346+
| while IFS= read -r filename; do
347+
if [ "$filename" = "k8s.device-plugin.nvidia.com-gpu.json" ] \
348+
&& grep -q '/dev/dxg' "$CDI_WSL_INPUT" 2>/dev/null; then
349+
transform_wsl_cdi_spec
350+
fi
351+
done
352+
}
353+
320354
if [ "${GPU_ENABLED:-}" = "true" ]; then
321355
echo "GPU support enabled — deploying NVIDIA device plugin"
322356

@@ -327,6 +361,11 @@ if [ "${GPU_ENABLED:-}" = "true" ]; then
327361
cp "$manifest" "$K3S_MANIFESTS/"
328362
done
329363
fi
364+
365+
if [ -c /dev/dxg ]; then
366+
echo "WSL2 GPU detected (/dev/dxg present) — starting CDI spec watcher"
367+
watch_cdi_specs &
368+
fi
330369
fi
331370

332371
# ---------------------------------------------------------------------------

deploy/kube/gpu-manifests/nvidia-device-plugin-helmchart.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,11 @@
1212
# (which requires nvidia.com/gpu.present=true) is overridden to empty
1313
# so it schedules on any node without requiring NFD/GFD labels.
1414
#
15+
# The device plugin is set to deviceIDStrategy=index so that device names are
16+
# numeric indices (e.g. "0"). This simplifies the conversion of CDI specs on WSL
17+
# systems, where we need to rename the *.nvidia.com/gpu=all device that is
18+
# generated by the device plugin to *.nvidia.com/gpu=0.
19+
#
1520
# k3s auto-detects nvidia-container-runtime on PATH and registers the "nvidia"
1621
# RuntimeClass automatically, so no manual RuntimeClass manifest is needed.
1722

@@ -28,6 +33,7 @@ spec:
2833
createNamespace: true
2934
valuesContent: |-
3035
runtimeClassName: nvidia
36+
deviceIDStrategy: index
3137
gfd:
3238
enabled: false
3339
nfd:

0 commit comments

Comments
 (0)