Skip to content

Commit b54884e

Browse files
butler54claude
andcommitted
feat!: add NVIDIA H100 confidential GPU support for bare metal
Enables NVIDIA confidential GPU (H100/H200) on bare metal deployments with full CoCo integration. Addresses three documented gaps in the Red Hat OSC 1.12 documentation: - Gap 1: Pin GPU Operator to v26.3.0 (v26.3.1 breaks kata state machine) - Gap 2: Include kataSandboxDevicePlugin in ClusterPolicy (required for nvidia.com/pgpu resource advertisement) - Gap 3: Add imperative job to re-reconcile KataConfig after GPU Operator labels nodes (kata-cc-nvidia-gpu RuntimeClass creation) New charts: - charts/all/nvidia-gpu: ClusterPolicy CR and IOMMU MachineConfig - charts/coco-supported/gpu-workload: CUDA vectorAdd sample deployment Also extends Kyverno initdata injection to support kata-cc-nvidia-gpu runtime class and propagate initdata to gpu-workload namespace. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d61db58 commit b54884e

11 files changed

Lines changed: 219 additions & 1 deletion

File tree

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
---
2+
- name: Reconcile KataConfig for GPU RuntimeClass
3+
hosts: localhost
4+
connection: local
5+
become: false
6+
gather_facts: false
7+
tasks:
8+
- name: Check for nodes with NVIDIA GPU labels
9+
kubernetes.core.k8s_info:
10+
api_version: v1
11+
kind: Node
12+
label_selectors:
13+
- "nvidia.com/gpu.present=true"
14+
register: gpu_nodes
15+
16+
- name: Check if kata-cc-nvidia-gpu RuntimeClass exists
17+
kubernetes.core.k8s_info:
18+
api_version: node.k8s.io/v1
19+
kind: RuntimeClass
20+
name: kata-cc-nvidia-gpu
21+
register: gpu_runtimeclass
22+
23+
- name: Trigger KataConfig re-reconciliation
24+
kubernetes.core.k8s:
25+
state: patched
26+
api_version: kataconfiguration.openshift.io/v1
27+
kind: KataConfig
28+
name: default-kata-config
29+
definition:
30+
metadata:
31+
annotations:
32+
kata-reconcile: "{{ ansible_date_time.epoch }}"
33+
when:
34+
- gpu_nodes.resources | length > 0
35+
- gpu_runtimeclass.resources | length == 0
36+
37+
- name: Report status
38+
ansible.builtin.debug:
39+
msg: >-
40+
GPU nodes: {{ gpu_nodes.resources | length }},
41+
RuntimeClass exists: {{ gpu_runtimeclass.resources | length > 0 }},
42+
Action: {{ 'triggered re-reconciliation' if (gpu_nodes.resources | length > 0 and gpu_runtimeclass.resources | length == 0) else 'no action needed' }}

charts/all/coco-kyverno-policies/templates/inject-coco-initdata.yaml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -28,7 +28,7 @@ spec:
2828
all:
2929
- key: "{{ "{{" }}request.object.spec.runtimeClassName || '' {{ "}}" }}"
3030
operator: AnyIn
31-
value: ["kata", "kata-cc", "kata-remote"]
31+
value: ["kata", "kata-cc", "kata-remote", "kata-cc-nvidia-gpu"]
3232
- key: "{{ "{{" }}request.object.metadata.annotations.\"coco.io/initdata-configmap\" || '' {{ "}}" }}"
3333
operator: NotEquals
3434
value: ""
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
workloadNamespaces:
22
- hello-openshift
33
- kbs-access
4+
- gpu-workload
45

56
initdataSourceNamespace: imperative

charts/all/nvidia-gpu/Chart.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
apiVersion: v2
2+
description: NVIDIA GPU Operator configuration for confidential containers (ClusterPolicy, IOMMU MachineConfig).
3+
keywords:
4+
- pattern
5+
- nvidia
6+
- gpu
7+
- confidential
8+
name: nvidia-gpu
9+
version: 0.0.1
Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
{{- if .Values.enabled }}
2+
apiVersion: nvidia.com/v1
3+
kind: ClusterPolicy
4+
metadata:
5+
name: gpu-cluster-policy
6+
annotations:
7+
argocd.argoproj.io/sync-wave: "110"
8+
spec:
9+
operator:
10+
defaultRuntime: crio
11+
12+
sandboxWorkloads:
13+
enabled: true
14+
mode: kata
15+
defaultWorkload: vm-passthrough
16+
17+
kataManager:
18+
enabled: false
19+
20+
ccManager:
21+
enabled: {{ .Values.ccManager.enabled }}
22+
defaultMode: {{ .Values.ccManager.defaultMode | quote }}
23+
repository: nvcr.io/nvidia/cloud-native
24+
image: k8s-cc-manager
25+
version: v0.1.0
26+
env:
27+
- name: CC_CAPABLE_DEVICE_IDS
28+
value: {{ .Values.ccManager.deviceIDs | quote }}
29+
30+
kataSandboxDevicePlugin:
31+
enabled: {{ .Values.kataSandboxDevicePlugin.enabled }}
32+
repository: {{ .Values.kataSandboxDevicePlugin.repository }}
33+
image: {{ .Values.kataSandboxDevicePlugin.image }}
34+
version: {{ .Values.kataSandboxDevicePlugin.version | quote }}
35+
36+
sandboxDevicePlugin:
37+
enabled: true
38+
39+
driver:
40+
enabled: true
41+
42+
devicePlugin:
43+
enabled: false
44+
45+
vfioManager:
46+
enabled: true
47+
48+
gfd:
49+
enabled: true
50+
51+
nfd:
52+
nodefeaturerules: true
53+
{{- end }}
Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
{{- if .Values.iommu.enabled }}
2+
{{- range list "master" "worker" }}
3+
---
4+
apiVersion: machineconfiguration.openshift.io/v1
5+
kind: MachineConfig
6+
metadata:
7+
labels:
8+
machineconfiguration.openshift.io/role: {{ . }}
9+
name: 100-iommu-{{ . }}
10+
spec:
11+
kernelArguments:
12+
- intel_iommu=on
13+
- iommu=pt
14+
{{- end }}
15+
{{- end }}

charts/all/nvidia-gpu/values.yaml

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
enabled: true
2+
3+
ccManager:
4+
enabled: true
5+
defaultMode: "on"
6+
deviceIDs: "0x2331,0x2322"
7+
8+
kataSandboxDevicePlugin:
9+
enabled: true
10+
repository: nvcr.io/nvidia/cloud-native
11+
image: nvidia-sandbox-device-plugin
12+
version: "v0.0.2"
13+
14+
iommu:
15+
enabled: true
Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
apiVersion: v2
2+
description: Sample CUDA workload for NVIDIA confidential GPU verification.
3+
keywords:
4+
- pattern
5+
- nvidia
6+
- gpu
7+
- workload
8+
- confidential
9+
name: gpu-workload
10+
version: 0.0.1
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
apiVersion: apps/v1
2+
kind: Deployment
3+
metadata:
4+
name: gpu-vectoradd
5+
labels:
6+
app: gpu-vectoradd
7+
spec:
8+
replicas: 1
9+
selector:
10+
matchLabels:
11+
app: gpu-vectoradd
12+
template:
13+
metadata:
14+
labels:
15+
app: gpu-vectoradd
16+
annotations:
17+
coco.io/initdata-configmap: initdata
18+
{{- if .Values.defaultMemory }}
19+
io.katacontainers.config.hypervisor.default_memory: {{ .Values.defaultMemory | quote }}
20+
{{- end }}
21+
spec:
22+
runtimeClassName: {{ .Values.runtimeClassName }}
23+
containers:
24+
- name: cuda-vectoradd
25+
image: nvcr.io/nvidia/k8s/cuda-sample:vectoradd-cuda11.7.1-ubuntu20.04
26+
resources:
27+
limits:
28+
nvidia.com/pgpu: 1
29+
securityContext:
30+
privileged: false
31+
allowPrivilegeEscalation: false
32+
runAsNonRoot: true
33+
capabilities:
34+
drop:
35+
- ALL
36+
seccompProfile:
37+
type: RuntimeDefault
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
runtimeClassName: "kata-cc-nvidia-gpu"
2+
3+
defaultMemory: "32768"
4+
5+
global:
6+
clusterPlatform: ""

0 commit comments

Comments
 (0)