Skip to content

Commit 6f72a74

Browse files
authored
RUN-17256 Adjust to run.ai/reserve_for_gpu_index change (#69)
1 parent f268b08 commit 6f72a74

9 files changed

+720
-244
lines changed
+228
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,228 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
annotations:
5+
clusterId: d69dff42-4134-41d9-90fc-1c39505cb774
6+
cni.projectcalico.org/containerID: 4357ed00f685ddcfafb9b551fbad81f6ad9a39d74f0158fe22b6e937dad415df
7+
cni.projectcalico.org/podIP: 100.122.249.152/32
8+
cni.projectcalico.org/podIPs: 100.122.249.152/32
9+
gpu-fraction: "0.5"
10+
pod-group-name: pg-frac-1-0-2237ca39-cac0-4601-b658-8a3c5f406a4f
11+
received-resource-type: Fraction
12+
runai-allocated-gpu-memory: "7680"
13+
runai-allocated-gpus: "0.5"
14+
runai-allocated-mig-gpus: "0"
15+
runai-calculated-status: Running
16+
runai-job-id: 2237ca39-cac0-4601-b658-8a3c5f406a4f
17+
runai-node: i-0b498db53280b86a6
18+
runai/shared-gpu-configmap: frac-1-ns26p7c-runai-sh-gpu
19+
20+
workloadId: 027397ab-4c3c-45f7-87d0-8b3bae4ded65
21+
creationTimestamp: "2024-03-31T09:03:22Z"
22+
generateName: frac-1-
23+
labels:
24+
app: runaijob
25+
controller-uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f
26+
createdBy: RunaiJob
27+
project: pa
28+
release: frac-1
29+
run.ai/top-owner-uid: 027397ab-4c3c-45f7-87d0-8b3bae4ded65
30+
runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b
31+
runai/pod-index: 0-0
32+
workloadKind: TrainingWorkload
33+
workloadName: frac-1
34+
name: frac-1-0-0
35+
namespace: runai-pa
36+
ownerReferences:
37+
- apiVersion: run.ai/v1
38+
blockOwnerDeletion: true
39+
controller: true
40+
kind: RunaiJob
41+
name: frac-1
42+
uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f
43+
resourceVersion: "10748"
44+
uid: a801b3c7-b9be-4830-821c-2456cad2234f
45+
spec:
46+
affinity:
47+
nodeAffinity:
48+
requiredDuringSchedulingIgnoredDuringExecution:
49+
nodeSelectorTerms:
50+
- matchExpressions:
51+
- key: runai/node-pool
52+
operator: DoesNotExist
53+
containers:
54+
- env:
55+
- name: RUNAI_JOB_NAME
56+
value: frac-1
57+
- name: RUNAI_PROJECT
58+
value: pa
59+
- name: WANDB_NOTES
60+
value: https://shaibi-real.runailabs.com/trainings?columnFilter=[{"term":"frac-1","name":"name"}]&clusterId=d69dff42-4134-41d9-90fc-1c39505cb774
61+
- name: POD_INDEX
62+
value: "0"
63+
- name: RUNAI_GPU_MEMORY_REQUEST
64+
value: "0.50"
65+
- name: RUNAI_GPU_MEMORY_LIMIT
66+
value: "0.50"
67+
- name: NVIDIA_VISIBLE_DEVICES
68+
valueFrom:
69+
configMapKeyRef:
70+
key: RUNAI-VISIBLE-DEVICES
71+
name: frac-1-ns26p7c-runai-sh-gpu-0
72+
- name: RUNAI_NUM_OF_GPUS
73+
valueFrom:
74+
configMapKeyRef:
75+
key: RUNAI_NUM_OF_GPUS
76+
name: frac-1-ns26p7c-runai-sh-gpu-0
77+
- name: jobUUID
78+
value: 2237ca39-cac0-4601-b658-8a3c5f406a4f
79+
- name: JOB_UUID
80+
value: 2237ca39-cac0-4601-b658-8a3c5f406a4f
81+
- name: jobName
82+
value: frac-1
83+
- name: JOB_NAME
84+
value: frac-1
85+
- name: reporterGatewayURL
86+
value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091
87+
- name: REPORTER_GATEWAY_URL
88+
value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091
89+
- name: podUUID
90+
valueFrom:
91+
fieldRef:
92+
apiVersion: v1
93+
fieldPath: metadata.uid
94+
- name: POD_UUID
95+
valueFrom:
96+
fieldRef:
97+
apiVersion: v1
98+
fieldPath: metadata.uid
99+
- name: NODE_NAME
100+
valueFrom:
101+
fieldRef:
102+
apiVersion: v1
103+
fieldPath: spec.nodeName
104+
envFrom:
105+
- configMapRef:
106+
name: frac-1-ns26p7c-runai-sh-gpu-0-evar
107+
optional: false
108+
image: gshaibi/gpu-burn
109+
imagePullPolicy: IfNotPresent
110+
name: frac-1
111+
resources:
112+
requests:
113+
cpu: 100m
114+
memory: 100M
115+
securityContext:
116+
allowPrivilegeEscalation: false
117+
capabilities: {}
118+
seccompProfile:
119+
type: RuntimeDefault
120+
terminationMessagePath: /dev/termination-log
121+
terminationMessagePolicy: File
122+
volumeMounts:
123+
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
124+
name: kube-api-access-dfphn
125+
readOnly: true
126+
- mountPath: /etc/ld.so.preload
127+
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
128+
readOnly: true
129+
subPath: ld.so.preload-key
130+
- mountPath: /etc/runai.d/memory
131+
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
132+
readOnly: true
133+
subPath: config
134+
- mountPath: /etc/runai.d/pod_uuid
135+
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
136+
readOnly: true
137+
subPath: pod-uuid
138+
- mountPath: /runai/shared
139+
name: runai-shared-directory
140+
readOnly: true
141+
- mountPath: /etc/runai.d/route
142+
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
143+
readOnly: true
144+
subPath: route
145+
dnsPolicy: ClusterFirst
146+
enableServiceLinks: true
147+
nodeName: i-0b498db53280b86a6
148+
preemptionPolicy: PreemptLowerPriority
149+
priority: 0
150+
restartPolicy: Never
151+
schedulerName: runai-scheduler
152+
securityContext: {}
153+
serviceAccount: default
154+
serviceAccountName: default
155+
terminationGracePeriodSeconds: 30
156+
tolerations:
157+
- effect: NoExecute
158+
key: node.kubernetes.io/not-ready
159+
operator: Exists
160+
tolerationSeconds: 300
161+
- effect: NoExecute
162+
key: node.kubernetes.io/unreachable
163+
operator: Exists
164+
tolerationSeconds: 300
165+
volumes:
166+
- name: kube-api-access-dfphn
167+
projected:
168+
defaultMode: 420
169+
sources:
170+
- serviceAccountToken:
171+
expirationSeconds: 3607
172+
path: token
173+
- configMap:
174+
items:
175+
- key: ca.crt
176+
path: ca.crt
177+
name: kube-root-ca.crt
178+
- downwardAPI:
179+
items:
180+
- fieldRef:
181+
apiVersion: v1
182+
fieldPath: metadata.namespace
183+
path: namespace
184+
- configMap:
185+
defaultMode: 420
186+
name: frac-1-ns26p7c-runai-sh-gpu-0
187+
name: frac-1-ns26p7c-runai-sh-gpu-0-vol
188+
- hostPath:
189+
path: /var/lib/runai/shared
190+
type: DirectoryOrCreate
191+
name: runai-shared-directory
192+
status:
193+
conditions:
194+
- lastProbeTime: null
195+
lastTransitionTime: "2024-03-31T09:03:27Z"
196+
status: "True"
197+
type: Initialized
198+
- lastProbeTime: null
199+
lastTransitionTime: "2024-03-31T09:03:51Z"
200+
status: "True"
201+
type: Ready
202+
- lastProbeTime: null
203+
lastTransitionTime: "2024-03-31T09:03:51Z"
204+
status: "True"
205+
type: ContainersReady
206+
- lastProbeTime: null
207+
lastTransitionTime: "2024-03-31T09:03:27Z"
208+
status: "True"
209+
type: PodScheduled
210+
containerStatuses:
211+
- containerID: containerd://4205608c75216bfe3d3a71ea7301f8bc041acba92673e033fc87be6d91867dc6
212+
image: docker.io/gshaibi/gpu-burn:latest
213+
imageID: docker.io/gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979
214+
lastState: {}
215+
name: frac-1
216+
ready: true
217+
restartCount: 0
218+
started: true
219+
state:
220+
running:
221+
startedAt: "2024-03-31T09:03:51Z"
222+
hostIP: 172.20.62.77
223+
phase: Running
224+
podIP: 100.122.249.152
225+
podIPs:
226+
- ip: 100.122.249.152
227+
qosClass: Burstable
228+
startTime: "2024-03-31T09:03:27Z"
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
apiVersion: v1
2+
kind: Pod
3+
metadata:
4+
annotations:
5+
cni.projectcalico.org/containerID: 75affaf027829643896b3de5699d15fedb291f4f7efac6f00b0d0bbe9a2dd65a
6+
cni.projectcalico.org/podIP: 100.122.249.151/32
7+
cni.projectcalico.org/podIPs: 100.122.249.151/32
8+
pod-group-name: pg-runai-reservation-gpu-i-0b498db53280b86a6-fzdhl-3b47e794-97f0-4824-b7d5-bb44c122039e
9+
run.ai/reserve_for_gpu_index: GPU-8983c66a-23df-e63b-4c2f-afcae9ec79b3
10+
runai-job-id: 3b47e794-97f0-4824-b7d5-bb44c122039e
11+
creationTimestamp: "2024-03-31T09:03:25Z"
12+
labels:
13+
app: runai-reservation
14+
app.runai.resource.reservation: runai-reservation-gpu
15+
runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b
16+
name: runai-reservation-gpu-i-0b498db53280b86a6-fzdhl
17+
namespace: runai-reservation
18+
resourceVersion: "10625"
19+
uid: 3b47e794-97f0-4824-b7d5-bb44c122039e
20+
spec:
21+
containers:
22+
- env:
23+
- name: POD_NAME
24+
valueFrom:
25+
fieldRef:
26+
apiVersion: v1
27+
fieldPath: metadata.name
28+
- name: POD_NAMESPACE
29+
valueFrom:
30+
fieldRef:
31+
apiVersion: v1
32+
fieldPath: metadata.namespace
33+
image: gcr.io/run-ai-prod/resource-reservation:v3.5.0
34+
imagePullPolicy: IfNotPresent
35+
name: runai-reservation
36+
resources:
37+
limits:
38+
nvidia.com/gpu: "1"
39+
requests:
40+
nvidia.com/gpu: "1"
41+
terminationMessagePath: /dev/termination-log
42+
terminationMessagePolicy: File
43+
volumeMounts:
44+
- mountPath: /var/run/secrets/kubernetes.io/serviceaccount
45+
name: kube-api-access-fnjgk
46+
readOnly: true
47+
dnsPolicy: ClusterFirst
48+
enableServiceLinks: true
49+
imagePullSecrets:
50+
- name: runai-reg-creds
51+
nodeName: i-0b498db53280b86a6
52+
preemptionPolicy: PreemptLowerPriority
53+
priority: 0
54+
restartPolicy: Always
55+
schedulerName: runai-scheduler
56+
securityContext: {}
57+
serviceAccount: runai-reservation-engine
58+
serviceAccountName: runai-reservation-engine
59+
terminationGracePeriodSeconds: 30
60+
tolerations:
61+
- effect: NoExecute
62+
key: node.kubernetes.io/not-ready
63+
operator: Exists
64+
tolerationSeconds: 300
65+
- effect: NoExecute
66+
key: node.kubernetes.io/unreachable
67+
operator: Exists
68+
tolerationSeconds: 300
69+
volumes:
70+
- name: kube-api-access-fnjgk
71+
projected:
72+
defaultMode: 420
73+
sources:
74+
- serviceAccountToken:
75+
expirationSeconds: 3607
76+
path: token
77+
- configMap:
78+
items:
79+
- key: ca.crt
80+
path: ca.crt
81+
name: kube-root-ca.crt
82+
- downwardAPI:
83+
items:
84+
- fieldRef:
85+
apiVersion: v1
86+
fieldPath: metadata.namespace
87+
path: namespace
88+
status:
89+
conditions:
90+
- lastProbeTime: null
91+
lastTransitionTime: "2024-03-31T09:03:25Z"
92+
status: "True"
93+
type: Initialized
94+
- lastProbeTime: null
95+
lastTransitionTime: "2024-03-31T09:03:27Z"
96+
status: "True"
97+
type: Ready
98+
- lastProbeTime: null
99+
lastTransitionTime: "2024-03-31T09:03:27Z"
100+
status: "True"
101+
type: ContainersReady
102+
- lastProbeTime: null
103+
lastTransitionTime: "2024-03-31T09:03:25Z"
104+
status: "True"
105+
type: PodScheduled
106+
containerStatuses:
107+
- containerID: containerd://1063439dc8e82d20ef89a97ad9567d40d59d0d270ac5b8d4cab7f49a474e4398
108+
image: gcr.io/run-ai-prod/resource-reservation:v3.5.0
109+
imageID: gcr.io/run-ai-prod/resource-reservation@sha256:add1db641829508bbd1e74a7e757348159bc99b67844fc656acc1e795872d0a6
110+
lastState: {}
111+
name: runai-reservation
112+
ready: true
113+
restartCount: 0
114+
started: true
115+
state:
116+
running:
117+
startedAt: "2024-03-31T09:03:27Z"
118+
hostIP: 172.20.62.77
119+
phase: Running
120+
podIP: 100.122.249.151
121+
podIPs:
122+
- ip: 100.122.249.151
123+
qosClass: BestEffort
124+
startTime: "2024-03-31T09:03:25Z"

0 commit comments

Comments
 (0)