|
| 1 | +apiVersion: v1 |
| 2 | +kind: Pod |
| 3 | +metadata: |
| 4 | + annotations: |
| 5 | + clusterId: d69dff42-4134-41d9-90fc-1c39505cb774 |
| 6 | + cni.projectcalico.org/containerID: 4357ed00f685ddcfafb9b551fbad81f6ad9a39d74f0158fe22b6e937dad415df |
| 7 | + cni.projectcalico.org/podIP: 100.122.249.152/32 |
| 8 | + cni.projectcalico.org/podIPs: 100.122.249.152/32 |
| 9 | + gpu-fraction: "0.5" |
| 10 | + pod-group-name: pg-frac-1-0-2237ca39-cac0-4601-b658-8a3c5f406a4f |
| 11 | + received-resource-type: Fraction |
| 12 | + runai-allocated-gpu-memory: "7680" |
| 13 | + runai-allocated-gpus: "0.5" |
| 14 | + runai-allocated-mig-gpus: "0" |
| 15 | + runai-calculated-status: Running |
| 16 | + runai-job-id: 2237ca39-cac0-4601-b658-8a3c5f406a4f |
| 17 | + runai-node: i-0b498db53280b86a6 |
| 18 | + runai/shared-gpu-configmap: frac-1-ns26p7c-runai-sh-gpu |
| 19 | + |
| 20 | + workloadId: 027397ab-4c3c-45f7-87d0-8b3bae4ded65 |
| 21 | + creationTimestamp: "2024-03-31T09:03:22Z" |
| 22 | + generateName: frac-1- |
| 23 | + labels: |
| 24 | + app: runaijob |
| 25 | + controller-uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f |
| 26 | + createdBy: RunaiJob |
| 27 | + project: pa |
| 28 | + release: frac-1 |
| 29 | + run.ai/top-owner-uid: 027397ab-4c3c-45f7-87d0-8b3bae4ded65 |
| 30 | + runai-gpu-group: df7c0dd3-9795-443c-85b9-acbf49c8fb6b |
| 31 | + runai/pod-index: 0-0 |
| 32 | + workloadKind: TrainingWorkload |
| 33 | + workloadName: frac-1 |
| 34 | + name: frac-1-0-0 |
| 35 | + namespace: runai-pa |
| 36 | + ownerReferences: |
| 37 | + - apiVersion: run.ai/v1 |
| 38 | + blockOwnerDeletion: true |
| 39 | + controller: true |
| 40 | + kind: RunaiJob |
| 41 | + name: frac-1 |
| 42 | + uid: 2237ca39-cac0-4601-b658-8a3c5f406a4f |
| 43 | + resourceVersion: "10748" |
| 44 | + uid: a801b3c7-b9be-4830-821c-2456cad2234f |
| 45 | +spec: |
| 46 | + affinity: |
| 47 | + nodeAffinity: |
| 48 | + requiredDuringSchedulingIgnoredDuringExecution: |
| 49 | + nodeSelectorTerms: |
| 50 | + - matchExpressions: |
| 51 | + - key: runai/node-pool |
| 52 | + operator: DoesNotExist |
| 53 | + containers: |
| 54 | + - env: |
| 55 | + - name: RUNAI_JOB_NAME |
| 56 | + value: frac-1 |
| 57 | + - name: RUNAI_PROJECT |
| 58 | + value: pa |
| 59 | + - name: WANDB_NOTES |
| 60 | + value: https://shaibi-real.runailabs.com/trainings?columnFilter=[{"term":"frac-1","name":"name"}]&clusterId=d69dff42-4134-41d9-90fc-1c39505cb774 |
| 61 | + - name: POD_INDEX |
| 62 | + value: "0" |
| 63 | + - name: RUNAI_GPU_MEMORY_REQUEST |
| 64 | + value: "0.50" |
| 65 | + - name: RUNAI_GPU_MEMORY_LIMIT |
| 66 | + value: "0.50" |
| 67 | + - name: NVIDIA_VISIBLE_DEVICES |
| 68 | + valueFrom: |
| 69 | + configMapKeyRef: |
| 70 | + key: RUNAI-VISIBLE-DEVICES |
| 71 | + name: frac-1-ns26p7c-runai-sh-gpu-0 |
| 72 | + - name: RUNAI_NUM_OF_GPUS |
| 73 | + valueFrom: |
| 74 | + configMapKeyRef: |
| 75 | + key: RUNAI_NUM_OF_GPUS |
| 76 | + name: frac-1-ns26p7c-runai-sh-gpu-0 |
| 77 | + - name: jobUUID |
| 78 | + value: 2237ca39-cac0-4601-b658-8a3c5f406a4f |
| 79 | + - name: JOB_UUID |
| 80 | + value: 2237ca39-cac0-4601-b658-8a3c5f406a4f |
| 81 | + - name: jobName |
| 82 | + value: frac-1 |
| 83 | + - name: JOB_NAME |
| 84 | + value: frac-1 |
| 85 | + - name: reporterGatewayURL |
| 86 | + value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 |
| 87 | + - name: REPORTER_GATEWAY_URL |
| 88 | + value: runai-prometheus-pushgateway.runai.svc.cluster.local:9091 |
| 89 | + - name: podUUID |
| 90 | + valueFrom: |
| 91 | + fieldRef: |
| 92 | + apiVersion: v1 |
| 93 | + fieldPath: metadata.uid |
| 94 | + - name: POD_UUID |
| 95 | + valueFrom: |
| 96 | + fieldRef: |
| 97 | + apiVersion: v1 |
| 98 | + fieldPath: metadata.uid |
| 99 | + - name: NODE_NAME |
| 100 | + valueFrom: |
| 101 | + fieldRef: |
| 102 | + apiVersion: v1 |
| 103 | + fieldPath: spec.nodeName |
| 104 | + envFrom: |
| 105 | + - configMapRef: |
| 106 | + name: frac-1-ns26p7c-runai-sh-gpu-0-evar |
| 107 | + optional: false |
| 108 | + image: gshaibi/gpu-burn |
| 109 | + imagePullPolicy: IfNotPresent |
| 110 | + name: frac-1 |
| 111 | + resources: |
| 112 | + requests: |
| 113 | + cpu: 100m |
| 114 | + memory: 100M |
| 115 | + securityContext: |
| 116 | + allowPrivilegeEscalation: false |
| 117 | + capabilities: {} |
| 118 | + seccompProfile: |
| 119 | + type: RuntimeDefault |
| 120 | + terminationMessagePath: /dev/termination-log |
| 121 | + terminationMessagePolicy: File |
| 122 | + volumeMounts: |
| 123 | + - mountPath: /var/run/secrets/kubernetes.io/serviceaccount |
| 124 | + name: kube-api-access-dfphn |
| 125 | + readOnly: true |
| 126 | + - mountPath: /etc/ld.so.preload |
| 127 | + name: frac-1-ns26p7c-runai-sh-gpu-0-vol |
| 128 | + readOnly: true |
| 129 | + subPath: ld.so.preload-key |
| 130 | + - mountPath: /etc/runai.d/memory |
| 131 | + name: frac-1-ns26p7c-runai-sh-gpu-0-vol |
| 132 | + readOnly: true |
| 133 | + subPath: config |
| 134 | + - mountPath: /etc/runai.d/pod_uuid |
| 135 | + name: frac-1-ns26p7c-runai-sh-gpu-0-vol |
| 136 | + readOnly: true |
| 137 | + subPath: pod-uuid |
| 138 | + - mountPath: /runai/shared |
| 139 | + name: runai-shared-directory |
| 140 | + readOnly: true |
| 141 | + - mountPath: /etc/runai.d/route |
| 142 | + name: frac-1-ns26p7c-runai-sh-gpu-0-vol |
| 143 | + readOnly: true |
| 144 | + subPath: route |
| 145 | + dnsPolicy: ClusterFirst |
| 146 | + enableServiceLinks: true |
| 147 | + nodeName: i-0b498db53280b86a6 |
| 148 | + preemptionPolicy: PreemptLowerPriority |
| 149 | + priority: 0 |
| 150 | + restartPolicy: Never |
| 151 | + schedulerName: runai-scheduler |
| 152 | + securityContext: {} |
| 153 | + serviceAccount: default |
| 154 | + serviceAccountName: default |
| 155 | + terminationGracePeriodSeconds: 30 |
| 156 | + tolerations: |
| 157 | + - effect: NoExecute |
| 158 | + key: node.kubernetes.io/not-ready |
| 159 | + operator: Exists |
| 160 | + tolerationSeconds: 300 |
| 161 | + - effect: NoExecute |
| 162 | + key: node.kubernetes.io/unreachable |
| 163 | + operator: Exists |
| 164 | + tolerationSeconds: 300 |
| 165 | + volumes: |
| 166 | + - name: kube-api-access-dfphn |
| 167 | + projected: |
| 168 | + defaultMode: 420 |
| 169 | + sources: |
| 170 | + - serviceAccountToken: |
| 171 | + expirationSeconds: 3607 |
| 172 | + path: token |
| 173 | + - configMap: |
| 174 | + items: |
| 175 | + - key: ca.crt |
| 176 | + path: ca.crt |
| 177 | + name: kube-root-ca.crt |
| 178 | + - downwardAPI: |
| 179 | + items: |
| 180 | + - fieldRef: |
| 181 | + apiVersion: v1 |
| 182 | + fieldPath: metadata.namespace |
| 183 | + path: namespace |
| 184 | + - configMap: |
| 185 | + defaultMode: 420 |
| 186 | + name: frac-1-ns26p7c-runai-sh-gpu-0 |
| 187 | + name: frac-1-ns26p7c-runai-sh-gpu-0-vol |
| 188 | + - hostPath: |
| 189 | + path: /var/lib/runai/shared |
| 190 | + type: DirectoryOrCreate |
| 191 | + name: runai-shared-directory |
| 192 | +status: |
| 193 | + conditions: |
| 194 | + - lastProbeTime: null |
| 195 | + lastTransitionTime: "2024-03-31T09:03:27Z" |
| 196 | + status: "True" |
| 197 | + type: Initialized |
| 198 | + - lastProbeTime: null |
| 199 | + lastTransitionTime: "2024-03-31T09:03:51Z" |
| 200 | + status: "True" |
| 201 | + type: Ready |
| 202 | + - lastProbeTime: null |
| 203 | + lastTransitionTime: "2024-03-31T09:03:51Z" |
| 204 | + status: "True" |
| 205 | + type: ContainersReady |
| 206 | + - lastProbeTime: null |
| 207 | + lastTransitionTime: "2024-03-31T09:03:27Z" |
| 208 | + status: "True" |
| 209 | + type: PodScheduled |
| 210 | + containerStatuses: |
| 211 | + - containerID: containerd://4205608c75216bfe3d3a71ea7301f8bc041acba92673e033fc87be6d91867dc6 |
| 212 | + image: docker.io/gshaibi/gpu-burn:latest |
| 213 | + imageID: docker.io/gshaibi/gpu-burn@sha256:ed07993b0581228c2bd7113fae0ed214549547f0fa91ba50165bc2473cfaf979 |
| 214 | + lastState: {} |
| 215 | + name: frac-1 |
| 216 | + ready: true |
| 217 | + restartCount: 0 |
| 218 | + started: true |
| 219 | + state: |
| 220 | + running: |
| 221 | + startedAt: "2024-03-31T09:03:51Z" |
| 222 | + hostIP: 172.20.62.77 |
| 223 | + phase: Running |
| 224 | + podIP: 100.122.249.152 |
| 225 | + podIPs: |
| 226 | + - ip: 100.122.249.152 |
| 227 | + qosClass: Burstable |
| 228 | + startTime: "2024-03-31T09:03:27Z" |
0 commit comments