run-ai · Apr 2, 2024
diff --git a/‎cmd/device-plugin/main.go
+1-1 b/‎cmd/device-plugin/main.go
+1-1
diff --git a/‎cmd/status-updater/main.go
+4 b/‎cmd/status-updater/main.go
+4
diff --git a/‎deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
+77 b/‎deploy/fake-gpu-operator/templates/device-plugin/_helpers.tpl
+77
diff --git a/‎deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml
+5-64 b/‎deploy/fake-gpu-operator/templates/device-plugin/daemonset.yml
+5-64
diff --git a/‎deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml
+16 b/‎deploy/fake-gpu-operator/templates/device-plugin/deployment-template.yaml
+16
diff --git a/‎deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
+61 b/‎deploy/fake-gpu-operator/templates/status-exporter/_helpers.tpl
+61
diff --git a/‎deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml
+6-56 b/‎deploy/fake-gpu-operator/templates/status-exporter/daemonset.yaml
+6-56
diff --git a/‎deploy/fake-gpu-operator/templates/status-exporter/deployment-template.yaml
+16 b/‎deploy/fake-gpu-operator/templates/status-exporter/deployment-template.yaml
+16
diff --git a/‎deploy/fake-gpu-operator/templates/status-updater/clusterrole.yaml
+1 b/‎deploy/fake-gpu-operator/templates/status-updater/clusterrole.yaml
+1
diff --git a/‎deploy/fake-gpu-operator/templates/status-updater/deployment.yaml
+2 b/‎deploy/fake-gpu-operator/templates/status-updater/deployment.yaml
+2
diff --git a/‎deploy/fake-gpu-operator/templates/status-updater/role.yaml
+16 b/‎deploy/fake-gpu-operator/templates/status-updater/role.yaml
+16
diff --git a/‎deploy/fake-gpu-operator/templates/status-updater/rolebinding.yaml
+12 b/‎deploy/fake-gpu-operator/templates/status-updater/rolebinding.yaml
+12
diff --git a/‎go.mod
+34-35 b/‎go.mod
+34-35
diff --git a/‎go.sum
+82-82 b/‎go.sum
+82-82
diff --git a/‎internal/common/constants/constants.go
+12-3 b/‎internal/common/constants/constants.go
+12-3
diff --git a/‎internal/common/kubeclient/kubeclient.go
+4-1 b/‎internal/common/kubeclient/kubeclient.go
+4-1
diff --git a/‎internal/deviceplugin/device_plugin.go
+13-208 b/‎internal/deviceplugin/device_plugin.go
+13-208
diff --git a/‎internal/deviceplugin/fake_node.go
+28 b/‎internal/deviceplugin/fake_node.go
+28
diff --git a/‎internal/deviceplugin/real_node.go
+217 b/‎internal/deviceplugin/real_node.go
+217
diff --git a/‎internal/status-updater/handlers/node/fake_node_deployments.go
+115 b/‎internal/status-updater/handlers/node/fake_node_deployments.go
+115
diff --git a/‎internal/status-updater/handlers/node/handler.go
+8-29 b/‎internal/status-updater/handlers/node/handler.go
+8-29
diff --git a/‎internal/status-updater/handlers/node/topology_cm.go
+48 b/‎internal/status-updater/handlers/node/topology_cm.go
+48
@@ -41,7 +41,7 @@ func main() {
 	initNvidiaSmi()
 	initPreloaders()
 
-	devicePlugin := deviceplugin.NewDevicePlugin(topology)
+	devicePlugin := deviceplugin.NewDevicePlugin(topology, kubeClient)
 	if err = devicePlugin.Serve(); err != nil {
 		log.Printf("Failed to serve device plugin: %s\n", err)
 		os.Exit(1)
 
@@ -2,10 +2,14 @@ package main
 
 import (
 	"github.com/run-ai/fake-gpu-operator/internal/common/app"
+	"github.com/run-ai/fake-gpu-operator/internal/common/config"
 	status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater"
 )
 
 func main() {
+	requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_GPU_OPERATOR_NAMESPACE"}
+	config.ValidateConfig(requiredEnvVars)
+
 	appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{})
 	appRunner.Run()
 }
@@ -0,0 +1,77 @@
+{{- define "fake-gpu-operator.device-plugin.common.metadata.labels" -}}
+app: device-plugin
+{{- end -}}
+
+{{- define "fake-gpu-operator.device-plugin.common.metadata.annotations" -}}
+openshift.io/scc: hostmount-anyuid
+{{- end -}}
+
+{{- define "fake-gpu-operator.device-plugin.common.metadata.name" -}}
+device-plugin
+{{- end -}}
+
+{{- define "fake-gpu-operator.device-plugin.common.podSelector" }}
+matchLabels:
+  app: device-plugin
+  component: device-plugin
+{{- end }}
+
+{{- define "fake-gpu-operator.device-plugin.common.podTemplate.metadata" }}
+annotations:
+  checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
+labels:
+  app: device-plugin
+  component: device-plugin
+{{- end }}
+
+{{- define "fake-gpu-operator.device-plugin.common.podTemplate.spec" }}
+containers:
+  - image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
+    imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
+    resources:
+      {{- toYaml .Values.devicePlugin.resources | nindent 12 }}
+    env:
+      - name: NODE_NAME
+        valueFrom:
+          fieldRef:
+            fieldPath: spec.nodeName
+      - name: TOPOLOGY_CM_NAME
+        value: topology
+      - name: TOPOLOGY_CM_NAMESPACE
+        value: "{{ .Release.Namespace }}"
+    name: nvidia-device-plugin-ctr
+    securityContext:
+      privileged: true
+    terminationMessagePath: /dev/termination-log
+    terminationMessagePolicy: File
+    volumeMounts:
+      - mountPath: /runai/bin
+        name: runai-bin-directory
+      - mountPath: /runai/shared
+        name: runai-shared-directory              
+      - mountPath: /var/lib/kubelet/device-plugins
+        name: device-plugin
+dnsPolicy: ClusterFirst
+restartPolicy: Always
+serviceAccountName: nvidia-device-plugin
+terminationGracePeriodSeconds: 30
+tolerations:
+  - effect: NoSchedule
+    key: nvidia.com/gpu
+    operator: Exists
+imagePullSecrets:
+  - name: gcr-secret
+volumes:
+  - hostPath:
+      path: /var/lib/kubelet/device-plugins
+      type: ""
+    name: device-plugin
+  - hostPath:
+      path: /var/lib/runai/bin
+      type: DirectoryOrCreate
+    name: runai-bin-directory
+  - hostPath:
+      path: /var/lib/runai/shared
+      type: DirectoryOrCreate
+    name: runai-shared-directory
+{{- end }}
@@ -1,75 +1,16 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
-{{- if .Values.environment.openshift }}
-  annotations:
-    openshift.io/scc: hostmount-anyuid
-{{- end }}
+  name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }}
   labels:
-    app: device-plugin
-  name: device-plugin
+    {{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }}
 spec:
   selector:
-    matchLabels:
-      app: device-plugin
-      component: device-plugin
+    {{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }}
   template:
     metadata:
-      annotations:
-        checksum/initialTopology: {{ include (print $.Template.BasePath "/topology-cm.yml") . | sha256sum }}
-      labels:
-        app: device-plugin
-        component: device-plugin
+      {{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }}
     spec:
-      containers:
-        - image: "{{ .Values.devicePlugin.image.repository }}:{{ .Values.devicePlugin.image.tag }}"
-          imagePullPolicy: "{{ .Values.devicePlugin.image.pullPolicy }}"
-          resources:
-            {{- toYaml .Values.devicePlugin.resources | nindent 12 }}
-          env:
-            - name: NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: TOPOLOGY_CM_NAME
-              value: topology
-            - name: TOPOLOGY_CM_NAMESPACE
-              value: "{{ .Release.Namespace }}"
-          imagePullPolicy: Always
-          name: nvidia-device-plugin-ctr
-          securityContext:
-            privileged: true
-          terminationMessagePath: /dev/termination-log
-          terminationMessagePolicy: File
-          volumeMounts:
-            - mountPath: /runai/bin
-              name: runai-bin-directory
-            - mountPath: /runai/shared
-              name: runai-shared-directory              
-            - mountPath: /var/lib/kubelet/device-plugins
-              name: device-plugin
-      dnsPolicy: ClusterFirst
+      {{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }}
       nodeSelector:
         nvidia.com/gpu.deploy.device-plugin: "true"
-      restartPolicy: Always
-      serviceAccountName: nvidia-device-plugin
-      terminationGracePeriodSeconds: 30
-      tolerations:
-        - effect: NoSchedule
-          key: nvidia.com/gpu
-          operator: Exists
-      imagePullSecrets:
-        - name: gcr-secret
-      volumes:
-        - hostPath:
-            path: /var/lib/kubelet/device-plugins
-            type: ""
-          name: device-plugin
-        - hostPath:
-            path: /var/lib/runai/bin
-            type: DirectoryOrCreate
-          name: runai-bin-directory
-        - hostPath:
-            path: /var/lib/runai/shared
-            type: DirectoryOrCreate
-          name: runai-shared-directory
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "fake-gpu-operator.device-plugin.common.metadata.name" . }}
+  labels:
+    {{- include "fake-gpu-operator.device-plugin.common.metadata.labels" . | nindent 4 }}
+    run.ai/fake-node-deployment-template: "true"
+spec:
+  replicas: 0
+  selector:
+    {{- include "fake-gpu-operator.device-plugin.common.podSelector" . | nindent 4 }}
+  template:
+    metadata:
+      {{- include "fake-gpu-operator.device-plugin.common.podTemplate.metadata" . | nindent 6 }}
+    spec:
+      {{- include "fake-gpu-operator.device-plugin.common.podTemplate.spec" . | nindent 6 }}
@@ -0,0 +1,61 @@
+{{- define "fake-gpu-operator.status-exporter.common.metadata.labels" -}}
+app: nvidia-dcgm-exporter
+component: status-exporter
+app.kubernetes.io/name: nvidia-container-toolkit
+{{- end -}}
+
+{{- define "fake-gpu-operator.status-exporter.common.metadata.name" -}}
+nvidia-dcgm-exporter
+{{- end -}}
+
+{{- define "fake-gpu-operator.status-exporter.common.podSelector" -}}
+matchLabels:
+  app: nvidia-dcgm-exporter
+{{- end -}}
+
+{{- define "fake-gpu-operator.status-exporter.common.podTemplate.metadata" -}}
+labels:
+  app: nvidia-dcgm-exporter
+  app.kubernetes.io/name: nvidia-container-toolkit
+{{- end -}}
+
+{{- define "fake-gpu-operator.status-exporter.common.podTemplate.spec" -}}
+containers:
+- image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
+  imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
+  resources:
+    {{- toYaml .Values.statusExporter.resources | nindent 8 }}
+  name: nvidia-dcgm-exporter
+  env:
+    - name: NODE_NAME
+      valueFrom:
+        fieldRef:
+          fieldPath: spec.nodeName
+    - name: TOPOLOGY_CM_NAME
+      value: topology
+    - name: TOPOLOGY_CM_NAMESPACE
+      value: "{{ .Release.Namespace }}"
+    - name: TOPOLOGY_MAX_EXPORT_INTERVAL
+      value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
+  ports:
+    - containerPort: 9400
+      name: http
+  volumeMounts:
+    - mountPath: /runai/proc
+      name: runai-proc-directory
+restartPolicy: Always
+schedulerName: default-scheduler
+serviceAccount: status-exporter
+serviceAccountName: status-exporter
+tolerations:
+  - effect: NoSchedule
+    key: nvidia.com/gpu
+    operator: Exists
+imagePullSecrets:
+  - name: gcr-secret
+volumes:
+  - name: runai-proc-directory
+    hostPath:
+      path: /var/lib/runai/proc
+      type: DirectoryOrCreate
+{{- end -}}
@@ -1,66 +1,16 @@
 apiVersion: apps/v1
 kind: DaemonSet
 metadata:
+  name: {{ include "fake-gpu-operator.status-exporter.common.metadata.name" . }}
   labels:
-    app: nvidia-dcgm-exporter
-    component: status-exporter
-    # this label would make the deployment pod to mimic the container-toolkit, on top of mimicking the dcgm-exporter.
-    app.kubernetes.io/name: nvidia-container-toolkit
-  name: nvidia-dcgm-exporter
-
+    {{- include "fake-gpu-operator.status-exporter.common.metadata.labels" . | nindent 4 }}
 spec:
   selector:
-    matchLabels:
-      app: nvidia-dcgm-exporter
+    {{- include "fake-gpu-operator.status-exporter.common.podSelector" . | nindent 4 }}
   template:
     metadata:
-      creationTimestamp: null
-      labels:
-        app: nvidia-dcgm-exporter
-        app.kubernetes.io/name: nvidia-container-toolkit
+      {{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
     spec:
-      containers:
-        - image: "{{ .Values.statusExporter.image.repository }}:{{ .Values.statusExporter.image.tag }}"
-          imagePullPolicy: "{{ .Values.statusExporter.image.pullPolicy }}"
-          resources:
-            {{- toYaml .Values.statusExporter.resources | nindent 12 }}
-          name: nvidia-dcgm-exporter
-          env:
-            - name: NODE_NAME
-              valueFrom:
-                fieldRef:
-                  fieldPath: spec.nodeName
-            - name: TOPOLOGY_CM_NAME
-              value: topology
-            - name: TOPOLOGY_CM_NAMESPACE
-              value: "{{ .Release.Namespace }}"
-            - name: TOPOLOGY_MAX_EXPORT_INTERVAL
-              value: "{{ .Values.statusExporter.topologyMaxExportInterval }}"
-          ports:
-            - containerPort: 9400
-              name: http
-          volumeMounts:
-            - mountPath: /runai/proc
-              name: runai-proc-directory
+      {{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
       nodeSelector:
-        nvidia.com/gpu.deploy.dcgm-exporter: "true"
-      restartPolicy: Always
-      schedulerName: default-scheduler
-      serviceAccount: status-exporter
-      serviceAccountName: status-exporter
-      tolerations:
-        - effect: NoSchedule
-          key: nvidia.com/gpu
-          operator: Exists
-      imagePullSecrets:
-        - name: gcr-secret
-      volumes:
-        - name: runai-proc-directory
-          hostPath:
-            path: /var/lib/runai/proc
-            type: DirectoryOrCreate
-  updateStrategy:
-    rollingUpdate:
-      maxSurge: 0
-      maxUnavailable: 1
-    type: RollingUpdate
+        nvidia.com/gpu.deploy.dcgm-exporter: "true"
@@ -0,0 +1,16 @@
+apiVersion: apps/v1
+kind: Deployment
+metadata:
+  name: {{ include "fake-gpu-operator.status-exporter.common.metadata.name" . }}
+  labels:
+    {{- include "fake-gpu-operator.status-exporter.common.metadata.labels" . | nindent 4 }}
+    run.ai/fake-node-deployment-template: "true"
+spec:
+  replicas: 0
+  selector:
+    {{- include "fake-gpu-operator.status-exporter.common.podSelector" . | nindent 4 }}
+  template:
+    metadata:
+      {{- include "fake-gpu-operator.status-exporter.common.podTemplate.metadata" . | nindent 6 }}
+    spec:
+      {{- include "fake-gpu-operator.status-exporter.common.podTemplate.spec" . | nindent 6 }}
@@ -12,6 +12,7 @@ rules:
       - get
       - list
       - watch
+      - patch
   - apiGroups:
       - ""
     resources:
 
@@ -29,6 +29,8 @@ spec:
               value: topology
             - name: TOPOLOGY_CM_NAMESPACE
               value: "{{ .Release.Namespace }}"
+            - name: FAKE_GPU_OPERATOR_NAMESPACE
+              value: "{{ .Release.Namespace }}"
       restartPolicy: Always
       serviceAccountName: status-updater
       imagePullSecrets:
 
@@ -0,0 +1,16 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: Role
+metadata:
+  name: fake-status-updater
+rules:
+  - apiGroups:
+      - apps
+    resources:
+      - deployments
+    verbs:
+      - update
+      - list
+      - get
+      - watch
+      - create
+      - delete
@@ -0,0 +1,12 @@
+apiVersion: rbac.authorization.k8s.io/v1
+kind: RoleBinding
+metadata:
+  name: fake-status-updater
+roleRef:
+  kind: Role
+  apiGroup: rbac.authorization.k8s.io
+  name: fake-status-updater
+subjects:
+  - kind: ServiceAccount
+    name: status-updater
+    namespace: "{{ .Release.Namespace }}"
@@ -10,96 +10,95 @@ require (
 	github.com/onsi/ginkgo/v2 v2.17.1
 	github.com/onsi/gomega v1.30.0
 	github.com/otiai10/copy v1.7.0
-	github.com/prometheus/client_golang v1.14.0
-	github.com/prometheus/client_model v0.3.0
+	github.com/prometheus/client_golang v1.18.0
+	github.com/prometheus/client_model v0.5.0
 	github.com/spf13/viper v1.14.0
 	github.com/tidwall/gjson v1.14.1
 	golang.org/x/net v0.20.0
-	google.golang.org/grpc v1.56.3
+	google.golang.org/grpc v1.58.3
 	gopkg.in/yaml.v3 v3.0.1
-	k8s.io/api v0.26.0
-	k8s.io/apimachinery v0.26.0
-	k8s.io/client-go v0.26.0
+	k8s.io/api v0.29.3
+	k8s.io/apimachinery v0.29.3
+	k8s.io/client-go v0.29.3
 	k8s.io/kubelet v0.24.0
-	sigs.k8s.io/controller-runtime v0.14.1
+	sigs.k8s.io/controller-runtime v0.17.2
 )
 
 require (
-	github.com/emicklei/go-restful/v3 v3.9.0 // indirect
-	github.com/evanphx/json-patch/v5 v5.6.0 // indirect
+	github.com/emicklei/go-restful/v3 v3.11.0 // indirect
+	github.com/evanphx/json-patch/v5 v5.8.0 // indirect
 	github.com/go-playground/locales v0.14.0 // indirect
 	github.com/go-playground/universal-translator v0.18.0 // indirect
 	github.com/go-task/slim-sprig v0.0.0-20230315185526-52ccab3ef572 // indirect
 	github.com/golang/groupcache v0.0.0-20210331224755-41bb18bfe9da // indirect
-	github.com/google/pprof v0.0.0-20210407192527-94a9f03dee38 // indirect
+	github.com/google/gnostic-models v0.6.8 // indirect
+	github.com/google/pprof v0.0.0-20210720184732-4bb14d4b1be1 // indirect
 	github.com/hashicorp/errwrap v1.0.0 // indirect
 	github.com/hashicorp/hcl v1.0.0 // indirect
 	github.com/imdario/mergo v0.3.6 // indirect
 	github.com/leodido/go-urn v1.2.1 // indirect
 	github.com/magiconair/properties v1.8.6 // indirect
+	github.com/matttproud/golang_protobuf_extensions/v2 v2.0.0 // indirect
 	github.com/pelletier/go-toml v1.9.5 // indirect
 	github.com/pelletier/go-toml/v2 v2.0.5 // indirect
 	github.com/pmezard/go-difflib v1.0.0 // indirect
-	github.com/rogpeppe/go-internal v1.8.0 // indirect
 	github.com/spf13/afero v1.9.2 // indirect
 	github.com/spf13/cast v1.5.0 // indirect
 	github.com/spf13/jwalterweatherman v1.1.0 // indirect
 	github.com/spf13/pflag v1.0.5 // indirect
 	github.com/subosito/gotenv v1.4.1 // indirect
+	golang.org/x/exp v0.0.0-20220722155223-a9213eeb770e // indirect
 	golang.org/x/tools v0.17.0 // indirect
-	gomodules.xyz/jsonpatch/v2 v2.2.0 // indirect
-	gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
+	gomodules.xyz/jsonpatch/v2 v2.4.0 // indirect
+	google.golang.org/genproto/googleapis/rpc v0.0.0-20230822172742-b8732ec3820d // indirect
 	gopkg.in/go-playground/assert.v1 v1.2.1 // indirect
 	gopkg.in/ini.v1 v1.67.0 // indirect
 	gopkg.in/yaml.v2 v2.4.0 // indirect
-	k8s.io/apiextensions-apiserver v0.26.0 // indirect
-	k8s.io/component-base v0.26.0 // indirect
+	k8s.io/apiextensions-apiserver v0.29.0 // indirect
+	k8s.io/component-base v0.29.0 // indirect
 )
 
 require (
 	github.com/beorn7/perks v1.0.1 // indirect
 	github.com/cespare/xxhash/v2 v2.2.0 // indirect
 	github.com/davecgh/go-spew v1.1.1 // indirect
 	github.com/evanphx/json-patch v4.12.0+incompatible // indirect
-	github.com/fsnotify/fsnotify v1.6.0 // indirect
+	github.com/fsnotify/fsnotify v1.7.0 // indirect
 	github.com/go-logr/logr v1.4.1 // indirect
-	github.com/go-openapi/jsonpointer v0.19.5 // indirect
-	github.com/go-openapi/jsonreference v0.20.0 // indirect
-	github.com/go-openapi/swag v0.19.14 // indirect
+	github.com/go-openapi/jsonpointer v0.19.6 // indirect
+	github.com/go-openapi/jsonreference v0.20.2 // indirect
+	github.com/go-openapi/swag v0.22.3 // indirect
 	github.com/go-playground/validator v9.31.0+incompatible
 	github.com/gogo/protobuf v1.3.2 // indirect
-	github.com/golang/protobuf v1.5.3 // indirect
-	github.com/google/gnostic v0.5.7-v3refs // indirect
+	github.com/golang/protobuf v1.5.4 // indirect
 	github.com/google/go-cmp v0.6.0 // indirect
-	github.com/google/gofuzz v1.1.0 // indirect
+	github.com/google/gofuzz v1.2.0 // indirect
 	github.com/josharian/intern v1.0.0 // indirect
 	github.com/json-iterator/go v1.1.12 // indirect
-	github.com/mailru/easyjson v0.7.6 // indirect
+	github.com/mailru/easyjson v0.7.7 // indirect
 	github.com/mattn/go-runewidth v0.0.13 // indirect
-	github.com/matttproud/golang_protobuf_extensions v1.0.2 // indirect
 	github.com/modern-go/concurrent v0.0.0-20180306012644-bacd9c7ef1dd // indirect
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect
 	github.com/pkg/errors v0.9.1 // indirect
-	github.com/prometheus/common v0.37.0 // indirect
-	github.com/prometheus/procfs v0.8.0 // indirect
+	github.com/prometheus/common v0.45.0 // indirect
+	github.com/prometheus/procfs v0.12.0 // indirect
 	github.com/rivo/uniseg v0.2.0 // indirect
-	github.com/stretchr/testify v1.8.1
+	github.com/stretchr/testify v1.8.4
 	github.com/tidwall/match v1.1.1 // indirect
 	github.com/tidwall/pretty v1.2.0 // indirect
-	golang.org/x/oauth2 v0.7.0 // indirect
+	golang.org/x/oauth2 v0.12.0 // indirect
 	golang.org/x/sys v0.16.0 // indirect
 	golang.org/x/term v0.16.0 // indirect
 	golang.org/x/text v0.14.0 // indirect
 	golang.org/x/time v0.3.0 // indirect
 	google.golang.org/appengine v1.6.7 // indirect
-	google.golang.org/genproto v0.0.0-20230410155749-daa745c078e1 // indirect
-	google.golang.org/protobuf v1.30.0 // indirect
+	google.golang.org/protobuf v1.33.0 // indirect
 	gopkg.in/inf.v0 v0.9.1 // indirect
-	k8s.io/klog/v2 v2.80.1 // indirect
-	k8s.io/kube-openapi v0.0.0-20221012153701-172d655c2280 // indirect
+	k8s.io/klog/v2 v2.110.1 // indirect
+	k8s.io/kube-openapi v0.0.0-20231010175941-2dd684a91f00 // indirect
 	k8s.io/utils v0.0.0-20240310230437-4693a0247e57
-	sigs.k8s.io/json v0.0.0-20220713155537-f223a00ba0e2 // indirect
-	sigs.k8s.io/structured-merge-diff/v4 v4.2.3 // indirect
-	sigs.k8s.io/yaml v1.3.0 // indirect
+	sigs.k8s.io/json v0.0.0-20221116044647-bc3834ca7abd // indirect
+	sigs.k8s.io/structured-merge-diff/v4 v4.4.1 // indirect
+	sigs.k8s.io/yaml v1.4.0 // indirect
 )
@@ -6,12 +6,21 @@ const (
 	PodGroupNameAnnotation         = "pod-group-name"
 	ReservationPodGpuIdxAnnotation = "run.ai/reserve_for_gpu_index"
 	MigMappingAnnotation           = "run.ai/mig-mapping"
+	KwokNodeAnnotation             = "kwok.x-k8s.io/node"
 
-	GpuGroupLabel       = "runai-gpu-group"
-	GpuProductLabel     = "nvidia.com/gpu.product"
-	MigConfigStateLabel = "nvidia.com/mig.config.state"
+	GpuGroupLabel                   = "runai-gpu-group"
+	GpuProductLabel                 = "nvidia.com/gpu.product"
+	MigConfigStateLabel             = "nvidia.com/mig.config.state"
+	FakeNodeDeploymentTemplateLabel = "run.ai/fake-node-deployment-template"
 
 	ReservationNs = "runai-reservation"
 
 	GpuResourceName = "nvidia.com/gpu"
+
+	// GuyTodo: Use these constants in the code
+	EnvFakeNode            = "FAKE_NODE"
+	EnvNodeName            = "NODE_NAME"
+	EnvTopologyCmName      = "TOPOLOGY_CM_NAME"
+	EnvTopologyCmNamespace = "TOPOLOGY_CM_NAMESPACE"
+	EnvFakeGpuOperatorNs   = "FAKE_GPU_OPERATOR_NAMESPACE"
 )
@@ -10,6 +10,8 @@ import (
 	"k8s.io/apimachinery/pkg/watch"
 	"k8s.io/client-go/kubernetes"
 	"k8s.io/client-go/rest"
+
+	ctrl "sigs.k8s.io/controller-runtime"
 )
 
 type KubeClientInterface interface {
@@ -28,11 +30,12 @@ type KubeClient struct {
 func NewKubeClient(config *rest.Config, stop chan struct{}) *KubeClient {
 	if config == nil {
 		var err error
-		config, err = rest.InClusterConfig()
+		config, err = ctrl.GetConfig()
 		if err != nil {
 			log.Fatalf("Error getting in cluster config to init kubeclient: %e", err)
 		}
 	}
+
 	clientset := kubernetes.NewForConfigOrDie(config)
 	return &KubeClient{
 		ClientSet: clientset,
 
@@ -1,229 +1,34 @@
 package deviceplugin
 
 import (
-	"fmt"
-	"log"
-	"net"
-	"os"
-	"path"
-	"strings"
-	"time"
-
-	"github.com/google/uuid"
+	"github.com/run-ai/fake-gpu-operator/internal/common/constants"
 	"github.com/run-ai/fake-gpu-operator/internal/common/topology"
-	"golang.org/x/net/context"
-	"google.golang.org/grpc"
-	"google.golang.org/grpc/credentials/insecure"
-	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
+	"github.com/spf13/viper"
+	"k8s.io/client-go/kubernetes"
 )
 
 const (
 	resourceName = "nvidia.com/gpu"
-	serverSock   = pluginapi.DevicePluginPath + "fake-nvidia-gpu.sock"
 )
 
-type DevicePlugin struct {
-	devs   []*pluginapi.Device
-	socket string
-
-	stop   chan interface{}
-	health chan *pluginapi.Device
-	server *grpc.Server
+type Interface interface {
+	Serve() error
 }
 
-func NewDevicePlugin(topology *topology.NodeTopology) *DevicePlugin {
+func NewDevicePlugin(topology *topology.NodeTopology, kubeClient kubernetes.Interface) Interface {
 	if topology == nil {
 		panic("topology is nil")
 	}
 
-	return &DevicePlugin{
-		devs:   createDevices(getGpuCount(topology)),
-		socket: serverSock,
-	}
-}
-
-func getGpuCount(nodeTopology *topology.NodeTopology) int {
-	return len(nodeTopology.Gpus)
-}
-
-func (m *DevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
-	return &pluginapi.DevicePluginOptions{}, nil
-}
-
-func dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
-	ctx := context.Background()
-	ctx, cancel := context.WithTimeout(ctx, timeout)
-	defer cancel()
-
-	c, err := grpc.DialContext(
-		ctx,
-		unixSocketPath,
-		grpc.WithTransportCredentials(insecure.NewCredentials()),
-		grpc.WithBlock(),
-		grpc.WithContextDialer(func(_ context.Context, addr string) (net.Conn, error) {
-			return net.DialTimeout("unix", addr, timeout)
-		}),
-	)
-
-	if err != nil {
-		return nil, err
-	}
-
-	return c, nil
-}
-
-func createDevices(devCount int) []*pluginapi.Device {
-	var devs []*pluginapi.Device
-	for i := 0; i < devCount; i++ {
-		u, _ := uuid.NewRandom()
-		devs = append(devs, &pluginapi.Device{
-			ID:     u.String(),
-			Health: pluginapi.Healthy,
-		})
-	}
-	return devs
-}
-
-func (m *DevicePlugin) Start() error {
-	err := m.cleanup()
-	if err != nil {
-		return err
-	}
-
-	sock, err := net.Listen("unix", m.socket)
-	if err != nil {
-		return err
-	}
-
-	m.server = grpc.NewServer([]grpc.ServerOption{}...)
-	pluginapi.RegisterDevicePluginServer(m.server, m)
-
-	go func() {
-		err := m.server.Serve(sock)
-		if err != nil {
-			log.Println(err)
-		}
-	}()
-
-	// Wait for server to start by launching a blocking connexion
-	conn, err := dial(m.socket, 5*time.Second)
-	if err != nil {
-		return err
-	}
-	conn.Close()
-
-	return nil
-}
-
-func (m *DevicePlugin) Stop() error {
-	if m.server == nil {
-		return nil
-	}
-
-	m.server.Stop()
-	m.server = nil
-	close(m.stop)
-
-	return m.cleanup()
-}
-
-func (m *DevicePlugin) Register(kubeletEndpoint, resourceName string) error {
-	conn, err := dial(kubeletEndpoint, 5*time.Second)
-	if err != nil {
-		return err
-	}
-	defer conn.Close()
-
-	client := pluginapi.NewRegistrationClient(conn)
-	reqt := &pluginapi.RegisterRequest{
-		Version:      pluginapi.Version,
-		Endpoint:     path.Base(m.socket),
-		ResourceName: resourceName,
-	}
-
-	_, err = client.Register(context.Background(), reqt)
-	if err != nil {
-		return err
-	}
-	return nil
-}
-
-func (m *DevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
-	err := s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
-	if err != nil {
-		fmt.Printf("Failed to send devices to Kubelet: %v\n", err)
-	}
-
-	for {
-		select {
-		case <-m.stop:
-			return nil
-		case d := <-m.health:
-			// FIXME: there is no way to recover from the Unhealthy state.
-			d.Health = pluginapi.Unhealthy
-			err := s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
-			if err != nil {
-				log.Printf("failed to send unhealthy update: %v", err)
-			}
+	if viper.GetBool(constants.EnvFakeNode) {
+		return &FakeNodeDevicePlugin{
+			kubeClient: kubeClient,
+			gpuCount:   getGpuCount(topology),
 		}
 	}
-}
-
-func (m *DevicePlugin) GetPreferredAllocation(context.Context, *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
-	return &pluginapi.PreferredAllocationResponse{}, nil
-}
-
-func (m *DevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
-	responses := pluginapi.AllocateResponse{}
-	for _, req := range reqs.ContainerRequests {
-		response := pluginapi.ContainerAllocateResponse{
-			Envs: map[string]string{
-				"MOCK_NVIDIA_VISIBLE_DEVICES": strings.Join(req.DevicesIDs, ","),
-			},
-			Mounts: []*pluginapi.Mount{
-				{
-					ContainerPath: "/bin/nvidia-smi",
-					HostPath:      "/var/lib/runai/bin/nvidia-smi",
-				},
-			},
-		}
 
-		responses.ContainerResponses = append(responses.ContainerResponses, &response)
-	}
-
-	return &responses, nil
-}
-
-func (m *DevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
-	return &pluginapi.PreStartContainerResponse{}, nil
-}
-
-func (m *DevicePlugin) cleanup() error {
-	if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
-		return err
-	}
-
-	return nil
-}
-
-func (m *DevicePlugin) Serve() error {
-	err := m.Start()
-	if err != nil {
-		log.Printf("Could not start device plugin: %s", err)
-		return err
-	}
-	log.Println("Starting to serve on", m.socket)
-
-	err = m.Register(pluginapi.KubeletSocket, resourceName)
-	if err != nil {
-		log.Printf("Could not register device plugin: %s", err)
-		stopErr := m.Stop()
-		if stopErr != nil {
-			log.Printf("Could not stop device plugin: %s", stopErr)
-		}
-		return err
+	return &RealNodeDevicePlugin{
+		devs:   createDevices(getGpuCount(topology)),
+		socket: serverSock,
 	}
-	log.Println("Registered device plugin with Kubelet")
-
-	return nil
 }
@@ -0,0 +1,28 @@
+package deviceplugin
+
+import (
+	"fmt"
+	"os"
+
+	"github.com/run-ai/fake-gpu-operator/internal/common/constants"
+	"golang.org/x/net/context"
+	"k8s.io/apimachinery/pkg/types"
+	"k8s.io/client-go/kubernetes"
+
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+type FakeNodeDevicePlugin struct {
+	kubeClient kubernetes.Interface
+	gpuCount   int
+}
+
+func (f *FakeNodeDevicePlugin) Serve() error {
+	patch := fmt.Sprintf(`{"status": {"capacity": {"%s": "%d"}, "allocatable": {"%s": "%d"}}}`, resourceName, f.gpuCount, resourceName, f.gpuCount)
+	_, err := f.kubeClient.CoreV1().Nodes().Patch(context.TODO(), os.Getenv(constants.EnvNodeName), types.MergePatchType, []byte(patch), metav1.PatchOptions{}, "status")
+	if err != nil {
+		return fmt.Errorf("failed to update node capacity and allocatable: %v", err)
+	}
+
+	return nil
+}
@@ -0,0 +1,217 @@
+package deviceplugin
+
+import (
+	"fmt"
+	"log"
+	"net"
+	"os"
+	"path"
+	"strings"
+	"time"
+
+	"github.com/google/uuid"
+	"github.com/run-ai/fake-gpu-operator/internal/common/topology"
+	"golang.org/x/net/context"
+	"google.golang.org/grpc"
+	"google.golang.org/grpc/credentials/insecure"
+	pluginapi "k8s.io/kubelet/pkg/apis/deviceplugin/v1beta1"
+)
+
+const (
+	serverSock = pluginapi.DevicePluginPath + "fake-nvidia-gpu.sock"
+)
+
+type RealNodeDevicePlugin struct {
+	devs   []*pluginapi.Device
+	socket string
+
+	stop   chan interface{}
+	health chan *pluginapi.Device
+	server *grpc.Server
+}
+
+func getGpuCount(nodeTopology *topology.NodeTopology) int {
+	return len(nodeTopology.Gpus)
+}
+
+func (m *RealNodeDevicePlugin) GetDevicePluginOptions(context.Context, *pluginapi.Empty) (*pluginapi.DevicePluginOptions, error) {
+	return &pluginapi.DevicePluginOptions{}, nil
+}
+
+func dial(unixSocketPath string, timeout time.Duration) (*grpc.ClientConn, error) {
+	ctx := context.Background()
+	ctx, cancel := context.WithTimeout(ctx, timeout)
+	defer cancel()
+
+	c, err := grpc.DialContext(
+		ctx,
+		unixSocketPath,
+		grpc.WithTransportCredentials(insecure.NewCredentials()),
+		grpc.WithBlock(),
+		grpc.WithContextDialer(func(_ context.Context, addr string) (net.Conn, error) {
+			return net.DialTimeout("unix", addr, timeout)
+		}),
+	)
+
+	if err != nil {
+		return nil, err
+	}
+
+	return c, nil
+}
+
+func createDevices(devCount int) []*pluginapi.Device {
+	var devs []*pluginapi.Device
+	for i := 0; i < devCount; i++ {
+		u, _ := uuid.NewRandom()
+		devs = append(devs, &pluginapi.Device{
+			ID:     u.String(),
+			Health: pluginapi.Healthy,
+		})
+	}
+	return devs
+}
+
+func (m *RealNodeDevicePlugin) Start() error {
+	err := m.cleanup()
+	if err != nil {
+		return err
+	}
+
+	sock, err := net.Listen("unix", m.socket)
+	if err != nil {
+		return err
+	}
+
+	m.server = grpc.NewServer([]grpc.ServerOption{}...)
+	pluginapi.RegisterDevicePluginServer(m.server, m)
+
+	go func() {
+		err := m.server.Serve(sock)
+		if err != nil {
+			log.Println(err)
+		}
+	}()
+
+	// Wait for server to start by launching a blocking connexion
+	conn, err := dial(m.socket, 5*time.Second)
+	if err != nil {
+		return err
+	}
+	conn.Close()
+
+	return nil
+}
+
+func (m *RealNodeDevicePlugin) Stop() error {
+	if m.server == nil {
+		return nil
+	}
+
+	m.server.Stop()
+	m.server = nil
+	close(m.stop)
+
+	return m.cleanup()
+}
+
+func (m *RealNodeDevicePlugin) Register(kubeletEndpoint, resourceName string) error {
+	conn, err := dial(kubeletEndpoint, 5*time.Second)
+	if err != nil {
+		return err
+	}
+	defer conn.Close()
+
+	client := pluginapi.NewRegistrationClient(conn)
+	reqt := &pluginapi.RegisterRequest{
+		Version:      pluginapi.Version,
+		Endpoint:     path.Base(m.socket),
+		ResourceName: resourceName,
+	}
+
+	_, err = client.Register(context.Background(), reqt)
+	if err != nil {
+		return err
+	}
+	return nil
+}
+
+func (m *RealNodeDevicePlugin) ListAndWatch(e *pluginapi.Empty, s pluginapi.DevicePlugin_ListAndWatchServer) error {
+	err := s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
+	if err != nil {
+		fmt.Printf("Failed to send devices to Kubelet: %v\n", err)
+	}
+
+	for {
+		select {
+		case <-m.stop:
+			return nil
+		case d := <-m.health:
+			// FIXME: there is no way to recover from the Unhealthy state.
+			d.Health = pluginapi.Unhealthy
+			err := s.Send(&pluginapi.ListAndWatchResponse{Devices: m.devs})
+			if err != nil {
+				log.Printf("failed to send unhealthy update: %v", err)
+			}
+		}
+	}
+}
+
+func (m *RealNodeDevicePlugin) GetPreferredAllocation(context.Context, *pluginapi.PreferredAllocationRequest) (*pluginapi.PreferredAllocationResponse, error) {
+	return &pluginapi.PreferredAllocationResponse{}, nil
+}
+
+func (m *RealNodeDevicePlugin) Allocate(ctx context.Context, reqs *pluginapi.AllocateRequest) (*pluginapi.AllocateResponse, error) {
+	responses := pluginapi.AllocateResponse{}
+	for _, req := range reqs.ContainerRequests {
+		response := pluginapi.ContainerAllocateResponse{
+			Envs: map[string]string{
+				"MOCK_NVIDIA_VISIBLE_DEVICES": strings.Join(req.DevicesIDs, ","),
+			},
+			Mounts: []*pluginapi.Mount{
+				{
+					ContainerPath: "/bin/nvidia-smi",
+					HostPath:      "/var/lib/runai/bin/nvidia-smi",
+				},
+			},
+		}
+
+		responses.ContainerResponses = append(responses.ContainerResponses, &response)
+	}
+
+	return &responses, nil
+}
+
+func (m *RealNodeDevicePlugin) PreStartContainer(context.Context, *pluginapi.PreStartContainerRequest) (*pluginapi.PreStartContainerResponse, error) {
+	return &pluginapi.PreStartContainerResponse{}, nil
+}
+
+func (m *RealNodeDevicePlugin) cleanup() error {
+	if err := os.Remove(m.socket); err != nil && !os.IsNotExist(err) {
+		return err
+	}
+
+	return nil
+}
+
+func (m *RealNodeDevicePlugin) Serve() error {
+	err := m.Start()
+	if err != nil {
+		log.Printf("Could not start device plugin: %s", err)
+		return err
+	}
+	log.Println("Starting to serve on", m.socket)
+
+	err = m.Register(pluginapi.KubeletSocket, resourceName)
+	if err != nil {
+		log.Printf("Could not register device plugin: %s", err)
+		stopErr := m.Stop()
+		if stopErr != nil {
+			log.Printf("Could not stop device plugin: %s", stopErr)
+		}
+		return err
+	}
+	log.Println("Registered device plugin with Kubelet")
+
+	return nil
+}
@@ -0,0 +1,115 @@
+package node
+
+import (
+	"context"
+	"fmt"
+	"os"
+
+	"github.com/run-ai/fake-gpu-operator/internal/common/constants"
+	appsv1 "k8s.io/api/apps/v1"
+	v1 "k8s.io/api/core/v1"
+	"k8s.io/apimachinery/pkg/api/errors"
+	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+	"k8s.io/utils/ptr"
+)
+
+func (p *NodeHandler) applyFakeNodeDeployments(node *v1.Node) error {
+	if !isFakeNode(node) {
+		return nil
+	}
+
+	deployments, err := p.generateFakeNodeDeployments(node)
+	if err != nil {
+		return fmt.Errorf("failed to get fake node deployments: %w", err)
+	}
+
+	for _, deployment := range deployments {
+		err := p.applyDeployment(deployment)
+		if err != nil {
+			return fmt.Errorf("failed to apply deployment: %w", err)
+		}
+	}
+
+	return nil
+}
+
+func (p *NodeHandler) deleteFakeNodeDeployments(node *v1.Node) error {
+	if !isFakeNode(node) {
+		return nil
+	}
+
+	deployments, err := p.generateFakeNodeDeployments(node)
+	if err != nil {
+		return fmt.Errorf("failed to get fake node deployments: %w", err)
+	}
+
+	for _, deployment := range deployments {
+		err := p.kubeClient.AppsV1().Deployments(deployment.Namespace).Delete(context.TODO(), deployment.Name, metav1.DeleteOptions{})
+		if err != nil && !errors.IsNotFound(err) {
+			return fmt.Errorf("failed to delete deployment %s: %w", deployment.Name, err)
+		}
+	}
+
+	return nil
+}
+
+func (p *NodeHandler) generateFakeNodeDeployments(node *v1.Node) ([]appsv1.Deployment, error) {
+	deploymentTemplates, err := p.kubeClient.AppsV1().Deployments(os.Getenv(constants.EnvFakeGpuOperatorNs)).List(context.TODO(), metav1.ListOptions{
+		LabelSelector: fmt.Sprintf("%s=true", constants.FakeNodeDeploymentTemplateLabel),
+	})
+	if err != nil {
+		return nil, fmt.Errorf("failed to list deployments: %w", err)
+	}
+
+	deployments := []appsv1.Deployment{}
+	for i := range deploymentTemplates.Items {
+		deployments = append(deployments, *generateFakeNodeDeploymentFromTemplate(&deploymentTemplates.Items[i], node))
+	}
+
+	return deployments, nil
+}
+
+func (p *NodeHandler) applyDeployment(deployment appsv1.Deployment) error {
+	existingDeployment, err := p.kubeClient.AppsV1().Deployments(deployment.Namespace).Get(context.TODO(), deployment.Name, metav1.GetOptions{})
+	if err != nil && !errors.IsNotFound(err) {
+		return fmt.Errorf("failed to get deployment %s: %w", deployment.Name, err)
+	}
+
+	if errors.IsNotFound(err) {
+		deployment.ResourceVersion = ""
+		_, err := p.kubeClient.AppsV1().Deployments(deployment.Namespace).Create(context.TODO(), &deployment, metav1.CreateOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to create deployment %s: %w", deployment.Name, err)
+		}
+	} else {
+		deployment.UID = existingDeployment.UID
+		deployment.ResourceVersion = existingDeployment.ResourceVersion
+		_, err := p.kubeClient.AppsV1().Deployments(deployment.Namespace).Update(context.TODO(), &deployment, metav1.UpdateOptions{})
+		if err != nil {
+			return fmt.Errorf("failed to update deployment %s: %w", deployment.Name, err)
+		}
+	}
+
+	return nil
+}
+
+func generateFakeNodeDeploymentFromTemplate(template *appsv1.Deployment, node *v1.Node) *appsv1.Deployment {
+	deployment := template.DeepCopy()
+
+	delete(deployment.Labels, constants.FakeNodeDeploymentTemplateLabel)
+	deployment.Name = fmt.Sprintf("%s-%s", deployment.Name, node.Name)
+	deployment.Spec.Replicas = ptr.To(int32(1))
+	deployment.Spec.Template.Spec.Containers[0].Env = append(deployment.Spec.Template.Spec.Containers[0].Env, v1.EnvVar{
+		Name:  constants.EnvNodeName,
+		Value: node.Name,
+	}, v1.EnvVar{
+		Name:  constants.EnvFakeNode,
+		Value: "true",
+	})
+
+	return deployment
+}
+
+func isFakeNode(node *v1.Node) bool {
+	return node != nil && node.Annotations[constants.KwokNodeAnnotation] == "fake"
+}
@@ -4,7 +4,6 @@ import (
 	"fmt"
 	"log"
 
-	"github.com/google/uuid"
 	"github.com/run-ai/fake-gpu-operator/internal/common/topology"
 	v1 "k8s.io/api/core/v1"
 	"k8s.io/apimachinery/pkg/api/errors"
@@ -31,28 +30,14 @@ func NewNodeHandler(kubeClient kubernetes.Interface) *NodeHandler {
 func (p *NodeHandler) HandleAdd(node *v1.Node) error {
 	log.Printf("Handling node addition: %s\n", node.Name)
 
-	nodeTopology, _ := topology.GetNodeTopologyFromCM(p.kubeClient, node.Name)
-	if nodeTopology != nil {
-		return nil
-	}
-
-	baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient)
+	err := p.createNodeTopologyCM(node)
 	if err != nil {
-		return fmt.Errorf("failed to get base topology: %w", err)
-	}
-
-	nodeAutofillSettings := baseTopology.Config.NodeAutofill
-
-	nodeTopology = &topology.NodeTopology{
-		GpuMemory:   nodeAutofillSettings.GpuMemory,
-		GpuProduct:  nodeAutofillSettings.GpuProduct,
-		Gpus:        generateGpuDetails(nodeAutofillSettings.GpuCount, node.Name),
-		MigStrategy: nodeAutofillSettings.MigStrategy,
+		return fmt.Errorf("failed to create node topology ConfigMap: %w", err)
 	}
 
-	err = topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name)
+	err = p.applyFakeNodeDeployments(node)
 	if err != nil {
-		return fmt.Errorf("failed to create node topology: %w", err)
+		return fmt.Errorf("failed to apply fake node deployments: %w", err)
 	}
 
 	return nil
@@ -66,16 +51,10 @@ func (p *NodeHandler) HandleDelete(node *v1.Node) error {
 		return fmt.Errorf("failed to delete node topology: %w", err)
 	}
 
-	return nil
-}
-
-func generateGpuDetails(gpuCount int, nodeName string) []topology.GpuDetails {
-	gpus := make([]topology.GpuDetails, gpuCount)
-	for idx := range gpus {
-		gpus[idx] = topology.GpuDetails{
-			ID: fmt.Sprintf("GPU-%s", uuid.NewSHA1(uuid.Nil, []byte(fmt.Sprintf("%s-%d", nodeName, idx)))),
-		}
+	err = p.deleteFakeNodeDeployments(node)
+	if err != nil {
+		return fmt.Errorf("failed to delete fake node deployments: %w", err)
 	}
 
-	return gpus
+	return nil
 }
@@ -0,0 +1,48 @@
+package node
+
+import (
+	"fmt"
+
+	"github.com/google/uuid"
+	"github.com/run-ai/fake-gpu-operator/internal/common/topology"
+	v1 "k8s.io/api/core/v1"
+)
+
+func (p *NodeHandler) createNodeTopologyCM(node *v1.Node) error {
+	nodeTopology, _ := topology.GetNodeTopologyFromCM(p.kubeClient, node.Name)
+	if nodeTopology != nil {
+		return nil
+	}
+
+	baseTopology, err := topology.GetBaseTopologyFromCM(p.kubeClient)
+	if err != nil {
+		return fmt.Errorf("failed to get base topology: %w", err)
+	}
+
+	nodeAutofillSettings := baseTopology.Config.NodeAutofill
+
+	nodeTopology = &topology.NodeTopology{
+		GpuMemory:   nodeAutofillSettings.GpuMemory,
+		GpuProduct:  nodeAutofillSettings.GpuProduct,
+		Gpus:        generateGpuDetails(nodeAutofillSettings.GpuCount, node.Name),
+		MigStrategy: nodeAutofillSettings.MigStrategy,
+	}
+
+	err = topology.CreateNodeTopologyCM(p.kubeClient, nodeTopology, node.Name)
+	if err != nil {
+		return fmt.Errorf("failed to create node topology: %w", err)
+	}
+
+	return nil
+}
+
+func generateGpuDetails(gpuCount int, nodeName string) []topology.GpuDetails {
+	gpus := make([]topology.GpuDetails, gpuCount)
+	for idx := range gpus {
+		gpus[idx] = topology.GpuDetails{
+			ID: fmt.Sprintf("GPU-%s", uuid.NewSHA1(uuid.Nil, []byte(fmt.Sprintf("%s-%d", nodeName, idx)))),
+		}
+	}
+
+	return gpus
+}
Original file line number	Diff line number	Diff line change
`@@ -2,10 +2,14 @@ package main`
`2`	`2`
`3`	`3`	`import (`
`4`	`4`	`"github.com/run-ai/fake-gpu-operator/internal/common/app"`
	`5`	`+ "github.com/run-ai/fake-gpu-operator/internal/common/config"`
`5`	`6`	`status_updater "github.com/run-ai/fake-gpu-operator/internal/status-updater"`
`6`	`7`	`)`
`7`	`8`
`8`	`9`	`func main() {`
	`10`	`+ requiredEnvVars := []string{"TOPOLOGY_CM_NAME", "TOPOLOGY_CM_NAMESPACE", "FAKE_GPU_OPERATOR_NAMESPACE"}`
	`11`	`+ config.ValidateConfig(requiredEnvVars)`
	`12`	`+`
`9`	`13`	`appRunner := app.NewAppRunner(&status_updater.StatusUpdaterApp{})`
`10`	`14`	`appRunner.Run()`
`11`	`15`	`}`