1
+ /*
2
+ Copyright 2024.
3
+
4
+ Licensed under the Apache License, Version 2.0 (the "License");
5
+ you may not use this file except in compliance with the License.
6
+ You may obtain a copy of the License at
7
+
8
+ http://www.apache.org/licenses/LICENSE-2.0
9
+
10
+ Unless required by applicable law or agreed to in writing, software
11
+ distributed under the License is distributed on an "AS IS" BASIS,
12
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ See the License for the specific language governing permissions and
14
+ limitations under the License.
15
+ */
16
+
17
+ package dra
18
+
19
+ import (
20
+ "context"
21
+ "fmt"
22
+
23
+ resourcev1beta2 "k8s.io/api/resource/v1beta2"
24
+ corev1 "k8s.io/api/core/v1"
25
+ "k8s.io/apimachinery/pkg/api/errors"
26
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27
+ "k8s.io/apimachinery/pkg/runtime"
28
+ "k8s.io/apimachinery/pkg/types"
29
+ ctrl "sigs.k8s.io/controller-runtime"
30
+ "sigs.k8s.io/controller-runtime/pkg/client"
31
+ "sigs.k8s.io/controller-runtime/pkg/log"
32
+
33
+ "github.com/NexusGPU/tensor-fusion/internal/constants"
34
+ )
35
+
36
+ // ResourceClaimReconciler reconciles ResourceClaim objects
37
+ type ResourceClaimReconciler struct {
38
+ client.Client
39
+ Scheme * runtime.Scheme
40
+ }
41
+
42
+ //+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaims,verbs=get;list;watch;create;update;patch;delete
43
+ //+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
44
+
45
+ // Reconcile is part of the main kubernetes reconciliation loop which aims to
46
+ // move the current state of the cluster closer to the desired state.
47
+ func (r * ResourceClaimReconciler ) Reconcile (ctx context.Context , req ctrl.Request ) (ctrl.Result , error ) {
48
+ log := log .FromContext (ctx )
49
+
50
+ // Fetch the ResourceClaim instance
51
+ resourceClaim := & resourcev1beta2.ResourceClaim {}
52
+ if err := r .Get (ctx , req .NamespacedName , resourceClaim ); err != nil {
53
+ if errors .IsNotFound (err ) {
54
+ // Request object not found, could have been deleted after reconcile request.
55
+ // Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
56
+ // Return and don't requeue
57
+ log .Info ("ResourceClaim resource not found. Ignoring since object must be deleted" )
58
+ return ctrl.Result {}, nil
59
+ }
60
+ // Error reading the object - requeue the request.
61
+ log .Error (err , "Failed to get ResourceClaim" )
62
+ return ctrl.Result {}, err
63
+ }
64
+
65
+ // Check if this ResourceClaim is created from our ResourceClaimTemplate
66
+ if resourceClaim .Labels == nil {
67
+ // No labels, not our ResourceClaim
68
+ return ctrl.Result {}, nil
69
+ }
70
+
71
+ labelValue , exists := resourceClaim .Labels [constants .TensorFusionResourceClaimTemplateLabel ]
72
+ if ! exists || labelValue != constants .TrueStringValue {
73
+ // Not our ResourceClaim, ignore
74
+ return ctrl.Result {}, nil
75
+ }
76
+
77
+ log .Info ("Processing TensorFusion ResourceClaim" , "name" , resourceClaim .Name , "namespace" , resourceClaim .Namespace )
78
+
79
+ // Find the owner Pod to get the CEL expression annotation
80
+ ownerPod , err := r .findOwnerPod (ctx , resourceClaim )
81
+ if err != nil {
82
+ log .Error (err , "Failed to find owner Pod" )
83
+ return ctrl.Result {}, err
84
+ }
85
+
86
+ if ownerPod == nil {
87
+ log .Info ("Owner Pod not found, ResourceClaim may not have OwnerReference yet" )
88
+ return ctrl.Result {RequeueAfter : constants .PendingRequeueDuration }, nil
89
+ }
90
+
91
+ // Get CEL expression from Pod annotation
92
+ celExpression := ownerPod .Annotations [constants .DRACelExpressionAnnotation ]
93
+ if celExpression == "" {
94
+ log .Info ("No CEL expression found in Pod annotation" , "pod" , ownerPod .Name )
95
+ return ctrl.Result {}, nil
96
+ }
97
+
98
+ // Update ResourceClaim with CEL expression
99
+ if err := r .updateResourceClaimCEL (ctx , resourceClaim , celExpression ); err != nil {
100
+ log .Error (err , "Failed to update ResourceClaim CEL expression" )
101
+ return ctrl.Result {}, err
102
+ }
103
+
104
+ log .Info ("Successfully updated ResourceClaim with CEL expression" , "cel" , celExpression )
105
+ return ctrl.Result {}, nil
106
+ }
107
+
108
+ // findOwnerPod finds the Pod that owns this ResourceClaim
109
+ func (r * ResourceClaimReconciler ) findOwnerPod (ctx context.Context , resourceClaim * resourcev1beta2.ResourceClaim ) (* corev1.Pod , error ) {
110
+ // Find the Pod OwnerReference (there should be exactly one)
111
+ var podOwnerRef * metav1.OwnerReference
112
+ for i , ownerRef := range resourceClaim .OwnerReferences {
113
+ if ownerRef .Kind == "Pod" && ownerRef .APIVersion == "v1" {
114
+ podOwnerRef = & resourceClaim .OwnerReferences [i ]
115
+ break
116
+ }
117
+ }
118
+
119
+ if podOwnerRef == nil {
120
+ return nil , nil // No Pod owner found
121
+ }
122
+
123
+ // Get the Pod by name and namespace (UID is automatically verified by Kubernetes)
124
+ pod := & corev1.Pod {}
125
+ err := r .Get (ctx , types.NamespacedName {
126
+ Name : podOwnerRef .Name ,
127
+ Namespace : resourceClaim .Namespace ,
128
+ }, pod )
129
+ if err != nil {
130
+ if errors .IsNotFound (err ) {
131
+ return nil , nil // Pod was deleted
132
+ }
133
+ return nil , fmt .Errorf ("failed to get owner Pod %s/%s: %w" , resourceClaim .Namespace , podOwnerRef .Name , err )
134
+ }
135
+
136
+ // Verify the UID matches (additional safety check)
137
+ if pod .UID != podOwnerRef .UID {
138
+ return nil , fmt .Errorf ("Pod UID mismatch: expected %s, got %s" , podOwnerRef .UID , pod .UID )
139
+ }
140
+
141
+ return pod , nil
142
+ }
143
+
144
+ // updateResourceClaimCEL updates the ResourceClaim's CEL selector expression
145
+ func (r * ResourceClaimReconciler ) updateResourceClaimCEL (ctx context.Context , resourceClaim * resourcev1beta2.ResourceClaim , celExpression string ) error {
146
+ // Check if we need to update
147
+ if len (resourceClaim .Spec .Devices .Requests ) == 0 {
148
+ return fmt .Errorf ("no device requests found in ResourceClaim" )
149
+ }
150
+
151
+ deviceReq := & resourceClaim .Spec .Devices .Requests [0 ]
152
+ if deviceReq .Exactly == nil {
153
+ return fmt .Errorf ("no ExactDeviceRequest found" )
154
+ }
155
+
156
+ // Check if CEL expression is already set correctly
157
+ if len (deviceReq .Exactly .Selectors ) > 0 &&
158
+ deviceReq .Exactly .Selectors [0 ].CEL != nil &&
159
+ deviceReq .Exactly .Selectors [0 ].CEL .Expression == celExpression {
160
+ // Already updated
161
+ return nil
162
+ }
163
+
164
+ // Update the CEL expression
165
+ if len (deviceReq .Exactly .Selectors ) == 0 {
166
+ deviceReq .Exactly .Selectors = []resourcev1beta2.DeviceSelector {{}}
167
+ }
168
+
169
+ if deviceReq .Exactly .Selectors [0 ].CEL == nil {
170
+ deviceReq .Exactly .Selectors [0 ].CEL = & resourcev1beta2.CELDeviceSelector {}
171
+ }
172
+
173
+ deviceReq .Exactly .Selectors [0 ].CEL .Expression = celExpression
174
+
175
+ // Update the ResourceClaim
176
+ return r .Update (ctx , resourceClaim )
177
+ }
178
+
179
+ // SetupWithManager sets up the controller with the Manager.
180
+ func (r * ResourceClaimReconciler ) SetupWithManager (mgr ctrl.Manager ) error {
181
+ return ctrl .NewControllerManagedBy (mgr ).
182
+ For (& resourcev1beta2.ResourceClaim {}).
183
+ Complete (r )
184
+ }
0 commit comments