Skip to content

Commit 4959c61

Browse files
resource template support
1 parent 5f25794 commit 4959c61

File tree

7 files changed

+867
-345
lines changed

7 files changed

+867
-345
lines changed

api/v1/schedulingconfigtemplate_types.go

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -216,10 +216,9 @@ type DRAConfig struct {
216216
// +optional
217217
Enable *bool `json:"enable,omitempty"`
218218

219-
// ResourceClass specifies the DRA resource class name to use
220-
// +kubebuilder:default="tensorfusion.ai/gpu"
219+
// ResourceClaimTemplateName specifies the ResourceClaim template name to use
221220
// +optional
222-
ResourceClass string `json:"resourceClass,omitempty"`
221+
ResourceClaimTemplateName string `json:"resourceClaimTemplateName,omitempty"`
223222
}
224223

225224
// SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate.

cmd/main.go

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,7 @@ import (
5656
"github.com/NexusGPU/tensor-fusion/internal/config"
5757
"github.com/NexusGPU/tensor-fusion/internal/constants"
5858
"github.com/NexusGPU/tensor-fusion/internal/controller"
59+
"github.com/NexusGPU/tensor-fusion/internal/controller/dra"
5960
"github.com/NexusGPU/tensor-fusion/internal/gpuallocator"
6061
"github.com/NexusGPU/tensor-fusion/internal/metrics"
6162
"github.com/NexusGPU/tensor-fusion/internal/portallocator"
@@ -397,6 +398,15 @@ func startCustomResourceController(
397398
setupLog.Error(err, "unable to create controller", "controller", "Pod")
398399
os.Exit(1)
399400
}
401+
402+
// Setup ResourceClaim controller for DRA Phase 2
403+
if err = (&dra.ResourceClaimReconciler{
404+
Client: mgr.GetClient(),
405+
Scheme: mgr.GetScheme(),
406+
}).SetupWithManager(mgr); err != nil {
407+
setupLog.Error(err, "unable to create controller", "controller", "ResourceClaim")
408+
os.Exit(1)
409+
}
400410
if err = (&controller.NodeReconciler{
401411
Client: mgr.GetClient(),
402412
Scheme: mgr.GetScheme(),

internal/constants/constants.go

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -124,12 +124,20 @@ const (
124124

125125
// DRA support
126126
// annotation for pod to indicate if DRA is enabled
127-
DRAEnabledAnnotation = Domain + "/dra-enabled"
128-
DRAResourceClaimName = "tensor-fusion-resource-claim-%s"
127+
DRAEnabledAnnotation = Domain + "/dra-enabled"
128+
DRACelExpressionAnnotation = Domain + "/dra-cel-expression"
129+
130+
DRADriverName = Domain + ".dra-driver"
131+
DRAResourceClaimName = "tensor-fusion-resource-claim-%s-%s"
129132
// resource claim name for request
130133
DRAResourceClaimRequestName = "tensor-fusion-resource-claim-request-%s"
131134

132135
DRAClaimDefineName = "tensor-fusion-gpu-claim"
136+
137+
TensorFusionResourceClaimTemplateLabel = Domain + "/resource-claim-template"
138+
139+
// ResourceClaimTemplate related constants
140+
DRAResourceClaimTemplateName = "tensor-fusion-gpu-template"
133141
)
134142

135143
// for avoid golang lint issues
Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
/*
2+
Copyright 2024.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package dra
18+
19+
import (
20+
"context"
21+
"fmt"
22+
23+
resourcev1beta2 "k8s.io/api/resource/v1beta2"
24+
corev1 "k8s.io/api/core/v1"
25+
"k8s.io/apimachinery/pkg/api/errors"
26+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
27+
"k8s.io/apimachinery/pkg/runtime"
28+
"k8s.io/apimachinery/pkg/types"
29+
ctrl "sigs.k8s.io/controller-runtime"
30+
"sigs.k8s.io/controller-runtime/pkg/client"
31+
"sigs.k8s.io/controller-runtime/pkg/log"
32+
33+
"github.com/NexusGPU/tensor-fusion/internal/constants"
34+
)
35+
36+
// ResourceClaimReconciler reconciles ResourceClaim objects
37+
type ResourceClaimReconciler struct {
38+
client.Client
39+
Scheme *runtime.Scheme
40+
}
41+
42+
//+kubebuilder:rbac:groups=resource.k8s.io,resources=resourceclaims,verbs=get;list;watch;create;update;patch;delete
43+
//+kubebuilder:rbac:groups=core,resources=pods,verbs=get;list;watch
44+
45+
// Reconcile is part of the main kubernetes reconciliation loop which aims to
46+
// move the current state of the cluster closer to the desired state.
47+
func (r *ResourceClaimReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
48+
log := log.FromContext(ctx)
49+
50+
// Fetch the ResourceClaim instance
51+
resourceClaim := &resourcev1beta2.ResourceClaim{}
52+
if err := r.Get(ctx, req.NamespacedName, resourceClaim); err != nil {
53+
if errors.IsNotFound(err) {
54+
// Request object not found, could have been deleted after reconcile request.
55+
// Owned objects are automatically garbage collected. For additional cleanup logic use finalizers.
56+
// Return and don't requeue
57+
log.Info("ResourceClaim resource not found. Ignoring since object must be deleted")
58+
return ctrl.Result{}, nil
59+
}
60+
// Error reading the object - requeue the request.
61+
log.Error(err, "Failed to get ResourceClaim")
62+
return ctrl.Result{}, err
63+
}
64+
65+
// Check if this ResourceClaim is created from our ResourceClaimTemplate
66+
if resourceClaim.Labels == nil {
67+
// No labels, not our ResourceClaim
68+
return ctrl.Result{}, nil
69+
}
70+
71+
labelValue, exists := resourceClaim.Labels[constants.TensorFusionResourceClaimTemplateLabel]
72+
if !exists || labelValue != constants.TrueStringValue {
73+
// Not our ResourceClaim, ignore
74+
return ctrl.Result{}, nil
75+
}
76+
77+
log.Info("Processing TensorFusion ResourceClaim", "name", resourceClaim.Name, "namespace", resourceClaim.Namespace)
78+
79+
// Find the owner Pod to get the CEL expression annotation
80+
ownerPod, err := r.findOwnerPod(ctx, resourceClaim)
81+
if err != nil {
82+
log.Error(err, "Failed to find owner Pod")
83+
return ctrl.Result{}, err
84+
}
85+
86+
if ownerPod == nil {
87+
log.Info("Owner Pod not found, ResourceClaim may not have OwnerReference yet")
88+
return ctrl.Result{RequeueAfter: constants.PendingRequeueDuration}, nil
89+
}
90+
91+
// Get CEL expression from Pod annotation
92+
celExpression := ownerPod.Annotations[constants.DRACelExpressionAnnotation]
93+
if celExpression == "" {
94+
log.Info("No CEL expression found in Pod annotation", "pod", ownerPod.Name)
95+
return ctrl.Result{}, nil
96+
}
97+
98+
// Update ResourceClaim with CEL expression
99+
if err := r.updateResourceClaimCEL(ctx, resourceClaim, celExpression); err != nil {
100+
log.Error(err, "Failed to update ResourceClaim CEL expression")
101+
return ctrl.Result{}, err
102+
}
103+
104+
log.Info("Successfully updated ResourceClaim with CEL expression", "cel", celExpression)
105+
return ctrl.Result{}, nil
106+
}
107+
108+
// findOwnerPod finds the Pod that owns this ResourceClaim
109+
func (r *ResourceClaimReconciler) findOwnerPod(ctx context.Context, resourceClaim *resourcev1beta2.ResourceClaim) (*corev1.Pod, error) {
110+
// Find the Pod OwnerReference (there should be exactly one)
111+
var podOwnerRef *metav1.OwnerReference
112+
for i, ownerRef := range resourceClaim.OwnerReferences {
113+
if ownerRef.Kind == "Pod" && ownerRef.APIVersion == "v1" {
114+
podOwnerRef = &resourceClaim.OwnerReferences[i]
115+
break
116+
}
117+
}
118+
119+
if podOwnerRef == nil {
120+
return nil, nil // No Pod owner found
121+
}
122+
123+
// Get the Pod by name and namespace (UID is automatically verified by Kubernetes)
124+
pod := &corev1.Pod{}
125+
err := r.Get(ctx, types.NamespacedName{
126+
Name: podOwnerRef.Name,
127+
Namespace: resourceClaim.Namespace,
128+
}, pod)
129+
if err != nil {
130+
if errors.IsNotFound(err) {
131+
return nil, nil // Pod was deleted
132+
}
133+
return nil, fmt.Errorf("failed to get owner Pod %s/%s: %w", resourceClaim.Namespace, podOwnerRef.Name, err)
134+
}
135+
136+
// Verify the UID matches (additional safety check)
137+
if pod.UID != podOwnerRef.UID {
138+
return nil, fmt.Errorf("Pod UID mismatch: expected %s, got %s", podOwnerRef.UID, pod.UID)
139+
}
140+
141+
return pod, nil
142+
}
143+
144+
// updateResourceClaimCEL updates the ResourceClaim's CEL selector expression
145+
func (r *ResourceClaimReconciler) updateResourceClaimCEL(ctx context.Context, resourceClaim *resourcev1beta2.ResourceClaim, celExpression string) error {
146+
// Check if we need to update
147+
if len(resourceClaim.Spec.Devices.Requests) == 0 {
148+
return fmt.Errorf("no device requests found in ResourceClaim")
149+
}
150+
151+
deviceReq := &resourceClaim.Spec.Devices.Requests[0]
152+
if deviceReq.Exactly == nil {
153+
return fmt.Errorf("no ExactDeviceRequest found")
154+
}
155+
156+
// Check if CEL expression is already set correctly
157+
if len(deviceReq.Exactly.Selectors) > 0 &&
158+
deviceReq.Exactly.Selectors[0].CEL != nil &&
159+
deviceReq.Exactly.Selectors[0].CEL.Expression == celExpression {
160+
// Already updated
161+
return nil
162+
}
163+
164+
// Update the CEL expression
165+
if len(deviceReq.Exactly.Selectors) == 0 {
166+
deviceReq.Exactly.Selectors = []resourcev1beta2.DeviceSelector{{}}
167+
}
168+
169+
if deviceReq.Exactly.Selectors[0].CEL == nil {
170+
deviceReq.Exactly.Selectors[0].CEL = &resourcev1beta2.CELDeviceSelector{}
171+
}
172+
173+
deviceReq.Exactly.Selectors[0].CEL.Expression = celExpression
174+
175+
// Update the ResourceClaim
176+
return r.Update(ctx, resourceClaim)
177+
}
178+
179+
// SetupWithManager sets up the controller with the Manager.
180+
func (r *ResourceClaimReconciler) SetupWithManager(mgr ctrl.Manager) error {
181+
return ctrl.NewControllerManagedBy(mgr).
182+
For(&resourcev1beta2.ResourceClaim{}).
183+
Complete(r)
184+
}

0 commit comments

Comments
 (0)