Skip to content

Commit 5f25794

Browse files
webhook & gpu resource fit dra support
1 parent 8503585 commit 5f25794

File tree

14 files changed

+1132
-16
lines changed

14 files changed

+1132
-16
lines changed

api/v1/schedulingconfigtemplate_types.go

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,10 @@ type SchedulingConfigTemplateSpec struct {
3939
// single GPU device multi-process queuing and fair scheduling with QoS constraint
4040
// +optional
4141
Hypervisor *HypervisorScheduling `json:"hypervisor,omitempty"`
42+
43+
// enable Dynamic Resource Allocation (DRA) for GPU resource management
44+
// +optional
45+
DRA *DRAConfig `json:"dra,omitempty"`
4246
}
4347

4448
type PlacementConfig struct {
@@ -206,6 +210,18 @@ type MultiProcessQueuing struct {
206210
QueueLevelTimeSlices []string `json:"queueLevelTimeSlices,omitempty"`
207211
}
208212

213+
// DRAConfig configures Dynamic Resource Allocation support
214+
type DRAConfig struct {
215+
// Enable DRA mode for all workloads in this configuration template
216+
// +optional
217+
Enable *bool `json:"enable,omitempty"`
218+
219+
// ResourceClass specifies the DRA resource class name to use
220+
// +kubebuilder:default="tensorfusion.ai/gpu"
221+
// +optional
222+
ResourceClass string `json:"resourceClass,omitempty"`
223+
}
224+
209225
// SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate.
210226
type SchedulingConfigTemplateStatus struct {
211227
// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster

api/v1/zz_generated.deepcopy.go

Lines changed: 25 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,20 @@ spec:
143143
type: string
144144
type: object
145145
type: object
146+
dra:
147+
description: enable Dynamic Resource Allocation (DRA) for GPU resource
148+
management
149+
properties:
150+
enable:
151+
description: Enable DRA mode for all workloads in this configuration
152+
template
153+
type: boolean
154+
resourceClass:
155+
default: tensorfusion.ai/gpu
156+
description: ResourceClass specifies the DRA resource class name
157+
to use
158+
type: string
159+
type: object
146160
hypervisor:
147161
description: single GPU device multi-process queuing and fair scheduling
148162
with QoS constraint

cmd/main.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ import (
3333
"k8s.io/client-go/rest"
3434
"k8s.io/klog/v2"
3535

36+
resourcev1beta2 "k8s.io/api/resource/v1beta2"
3637
"k8s.io/apimachinery/pkg/runtime"
3738
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
3839
clientgoscheme "k8s.io/client-go/kubernetes/scheme"
@@ -99,6 +100,7 @@ var alertEvaluatorReady chan struct{}
99100
func init() {
100101
utilruntime.Must(clientgoscheme.AddToScheme(scheme))
101102
utilruntime.Must(tfv1.AddToScheme(scheme))
103+
utilruntime.Must(resourcev1beta2.AddToScheme(scheme))
102104
// +kubebuilder:scaffold:scheme
103105
}
104106

config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -143,6 +143,20 @@ spec:
143143
type: string
144144
type: object
145145
type: object
146+
dra:
147+
description: enable Dynamic Resource Allocation (DRA) for GPU resource
148+
management
149+
properties:
150+
enable:
151+
description: Enable DRA mode for all workloads in this configuration
152+
template
153+
type: boolean
154+
resourceClass:
155+
default: tensorfusion.ai/gpu
156+
description: ResourceClass specifies the DRA resource class name
157+
to use
158+
type: string
159+
type: object
146160
hypervisor:
147161
description: single GPU device multi-process queuing and fair scheduling
148162
with QoS constraint

internal/constants/constants.go

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -121,6 +121,15 @@ const (
121121
QoSLevelMedium = "medium"
122122
QoSLevelHigh = "high"
123123
QoSLevelCritical = "critical"
124+
125+
// DRA support
126+
// annotation for pod to indicate if DRA is enabled
127+
DRAEnabledAnnotation = Domain + "/dra-enabled"
128+
DRAResourceClaimName = "tensor-fusion-resource-claim-%s"
129+
// resource claim name for request
130+
DRAResourceClaimRequestName = "tensor-fusion-resource-claim-request-%s"
131+
132+
DRAClaimDefineName = "tensor-fusion-gpu-claim"
124133
)
125134

126135
// for avoid golang lint issues

internal/scheduler/gpuresources/gpuresources.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626
const Name = "GPUResourcesFit"
2727
const CycleStateAllocateRequest = "allocateRequest"
2828
const CycleStateGPUSchedulingResult = "gpuSchedulingResult"
29+
2930
const SchedulerSimulationKey = "schedulerSimulation"
3031

3132
var _ framework.PreFilterPlugin = &GPUFit{}
@@ -105,6 +106,11 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod
105106
}, framework.NewStatus(framework.Success, "progressive migration for native resources claim")
106107
}
107108

109+
// Check if DRA mode is enabled for this pod
110+
if isDRAEnabled(pod) && hasDRAClaim(pod) {
111+
return nil, framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU prefilter")
112+
}
113+
108114
// Skip non tensor-fusion mode
109115
if !utils.IsTensorFusionWorker(pod) {
110116
return nil, framework.NewStatus(framework.Skip, "skip for non tensor-fusion mode")
@@ -207,6 +213,11 @@ func (s *GPUFit) PreFilterExtensions() framework.PreFilterExtensions {
207213
}
208214

209215
func (s *GPUFit) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
216+
// Check if DRA mode is enabled for this pod
217+
if isDRAEnabled(pod) && hasDRAClaim(pod) {
218+
return framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU filter")
219+
}
220+
210221
if !utils.IsTensorFusionWorker(pod) {
211222
return framework.NewStatus(framework.Success, "skip for non tensor-fusion mode")
212223
}
@@ -228,6 +239,11 @@ func (s *GPUFit) Score(
228239
pod *v1.Pod,
229240
nodeInfo *framework.NodeInfo,
230241
) (int64, *framework.Status) {
242+
// Check if DRA mode is enabled for this pod
243+
if isDRAEnabled(pod) && hasDRAClaim(pod) {
244+
return 0, framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU scoring")
245+
}
246+
231247
// Skip non tensor-fusion mode scheduling
232248
if !utils.IsTensorFusionWorker(pod) {
233249
return 0, framework.NewStatus(framework.Success, "")
@@ -266,6 +282,11 @@ func (s *GPUFit) ScoreExtensions() framework.ScoreExtensions {
266282
}
267283

268284
func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
285+
// Check if DRA mode is enabled for this pod
286+
if isDRAEnabled(pod) && hasDRAClaim(pod) {
287+
return framework.NewStatus(framework.Success, "DRA mode enabled, skipping custom GPU reservation")
288+
}
289+
269290
if !utils.IsTensorFusionWorker(pod) {
270291
return framework.NewStatus(framework.Success, "skip for non tensor-fusion mode")
271292
}
@@ -312,6 +333,11 @@ func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod *
312333
}
313334

314335
func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) {
336+
// Check if DRA mode is enabled for this pod
337+
if isDRAEnabled(pod) && hasDRAClaim(pod) {
338+
return // DRA handles unreservation
339+
}
340+
315341
if !utils.IsTensorFusionWorker(pod) {
316342
return
317343
}
@@ -331,6 +357,11 @@ func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod
331357
}
332358

333359
func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) {
360+
// Check if DRA mode is enabled for this pod
361+
if isDRAEnabled(pod) && hasDRAClaim(pod) {
362+
return // DRA handles post-bind actions
363+
}
364+
334365
if !utils.IsTensorFusionWorker(pod) {
335366
return
336367
}
@@ -359,3 +390,17 @@ func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod
359390
"Attach GPU device ID info", "Attach TensorFusion GPU device IDs to Pod: "+gpuIDs)
360391
}
361392
}
393+
394+
// isDRAEnabled checks if DRA is enabled for a pod
395+
func isDRAEnabled(pod *v1.Pod) bool {
396+
if pod.Annotations == nil {
397+
return false
398+
}
399+
val, ok := pod.Annotations[constants.DRAEnabledAnnotation]
400+
return ok && val == constants.TrueStringValue
401+
}
402+
403+
// hasDRAClaim checks if a pod has DRA ResourceClaim references
404+
func hasDRAClaim(pod *v1.Pod) bool {
405+
return len(pod.Spec.ResourceClaims) > 0
406+
}

0 commit comments

Comments
 (0)