NexusGPU
diff --git a/‎api/v1/schedulingconfigtemplate_types.go‎
Lines changed: 16 additions & 0 deletions b/‎api/v1/schedulingconfigtemplate_types.go‎
Lines changed: 16 additions & 0 deletions
diff --git a/‎api/v1/zz_generated.deepcopy.go‎
Lines changed: 25 additions & 0 deletions b/‎api/v1/zz_generated.deepcopy.go‎
Lines changed: 25 additions & 0 deletions
diff --git a/‎charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 14 additions & 0 deletions b/‎charts/tensor-fusion/crds/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎cmd/main.go‎
Lines changed: 2 additions & 0 deletions b/‎cmd/main.go‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 14 additions & 0 deletions b/‎config/crd/bases/tensor-fusion.ai_schedulingconfigtemplates.yaml‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎internal/constants/constants.go‎
Lines changed: 9 additions & 0 deletions b/‎internal/constants/constants.go‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎internal/scheduler/gpuresources/gpuresources.go‎
Lines changed: 45 additions & 0 deletions b/‎internal/scheduler/gpuresources/gpuresources.go‎
Lines changed: 45 additions & 0 deletions
@@ -39,6 +39,10 @@ type SchedulingConfigTemplateSpec struct {
 	// single GPU device multi-process queuing and fair scheduling with QoS constraint
 	// +optional
 	Hypervisor *HypervisorScheduling `json:"hypervisor,omitempty"`
+
+	// enable Dynamic Resource Allocation (DRA) for GPU resource management
+	// +optional
+	DRA *DRAConfig `json:"dra,omitempty"`
 }
 
 type PlacementConfig struct {
@@ -206,6 +210,18 @@ type MultiProcessQueuing struct {
 	QueueLevelTimeSlices []string `json:"queueLevelTimeSlices,omitempty"`
 }
 
+// DRAConfig configures Dynamic Resource Allocation support
+type DRAConfig struct {
+	// Enable DRA mode for all workloads in this configuration template
+	// +optional
+	Enable *bool `json:"enable,omitempty"`
+
+	// ResourceClass specifies the DRA resource class name to use
+	// +kubebuilder:default="tensorfusion.ai/gpu"
+	// +optional
+	ResourceClass string `json:"resourceClass,omitempty"`
+}
+
 // SchedulingConfigTemplateStatus defines the observed state of SchedulingConfigTemplate.
 type SchedulingConfigTemplateStatus struct {
 	// INSERT ADDITIONAL STATUS FIELD - define observed state of cluster
 
@@ -143,6 +143,20 @@ spec:
                         type: string
                     type: object
                 type: object
+              dra:
+                description: enable Dynamic Resource Allocation (DRA) for GPU resource
+                  management
+                properties:
+                  enable:
+                    description: Enable DRA mode for all workloads in this configuration
+                      template
+                    type: boolean
+                  resourceClass:
+                    default: tensorfusion.ai/gpu
+                    description: ResourceClass specifies the DRA resource class name
+                      to use
+                    type: string
+                type: object
               hypervisor:
                 description: single GPU device multi-process queuing and fair scheduling
                   with QoS constraint
 
@@ -33,6 +33,7 @@ import (
 	"k8s.io/client-go/rest"
 	"k8s.io/klog/v2"
 
+	resourcev1beta2 "k8s.io/api/resource/v1beta2"
 	"k8s.io/apimachinery/pkg/runtime"
 	utilruntime "k8s.io/apimachinery/pkg/util/runtime"
 	clientgoscheme "k8s.io/client-go/kubernetes/scheme"
@@ -99,6 +100,7 @@ var alertEvaluatorReady chan struct{}
 func init() {
 	utilruntime.Must(clientgoscheme.AddToScheme(scheme))
 	utilruntime.Must(tfv1.AddToScheme(scheme))
+	utilruntime.Must(resourcev1beta2.AddToScheme(scheme))
 	// +kubebuilder:scaffold:scheme
 }
 
 
@@ -143,6 +143,20 @@ spec:
                         type: string
                     type: object
                 type: object
+              dra:
+                description: enable Dynamic Resource Allocation (DRA) for GPU resource
+                  management
+                properties:
+                  enable:
+                    description: Enable DRA mode for all workloads in this configuration
+                      template
+                    type: boolean
+                  resourceClass:
+                    default: tensorfusion.ai/gpu
+                    description: ResourceClass specifies the DRA resource class name
+                      to use
+                    type: string
+                type: object
               hypervisor:
                 description: single GPU device multi-process queuing and fair scheduling
                   with QoS constraint
 
@@ -121,6 +121,15 @@ const (
 	QoSLevelMedium   = "medium"
 	QoSLevelHigh     = "high"
 	QoSLevelCritical = "critical"
+
+	// DRA support
+	// annotation for pod to indicate if DRA is enabled
+	DRAEnabledAnnotation = Domain + "/dra-enabled"
+	DRAResourceClaimName = "tensor-fusion-resource-claim-%s"
+	// resource claim name for request
+	DRAResourceClaimRequestName = "tensor-fusion-resource-claim-request-%s"
+
+	DRAClaimDefineName = "tensor-fusion-gpu-claim"
 )
 
 // for avoid golang lint issues
 
@@ -26,6 +26,7 @@ import (
 const Name = "GPUResourcesFit"
 const CycleStateAllocateRequest = "allocateRequest"
 const CycleStateGPUSchedulingResult = "gpuSchedulingResult"
+
 const SchedulerSimulationKey = "schedulerSimulation"
 
 var _ framework.PreFilterPlugin = &GPUFit{}
@@ -105,6 +106,11 @@ func (s *GPUFit) PreFilter(ctx context.Context, state *framework.CycleState, pod
 		}, framework.NewStatus(framework.Success, "progressive migration for native resources claim")
 	}
 
+	// Check if DRA mode is enabled for this pod
+	if isDRAEnabled(pod) && hasDRAClaim(pod) {
+		return nil, framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU prefilter")
+	}
+
 	// Skip non tensor-fusion mode
 	if !utils.IsTensorFusionWorker(pod) {
 		return nil, framework.NewStatus(framework.Skip, "skip for non tensor-fusion mode")
@@ -207,6 +213,11 @@ func (s *GPUFit) PreFilterExtensions() framework.PreFilterExtensions {
 }
 
 func (s *GPUFit) Filter(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeInfo *framework.NodeInfo) *framework.Status {
+	// Check if DRA mode is enabled for this pod
+	if isDRAEnabled(pod) && hasDRAClaim(pod) {
+		return framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU filter")
+	}
+
 	if !utils.IsTensorFusionWorker(pod) {
 		return framework.NewStatus(framework.Success, "skip for non tensor-fusion mode")
 	}
@@ -228,6 +239,11 @@ func (s *GPUFit) Score(
 	pod *v1.Pod,
 	nodeInfo *framework.NodeInfo,
 ) (int64, *framework.Status) {
+	// Check if DRA mode is enabled for this pod
+	if isDRAEnabled(pod) && hasDRAClaim(pod) {
+		return 0, framework.NewStatus(framework.Skip, "DRA mode enabled, skipping custom GPU scoring")
+	}
+
 	// Skip non tensor-fusion mode scheduling
 	if !utils.IsTensorFusionWorker(pod) {
 		return 0, framework.NewStatus(framework.Success, "")
@@ -266,6 +282,11 @@ func (s *GPUFit) ScoreExtensions() framework.ScoreExtensions {
 }
 
 func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) *framework.Status {
+	// Check if DRA mode is enabled for this pod
+	if isDRAEnabled(pod) && hasDRAClaim(pod) {
+		return framework.NewStatus(framework.Success, "DRA mode enabled, skipping custom GPU reservation")
+	}
+
 	if !utils.IsTensorFusionWorker(pod) {
 		return framework.NewStatus(framework.Success, "skip for non tensor-fusion mode")
 	}
@@ -312,6 +333,11 @@ func (s *GPUFit) Reserve(ctx context.Context, state *framework.CycleState, pod *
 }
 
 func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) {
+	// Check if DRA mode is enabled for this pod
+	if isDRAEnabled(pod) && hasDRAClaim(pod) {
+		return // DRA handles unreservation
+	}
+
 	if !utils.IsTensorFusionWorker(pod) {
 		return
 	}
@@ -331,6 +357,11 @@ func (s *GPUFit) Unreserve(ctx context.Context, state *framework.CycleState, pod
 }
 
 func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod *v1.Pod, nodeName string) {
+	// Check if DRA mode is enabled for this pod
+	if isDRAEnabled(pod) && hasDRAClaim(pod) {
+		return // DRA handles post-bind actions
+	}
+
 	if !utils.IsTensorFusionWorker(pod) {
 		return
 	}
@@ -359,3 +390,17 @@ func (s *GPUFit) PostBind(ctx context.Context, state *framework.CycleState, pod
 			"Attach GPU device ID info", "Attach TensorFusion GPU device IDs to Pod: "+gpuIDs)
 	}
 }
+
+// isDRAEnabled checks if DRA is enabled for a pod
+func isDRAEnabled(pod *v1.Pod) bool {
+	if pod.Annotations == nil {
+		return false
+	}
+	val, ok := pod.Annotations[constants.DRAEnabledAnnotation]
+	return ok && val == constants.TrueStringValue
+}
+
+// hasDRAClaim checks if a pod has DRA ResourceClaim references
+func hasDRAClaim(pod *v1.Pod) bool {
+	return len(pod.Spec.ResourceClaims) > 0
+}