Skip to content
This repository was archived by the owner on Sep 19, 2022. It is now read-only.
Open
2 changes: 2 additions & 0 deletions manifests/crd.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ spec:
properties:
spec:
properties:
priorityClassName:
type: string
pytorchReplicaSpecs:
properties:
Master:
Expand Down
2 changes: 2 additions & 0 deletions manifests/podgroup.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ spec:
minMember:
format: int32
type: integer
priorityClassName:
type: string
type: object
status:
properties:
Expand Down
4 changes: 4 additions & 0 deletions pkg/apis/pytorch/v1/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,10 @@ type PyTorchJobSpec struct {
// "Worker": PyTorchReplicaSpec,
// }
PyTorchReplicaSpecs map[PyTorchReplicaType]*common.ReplicaSpec `json:"pytorchReplicaSpecs"`

//add PriorityClassName
//PriorityClassName is a type of k8s resource.(kubectl get priorityclass)
PriorityClassName *string `json:"priorityClassName,omitempty"`
}

// PyTorchReplicaType is the type for PyTorchReplica. Can be one of "Master" or "Worker".
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/pytorch/v1/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 4 additions & 0 deletions pkg/apis/pytorch/v1beta2/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ type PyTorchJobSpec struct {
// "Worker": PyTorchReplicaSpec,
// }
PyTorchReplicaSpecs map[PyTorchReplicaType]*common.ReplicaSpec `json:"pytorchReplicaSpecs"`

//add PriorityClassName
//PriorityClassName is a type of k8s resource.(kubectl get priorityclass)
PriorityClassName *string `json:"priorityClassName,omitempty"`
}

// PyTorchReplicaType is the type for PyTorchReplica.
Expand Down
5 changes: 5 additions & 0 deletions pkg/apis/pytorch/v1beta2/zz_generated.deepcopy.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 4 additions & 2 deletions pkg/controller.v1/pytorch/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (

kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned"
log "github.com/sirupsen/logrus"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
Expand Down Expand Up @@ -437,7 +437,9 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *pyv1.PyTorchJob) error {

if pc.Config.EnableGangScheduling {
minAvailableReplicas := getTotalReplicas(job)
_, err := pc.SyncPodGroup(job, minAvailableReplicas)
priorityClassName := getPriorityClassName(job)
//_, err := pc.SyncPodGroup(job, minAvailableReplicas)
_, err := pc.SyncPodGroup(job, minAvailableReplicas, priorityClassName)
if err != nil {
logger.Warnf("Sync PodGroup %v: %v", job.Name, err)
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/controller.v1/pytorch/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -216,3 +216,8 @@ func getTotalFailedReplicas(job *pyv1.PyTorchJob) int32 {
}
return totalFailedReplicas
}

func getPriorityClassName(job *pyv1.PyTorchJob) string {
priorityClassName := *(job.Spec.PriorityClassName)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There may be an runtime error: invalid memory address or nil pointer dereference

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I need to remove the bracket

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

right? @gaocegege

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No I think you need to check if it is nil.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I got it.

return priorityClassName
}
1 change: 1 addition & 0 deletions pkg/controller.v1/pytorch/status.go
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,7 @@ func (pc *PyTorchController) updateStatusSingle(job *pyv1.PyTorchJob, rtype pyv1

// Expect to have `replicas - succeeded` pods alive.
commonType := common.ReplicaType(rtype)
//expected is a flag of success.if expected==0,PyTorchJob is successfully completed.
expected := replicas - int(job.Status.ReplicaStatuses[commonType].Succeeded)
running := int(job.Status.ReplicaStatuses[commonType].Active)
failed := int(job.Status.ReplicaStatuses[commonType].Failed)
Expand Down
5 changes: 3 additions & 2 deletions pkg/controller.v1beta2/pytorch/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ import (

kubebatchclient "github.com/kubernetes-sigs/kube-batch/pkg/client/clientset/versioned"
log "github.com/sirupsen/logrus"
"k8s.io/api/core/v1"
v1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime/schema"
utilruntime "k8s.io/apimachinery/pkg/util/runtime"
Expand Down Expand Up @@ -428,7 +428,8 @@ func (pc *PyTorchController) reconcilePyTorchJobs(job *v1beta2.PyTorchJob) error

if pc.Config.EnableGangScheduling {
minAvailableReplicas := getTotalReplicas(job)
_, err := pc.SyncPodGroup(job, minAvailableReplicas)
priorityClassName := getPriorityClassName(job)
_, err := pc.SyncPodGroup(job, minAvailableReplicas, priorityClassName)
if err != nil {
logger.Warnf("Sync PodGroup %v: %v", job.Name, err)
}
Expand Down
5 changes: 5 additions & 0 deletions pkg/controller.v1beta2/pytorch/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -206,3 +206,8 @@ func getTotalFailedReplicas(job *v1beta2.PyTorchJob) int32 {
}
return totalFailedReplicas
}

func getPriorityClassName(job *v1beta2.PyTorchJob) string {
priorityClassName := *(job.Spec.PriorityClassName)
return priorityClassName
}

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.