diff --git a/README.md b/README.md index 15bbffcef2..19baadcf54 100644 --- a/README.md +++ b/README.md @@ -2,7 +2,7 @@ ![Release Charts](https://github.com/kubernetes-sigs/descheduler/workflows/Release%20Charts/badge.svg)

- ↖️ Click at the [bullet list icon] at the top left corner of the Readme visualization for the github generated table of contents. + ↗️️ Click at the [bullet list icon] at the top right corner of the Readme visualization for the github generated table of contents.

diff --git a/charts/descheduler/templates/NOTES.txt b/charts/descheduler/templates/NOTES.txt index d0a0f8b2da..5882980483 100644 --- a/charts/descheduler/templates/NOTES.txt +++ b/charts/descheduler/templates/NOTES.txt @@ -1,7 +1,7 @@ Descheduler installed as a {{ .Values.kind }}. {{- if eq .Values.kind "Deployment" }} -{{- if eq .Values.replicas 1.0}} +{{- if eq (.Values.replicas | int) 1 }} WARNING: You set replica count as 1 and workload kind as Deployment however leaderElection is not enabled. Consider enabling Leader Election for HA mode. {{- end}} {{- if .Values.leaderElection }} diff --git a/charts/descheduler/templates/cronjob.yaml b/charts/descheduler/templates/cronjob.yaml index 73e3714b9f..9d18adf093 100644 --- a/charts/descheduler/templates/cronjob.yaml +++ b/charts/descheduler/templates/cronjob.yaml @@ -81,7 +81,11 @@ spec: args: - --policy-config-file=/policy-dir/policy.yaml {{- range $key, $value := .Values.cmdOptions }} - - {{ printf "--%s" $key }}{{ if $value }}={{ $value }}{{ end }} + {{- if ne $value nil }} + - {{ printf "--%s=%s" $key (toString $value) }} + {{- else }} + - {{ printf "--%s" $key }} + {{- end }} {{- end }} livenessProbe: {{- toYaml .Values.livenessProbe | nindent 16 }} diff --git a/charts/descheduler/templates/deployment.yaml b/charts/descheduler/templates/deployment.yaml index ff90a6602f..53d18cb665 100644 --- a/charts/descheduler/templates/deployment.yaml +++ b/charts/descheduler/templates/deployment.yaml @@ -7,7 +7,7 @@ metadata: labels: {{- include "descheduler.labels" . | nindent 4 }} spec: - {{- if gt .Values.replicas 1.0}} + {{- if gt (.Values.replicas | int) 1 }} {{- if not .Values.leaderElection.enabled }} {{- fail "You must set leaderElection to use more than 1 replica"}} {{- end}} @@ -53,7 +53,11 @@ spec: - --policy-config-file=/policy-dir/policy.yaml - --descheduling-interval={{ required "deschedulingInterval required for running as Deployment" .Values.deschedulingInterval }} {{- range $key, $value := .Values.cmdOptions }} - - {{ printf "--%s" $key }}{{ if $value }}={{ $value }}{{ end }} + {{- if ne $value nil }} + - {{ printf "--%s=%s" $key (toString $value) }} + {{- else }} + - {{ printf "--%s" $key }} + {{- end }} {{- end }} {{- include "descheduler.leaderElection" . | nindent 12 }} ports: diff --git a/charts/descheduler/values.yaml b/charts/descheduler/values.yaml index 5d4d8ed9de..7013ce52ff 100644 --- a/charts/descheduler/values.yaml +++ b/charts/descheduler/values.yaml @@ -111,14 +111,13 @@ deschedulerPolicy: args: podRestartThreshold: 100 includingInitContainers: true - - name: RemovePodsViolatingNodeTaints + - name: RemovePodsViolatingNodeAffinity args: nodeAffinityType: - - requiredDuringSchedulingIgnoredDuringExecution + - requiredDuringSchedulingIgnoredDuringExecution + - name: RemovePodsViolatingNodeTaints - name: RemovePodsViolatingInterPodAntiAffinity - name: RemovePodsViolatingTopologySpreadConstraint - args: - includeSoftConstraints: false - name: LowNodeUtilization args: thresholds: @@ -133,7 +132,6 @@ deschedulerPolicy: balance: enabled: - RemoveDuplicates - - RemovePodsViolatingNodeAffinity - RemovePodsViolatingTopologySpreadConstraint - LowNodeUtilization deschedule: diff --git a/cmd/descheduler/app/server.go b/cmd/descheduler/app/server.go index 3056a25fa3..e7ef713460 100644 --- a/cmd/descheduler/app/server.go +++ b/cmd/descheduler/app/server.go @@ -77,7 +77,7 @@ func NewDeschedulerCommand(out io.Writer) *cobra.Command { secureServing.DisableHTTP2 = !s.EnableHTTP2 - ctx, done := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + ctx, done := signal.NotifyContext(cmd.Context(), syscall.SIGINT, syscall.SIGTERM) pathRecorderMux := mux.NewPathRecorderMux("descheduler") if !s.DisableMetrics { diff --git a/pkg/descheduler/node/node.go b/pkg/descheduler/node/node.go index eed0604639..d283280656 100644 --- a/pkg/descheduler/node/node.go +++ b/pkg/descheduler/node/node.go @@ -343,9 +343,30 @@ func podMatchesInterPodAntiAffinity(nodeIndexer podutil.GetPodsAssignedToNodeFun if err != nil { return false, fmt.Errorf("error listing all pods: %v", err) } + assignedPodsInNamespace := podutil.GroupByNamespace(podsOnNode) - podsInANamespace := podutil.GroupByNamespace(podsOnNode) - nodeMap := utils.CreateNodeMap([]*v1.Node{node}) + for _, term := range utils.GetPodAntiAffinityTerms(pod.Spec.Affinity.PodAntiAffinity) { + namespaces := utils.GetNamespacesFromPodAffinityTerm(pod, &term) + selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) + if err != nil { + klog.ErrorS(err, "Unable to convert LabelSelector into Selector") + return false, err + } + + for namespace := range namespaces { + for _, assignedPod := range assignedPodsInNamespace[namespace] { + if assignedPod.Name == pod.Name || !utils.PodMatchesTermsNamespaceAndSelector(assignedPod, namespaces, selector) { + klog.V(4).InfoS("Pod doesn't match inter-pod anti-affinity rule of assigned pod on node", "candidatePod", klog.KObj(pod), "assignedPod", klog.KObj(assignedPod)) + continue + } + + if _, ok := node.Labels[term.TopologyKey]; ok { + klog.V(1).InfoS("Pod matches inter-pod anti-affinity rule of assigned pod on node", "candidatePod", klog.KObj(pod), "assignedPod", klog.KObj(assignedPod)) + return true, nil + } + } + } + } - return utils.CheckPodsWithAntiAffinityExist(pod, podsInANamespace, nodeMap), nil + return false, nil } diff --git a/pkg/descheduler/node/node_test.go b/pkg/descheduler/node/node_test.go index dafc0e9205..383e18bc51 100644 --- a/pkg/descheduler/node/node_test.go +++ b/pkg/descheduler/node/node_test.go @@ -759,6 +759,9 @@ func TestNodeFit(t *testing.T) { "region": "main-region", } }) + + nodeNolabel := test.BuildTestNode("node", 64000, 128*1000*1000*1000, 2, nil) + tests := []struct { description string pod *v1.Pod @@ -767,7 +770,7 @@ func TestNodeFit(t *testing.T) { err error }{ { - description: "insufficient cpu", + description: "Insufficient cpu", pod: test.BuildTestPod("p1", 10000, 2*1000*1000*1000, "", nil), node: node, podsOnNode: []*v1.Pod{ @@ -776,7 +779,7 @@ func TestNodeFit(t *testing.T) { err: errors.New("insufficient cpu"), }, { - description: "insufficient pod num", + description: "Insufficient pod num", pod: test.BuildTestPod("p1", 1000, 2*1000*1000*1000, "", nil), node: node, podsOnNode: []*v1.Pod{ @@ -786,7 +789,7 @@ func TestNodeFit(t *testing.T) { err: errors.New("insufficient pods"), }, { - description: "matches inter-pod anti-affinity rule of pod on node", + description: "Pod matches inter-pod anti-affinity rule of other pod on node", pod: test.PodWithPodAntiAffinity(test.BuildTestPod("p1", 1000, 1000, node.Name, nil), "foo", "bar"), node: node, podsOnNode: []*v1.Pod{ @@ -795,11 +798,36 @@ func TestNodeFit(t *testing.T) { err: errors.New("pod matches inter-pod anti-affinity rule of other pod on node"), }, { - description: "pod fits on node", + description: "Pod doesn't match inter-pod anti-affinity rule of other pod on node, because pod and other pod is not same namespace", + pod: test.PodWithPodAntiAffinity(test.BuildTestPod("p1", 1000, 1000, node.Name, nil), "foo", "bar"), + node: node, + podsOnNode: []*v1.Pod{ + test.PodWithPodAntiAffinity(test.BuildTestPod("p2", 1000, 1000, node.Name, func(pod *v1.Pod) { + pod.Namespace = "test" + }), "foo", "bar"), + }, + }, + { + description: "Pod doesn't match inter-pod anti-affinity rule of other pod on node, because other pod not match labels of pod", + pod: test.PodWithPodAntiAffinity(test.BuildTestPod("p1", 1000, 1000, node.Name, nil), "foo", "bar"), + node: node, + podsOnNode: []*v1.Pod{ + test.PodWithPodAntiAffinity(test.BuildTestPod("p2", 1000, 1000, node.Name, nil), "foo1", "bar1"), + }, + }, + { + description: "Pod doesn't match inter-pod anti-affinity rule of other pod on node, because node have no topologyKey", + pod: test.PodWithPodAntiAffinity(test.BuildTestPod("p1", 1000, 1000, "node1", nil), "foo", "bar"), + node: nodeNolabel, + podsOnNode: []*v1.Pod{ + test.PodWithPodAntiAffinity(test.BuildTestPod("p2", 1000, 1000, node.Name, nil), "foo", "bar"), + }, + }, + { + description: "Pod fits on node", pod: test.BuildTestPod("p1", 1000, 1000, "", func(pod *v1.Pod) {}), node: node, podsOnNode: []*v1.Pod{}, - err: nil, }, } diff --git a/pkg/utils/predicates.go b/pkg/utils/predicates.go index 48f25a1378..7153f17926 100644 --- a/pkg/utils/predicates.go +++ b/pkg/utils/predicates.go @@ -24,10 +24,37 @@ import ( v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" "k8s.io/apimachinery/pkg/labels" + "k8s.io/apimachinery/pkg/util/sets" "k8s.io/component-helpers/scheduling/corev1" "k8s.io/klog/v2" ) +// GetNamespacesFromPodAffinityTerm returns a set of names +// according to the namespaces indicated in podAffinityTerm. +// If namespaces is empty it considers the given pod's namespace. +func GetNamespacesFromPodAffinityTerm(pod *v1.Pod, podAffinityTerm *v1.PodAffinityTerm) sets.Set[string] { + names := sets.New[string]() + if len(podAffinityTerm.Namespaces) == 0 { + names.Insert(pod.Namespace) + } else { + names.Insert(podAffinityTerm.Namespaces...) + } + return names +} + +// PodMatchesTermsNamespaceAndSelector returns true if the given +// matches the namespace and selector defined by `s . +func PodMatchesTermsNamespaceAndSelector(pod *v1.Pod, namespaces sets.Set[string], selector labels.Selector) bool { + if !namespaces.Has(pod.Namespace) { + return false + } + + if !selector.Matches(labels.Set(pod.Labels)) { + return false + } + return true +} + // The following code has been copied from predicates package to avoid the // huge vendoring issues, mostly copied from // k8s.io/kubernetes/plugin/pkg/scheduler/algorithm/predicates/ @@ -309,42 +336,52 @@ func CreateNodeMap(nodes []*v1.Node) map[string]*v1.Node { return m } -// CheckPodsWithAntiAffinityExist checks if there are other pods on the node that the current pod cannot tolerate. -func CheckPodsWithAntiAffinityExist(pod *v1.Pod, pods map[string][]*v1.Pod, nodeMap map[string]*v1.Node) bool { - affinity := pod.Spec.Affinity - if affinity != nil && affinity.PodAntiAffinity != nil { - for _, term := range getPodAntiAffinityTerms(affinity.PodAntiAffinity) { - namespaces := getNamespacesFromPodAffinityTerm(pod, &term) - selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) - if err != nil { - klog.ErrorS(err, "Unable to convert LabelSelector into Selector") - return false - } - for namespace := range namespaces { - for _, existingPod := range pods[namespace] { - if existingPod.Name != pod.Name && podMatchesTermsNamespaceAndSelector(existingPod, namespaces, selector) { - node, ok := nodeMap[pod.Spec.NodeName] - if !ok { - continue - } - nodeHavingExistingPod, ok := nodeMap[existingPod.Spec.NodeName] - if !ok { - continue - } - if hasSameLabelValue(node, nodeHavingExistingPod, term.TopologyKey) { - klog.V(1).InfoS("Found Pods matching PodAntiAffinity", "pod with anti-affinity", klog.KObj(pod)) - return true - } - } +// CheckPodsWithAntiAffinityExist checks if there are other pods on the node that the current candidate pod cannot tolerate. +func CheckPodsWithAntiAffinityExist(candidatePod *v1.Pod, assignedPods map[string][]*v1.Pod, nodeMap map[string]*v1.Node) bool { + nodeHavingCandidatePod, ok := nodeMap[candidatePod.Spec.NodeName] + if !ok { + klog.Warningf("CandidatePod %s does not exist in nodeMap", klog.KObj(candidatePod)) + return false + } + + affinity := candidatePod.Spec.Affinity + if affinity == nil || affinity.PodAntiAffinity == nil { + return false + } + + for _, term := range GetPodAntiAffinityTerms(affinity.PodAntiAffinity) { + namespaces := GetNamespacesFromPodAffinityTerm(candidatePod, &term) + selector, err := metav1.LabelSelectorAsSelector(term.LabelSelector) + if err != nil { + klog.ErrorS(err, "Unable to convert LabelSelector into Selector") + return false + } + + for namespace := range namespaces { + for _, assignedPod := range assignedPods[namespace] { + if assignedPod.Name == candidatePod.Name || !PodMatchesTermsNamespaceAndSelector(assignedPod, namespaces, selector) { + klog.V(4).InfoS("CandidatePod doesn't matches inter-pod anti-affinity rule of assigned pod on node", "candidatePod", klog.KObj(candidatePod), "assignedPod", klog.KObj(assignedPod)) + continue + } + + nodeHavingAssignedPod, ok := nodeMap[assignedPod.Spec.NodeName] + if !ok { + continue + } + + if hasSameLabelValue(nodeHavingCandidatePod, nodeHavingAssignedPod, term.TopologyKey) { + klog.V(1).InfoS("CandidatePod matches inter-pod anti-affinity rule of assigned pod on node", "candidatePod", klog.KObj(candidatePod), "assignedPod", klog.KObj(assignedPod)) + return true } } } } + return false } -// getPodAntiAffinityTerms gets the antiaffinity terms for the given pod. -func getPodAntiAffinityTerms(podAntiAffinity *v1.PodAntiAffinity) (terms []v1.PodAffinityTerm) { +// GetPodAntiAffinityTerms gets the antiaffinity terms for the given pod. +func GetPodAntiAffinityTerms(podAntiAffinity *v1.PodAntiAffinity) (terms []v1.PodAffinityTerm) { if podAntiAffinity != nil { if len(podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution) != 0 { terms = podAntiAffinity.RequiredDuringSchedulingIgnoredDuringExecution diff --git a/pkg/utils/priority.go b/pkg/utils/priority.go index 2ba401a4ed..9a389dbe95 100644 --- a/pkg/utils/priority.go +++ b/pkg/utils/priority.go @@ -4,42 +4,13 @@ import ( "context" "fmt" - v1 "k8s.io/api/core/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" - "k8s.io/apimachinery/pkg/labels" - "k8s.io/apimachinery/pkg/util/sets" clientset "k8s.io/client-go/kubernetes" "sigs.k8s.io/descheduler/pkg/api" ) const SystemCriticalPriority = 2 * int32(1000000000) -// getNamespacesFromPodAffinityTerm returns a set of names -// according to the namespaces indicated in podAffinityTerm. -// If namespaces is empty it considers the given pod's namespace. -func getNamespacesFromPodAffinityTerm(pod *v1.Pod, podAffinityTerm *v1.PodAffinityTerm) sets.Set[string] { - names := sets.New[string]() - if len(podAffinityTerm.Namespaces) == 0 { - names.Insert(pod.Namespace) - } else { - names.Insert(podAffinityTerm.Namespaces...) - } - return names -} - -// podMatchesTermsNamespaceAndSelector returns true if the given -// matches the namespace and selector defined by `s . -func podMatchesTermsNamespaceAndSelector(pod *v1.Pod, namespaces sets.Set[string], selector labels.Selector) bool { - if !namespaces.Has(pod.Namespace) { - return false - } - - if !selector.Matches(labels.Set(pod.Labels)) { - return false - } - return true -} - // GetPriorityFromPriorityClass gets priority from the given priority class. // If no priority class is provided, it will return SystemCriticalPriority by default. func GetPriorityFromPriorityClass(ctx context.Context, client clientset.Interface, name string) (int32, error) {