UPSTREAM: <carry>: OCPBUGS-52169: Workload partitioning of static init containers

MarSik · MarSik · commit 57305d5fe7c8 · 2025-07-17T15:20:50.000+02:00
The pod modification routine that prepares containers for Workload
Partitioning quits early when it encounters a container with no
resources specified.

This causes a logical leak of resource capacity on the node,
because the Pod is left untouched and still reports its full
resource requests to the scheduler. It is however not using them,
because the logic that moves the container to the management
partitions works just fine.

The end result is lowered node capacity for scheduling.

Signed-off-by: Martin Sivak &lt;msivak@redhat.com&gt;
diff --git a/pkg/kubelet/managed/managed.go b/pkg/kubelet/managed/managed.go
@@ -117,14 +117,36 @@ func GenerateResourceName(workloadName string) v1.ResourceName {
 func updateContainers(workloadName string, pod *v1.Pod) error {
 	updateContainer := func(container *v1.Container) error {
 		if container.Resources.Requests == nil {
-			return fmt.Errorf("managed container %v does not have Resource.Requests", container.Name)
+			// Nothing to modify, but that is OK, it will not
+			// change the QoS class of the modified Pod
+			return nil
 		}
-		if _, ok := container.Resources.Requests[v1.ResourceCPU]; !ok {
+
+		_, cpuOk := container.Resources.Requests[v1.ResourceCPU]
+		_, memoryOk := container.Resources.Requests[v1.ResourceMemory]
+
+		// It is possible memory is configured using limits only and that implies
+		// requests with the same value, check for that in case memory requests
+		// are not present by themselves.
+		if !memoryOk && container.Resources.Limits != nil {
+			_, memoryOk = container.Resources.Limits[v1.ResourceMemory]
+		}
+
+		// When both cpu and memory requests are missing, there is nothing
+		// to do
+		if !cpuOk && !memoryOk {
+			return nil
+		}
+
+		// Both memory and cpu have to be set to make sure stripping them
+		// will not change the QoS class of the Pod
+		if !cpuOk {
 			return fmt.Errorf("managed container %v does not have cpu requests", container.Name)
 		}
-		if _, ok := container.Resources.Requests[v1.ResourceMemory]; !ok {
+		if !memoryOk {
 			return fmt.Errorf("managed container %v does not have memory requests", container.Name)
 		}
+
 		if container.Resources.Limits == nil {
 			container.Resources.Limits = v1.ResourceList{}
 		}
diff --git a/pkg/kubelet/managed/managed_test.go b/pkg/kubelet/managed/managed_test.go
@@ -19,17 +19,6 @@ func TestModifyStaticPodForPinnedManagementErrorStates(t *testing.T) {
 		pod           *v1.Pod
 		expectedError error
 	}{
-		{
-			pod: createPod(workloadAnnotations, nil,
-				&v1.Container{
-					Name:  "nginx",
-					Image: "test/image",
-					Resources: v1.ResourceRequirements{
-						Requests: nil,
-					},
-				}),
-			expectedError: fmt.Errorf("managed container nginx does not have Resource.Requests"),
-		},
 		{
 			pod: createPod(workloadAnnotations, nil,
 				&v1.Container{
@@ -129,6 +118,7 @@ func TestStaticPodManaged(t *testing.T) {
 		pod                 *v1.Pod
 		expectedAnnotations map[string]string
 		isGuaranteed        bool
+		isBestEffort        bool
 	}{
 		{
 			pod: &v1.Pod{
@@ -168,6 +158,47 @@ func TestStaticPodManaged(t *testing.T) {
 				"resources.workload.openshift.io/nginx":   `{"cpushares":102}`,
 			},
 		},
+		{
+			pod: &v1.Pod{
+				TypeMeta: metav1.TypeMeta{
+					Kind:       "Pod",
+					APIVersion: "",
+				},
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test",
+					UID:       "12345",
+					Namespace: "mynamespace",
+					Annotations: map[string]string{
+						"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+					},
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:  "nginx",
+							Image: "test/image",
+							Resources: v1.ResourceRequirements{
+								Requests: v1.ResourceList{
+									v1.ResourceName(v1.ResourceMemory): resource.MustParse("100m"),
+									v1.ResourceName(v1.ResourceCPU):    resource.MustParse("100m"),
+								},
+								Limits: v1.ResourceList{
+									v1.ResourceName(v1.ResourceMemory): resource.MustParse("100m"),
+								},
+							},
+						},
+					},
+					SecurityContext: &v1.PodSecurityContext{},
+				},
+				Status: v1.PodStatus{
+					Phase: v1.PodPending,
+				},
+			},
+			expectedAnnotations: map[string]string{
+				"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+				"resources.workload.openshift.io/nginx":   `{"cpushares":102}`,
+			},
+		},
 		{
 			pod: &v1.Pod{
 				TypeMeta: metav1.TypeMeta{
@@ -270,6 +301,164 @@ func TestStaticPodManaged(t *testing.T) {
 				"resources.workload.openshift.io/c1":      `{"cpushares":20,"cpulimit":100}`,
 			},
 		},
+		{
+			pod: &v1.Pod{
+				TypeMeta: metav1.TypeMeta{
+					Kind:       "Pod",
+					APIVersion: "",
+				},
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test",
+					UID:       "12345",
+					Namespace: "mynamespace",
+					Annotations: map[string]string{
+						"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+					},
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:      "c1",
+							Image:     "test/nginx",
+							Resources: v1.ResourceRequirements{},
+						},
+					},
+					SecurityContext: &v1.PodSecurityContext{},
+				},
+				Status: v1.PodStatus{
+					Phase: v1.PodPending,
+				},
+			},
+			expectedAnnotations: map[string]string{
+				"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+			},
+			isBestEffort: true,
+		},
+		{
+			pod: &v1.Pod{
+				TypeMeta: metav1.TypeMeta{
+					Kind:       "Pod",
+					APIVersion: "",
+				},
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test",
+					UID:       "12345",
+					Namespace: "mynamespace",
+					Annotations: map[string]string{
+						"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+					},
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:  "c1",
+							Image: "test/nginx",
+							Resources: v1.ResourceRequirements{
+								Requests: v1.ResourceList{
+									v1.ResourceName("dummy"): resource.MustParse("20m"),
+								},
+							},
+						},
+					},
+					SecurityContext: &v1.PodSecurityContext{},
+				},
+				Status: v1.PodStatus{
+					Phase: v1.PodPending,
+				},
+			},
+			expectedAnnotations: map[string]string{
+				"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+			},
+			isBestEffort: true,
+		},
+		{
+			pod: &v1.Pod{
+				TypeMeta: metav1.TypeMeta{
+					Kind:       "Pod",
+					APIVersion: "",
+				},
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test",
+					UID:       "12345",
+					Namespace: "mynamespace",
+					Annotations: map[string]string{
+						"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+					},
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:      "c1",
+							Image:     "test/nginx",
+							Resources: v1.ResourceRequirements{},
+						},
+					},
+					InitContainers: []v1.Container{
+						{
+							Name:      "ic1",
+							Image:     "test/nginx",
+							Resources: v1.ResourceRequirements{},
+						},
+					},
+					SecurityContext: &v1.PodSecurityContext{},
+				},
+				Status: v1.PodStatus{
+					Phase: v1.PodPending,
+				},
+			},
+			expectedAnnotations: map[string]string{
+				"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+			},
+			isBestEffort: true,
+		},
+		{
+			pod: &v1.Pod{
+				TypeMeta: metav1.TypeMeta{
+					Kind:       "Pod",
+					APIVersion: "",
+				},
+				ObjectMeta: metav1.ObjectMeta{
+					Name:      "test",
+					UID:       "12345",
+					Namespace: "mynamespace",
+					Annotations: map[string]string{
+						"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+					},
+				},
+				Spec: v1.PodSpec{
+					Containers: []v1.Container{
+						{
+							Name:  "c1",
+							Image: "test/nginx",
+							Resources: v1.ResourceRequirements{
+								Requests: v1.ResourceList{
+									v1.ResourceName("dummy"): resource.MustParse("20m"),
+								},
+							},
+						},
+					},
+					InitContainers: []v1.Container{
+						{
+							Name:  "ic1",
+							Image: "test/nginx",
+							Resources: v1.ResourceRequirements{
+								Requests: v1.ResourceList{
+									v1.ResourceName("dummy"): resource.MustParse("20m"),
+								},
+							},
+						},
+					},
+					SecurityContext: &v1.PodSecurityContext{},
+				},
+				Status: v1.PodStatus{
+					Phase: v1.PodPending,
+				},
+			},
+			expectedAnnotations: map[string]string{
+				"target.workload.openshift.io/management": `{"effect": "PreferredDuringScheduling"}`,
+			},
+			isBestEffort: true,
+		},
 		{
 			pod: &v1.Pod{
 				TypeMeta: metav1.TypeMeta{
@@ -481,15 +670,24 @@ func TestStaticPodManaged(t *testing.T) {
 			if container.Resources.Requests.Cpu().String() != "0" && !tc.isGuaranteed {
 				t.Errorf("cpu requests should be 0 got %v", container.Resources.Requests.Cpu().String())
 			}
-			if container.Resources.Requests.Memory().String() == "0" && !tc.isGuaranteed {
-				t.Errorf("memory requests were %v but should be %v", container.Resources.Requests.Memory().String(), container.Resources.Requests.Memory().String())
+			if container.Resources.Requests.Memory().String() == "0" && !tc.isGuaranteed && !tc.isBestEffort {
+				t.Errorf("memory requests were %v but should be %v in container %v", container.Resources.Requests.Memory().String(), container.Resources.Requests.Memory().String(), container.Name)
 			}
-			if _, exists := container.Resources.Requests[GenerateResourceName(workloadName)]; !exists && !tc.isGuaranteed {
+			if container.Resources.Requests.Memory().String() != "0" && !tc.isGuaranteed && tc.isBestEffort {
+				t.Errorf("memory requests should be 0 got %v", container.Resources.Requests.Memory().String())
+			}
+			if _, exists := container.Resources.Requests[GenerateResourceName(workloadName)]; !exists && !tc.isGuaranteed && !tc.isBestEffort {
 				t.Errorf("managed capacity label missing from pod %v and container %v", tc.pod.Name, container.Name)
 			}
-			if _, exists := container.Resources.Limits[GenerateResourceName(workloadName)]; !exists && !tc.isGuaranteed {
+			if _, exists := container.Resources.Limits[GenerateResourceName(workloadName)]; !exists && !tc.isGuaranteed && !tc.isBestEffort {
 				t.Errorf("managed capacity label missing from pod %v and container %v limits", tc.pod.Name, container.Name)
 			}
+			if _, exists := container.Resources.Requests[GenerateResourceName(workloadName)]; exists && tc.isBestEffort {
+				t.Errorf("managed capacity label present in best-effort pod %v and container %v requests", tc.pod.Name, container.Name)
+			}
+			if _, exists := container.Resources.Limits[GenerateResourceName(workloadName)]; exists && tc.isBestEffort {
+				t.Errorf("managed capacity label present in best-effort pod %v and container %v limits", tc.pod.Name, container.Name)
+			}
 		}
 	}
 }