UPSTREAM: <carry> 124063: fix grace period used for immediate evictions

Signed-off-by: Kevin Hannon <[email protected]>
openshift · Dec 12, 2024 · 3333988 · 3333988
1 parent df0195c
commit 3333988
Show file tree

Hide file tree

Showing 5 changed files with 204 additions and 32 deletions.
diff --git a/pkg/kubelet/eviction/eviction_manager.go b/pkg/kubelet/eviction/eviction_manager.go
@@ -56,6 +56,9 @@ const (
 	signalEphemeralPodFsLimit string = "ephemeralpodfs.limit"
 	// signalEmptyDirFsLimit is amount of storage available on filesystem requested by an emptyDir
 	signalEmptyDirFsLimit string = "emptydirfs.limit"
+	// immediateEvictionGracePeriodSeconds is how long we give pods to shut down when we
+	// need them to evict quickly due to resource pressure
+	immediateEvictionGracePeriodSeconds = 1
 )
 
 // managerImpl implements Manager
@@ -405,7 +408,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
 	// we kill at most a single pod during each eviction interval
 	for i := range activePods {
 		pod := activePods[i]
-		gracePeriodOverride := int64(0)
+		gracePeriodOverride := int64(immediateEvictionGracePeriodSeconds)
 		if !isHardEvictionThreshold(thresholdToReclaim) {
 			gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
 		}
@@ -525,7 +528,7 @@ func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.
 			used := podVolumeUsed[pod.Spec.Volumes[i].Name]
 			if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
 				// the emptyDir usage exceeds the size limit, evict the pod
-				if m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
+				if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
 					metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc()
 					return true
 				}
@@ -553,7 +556,7 @@ func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStat
 	if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
 		// the total usage of pod exceeds the total size limit of containers, evict the pod
 		message := fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String())
-		if m.evictPod(pod, 0, message, nil, nil) {
+		if m.evictPod(pod, immediateEvictionGracePeriodSeconds, message, nil, nil) {
 			metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc()
 			return true
 		}
@@ -579,7 +582,7 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P
 
 		if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
 			if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
-				if m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
+				if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
 					metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc()
 					return true
 				}

diff --git a/pkg/kubelet/eviction/eviction_manager_test.go b/pkg/kubelet/eviction/eviction_manager_test.go
@@ -123,10 +123,10 @@ func makePodWithPIDStats(name string, priority int32, processCount uint64) (*v1.
 	return pod, podStats
 }
 
-func makePodWithDiskStats(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, rootFsUsed, logsUsed, perLocalVolumeUsed string) (*v1.Pod, statsapi.PodStats) {
+func makePodWithDiskStats(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, rootFsUsed, logsUsed, perLocalVolumeUsed string, volumes []v1.Volume) (*v1.Pod, statsapi.PodStats) {
 	pod := newPod(name, priority, []v1.Container{
 		newContainer(name, requests, limits),
-	}, nil)
+	}, volumes)
 	podStats := newPodDiskStats(pod, parseQuantity(rootFsUsed), parseQuantity(logsUsed), parseQuantity(perLocalVolumeUsed))
 	return pod, podStats
 }
@@ -505,7 +505,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
 					Quantity: quantityMustParse("2Gi"),
 				},
 			},
-			evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. ",
+			evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. Container above-requests was using 700Mi, request is 100Mi, has larger consumption of ephemeral-storage. ",
 			podToMakes: []podToMake{
 				{name: "below-requests", requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "900Mi"},
 				{name: "above-requests", requests: newResourceList("", "", "100Mi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "700Mi"},
@@ -516,7 +516,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
 			nodeFsStats:      "1Gi",
 			imageFsStats:     "10Gi",
 			containerFsStats: "10Gi",
-			evictionMessage:  "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
+			evictionMessage:  "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
 			thresholdToMonitor: evictionapi.Threshold{
 				Signal:   evictionapi.SignalImageFsAvailable,
 				Operator: evictionapi.OpLessThan,
@@ -537,7 +537,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
 			nodeFsStats:                   "1Gi",
 			imageFsStats:                  "100Gi",
 			containerFsStats:              "10Gi",
-			evictionMessage:               "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
+			evictionMessage:               "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi.Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
 			thresholdToMonitor: evictionapi.Threshold{
 				Signal:   evictionapi.SignalContainerFsAvailable,
 				Operator: evictionapi.OpLessThan,
@@ -557,7 +557,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
 			nodeFsStats:                   "10Gi",
 			imageFsStats:                  "100Gi",
 			containerFsStats:              "10Gi",
-			evictionMessage:               "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
+			evictionMessage:               "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
 			thresholdToMonitor: evictionapi.Threshold{
 				Signal:   evictionapi.SignalNodeFsAvailable,
 				Operator: evictionapi.OpLessThan,
@@ -588,7 +588,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
 				pods := []*v1.Pod{}
 				podStats := map[*v1.Pod]statsapi.PodStats{}
 				for _, podToMake := range podsToMake {
-					pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
+					pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
 					pods = append(pods, pod)
 					podStats[pod] = podStat
 				}
@@ -835,8 +835,8 @@ func TestMemoryPressure(t *testing.T) {
 		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
 	}
 	observedGracePeriod = *podKiller.gracePeriodOverride
-	if observedGracePeriod != int64(0) {
-		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	if observedGracePeriod != int64(1) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 1, observedGracePeriod)
 	}
 
 	// the best-effort pod should not admit, burstable should
@@ -1106,8 +1106,8 @@ func TestPIDPressure(t *testing.T) {
 				t.Errorf("Manager chose to kill pod but should have had a grace period override.")
 			}
 			observedGracePeriod = *podKiller.gracePeriodOverride
-			if observedGracePeriod != int64(0) {
-				t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+			if observedGracePeriod != int64(1) {
+				t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 1, observedGracePeriod)
 			}
 
 			// try to admit our pod (should fail)
@@ -1336,7 +1336,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 			pods := []*v1.Pod{}
 			podStats := map[*v1.Pod]statsapi.PodStats{}
 			for _, podToMake := range podsToMake {
-				pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
+				pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
 				pods = append(pods, pod)
 				podStats[pod] = podStat
 			}
@@ -1379,7 +1379,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
 			}
 
 			// create a best effort pod to test admission
-			podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi")
+			podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi", nil)
 
 			// synchronize
 			_, err := manager.synchronize(diskInfoProvider, activePodsFunc)
@@ -1494,8 +1494,8 @@ func TestDiskPressureNodeFs(t *testing.T) {
 				t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
 			}
 			observedGracePeriod = *podKiller.gracePeriodOverride
-			if observedGracePeriod != int64(0) {
-				t.Fatalf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+			if observedGracePeriod != int64(1) {
+				t.Fatalf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 1, observedGracePeriod)
 			}
 
 			// try to admit our pod (should fail)
@@ -1644,8 +1644,8 @@ func TestMinReclaim(t *testing.T) {
 		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
 	}
 	observedGracePeriod := *podKiller.gracePeriodOverride
-	if observedGracePeriod != int64(0) {
-		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	if observedGracePeriod != int64(1) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 1, observedGracePeriod)
 	}
 
 	// reduce memory pressure, but not below the min-reclaim amount
@@ -1668,8 +1668,8 @@ func TestMinReclaim(t *testing.T) {
 		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
 	}
 	observedGracePeriod = *podKiller.gracePeriodOverride
-	if observedGracePeriod != int64(0) {
-		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	if observedGracePeriod != int64(1) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 1, observedGracePeriod)
 	}
 
 	// reduce memory pressure and ensure the min-reclaim amount
@@ -1858,7 +1858,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
 			pods := []*v1.Pod{}
 			podStats := map[*v1.Pod]statsapi.PodStats{}
 			for _, podToMake := range podsToMake {
-				pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
+				pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
 				pods = append(pods, pod)
 				podStats[pod] = podStat
 			}
@@ -2060,8 +2060,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
 				t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
 			}
 			observedGracePeriod := *podKiller.gracePeriodOverride
-			if observedGracePeriod != int64(0) {
-				t.Fatalf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+			if observedGracePeriod != int64(1) {
+				t.Fatalf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 1, observedGracePeriod)
 			}
 
 			// reduce disk pressure
@@ -2458,8 +2458,8 @@ func TestInodePressureFsInodes(t *testing.T) {
 				t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
 			}
 			observedGracePeriod = *podKiller.gracePeriodOverride
-			if observedGracePeriod != int64(0) {
-				t.Fatalf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+			if observedGracePeriod != int64(1) {
+				t.Fatalf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 1, observedGracePeriod)
 			}
 
 			// try to admit our pod (should fail)
@@ -2666,6 +2666,111 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) {
 	}
 }
 
+func TestStorageLimitEvictions(t *testing.T) {
+	volumeSizeLimit := resource.MustParse("1Gi")
+
+	testCases := map[string]struct {
+		pod     podToMake
+		volumes []v1.Volume
+	}{
+		"eviction due to rootfs above limit": {
+			pod: podToMake{name: "rootfs-above-limits", priority: defaultPriority, requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "2Gi"},
+		},
+		"eviction due to logsfs above limit": {
+			pod: podToMake{name: "logsfs-above-limits", priority: defaultPriority, requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), logsFsUsed: "2Gi"},
+		},
+		"eviction due to local volume above limit": {
+			pod: podToMake{name: "localvolume-above-limits", priority: defaultPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "2Gi"},
+			volumes: []v1.Volume{{
+				Name: "emptyDirVolume",
+				VolumeSource: v1.VolumeSource{
+					EmptyDir: &v1.EmptyDirVolumeSource{
+						SizeLimit: &volumeSizeLimit,
+					},
+				},
+			}},
+		},
+	}
+	for name, tc := range testCases {
+		t.Run(name, func(t *testing.T) {
+			podMaker := makePodWithDiskStats
+			summaryStatsMaker := makeDiskStats
+			podsToMake := []podToMake{
+				tc.pod,
+			}
+			pods := []*v1.Pod{}
+			podStats := map[*v1.Pod]statsapi.PodStats{}
+			for _, podToMake := range podsToMake {
+				pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, tc.volumes)
+				pods = append(pods, pod)
+				podStats[pod] = podStat
+			}
+
+			podToEvict := pods[0]
+			activePodsFunc := func() []*v1.Pod {
+				return pods
+			}
+
+			fakeClock := testingclock.NewFakeClock(time.Now())
+			podKiller := &mockPodKiller{}
+			diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)}
+			diskGC := &mockDiskGC{err: nil}
+			nodeRef := &v1.ObjectReference{
+				Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
+			}
+
+			config := Config{
+				MaxPodGracePeriodSeconds: 5,
+				PressureTransitionPeriod: time.Minute * 5,
+				Thresholds: []evictionapi.Threshold{
+					{
+						Signal:   evictionapi.SignalNodeFsAvailable,
+						Operator: evictionapi.OpLessThan,
+						Value: evictionapi.ThresholdValue{
+							Quantity: quantityMustParse("1Gi"),
+						},
+					},
+				},
+			}
+
+			diskStat := diskStats{
+				rootFsAvailableBytes:  "200Mi",
+				imageFsAvailableBytes: "200Mi",
+				podStats:              podStats,
+			}
+			summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStat)}
+			manager := &managerImpl{
+				clock:                         fakeClock,
+				killPodFunc:                   podKiller.killPodNow,
+				imageGC:                       diskGC,
+				containerGC:                   diskGC,
+				config:                        config,
+				recorder:                      &record.FakeRecorder{},
+				summaryProvider:               summaryProvider,
+				nodeRef:                       nodeRef,
+				nodeConditionsLastObservedAt:  nodeConditionsObservedAt{},
+				thresholdsFirstObservedAt:     thresholdsObservedAt{},
+				localStorageCapacityIsolation: true,
+			}
+
+			_, err := manager.synchronize(diskInfoProvider, activePodsFunc)
+			if err != nil {
+				t.Fatalf("Manager expects no error but got %v", err)
+			}
+
+			if podKiller.pod == nil {
+				t.Fatalf("Manager should have selected a pod for eviction")
+			}
+			if podKiller.pod != podToEvict {
+				t.Errorf("Manager should have killed pod: %v, but instead killed: %v", podToEvict.Name, podKiller.pod.Name)
+			}
+			if *podKiller.gracePeriodOverride != 1 {
+				t.Errorf("Manager should have evicted with gracePeriodOverride of 1, but used: %v", *podKiller.gracePeriodOverride)
+			}
+		})
+	}
+}
+
 // TestAllocatableMemoryPressure
 func TestAllocatableMemoryPressure(t *testing.T) {
 	podMaker := makePodWithMemoryStats
@@ -2767,8 +2872,8 @@ func TestAllocatableMemoryPressure(t *testing.T) {
 		t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
 	}
 	observedGracePeriod := *podKiller.gracePeriodOverride
-	if observedGracePeriod != int64(0) {
-		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 0, observedGracePeriod)
+	if observedGracePeriod != int64(1) {
+		t.Errorf("Manager chose to kill pod with incorrect grace period.  Expected: %d, actual: %d", 1, observedGracePeriod)
 	}
 	// reset state
 	podKiller.pod = nil

diff --git a/pkg/kubelet/eviction/helpers_test.go b/pkg/kubelet/eviction/helpers_test.go
@@ -3053,8 +3053,9 @@ func newPodDiskStats(pod *v1.Pod, rootFsUsed, logsUsed, perLocalVolumeUsed resou
 
 	rootFsUsedBytes := uint64(rootFsUsed.Value())
 	logsUsedBytes := uint64(logsUsed.Value())
-	for range pod.Spec.Containers {
+	for _, container := range pod.Spec.Containers {
 		result.Containers = append(result.Containers, statsapi.ContainerStats{
+			Name: container.Name,
 			Rootfs: &statsapi.FsStats{
 				UsedBytes: &rootFsUsedBytes,
 			},

diff --git a/pkg/kubelet/pod_workers.go b/pkg/kubelet/pod_workers.go
@@ -979,23 +979,26 @@ func calculateEffectiveGracePeriod(status *podSyncStatus, pod *v1.Pod, options *
 	// enforce the restriction that a grace period can only decrease and track whatever our value is,
 	// then ensure a calculated value is passed down to lower levels
 	gracePeriod := status.gracePeriod
+	overridden := false
 	// this value is bedrock truth - the apiserver owns telling us this value calculated by apiserver
 	if override := pod.DeletionGracePeriodSeconds; override != nil {
 		if gracePeriod == 0 || *override < gracePeriod {
 			gracePeriod = *override
+			overridden = true
 		}
 	}
 	// we allow other parts of the kubelet (namely eviction) to request this pod be terminated faster
 	if options != nil {
 		if override := options.PodTerminationGracePeriodSecondsOverride; override != nil {
 			if gracePeriod == 0 || *override < gracePeriod {
 				gracePeriod = *override
+				overridden = true
 			}
 		}
 	}
 	// make a best effort to default this value to the pod's desired intent, in the event
 	// the kubelet provided no requested value (graceful termination?)
-	if gracePeriod == 0 && pod.Spec.TerminationGracePeriodSeconds != nil {
+	if !overridden && gracePeriod == 0 && pod.Spec.TerminationGracePeriodSeconds != nil {
 		gracePeriod = *pod.Spec.TerminationGracePeriodSeconds
 	}
 	// no matter what, we always supply a grace period of 1