Skip to content

Commit

Permalink
UPSTREAM: <carry>: PR-124064 fix grace period used for immediate evic…
Browse files Browse the repository at this point in the history
…tions

Signed-off-by: Kevin Hannon <[email protected]>
  • Loading branch information
olyazavr authored and kannon92 committed Dec 12, 2024
1 parent df0195c commit c0eea14
Show file tree
Hide file tree
Showing 5 changed files with 204 additions and 32 deletions.
11 changes: 7 additions & 4 deletions pkg/kubelet/eviction/eviction_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ const (
signalEphemeralPodFsLimit string = "ephemeralpodfs.limit"
// signalEmptyDirFsLimit is amount of storage available on filesystem requested by an emptyDir
signalEmptyDirFsLimit string = "emptydirfs.limit"
// immediateEvictionGracePeriodSeconds is how long we give pods to shut down when we
// need them to evict quickly due to resource pressure
immediateEvictionGracePeriodSeconds = 1
)

// managerImpl implements Manager
Expand Down Expand Up @@ -405,7 +408,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
gracePeriodOverride := int64(0)
gracePeriodOverride := int64(immediateEvictionGracePeriodSeconds)
if !isHardEvictionThreshold(thresholdToReclaim) {
gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
}
Expand Down Expand Up @@ -525,7 +528,7 @@ func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.
used := podVolumeUsed[pod.Spec.Volumes[i].Name]
if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
// the emptyDir usage exceeds the size limit, evict the pod
if m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc()
return true
}
Expand Down Expand Up @@ -553,7 +556,7 @@ func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStat
if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
// the total usage of pod exceeds the total size limit of containers, evict the pod
message := fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String())
if m.evictPod(pod, 0, message, nil, nil) {
if m.evictPod(pod, immediateEvictionGracePeriodSeconds, message, nil, nil) {
metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc()
return true
}
Expand All @@ -579,7 +582,7 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P

if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
if m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc()
return true
}
Expand Down
157 changes: 131 additions & 26 deletions pkg/kubelet/eviction/eviction_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,10 @@ func makePodWithPIDStats(name string, priority int32, processCount uint64) (*v1.
return pod, podStats
}

func makePodWithDiskStats(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, rootFsUsed, logsUsed, perLocalVolumeUsed string) (*v1.Pod, statsapi.PodStats) {
func makePodWithDiskStats(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, rootFsUsed, logsUsed, perLocalVolumeUsed string, volumes []v1.Volume) (*v1.Pod, statsapi.PodStats) {
pod := newPod(name, priority, []v1.Container{
newContainer(name, requests, limits),
}, nil)
}, volumes)
podStats := newPodDiskStats(pod, parseQuantity(rootFsUsed), parseQuantity(logsUsed), parseQuantity(perLocalVolumeUsed))
return pod, podStats
}
Expand Down Expand Up @@ -505,7 +505,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
Quantity: quantityMustParse("2Gi"),
},
},
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. ",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. Container above-requests was using 700Mi, request is 100Mi, has larger consumption of ephemeral-storage. ",
podToMakes: []podToMake{
{name: "below-requests", requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "900Mi"},
{name: "above-requests", requests: newResourceList("", "", "100Mi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "700Mi"},
Expand All @@ -516,7 +516,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
nodeFsStats: "1Gi",
imageFsStats: "10Gi",
containerFsStats: "10Gi",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
thresholdToMonitor: evictionapi.Threshold{
Signal: evictionapi.SignalImageFsAvailable,
Operator: evictionapi.OpLessThan,
Expand All @@ -537,7 +537,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
nodeFsStats: "1Gi",
imageFsStats: "100Gi",
containerFsStats: "10Gi",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi.Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
thresholdToMonitor: evictionapi.Threshold{
Signal: evictionapi.SignalContainerFsAvailable,
Operator: evictionapi.OpLessThan,
Expand All @@ -557,7 +557,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
nodeFsStats: "10Gi",
imageFsStats: "100Gi",
containerFsStats: "10Gi",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
thresholdToMonitor: evictionapi.Threshold{
Signal: evictionapi.SignalNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Expand Down Expand Up @@ -588,7 +588,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
pods = append(pods, pod)
podStats[pod] = podStat
}
Expand Down Expand Up @@ -835,8 +835,8 @@ func TestMemoryPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// the best-effort pod should not admit, burstable should
Expand Down Expand Up @@ -1106,8 +1106,8 @@ func TestPIDPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod but should have had a grace period override.")
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// try to admit our pod (should fail)
Expand Down Expand Up @@ -1336,7 +1336,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
pods = append(pods, pod)
podStats[pod] = podStat
}
Expand Down Expand Up @@ -1379,7 +1379,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
}

// create a best effort pod to test admission
podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi")
podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi", nil)

// synchronize
_, err := manager.synchronize(diskInfoProvider, activePodsFunc)
Expand Down Expand Up @@ -1494,8 +1494,8 @@ func TestDiskPressureNodeFs(t *testing.T) {
t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// try to admit our pod (should fail)
Expand Down Expand Up @@ -1644,8 +1644,8 @@ func TestMinReclaim(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// reduce memory pressure, but not below the min-reclaim amount
Expand All @@ -1668,8 +1668,8 @@ func TestMinReclaim(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// reduce memory pressure and ensure the min-reclaim amount
Expand Down Expand Up @@ -1858,7 +1858,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
pods = append(pods, pod)
podStats[pod] = podStat
}
Expand Down Expand Up @@ -2060,8 +2060,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// reduce disk pressure
Expand Down Expand Up @@ -2458,8 +2458,8 @@ func TestInodePressureFsInodes(t *testing.T) {
t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// try to admit our pod (should fail)
Expand Down Expand Up @@ -2666,6 +2666,111 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) {
}
}

func TestStorageLimitEvictions(t *testing.T) {
volumeSizeLimit := resource.MustParse("1Gi")

testCases := map[string]struct {
pod podToMake
volumes []v1.Volume
}{
"eviction due to rootfs above limit": {
pod: podToMake{name: "rootfs-above-limits", priority: defaultPriority, requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "2Gi"},
},
"eviction due to logsfs above limit": {
pod: podToMake{name: "logsfs-above-limits", priority: defaultPriority, requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), logsFsUsed: "2Gi"},
},
"eviction due to local volume above limit": {
pod: podToMake{name: "localvolume-above-limits", priority: defaultPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "2Gi"},
volumes: []v1.Volume{{
Name: "emptyDirVolume",
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: &volumeSizeLimit,
},
},
}},
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
podMaker := makePodWithDiskStats
summaryStatsMaker := makeDiskStats
podsToMake := []podToMake{
tc.pod,
}
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, tc.volumes)
pods = append(pods, pod)
podStats[pod] = podStat
}

podToEvict := pods[0]
activePodsFunc := func() []*v1.Pod {
return pods
}

fakeClock := testingclock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)}
diskGC := &mockDiskGC{err: nil}
nodeRef := &v1.ObjectReference{
Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
}

config := Config{
MaxPodGracePeriodSeconds: 5,
PressureTransitionPeriod: time.Minute * 5,
Thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("1Gi"),
},
},
},
}

diskStat := diskStats{
rootFsAvailableBytes: "200Mi",
imageFsAvailableBytes: "200Mi",
podStats: podStats,
}
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStat)}
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: diskGC,
containerGC: diskGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
localStorageCapacityIsolation: true,
}

_, err := manager.synchronize(diskInfoProvider, activePodsFunc)
if err != nil {
t.Fatalf("Manager expects no error but got %v", err)
}

if podKiller.pod == nil {
t.Fatalf("Manager should have selected a pod for eviction")
}
if podKiller.pod != podToEvict {
t.Errorf("Manager should have killed pod: %v, but instead killed: %v", podToEvict.Name, podKiller.pod.Name)
}
if *podKiller.gracePeriodOverride != 1 {
t.Errorf("Manager should have evicted with gracePeriodOverride of 1, but used: %v", *podKiller.gracePeriodOverride)
}
})
}
}

// TestAllocatableMemoryPressure
func TestAllocatableMemoryPressure(t *testing.T) {
podMaker := makePodWithMemoryStats
Expand Down Expand Up @@ -2767,8 +2872,8 @@ func TestAllocatableMemoryPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}
// reset state
podKiller.pod = nil
Expand Down
3 changes: 2 additions & 1 deletion pkg/kubelet/eviction/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3053,8 +3053,9 @@ func newPodDiskStats(pod *v1.Pod, rootFsUsed, logsUsed, perLocalVolumeUsed resou

rootFsUsedBytes := uint64(rootFsUsed.Value())
logsUsedBytes := uint64(logsUsed.Value())
for range pod.Spec.Containers {
for _, container := range pod.Spec.Containers {
result.Containers = append(result.Containers, statsapi.ContainerStats{
Name: container.Name,
Rootfs: &statsapi.FsStats{
UsedBytes: &rootFsUsedBytes,
},
Expand Down
5 changes: 4 additions & 1 deletion pkg/kubelet/pod_workers.go
Original file line number Diff line number Diff line change
Expand Up @@ -979,23 +979,26 @@ func calculateEffectiveGracePeriod(status *podSyncStatus, pod *v1.Pod, options *
// enforce the restriction that a grace period can only decrease and track whatever our value is,
// then ensure a calculated value is passed down to lower levels
gracePeriod := status.gracePeriod
overridden := false
// this value is bedrock truth - the apiserver owns telling us this value calculated by apiserver
if override := pod.DeletionGracePeriodSeconds; override != nil {
if gracePeriod == 0 || *override < gracePeriod {
gracePeriod = *override
overridden = true
}
}
// we allow other parts of the kubelet (namely eviction) to request this pod be terminated faster
if options != nil {
if override := options.PodTerminationGracePeriodSecondsOverride; override != nil {
if gracePeriod == 0 || *override < gracePeriod {
gracePeriod = *override
overridden = true
}
}
}
// make a best effort to default this value to the pod's desired intent, in the event
// the kubelet provided no requested value (graceful termination?)
if gracePeriod == 0 && pod.Spec.TerminationGracePeriodSeconds != nil {
if !overridden && gracePeriod == 0 && pod.Spec.TerminationGracePeriodSeconds != nil {
gracePeriod = *pod.Spec.TerminationGracePeriodSeconds
}
// no matter what, we always supply a grace period of 1
Expand Down
Loading

0 comments on commit c0eea14

Please sign in to comment.