Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[release-4.17]: OCPBUGS-46364: Fix grace period used for immediate evictions #2164

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 7 additions & 4 deletions pkg/kubelet/eviction/eviction_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,9 @@ const (
signalEphemeralPodFsLimit string = "ephemeralpodfs.limit"
// signalEmptyDirFsLimit is amount of storage available on filesystem requested by an emptyDir
signalEmptyDirFsLimit string = "emptydirfs.limit"
// immediateEvictionGracePeriodSeconds is how long we give pods to shut down when we
// need them to evict quickly due to resource pressure
immediateEvictionGracePeriodSeconds = 1
)

// managerImpl implements Manager
Expand Down Expand Up @@ -405,7 +408,7 @@ func (m *managerImpl) synchronize(diskInfoProvider DiskInfoProvider, podFunc Act
// we kill at most a single pod during each eviction interval
for i := range activePods {
pod := activePods[i]
gracePeriodOverride := int64(0)
gracePeriodOverride := int64(immediateEvictionGracePeriodSeconds)
if !isHardEvictionThreshold(thresholdToReclaim) {
gracePeriodOverride = m.config.MaxPodGracePeriodSeconds
}
Expand Down Expand Up @@ -525,7 +528,7 @@ func (m *managerImpl) emptyDirLimitEviction(podStats statsapi.PodStats, pod *v1.
used := podVolumeUsed[pod.Spec.Volumes[i].Name]
if used != nil && size != nil && size.Sign() == 1 && used.Cmp(*size) > 0 {
// the emptyDir usage exceeds the size limit, evict the pod
if m.evictPod(pod, 0, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(emptyDirMessageFmt, pod.Spec.Volumes[i].Name, size.String()), nil, nil) {
metrics.Evictions.WithLabelValues(signalEmptyDirFsLimit).Inc()
return true
}
Expand Down Expand Up @@ -553,7 +556,7 @@ func (m *managerImpl) podEphemeralStorageLimitEviction(podStats statsapi.PodStat
if podEphemeralStorageTotalUsage.Cmp(podEphemeralStorageLimit) > 0 {
// the total usage of pod exceeds the total size limit of containers, evict the pod
message := fmt.Sprintf(podEphemeralStorageMessageFmt, podEphemeralStorageLimit.String())
if m.evictPod(pod, 0, message, nil, nil) {
if m.evictPod(pod, immediateEvictionGracePeriodSeconds, message, nil, nil) {
metrics.Evictions.WithLabelValues(signalEphemeralPodFsLimit).Inc()
return true
}
Expand All @@ -579,7 +582,7 @@ func (m *managerImpl) containerEphemeralStorageLimitEviction(podStats statsapi.P

if ephemeralStorageThreshold, ok := thresholdsMap[containerStat.Name]; ok {
if ephemeralStorageThreshold.Cmp(*containerUsed) < 0 {
if m.evictPod(pod, 0, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
if m.evictPod(pod, immediateEvictionGracePeriodSeconds, fmt.Sprintf(containerEphemeralStorageMessageFmt, containerStat.Name, ephemeralStorageThreshold.String()), nil, nil) {
metrics.Evictions.WithLabelValues(signalEphemeralContainerFsLimit).Inc()
return true
}
Expand Down
157 changes: 131 additions & 26 deletions pkg/kubelet/eviction/eviction_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -123,10 +123,10 @@ func makePodWithPIDStats(name string, priority int32, processCount uint64) (*v1.
return pod, podStats
}

func makePodWithDiskStats(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, rootFsUsed, logsUsed, perLocalVolumeUsed string) (*v1.Pod, statsapi.PodStats) {
func makePodWithDiskStats(name string, priority int32, requests v1.ResourceList, limits v1.ResourceList, rootFsUsed, logsUsed, perLocalVolumeUsed string, volumes []v1.Volume) (*v1.Pod, statsapi.PodStats) {
pod := newPod(name, priority, []v1.Container{
newContainer(name, requests, limits),
}, nil)
}, volumes)
podStats := newPodDiskStats(pod, parseQuantity(rootFsUsed), parseQuantity(logsUsed), parseQuantity(perLocalVolumeUsed))
return pod, podStats
}
Expand Down Expand Up @@ -505,7 +505,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
Quantity: quantityMustParse("2Gi"),
},
},
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. ",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 2Gi, available: 1536Mi. Container above-requests was using 700Mi, request is 100Mi, has larger consumption of ephemeral-storage. ",
podToMakes: []podToMake{
{name: "below-requests", requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "900Mi"},
{name: "above-requests", requests: newResourceList("", "", "100Mi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "700Mi"},
Expand All @@ -516,7 +516,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
nodeFsStats: "1Gi",
imageFsStats: "10Gi",
containerFsStats: "10Gi",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
thresholdToMonitor: evictionapi.Threshold{
Signal: evictionapi.SignalImageFsAvailable,
Operator: evictionapi.OpLessThan,
Expand All @@ -537,7 +537,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
nodeFsStats: "1Gi",
imageFsStats: "100Gi",
containerFsStats: "10Gi",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi.Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
thresholdToMonitor: evictionapi.Threshold{
Signal: evictionapi.SignalContainerFsAvailable,
Operator: evictionapi.OpLessThan,
Expand All @@ -557,7 +557,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
nodeFsStats: "10Gi",
imageFsStats: "100Gi",
containerFsStats: "10Gi",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. ",
evictionMessage: "The node was low on resource: ephemeral-storage. Threshold quantity: 50Gi, available: 10Gi. Container above-requests was using 80Gi, request is 50Gi, has larger consumption of ephemeral-storage. ",
thresholdToMonitor: evictionapi.Threshold{
Signal: evictionapi.SignalNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Expand Down Expand Up @@ -588,7 +588,7 @@ func TestDiskPressureNodeFs_VerifyPodStatus(t *testing.T) {
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
pods = append(pods, pod)
podStats[pod] = podStat
}
Expand Down Expand Up @@ -835,8 +835,8 @@ func TestMemoryPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// the best-effort pod should not admit, burstable should
Expand Down Expand Up @@ -1106,8 +1106,8 @@ func TestPIDPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod but should have had a grace period override.")
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// try to admit our pod (should fail)
Expand Down Expand Up @@ -1336,7 +1336,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
pods = append(pods, pod)
podStats[pod] = podStat
}
Expand Down Expand Up @@ -1379,7 +1379,7 @@ func TestDiskPressureNodeFs(t *testing.T) {
}

// create a best effort pod to test admission
podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi")
podToAdmit, _ := podMaker("pod-to-admit", defaultPriority, newResourceList("", "", ""), newResourceList("", "", ""), "0Gi", "0Gi", "0Gi", nil)

// synchronize
_, err := manager.synchronize(diskInfoProvider, activePodsFunc)
Expand Down Expand Up @@ -1494,8 +1494,8 @@ func TestDiskPressureNodeFs(t *testing.T) {
t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// try to admit our pod (should fail)
Expand Down Expand Up @@ -1644,8 +1644,8 @@ func TestMinReclaim(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// reduce memory pressure, but not below the min-reclaim amount
Expand All @@ -1668,8 +1668,8 @@ func TestMinReclaim(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// reduce memory pressure and ensure the min-reclaim amount
Expand Down Expand Up @@ -1858,7 +1858,7 @@ func TestNodeReclaimFuncs(t *testing.T) {
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed)
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, nil)
pods = append(pods, pod)
podStats[pod] = podStat
}
Expand Down Expand Up @@ -2060,8 +2060,8 @@ func TestNodeReclaimFuncs(t *testing.T) {
t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// reduce disk pressure
Expand Down Expand Up @@ -2458,8 +2458,8 @@ func TestInodePressureFsInodes(t *testing.T) {
t.Fatalf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod = *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Fatalf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}

// try to admit our pod (should fail)
Expand Down Expand Up @@ -2666,6 +2666,111 @@ func TestStaticCriticalPodsAreNotEvicted(t *testing.T) {
}
}

func TestStorageLimitEvictions(t *testing.T) {
volumeSizeLimit := resource.MustParse("1Gi")

testCases := map[string]struct {
pod podToMake
volumes []v1.Volume
}{
"eviction due to rootfs above limit": {
pod: podToMake{name: "rootfs-above-limits", priority: defaultPriority, requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), rootFsUsed: "2Gi"},
},
"eviction due to logsfs above limit": {
pod: podToMake{name: "logsfs-above-limits", priority: defaultPriority, requests: newResourceList("", "", "1Gi"), limits: newResourceList("", "", "1Gi"), logsFsUsed: "2Gi"},
},
"eviction due to local volume above limit": {
pod: podToMake{name: "localvolume-above-limits", priority: defaultPriority, requests: newResourceList("", "", ""), limits: newResourceList("", "", ""), perLocalVolumeUsed: "2Gi"},
volumes: []v1.Volume{{
Name: "emptyDirVolume",
VolumeSource: v1.VolumeSource{
EmptyDir: &v1.EmptyDirVolumeSource{
SizeLimit: &volumeSizeLimit,
},
},
}},
},
}
for name, tc := range testCases {
t.Run(name, func(t *testing.T) {
podMaker := makePodWithDiskStats
summaryStatsMaker := makeDiskStats
podsToMake := []podToMake{
tc.pod,
}
pods := []*v1.Pod{}
podStats := map[*v1.Pod]statsapi.PodStats{}
for _, podToMake := range podsToMake {
pod, podStat := podMaker(podToMake.name, podToMake.priority, podToMake.requests, podToMake.limits, podToMake.rootFsUsed, podToMake.logsFsUsed, podToMake.perLocalVolumeUsed, tc.volumes)
pods = append(pods, pod)
podStats[pod] = podStat
}

podToEvict := pods[0]
activePodsFunc := func() []*v1.Pod {
return pods
}

fakeClock := testingclock.NewFakeClock(time.Now())
podKiller := &mockPodKiller{}
diskInfoProvider := &mockDiskInfoProvider{dedicatedImageFs: ptr.To(false)}
diskGC := &mockDiskGC{err: nil}
nodeRef := &v1.ObjectReference{
Kind: "Node", Name: "test", UID: types.UID("test"), Namespace: "",
}

config := Config{
MaxPodGracePeriodSeconds: 5,
PressureTransitionPeriod: time.Minute * 5,
Thresholds: []evictionapi.Threshold{
{
Signal: evictionapi.SignalNodeFsAvailable,
Operator: evictionapi.OpLessThan,
Value: evictionapi.ThresholdValue{
Quantity: quantityMustParse("1Gi"),
},
},
},
}

diskStat := diskStats{
rootFsAvailableBytes: "200Mi",
imageFsAvailableBytes: "200Mi",
podStats: podStats,
}
summaryProvider := &fakeSummaryProvider{result: summaryStatsMaker(diskStat)}
manager := &managerImpl{
clock: fakeClock,
killPodFunc: podKiller.killPodNow,
imageGC: diskGC,
containerGC: diskGC,
config: config,
recorder: &record.FakeRecorder{},
summaryProvider: summaryProvider,
nodeRef: nodeRef,
nodeConditionsLastObservedAt: nodeConditionsObservedAt{},
thresholdsFirstObservedAt: thresholdsObservedAt{},
localStorageCapacityIsolation: true,
}

_, err := manager.synchronize(diskInfoProvider, activePodsFunc)
if err != nil {
t.Fatalf("Manager expects no error but got %v", err)
}

if podKiller.pod == nil {
t.Fatalf("Manager should have selected a pod for eviction")
}
if podKiller.pod != podToEvict {
t.Errorf("Manager should have killed pod: %v, but instead killed: %v", podToEvict.Name, podKiller.pod.Name)
}
if *podKiller.gracePeriodOverride != 1 {
t.Errorf("Manager should have evicted with gracePeriodOverride of 1, but used: %v", *podKiller.gracePeriodOverride)
}
})
}
}

// TestAllocatableMemoryPressure
func TestAllocatableMemoryPressure(t *testing.T) {
podMaker := makePodWithMemoryStats
Expand Down Expand Up @@ -2767,8 +2872,8 @@ func TestAllocatableMemoryPressure(t *testing.T) {
t.Errorf("Manager chose to kill pod: %v, but should have chosen %v", podKiller.pod.Name, podToEvict.Name)
}
observedGracePeriod := *podKiller.gracePeriodOverride
if observedGracePeriod != int64(0) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 0, observedGracePeriod)
if observedGracePeriod != int64(1) {
t.Errorf("Manager chose to kill pod with incorrect grace period. Expected: %d, actual: %d", 1, observedGracePeriod)
}
// reset state
podKiller.pod = nil
Expand Down
3 changes: 2 additions & 1 deletion pkg/kubelet/eviction/helpers_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3053,8 +3053,9 @@ func newPodDiskStats(pod *v1.Pod, rootFsUsed, logsUsed, perLocalVolumeUsed resou

rootFsUsedBytes := uint64(rootFsUsed.Value())
logsUsedBytes := uint64(logsUsed.Value())
for range pod.Spec.Containers {
for _, container := range pod.Spec.Containers {
result.Containers = append(result.Containers, statsapi.ContainerStats{
Name: container.Name,
Rootfs: &statsapi.FsStats{
UsedBytes: &rootFsUsedBytes,
},
Expand Down
5 changes: 4 additions & 1 deletion pkg/kubelet/pod_workers.go
Original file line number Diff line number Diff line change
Expand Up @@ -979,23 +979,26 @@ func calculateEffectiveGracePeriod(status *podSyncStatus, pod *v1.Pod, options *
// enforce the restriction that a grace period can only decrease and track whatever our value is,
// then ensure a calculated value is passed down to lower levels
gracePeriod := status.gracePeriod
overridden := false
// this value is bedrock truth - the apiserver owns telling us this value calculated by apiserver
if override := pod.DeletionGracePeriodSeconds; override != nil {
if gracePeriod == 0 || *override < gracePeriod {
gracePeriod = *override
overridden = true
}
}
// we allow other parts of the kubelet (namely eviction) to request this pod be terminated faster
if options != nil {
if override := options.PodTerminationGracePeriodSecondsOverride; override != nil {
if gracePeriod == 0 || *override < gracePeriod {
gracePeriod = *override
overridden = true
}
}
}
// make a best effort to default this value to the pod's desired intent, in the event
// the kubelet provided no requested value (graceful termination?)
if gracePeriod == 0 && pod.Spec.TerminationGracePeriodSeconds != nil {
if !overridden && gracePeriod == 0 && pod.Spec.TerminationGracePeriodSeconds != nil {
gracePeriod = *pod.Spec.TerminationGracePeriodSeconds
}
// no matter what, we always supply a grace period of 1
Expand Down
Loading