From 58acafd4b7f656f8f8f05bdaec8c333cb698529b Mon Sep 17 00:00:00 2001 From: Milan Plzik Date: Mon, 18 Nov 2024 13:59:06 +0100 Subject: [PATCH 1/2] Make the daemonset rollout stuck alert configurable. For bigger Kubernetes clusters with bigger node churn (for instance, cloud clusters with spot nodes), the daemonset rollouts often get stuck for longer than just 15 minutes. Since the alert might easily misfire even in cases where the delay is legitimate. This PR introduces configurable `for` value to allow for customization. As a default, the original value `15m` is left, so the only real difference would be a slight change in the alert message formatting. Signed-off-by: Milan Plzik --- alerts/apps_alerts.libsonnet | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet index 3616d61f2..1edac9b71 100644 --- a/alerts/apps_alerts.libsonnet +++ b/alerts/apps_alerts.libsonnet @@ -4,6 +4,7 @@ local utils = import '../lib/utils.libsonnet'; _config+:: { kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics', kubeJobTimeoutDuration: error 'must provide value for kubeJobTimeoutDuration', + kubeDaemonSetRolloutStuckFor: '15m', namespaceSelector: null, prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '', }, @@ -204,10 +205,10 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.', + description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %(kubeDaemonSetRolloutStuckFor)s.' % $._config, summary: 'DaemonSet rollout is stuck.', }, - 'for': '15m', + 'for': $._config.kubeDaemonSetRolloutStuckFor, }, { expr: ||| From 64b5274035611aa2d283e197ef1f792cc423c86a Mon Sep 17 00:00:00 2001 From: Milan Plzik Date: Wed, 27 Nov 2024 09:56:48 +0100 Subject: [PATCH 2/2] Fix the tests. Signed-off-by: Milan Plzik --- tests.yaml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests.yaml b/tests.yaml index 82dd4bb60..44d422ff5 100644 --- a/tests.yaml +++ b/tests.yaml @@ -822,7 +822,7 @@ tests: severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." - description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' + description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15m.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck @@ -878,7 +878,7 @@ tests: severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." - description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' + description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15m.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck @@ -909,7 +909,7 @@ tests: severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." - description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' + description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15m.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck @@ -940,7 +940,7 @@ tests: severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." - description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' + description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15m.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 36m alertname: KubeDaemonSetRolloutStuck