From a3fbf21977deb89b7d843eb8371170c011ea6835 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Milan=20Pl=C5=BE=C3=ADk?= <4592597+mplzik@users.noreply.github.com> Date: Wed, 27 Nov 2024 19:37:26 +0100 Subject: [PATCH] Make the daemonset rollout stuck alert configurable. (#989) * Make the daemonset rollout stuck alert configurable. For bigger Kubernetes clusters with bigger node churn (for instance, cloud clusters with spot nodes), the daemonset rollouts often get stuck for longer than just 15 minutes. Since the alert might easily misfire even in cases where the delay is legitimate. This PR introduces configurable `for` value to allow for customization. As a default, the original value `15m` is left, so the only real difference would be a slight change in the alert message formatting. Signed-off-by: Milan Plzik * Fix the tests. Signed-off-by: Milan Plzik --------- Signed-off-by: Milan Plzik --- alerts/apps_alerts.libsonnet | 5 +++-- tests.yaml | 8 ++++---- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/alerts/apps_alerts.libsonnet b/alerts/apps_alerts.libsonnet index 3616d61f2..1edac9b71 100644 --- a/alerts/apps_alerts.libsonnet +++ b/alerts/apps_alerts.libsonnet @@ -4,6 +4,7 @@ local utils = import '../lib/utils.libsonnet'; _config+:: { kubeStateMetricsSelector: error 'must provide selector for kube-state-metrics', kubeJobTimeoutDuration: error 'must provide value for kubeJobTimeoutDuration', + kubeDaemonSetRolloutStuckFor: '15m', namespaceSelector: null, prefixedNamespaceSelector: if self.namespaceSelector != null then self.namespaceSelector + ',' else '', }, @@ -204,10 +205,10 @@ local utils = import '../lib/utils.libsonnet'; severity: 'warning', }, annotations: { - description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least 15 minutes.', + description: 'DaemonSet {{ $labels.namespace }}/{{ $labels.daemonset }} has not finished or progressed for at least %(kubeDaemonSetRolloutStuckFor)s.' % $._config, summary: 'DaemonSet rollout is stuck.', }, - 'for': '15m', + 'for': $._config.kubeDaemonSetRolloutStuckFor, }, { expr: ||| diff --git a/tests.yaml b/tests.yaml index 82dd4bb60..44d422ff5 100644 --- a/tests.yaml +++ b/tests.yaml @@ -822,7 +822,7 @@ tests: severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." - description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' + description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15m.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck @@ -878,7 +878,7 @@ tests: severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." - description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' + description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15m.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck @@ -909,7 +909,7 @@ tests: severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." - description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' + description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15m.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 34m alertname: KubeDaemonSetRolloutStuck @@ -940,7 +940,7 @@ tests: severity: warning exp_annotations: summary: "DaemonSet rollout is stuck." - description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15 minutes.' + description: 'DaemonSet monitoring/node-exporter has not finished or progressed for at least 15m.' runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubedaemonsetrolloutstuck - eval_time: 36m alertname: KubeDaemonSetRolloutStuck