From 3830dfd31671f22e5531068d7ed59f0c76fdabc6 Mon Sep 17 00:00:00 2001 From: 7840vz <122374011+7840vz@users.noreply.github.com> Date: Thu, 7 Nov 2024 21:22:11 +0800 Subject: [PATCH] Fix KubeClientCertificateExpiration alerts (#941) 1) Change aggregation `by (le)` to `without(service,endpoint...)`, dropping only useless labels, but keeping external labels (like environment etc) intact. Otherwise they get dropped. 2) Change order of metrics in expression: `apiserver_client_certificate_expiration_seconds_bucket` metric comes first so actual expiration date is shown as result in Grafana->Explore queries, not `apiserver_client_certificate_expiration_seconds_count` value (which is quite useless). This make it easier to troubleshoot. 3) Finally, fix aggregation for `on(job)` to become `(job, cluster, instance)`. Otherwise, It would be enough to have just single instance with certificate expiration problem, and it would set all apiservers to 'firing' (false positive!). --- alerts/kube_apiserver.libsonnet | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/alerts/kube_apiserver.libsonnet b/alerts/kube_apiserver.libsonnet index 55448ac2b..7f4ced3dc 100644 --- a/alerts/kube_apiserver.libsonnet +++ b/alerts/kube_apiserver.libsonnet @@ -50,7 +50,9 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeClientCertificateExpiration', expr: ||| - apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(%(clusterLabel)s, job) histogram_quantile(0.01, sum by (%(clusterLabel)s, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s + histogram_quantile(0.01, sum without (%(namespaceLabel)s, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationWarningSeconds)s + and + on(job, %(clusterLabel)s, instance) apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 ||| % $._config, 'for': '5m', labels: { @@ -64,7 +66,9 @@ local utils = import '../lib/utils.libsonnet'; { alert: 'KubeClientCertificateExpiration', expr: ||| - apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 and on(%(clusterLabel)s, job) histogram_quantile(0.01, sum by (%(clusterLabel)s, job, le) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s + histogram_quantile(0.01, sum without (%(namespaceLabel)s, service, endpoint) (rate(apiserver_client_certificate_expiration_seconds_bucket{%(kubeApiserverSelector)s}[5m]))) < %(certExpirationCriticalSeconds)s + and + on(job, %(clusterLabel)s, instance) apiserver_client_certificate_expiration_seconds_count{%(kubeApiserverSelector)s} > 0 ||| % $._config, 'for': '5m', labels: {