From e1830526b923da8049486ca18de8cd2d1ce6568f Mon Sep 17 00:00:00 2001 From: Jack Hodgkiss Date: Mon, 17 Mar 2025 13:17:41 +0000 Subject: [PATCH 1/3] fix: use `rabbitmq` length for `RabbitMQNodeDown` The `RabbitMQNodeDown` made the assumption that all deployments involve only three RabbitMQ nodes. However, this is not always the case as we do support deployments with a single node or more than three. Before this would have caused false alerts in deployments with a single RabbitMQ node. Whilst also concealing alerts in deployments with more than three nodes. --- etc/kayobe/kolla/config/prometheus/rabbitmq.rules | 2 +- ...length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) create mode 100644 releasenotes/notes/use-length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml diff --git a/etc/kayobe/kolla/config/prometheus/rabbitmq.rules b/etc/kayobe/kolla/config/prometheus/rabbitmq.rules index d72230421..1bbb69e54 100644 --- a/etc/kayobe/kolla/config/prometheus/rabbitmq.rules +++ b/etc/kayobe/kolla/config/prometheus/rabbitmq.rules @@ -6,7 +6,7 @@ groups: - name: rabbitmq.rules rules: - alert: RabbitMQNodeDown - expr: sum(rabbitmq_build_info{instance!=""}) < 3 + expr: sum(rabbitmq_build_info{instance!=""}) < {% endraw %}{{ groups['rabbitmq'] | length }}{% raw %} for: 30m labels: severity: critical diff --git a/releasenotes/notes/use-length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml b/releasenotes/notes/use-length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml new file mode 100644 index 000000000..0af7e746d --- /dev/null +++ b/releasenotes/notes/use-length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml @@ -0,0 +1,6 @@ +--- +features: + - | + Use the length of the ``rabbitmq`` group to determine if any RabbitMQ + nodes are down. This is benefical for deployments that do not use a + standard three node setup. From e7282e6f0f271ccd9a7143faddc2c4759112db6f Mon Sep 17 00:00:00 2001 From: Jack Hodgkiss Date: Mon, 24 Mar 2025 12:59:42 +0000 Subject: [PATCH 2/3] Update etc/kayobe/kolla/config/prometheus/rabbitmq.rules Co-authored-by: Matt Crees --- etc/kayobe/kolla/config/prometheus/rabbitmq.rules | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/etc/kayobe/kolla/config/prometheus/rabbitmq.rules b/etc/kayobe/kolla/config/prometheus/rabbitmq.rules index 1bbb69e54..49407d85b 100644 --- a/etc/kayobe/kolla/config/prometheus/rabbitmq.rules +++ b/etc/kayobe/kolla/config/prometheus/rabbitmq.rules @@ -6,7 +6,7 @@ groups: - name: rabbitmq.rules rules: - alert: RabbitMQNodeDown - expr: sum(rabbitmq_build_info{instance!=""}) < {% endraw %}{{ groups['rabbitmq'] | length }}{% raw %} + expr: sum(rabbitmq_build_info{instance!=""}) < {{ groups['rabbitmq'] | length }} for: 30m labels: severity: critical From 747181f4a7c4e47d3b0fd77d820a3dfc63a1e230 Mon Sep 17 00:00:00 2001 From: Jack Hodgkiss Date: Fri, 9 May 2025 14:26:27 +0100 Subject: [PATCH 3/3] feat: add `alertmanager_number_of_rabbitmq_nodes` --- etc/kayobe/kolla/config/prometheus/rabbitmq.rules | 2 +- etc/kayobe/stackhpc-monitoring.yml | 3 +++ ...ngth-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml | 8 +++++--- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/etc/kayobe/kolla/config/prometheus/rabbitmq.rules b/etc/kayobe/kolla/config/prometheus/rabbitmq.rules index 49407d85b..82022d72a 100644 --- a/etc/kayobe/kolla/config/prometheus/rabbitmq.rules +++ b/etc/kayobe/kolla/config/prometheus/rabbitmq.rules @@ -6,7 +6,7 @@ groups: - name: rabbitmq.rules rules: - alert: RabbitMQNodeDown - expr: sum(rabbitmq_build_info{instance!=""}) < {{ groups['rabbitmq'] | length }} + expr: sum(rabbitmq_build_info{instance!=""}) < {% endraw %}{{ alertmanager_number_of_rabbitmq_nodes }}{% raw %} for: 30m labels: severity: critical diff --git a/etc/kayobe/stackhpc-monitoring.yml b/etc/kayobe/stackhpc-monitoring.yml index 5eee4b19c..2b34c5566 100644 --- a/etc/kayobe/stackhpc-monitoring.yml +++ b/etc/kayobe/stackhpc-monitoring.yml @@ -26,6 +26,9 @@ alertmanager_packet_drop_threshold: 1 # packets/s averaged over 5 minutes. alertmanager_packet_errors_threshold: 1 +# Number of RabbitMQ nodes in the cluster. +alertmanager_number_of_rabbitmq_nodes: "{{ groups['controllers'] | length }}" + ############################################################################### # Exporter configuration diff --git a/releasenotes/notes/use-length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml b/releasenotes/notes/use-length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml index 0af7e746d..e7b7b2f9a 100644 --- a/releasenotes/notes/use-length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml +++ b/releasenotes/notes/use-length-for-rabbitmq-node-down-rule-c9e9c6b09f57954d.yaml @@ -1,6 +1,8 @@ --- features: - | - Use the length of the ``rabbitmq`` group to determine if any RabbitMQ - nodes are down. This is benefical for deployments that do not use a - standard three node setup. + Allow for easy customisation of the number of expected `RabbitMQ` + nodes when evaluating the alert `RabbitMQNodeDown`. It is set by + the `alertmanager_number_of_rabbitmq_nodes` which defaults to the + number of `controllers`. This is benefical for deployments that + do not use a standard three node setup.