Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,4 @@ image_list:
- { name: "quay.io/kiwigrid/k8s-sidecar", tag: "{{ grafana_sidecar_image_tag }}" }
- { name: "registry.k8s.io/kube-state-metrics/kube-state-metrics", tag: "{{ kube_prometheus_stack_metrics_image_tag }}" }
- { name: "registry.k8s.io/ingress-nginx/kube-webhook-certgen", tag: "{{ kube_prometheus_stack_patch_image_tag }}" }
- { name: "quay.io/prometheus/blackbox-exporter", tag: "{{ kube_prometheus_stack_blackbox_exporter_image_tag }}" }
6 changes: 6 additions & 0 deletions ansible/roles/kube_prometheus_stack/defaults/main/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,12 @@ kube_prometheus_stack_wait_timeout: 5m
kube_prometheus_stack_metrics_image_tag: v2.12.0
kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6

kube_prometheus_stack_blackbox_exporter_release_version: 9.0.1
kube_prometheus_stack_blackbox_exporter_image_tag: v0.25.0
kube_prometheus_stack_blackbox_exporter_release_name: blackbox-exporter

kube_prometheus_stack_blackbox_modules: {}

control_ip: "{{ hostvars[groups['control'].0].ansible_host }}"

grafana_auth_anonymous: false
Expand Down
20 changes: 20 additions & 0 deletions ansible/roles/kube_prometheus_stack/tasks/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -176,6 +176,26 @@
ansible.builtin.import_role:
name: grafana-dashboards

- name: Install blackbox exporter helm chart
no_log: true # may expose testuser password
kubernetes.core.helm:
chart_ref: prometheus-blackbox-exporter
chart_repo_url: https://prometheus-community.github.io/helm-charts
chart_version: "{{ kube_prometheus_stack_blackbox_exporter_release_version }}"
release_name: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}"
release_namespace: "{{ kube_prometheus_stack_release_namespace }}"
release_values:
nodeSelector:
clusterrole: "server"
config:
modules: "{{ kube_prometheus_stack_blackbox_modules }}"
configReloader:
image:
tag: "{{ kube_prometheus_stack_app_version }}" # keeps consistent with pre-pulled image for kube-prometheus-stack
image:
tag: "{{ kube_prometheus_stack_blackbox_exporter_image_tag }}"
wait: yes

- name: Install kube-prometheus-stack on target Kubernetes cluster
kubernetes.core.helm:
chart_ref: "{{ kube_prometheus_stack_chart_name }}"
Expand Down
19 changes: 19 additions & 0 deletions docs/monitoring-and-logging.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,10 @@ Metrics are scraped from exporters. Exporters are services which expose HTTP end

Tool which parses slurm accounting data and produces a log file that is suitable for ingest by filebeat.

### [blackbox-exporter](https://github.com/prometheus/blackbox_exporter)

Tool which allows blackbox probing of endpoints over HTTP, HTTPS, DNS, TCP, ICMP and gRPC.

## Definition of terms

In this section we define any terms that may not be widely understood.
Expand Down Expand Up @@ -290,6 +294,21 @@ slurm-stats is configured `slurm-stats` role in the [slurm_openstack_tools colle
The `slurm_stats` group controls the placement of the `slurm_stats` service.
This should be configured to be a group with a single host. That host must be co-located on the same host as the `filebeat` service that scrapes its output.

## blackbox-exporter

### Defaults and adding jobs

Blackbox exporter is configured using rolevars in the [kube_prometheus_stack role](../ansible/roles/kube_prometheus_stack/defaults/main). Blackbox uses modules to
probe service endpoints. Modules can be configured by overriding the maps in [environments/common/inventory/group_vars/all/blackbox_exporter.yml](../environments/common/inventory/group_vars/all/blackbox_exporter.yml), see [upstream docs](https://github.com/prometheus/blackbox_exporter/blob/master/CONFIGURATION.md)
and [underlying Helm chart values](https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-blackbox-exporter/values.yaml#L162) for module configuration options. Probes are defined through Prometheus scrape jobs, which can be added in [environments/common/inventory/group_vars/all/prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml). See upstream docs for configuring blackbox-exporter scrape jobs.

By default a HTTPS probe for OpenOndemand is added if there are hosts in the `openondemand` group, the module and scrape job for this is defined in
[environments/common/inventory/group_vars/all/openondemand.yml](../environments/common/inventory/group_vars/all/openondemand.yml) (these are merged into the config in [blackbox_exporter.yml]([prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml)) and [prometheus.yml](../environments/common/inventory/group_vars/all/prometheus.yml) respectively).

### Placement

Installed as part of the kube_prometheus_stack role thats placement is controlled by the `prometheus` group. As above, there is currently no load balancing support so should only be placed on a single node, configured to be the Slurm control node by default.

### Access

Probes can be viewed through the `Prometheus Blackbox Exporter` Grafana dashboard.
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
{
"cluster_image": {
"RL8": "openhpc-RL8-241029-0905-f23c2fca",
"RL9": "openhpc-RL9-241029-0949-f23c2fca",
"RL9-cuda": "openhpc-cuda-RL9-241029-0905-f23c2fca"
"RL8": "openhpc-RL8-241106-1149-6e780c0d",
"RL9": "openhpc-RL9-241106-1149-6e780c0d",
"RL9-cuda": "openhpc-cuda-RL9-241106-1149-6e780c0d"
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Note: underlying helm chart is configured with a default module 'http_2xx', defining modules with this name here will merge the module's values with the existing module,
# see https://github.com/prometheus-community/helm-charts/blob/main/charts/prometheus-blackbox-exporter/values.yaml#L163
kube_prometheus_stack_blackbox_modules_defaults: {}

kube_prometheus_stack_blackbox_modules: "{{ kube_prometheus_stack_blackbox_modules_defaults if ( groups['openondemand'] | count == 0 ) else ( kube_prometheus_stack_blackbox_modules_defaults | combine(openondemand_blackbox_modules) ) }}"

# See prometheus_scrape_configs in prometheus.yml to add additional scrape jobs to probe services
6 changes: 6 additions & 0 deletions environments/common/inventory/group_vars/all/grafana.yml
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,12 @@ grafana_dashboards_default:
- placeholder: DS_PROMETHEUS
replacement: prometheus
revision_id: 3
# blackbox probes
- dashboard_id: 14928
replacements:
- placeholder: DS_PROMETHEUS
replacement: prometheus
revision_id: 6
grafana_dashboards: "{{ grafana_dashboards_default + (openondemand_dashboard if groups.get('openondemand') else []) }}"

# Configmap names of kube prometheus stack's default dashboards to exclude
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,5 +2,6 @@ kube_prometheus_stack_chart_version: 59.1.0
kube_prometheus_stack_release_namespace: monitoring-system
kube_prometheus_stack_release_name: kube-prometheus-stack
kube_prometheus_stack_wait_timeout: 5m
kube_prometheus_stack_blackbox_exporter_release_name: blackbox-exporter

# See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services
28 changes: 28 additions & 0 deletions environments/common/inventory/group_vars/all/openondemand.yml
Original file line number Diff line number Diff line change
Expand Up @@ -182,6 +182,34 @@ openondemand_scrape_configs:
labels:
environment: "{{ appliances_environment_name }}"
service: "openondemand"
- job_name: "ood-blackbox-probe"
metrics_path: /probe
params:
module: [ood_http_2xx]
static_configs:
- targets:
- "{{ openondemand_address }}"
relabel_configs:
- source_labels: [__address__]
target_label: __param_target
- source_labels: [__param_target]
target_label: target
- target_label: __address__
replacement: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}-prometheus-blackbox-exporter:9115"

openondemand_blackbox_modules:
ood_http_2xx:
prober: http
timeout: 5s
http:
valid_http_versions: ["HTTP/1.1", "HTTP/2.0"]
follow_redirects: true
preferred_ip_protocol: "ip4"
tls_config:
insecure_skip_verify: true
basic_auth:
username: "testuser"
password: "{{ vault_testuser_password }}"

openondemand_dashboard:
- dashboard_id: 13465
Expand Down
48 changes: 48 additions & 0 deletions environments/common/inventory/group_vars/all/prometheus.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,54 @@ prometheus_extra_rules:
expr: "slurm_nodes_down > 0\n"
labels:
severity: critical
- alert: BlackboxProbeFailed
expr: probe_success == 0
for: 0m
labels:
severity: critical
annotations:
summary: '{% raw %}Blackbox probe failed (target {{ $labels.target }}){% endraw %}'
description: "{% raw %}Blackbox probe '{{ $labels.target }}' failed{% endraw %}"
- alert: BlackboxSlowProbe
expr: avg_over_time(probe_duration_seconds[1m]) > 1.25 #around 1.14 expected due to indirection in cluster
for: 1m
labels:
severity: warning
annotations:
summary: '{% raw %}Blackbox slow probe (target {{ $labels.target }}){% endraw %}'
description: "{% raw %}Blackbox probe '{{ $labels.target }}' took more than 1s to complete - {{ $value }}{% endraw %}"
- alert: BlackboxProbeHttpFailure
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
for: 0m
labels:
severity: critical
annotations:
summary: '{% raw %}Blackbox probe HTTP failure (target {{ $labels.target }}){% endraw %}'
description: "{% raw %}Blackbox probe '{{ $labels.target }}' returned an HTTP error status - {{ $value }}{% endraw %}"
- alert: BlackboxSslCertificateWillExpireSoon
expr: (7 * 24 * 3600) <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (30 * 24 * 3600)
for: 0m
labels:
severity: warning
annotations:
summary: '{% raw %}Blackbox SSL certificate will expire soon (target {{ $labels.target }}){% endraw %}'
description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
- alert: BlackboxSslCertificateWillExpireVerySoon
expr: 0 <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (7 * 24 * 3600)
for: 0m
labels:
severity: critical
annotations:
summary: '{% raw %}Blackbox SSL certificate will expire very soon (target {{ $labels.target }}){% endraw %}'
description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
- alert: BlackboxSslCertificateExpired
expr: (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < 0
for: 0m
labels:
severity: critical
annotations:
summary: '{% raw %}Blackbox SSL certificate expired (target {{ $labels.target }}){% endraw %}'
description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' has expired{% endraw %}"
- record: node_cpu_system_seconds:record
expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s])))
- record: node_cpu_user_seconds:record
Expand Down
Loading