Skip to content

Commit 4e50499

Browse files
committed
Enable postgres-exporter by default from the timescaledb-single Helm
chart. - Add in postgres-exporter mixin alerts and Grafana dashboard - Fix and merge how we build and generate mixin dashboards and alerts
1 parent 0c7c832 commit 4e50499

File tree

9 files changed

+1764
-34
lines changed

9 files changed

+1764
-34
lines changed

.github/workflows/dashboards.yml

Lines changed: 12 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,9 @@ on:
88
paths:
99
- chart/dashboards/**
1010

11+
env:
12+
golang-version: 1.18.4
13+
1114
jobs:
1215
dashboard-sync:
1316
runs-on: ubuntu-latest
@@ -17,6 +20,14 @@ jobs:
1720
with:
1821
fetch-depth: 0
1922

23+
- name: Set up golang
24+
uses: actions/setup-go@v3
25+
with:
26+
go-version: ${{ env.golang-version }}
27+
2028
- name: Run Dashboard synchronizer
2129
run: |
22-
./scripts/sync-dashboards.sh && git diff --exit-code
30+
go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest && \
31+
go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest && \
32+
./scripts/sync-mixins.sh && \
33+
git diff --exit-code

Makefile

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,3 +58,7 @@ kubescape: manifests.yaml ## Runs a security analysis on generated manifests -
5858

5959
help: ## Displays help.
6060
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n\nTargets:\n"} /^[a-z0-9A-Z_-]+:.*?##/ { printf " \033[36m%-13s\033[0m %s\n", $$1, $$2 }' $(MAKEFILE_LIST)
61+
62+
.PHONY: sync-mixins
63+
sync-mixins: ## Syncs mixins from Promscale and Postgres-Exporter
64+
./scripts/sync-mixins.sh
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
groups:
2+
- name: PostgreSQL
3+
rules:
4+
- alert: PostgreSQLMaxConnectionsReached
5+
annotations:
6+
description: '{{ $labels.instance }} is exceeding the currently configured maximum
7+
Postgres connection limit (current value: {{ $value }}s). Services may be
8+
degraded - please take immediate action (you probably need to increase max_connections
9+
in the Docker image and re-deploy.'
10+
summary: '{{ $labels.instance }} has maxed out Postgres connections.'
11+
expr: |
12+
sum by (instance) (pg_stat_activity_count{})
13+
>=
14+
sum by (instance) (pg_settings_max_connections{})
15+
-
16+
sum by (instance) (pg_settings_superuser_reserved_connections{})
17+
for: 1m
18+
labels:
19+
severity: warning
20+
- alert: PostgreSQLHighConnections
21+
annotations:
22+
description: '{{ $labels.instance }} is exceeding 80% of the currently configured
23+
maximum Postgres connection limit (current value: {{ $value }}s). Please check
24+
utilization graphs and confirm if this is normal service growth, abuse or
25+
an otherwise temporary condition or if new resources need to be provisioned
26+
(or the limits increased, which is mostly likely).'
27+
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.'
28+
expr: |
29+
sum by (instance) (pg_stat_activity_count{})
30+
>
31+
(
32+
sum by (instance) (pg_settings_max_connections{})
33+
-
34+
sum by (instance) (pg_settings_superuser_reserved_connections{})
35+
) * 0.8
36+
for: 10m
37+
labels:
38+
severity: warning
39+
- alert: PostgreSQLDown
40+
annotations:
41+
description: '{{ $labels.instance }} is rejecting query requests from the exporter,
42+
and thus probably not allowing DNS requests to work either. User services
43+
should not be effected provided at least 1 node is still alive.'
44+
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}'
45+
expr: pg_up{} != 1
46+
for: 1m
47+
labels:
48+
severity: warning
49+
- alert: PostgreSQLSlowQueries
50+
annotations:
51+
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for
52+
database {{ $labels.datname }} with a value of {{ $value }} '
53+
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database
54+
{{ $labels.datname }} '
55+
expr: |
56+
avg by (datname) (
57+
rate (
58+
pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m]
59+
)
60+
) > 2 * 60
61+
for: 2m
62+
labels:
63+
severity: warning
64+
- alert: PostgreSQLQPS
65+
annotations:
66+
description: PostgreSQL high number of queries per second on {{ $labels.cluster
67+
}} for database {{ $labels.datname }} with a value of {{ $value }}
68+
summary: PostgreSQL high number of queries per second {{ $labels.cluster }}
69+
for database {{ $labels.datname }}
70+
expr: |
71+
avg by (datname) (
72+
irate(
73+
pg_stat_database_xact_commit{datname!~"template.*",}[5m]
74+
)
75+
+
76+
irate(
77+
pg_stat_database_xact_rollback{datname!~"template.*",}[5m]
78+
)
79+
) > 10000
80+
for: 5m
81+
labels:
82+
severity: warning
83+
- alert: PostgreSQLCacheHitRatio
84+
annotations:
85+
description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database
86+
{{ $labels.datname }} with a value of {{ $value }}
87+
summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database
88+
{{ $labels.datname }}
89+
expr: |
90+
avg by (datname) (
91+
rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m])
92+
/
93+
(
94+
rate(
95+
pg_stat_database_blks_hit{datname!~"template.*",}[5m]
96+
)
97+
+
98+
rate(
99+
pg_stat_database_blks_read{datname!~"template.*",}[5m]
100+
)
101+
)
102+
) < 0.98
103+
for: 5m
104+
labels:
105+
severity: warning

chart/ci/e2e-values.yaml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -47,6 +47,18 @@ timescaledb-single:
4747
## TimescaleDB resource requests
4848
resources: null
4949

50+
# Enable Prometheus exporter for PostgreSQL server metrics.
51+
# https://github.com/prometheus-community/postgres_exporter
52+
prometheus:
53+
enabled: true
54+
image:
55+
repository: quay.io/prometheuscommunity/postgres-exporter
56+
tag: v0.11.0
57+
58+
# Specifies whether ServiceMonitor for Prometheus operator should be created
59+
serviceMonitor:
60+
enabled: true
61+
5062
# Values for configuring the deployment of the Promscale
5163
# The charts README is at:
5264
# https://github.com/timescale/promscale/tree/master/helm-chart
@@ -235,6 +247,7 @@ kube-prometheus-stack:
235247
- dashboards/apm-service-dependencies-upstream.json
236248
- dashboards/apm-service-overview.json
237249
- dashboards/promscale.json
250+
- dashboards/postgres-overview.json
238251
adminUser: admin
239252
# To configure password externally refer to https://github.com/grafana/helm-charts/blob/6578497320d3c4672bab3a3c7fd38dffba1c9aba/charts/grafana/values.yaml#L340-L345
240253
adminPassword: ""

0 commit comments

Comments
 (0)