Skip to content

Commit

Permalink
Enable postgres-exporter by default from the timescaledb-single Helm
Browse files Browse the repository at this point in the history
chart.
- Add in postgres-exporter mixin alerts and Grafana dashboard
- Fix and merge how we build and generate mixin dashboards and alerts
  • Loading branch information
nhudson committed Aug 3, 2022
1 parent 0c7c832 commit 4e50499
Show file tree
Hide file tree
Showing 9 changed files with 1,764 additions and 34 deletions.
13 changes: 12 additions & 1 deletion .github/workflows/dashboards.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,9 @@ on:
paths:
- chart/dashboards/**

env:
golang-version: 1.18.4

jobs:
dashboard-sync:
runs-on: ubuntu-latest
Expand All @@ -17,6 +20,14 @@ jobs:
with:
fetch-depth: 0

- name: Set up golang
uses: actions/setup-go@v3
with:
go-version: ${{ env.golang-version }}

- name: Run Dashboard synchronizer
run: |
./scripts/sync-dashboards.sh && git diff --exit-code
go install github.com/monitoring-mixins/mixtool/cmd/mixtool@latest && \
go install github.com/google/go-jsonnet/cmd/jsonnetfmt@latest && \
./scripts/sync-mixins.sh && \
git diff --exit-code
4 changes: 4 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,7 @@ kubescape: manifests.yaml ## Runs a security analysis on generated manifests -

help: ## Displays help.
@awk 'BEGIN {FS = ":.*##"; printf "\nUsage:\n make \033[36m<target>\033[0m\n\nTargets:\n"} /^[a-z0-9A-Z_-]+:.*?##/ { printf " \033[36m%-13s\033[0m %s\n", $$1, $$2 }' $(MAKEFILE_LIST)

.PHONY: sync-mixins
sync-mixins: ## Syncs mixins from Promscale and Postgres-Exporter
./scripts/sync-mixins.sh
105 changes: 105 additions & 0 deletions chart/alerts/postgres-exporter-alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
groups:
- name: PostgreSQL
rules:
- alert: PostgreSQLMaxConnectionsReached
annotations:
description: '{{ $labels.instance }} is exceeding the currently configured maximum
Postgres connection limit (current value: {{ $value }}s). Services may be
degraded - please take immediate action (you probably need to increase max_connections
in the Docker image and re-deploy.'
summary: '{{ $labels.instance }} has maxed out Postgres connections.'
expr: |
sum by (instance) (pg_stat_activity_count{})
>=
sum by (instance) (pg_settings_max_connections{})
-
sum by (instance) (pg_settings_superuser_reserved_connections{})
for: 1m
labels:
severity: warning
- alert: PostgreSQLHighConnections
annotations:
description: '{{ $labels.instance }} is exceeding 80% of the currently configured
maximum Postgres connection limit (current value: {{ $value }}s). Please check
utilization graphs and confirm if this is normal service growth, abuse or
an otherwise temporary condition or if new resources need to be provisioned
(or the limits increased, which is mostly likely).'
summary: '{{ $labels.instance }} is over 80% of max Postgres connections.'
expr: |
sum by (instance) (pg_stat_activity_count{})
>
(
sum by (instance) (pg_settings_max_connections{})
-
sum by (instance) (pg_settings_superuser_reserved_connections{})
) * 0.8
for: 10m
labels:
severity: warning
- alert: PostgreSQLDown
annotations:
description: '{{ $labels.instance }} is rejecting query requests from the exporter,
and thus probably not allowing DNS requests to work either. User services
should not be effected provided at least 1 node is still alive.'
summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}'
expr: pg_up{} != 1
for: 1m
labels:
severity: warning
- alert: PostgreSQLSlowQueries
annotations:
description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for
database {{ $labels.datname }} with a value of {{ $value }} '
summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database
{{ $labels.datname }} '
expr: |
avg by (datname) (
rate (
pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m]
)
) > 2 * 60
for: 2m
labels:
severity: warning
- alert: PostgreSQLQPS
annotations:
description: PostgreSQL high number of queries per second on {{ $labels.cluster
}} for database {{ $labels.datname }} with a value of {{ $value }}
summary: PostgreSQL high number of queries per second {{ $labels.cluster }}
for database {{ $labels.datname }}
expr: |
avg by (datname) (
irate(
pg_stat_database_xact_commit{datname!~"template.*",}[5m]
)
+
irate(
pg_stat_database_xact_rollback{datname!~"template.*",}[5m]
)
) > 10000
for: 5m
labels:
severity: warning
- alert: PostgreSQLCacheHitRatio
annotations:
description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database
{{ $labels.datname }} with a value of {{ $value }}
summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database
{{ $labels.datname }}
expr: |
avg by (datname) (
rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m])
/
(
rate(
pg_stat_database_blks_hit{datname!~"template.*",}[5m]
)
+
rate(
pg_stat_database_blks_read{datname!~"template.*",}[5m]
)
)
) < 0.98
for: 5m
labels:
severity: warning
13 changes: 13 additions & 0 deletions chart/ci/e2e-values.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,18 @@ timescaledb-single:
## TimescaleDB resource requests
resources: null

# Enable Prometheus exporter for PostgreSQL server metrics.
# https://github.com/prometheus-community/postgres_exporter
prometheus:
enabled: true
image:
repository: quay.io/prometheuscommunity/postgres-exporter
tag: v0.11.0

# Specifies whether ServiceMonitor for Prometheus operator should be created
serviceMonitor:
enabled: true

# Values for configuring the deployment of the Promscale
# The charts README is at:
# https://github.com/timescale/promscale/tree/master/helm-chart
Expand Down Expand Up @@ -235,6 +247,7 @@ kube-prometheus-stack:
- dashboards/apm-service-dependencies-upstream.json
- dashboards/apm-service-overview.json
- dashboards/promscale.json
- dashboards/postgres-overview.json
adminUser: admin
# To configure password externally refer to https://github.com/grafana/helm-charts/blob/6578497320d3c4672bab3a3c7fd38dffba1c9aba/charts/grafana/values.yaml#L340-L345
adminPassword: ""
Expand Down
Loading

0 comments on commit 4e50499

Please sign in to comment.