|
| 1 | +groups: |
| 2 | +- name: PostgreSQL |
| 3 | + rules: |
| 4 | + - alert: PostgreSQLMaxConnectionsReached |
| 5 | + annotations: |
| 6 | + description: '{{ $labels.instance }} is exceeding the currently configured maximum |
| 7 | + Postgres connection limit (current value: {{ $value }}s). Services may be |
| 8 | + degraded - please take immediate action (you probably need to increase max_connections |
| 9 | + in the Docker image and re-deploy.' |
| 10 | + summary: '{{ $labels.instance }} has maxed out Postgres connections.' |
| 11 | + expr: | |
| 12 | + sum by (instance) (pg_stat_activity_count{}) |
| 13 | + >= |
| 14 | + sum by (instance) (pg_settings_max_connections{}) |
| 15 | + - |
| 16 | + sum by (instance) (pg_settings_superuser_reserved_connections{}) |
| 17 | + for: 1m |
| 18 | + labels: |
| 19 | + severity: warning |
| 20 | + - alert: PostgreSQLHighConnections |
| 21 | + annotations: |
| 22 | + description: '{{ $labels.instance }} is exceeding 80% of the currently configured |
| 23 | + maximum Postgres connection limit (current value: {{ $value }}s). Please check |
| 24 | + utilization graphs and confirm if this is normal service growth, abuse or |
| 25 | + an otherwise temporary condition or if new resources need to be provisioned |
| 26 | + (or the limits increased, which is mostly likely).' |
| 27 | + summary: '{{ $labels.instance }} is over 80% of max Postgres connections.' |
| 28 | + expr: | |
| 29 | + sum by (instance) (pg_stat_activity_count{}) |
| 30 | + > |
| 31 | + ( |
| 32 | + sum by (instance) (pg_settings_max_connections{}) |
| 33 | + - |
| 34 | + sum by (instance) (pg_settings_superuser_reserved_connections{}) |
| 35 | + ) * 0.8 |
| 36 | + for: 10m |
| 37 | + labels: |
| 38 | + severity: warning |
| 39 | + - alert: PostgreSQLDown |
| 40 | + annotations: |
| 41 | + description: '{{ $labels.instance }} is rejecting query requests from the exporter, |
| 42 | + and thus probably not allowing DNS requests to work either. User services |
| 43 | + should not be effected provided at least 1 node is still alive.' |
| 44 | + summary: 'PostgreSQL is not processing queries: {{ $labels.instance }}' |
| 45 | + expr: pg_up{} != 1 |
| 46 | + for: 1m |
| 47 | + labels: |
| 48 | + severity: warning |
| 49 | + - alert: PostgreSQLSlowQueries |
| 50 | + annotations: |
| 51 | + description: 'PostgreSQL high number of slow queries {{ $labels.cluster }} for |
| 52 | + database {{ $labels.datname }} with a value of {{ $value }} ' |
| 53 | + summary: 'PostgreSQL high number of slow on {{ $labels.cluster }} for database |
| 54 | + {{ $labels.datname }} ' |
| 55 | + expr: | |
| 56 | + avg by (datname) ( |
| 57 | + rate ( |
| 58 | + pg_stat_activity_max_tx_duration{datname!~"template.*",}[2m] |
| 59 | + ) |
| 60 | + ) > 2 * 60 |
| 61 | + for: 2m |
| 62 | + labels: |
| 63 | + severity: warning |
| 64 | + - alert: PostgreSQLQPS |
| 65 | + annotations: |
| 66 | + description: PostgreSQL high number of queries per second on {{ $labels.cluster |
| 67 | + }} for database {{ $labels.datname }} with a value of {{ $value }} |
| 68 | + summary: PostgreSQL high number of queries per second {{ $labels.cluster }} |
| 69 | + for database {{ $labels.datname }} |
| 70 | + expr: | |
| 71 | + avg by (datname) ( |
| 72 | + irate( |
| 73 | + pg_stat_database_xact_commit{datname!~"template.*",}[5m] |
| 74 | + ) |
| 75 | + + |
| 76 | + irate( |
| 77 | + pg_stat_database_xact_rollback{datname!~"template.*",}[5m] |
| 78 | + ) |
| 79 | + ) > 10000 |
| 80 | + for: 5m |
| 81 | + labels: |
| 82 | + severity: warning |
| 83 | + - alert: PostgreSQLCacheHitRatio |
| 84 | + annotations: |
| 85 | + description: PostgreSQL low on cache hit rate on {{ $labels.cluster }} for database |
| 86 | + {{ $labels.datname }} with a value of {{ $value }} |
| 87 | + summary: PostgreSQL low cache hit rate on {{ $labels.cluster }} for database |
| 88 | + {{ $labels.datname }} |
| 89 | + expr: | |
| 90 | + avg by (datname) ( |
| 91 | + rate(pg_stat_database_blks_hit{datname!~"template.*",}[5m]) |
| 92 | + / |
| 93 | + ( |
| 94 | + rate( |
| 95 | + pg_stat_database_blks_hit{datname!~"template.*",}[5m] |
| 96 | + ) |
| 97 | + + |
| 98 | + rate( |
| 99 | + pg_stat_database_blks_read{datname!~"template.*",}[5m] |
| 100 | + ) |
| 101 | + ) |
| 102 | + ) < 0.98 |
| 103 | + for: 5m |
| 104 | + labels: |
| 105 | + severity: warning |
0 commit comments