diff --git a/documentation/configuration_file.md b/documentation/configuration_file.md
index a7bd93d7..5cc47750 100644
--- a/documentation/configuration_file.md
+++ b/documentation/configuration_file.md
@@ -190,9 +190,11 @@ interval = 60
[pgprobackup]
enabled = false
-interval = 300
+interval = 900
backup_dirs = /backup_dir1,/backup_dir2
-pg_probackup_path = /usr/bin/pg_probackup-11
+pg_probackup_path = /usr/bin/pg_probackup-13
+max_time_run_backup2alert_in_sec = 21600
+max_time_lack_backup2alert_in_sec = 100800
```
**[preparedtransaction]**
@@ -219,6 +221,10 @@ The *interval* parameter allows you to change the metrics collection interval.
By default this plugin is disabled. To enable it set the enabled parameter to True.
-This plugin collects two metrics: *pg_probackup.dir.size[#backup_directory]* (the size of the target directory) and *pg_probackup.dir.error[#backup_directory]* (backup errors) for each specified *backup_directory*.
+This plugin collects several metrics:
+- *pg_probackup.dir.size[#backup_directory]* (the size of the target directory)
+- *pg_probackup.dir.error[#backup_directory]* (backup errors)
+- other metrics for each specified *backup_directory*.
+See file metrics.md
If any generated backup has bad status, like ERROR, CORRUPT, ORPHAN, а trigger is fired.
diff --git a/documentation/metrics.md b/documentation/metrics.md
index e3049162..949e1121 100644
--- a/documentation/metrics.md
+++ b/documentation/metrics.md
@@ -1,20 +1,21 @@
# Mamonsu: metrics
**Metrics:**
-- [Mamonsu health metrics](#mamonsu-health-metrics)
- - [Items](#items)
- - [Triggers](#triggers)
-- [System metrics](#system-metrics)
- - [*nix](#nix)
+- [Mamonsu: metrics](#mamonsu-metrics)
+ - [Mamonsu Health metrics](#mamonsu-health-metrics)
+ - [Items](#items)
+ - [Triggers](#triggers)
+ - [System metrics](#system-metrics)
+ - [*nix](#nix)
- [Items](#items-1)
- [Discovery Rules](#discovery-rules)
- [Graphs](#graphs)
- [Triggers](#triggers-1)
- - [Windows](#windows)
+ - [Windows](#windows)
- [Items](#items-2)
- [Discovery Rules](#discovery-rules-1)
-- [PostgreSQL metrics](#postgresql-metrics)
- - [Archiving](#archiving)
+ - [PostgreSQL metrics](#postgresql-metrics)
+ - [Archiving](#archiving)
- [Items](#items-3)
- [Graphs](#graphs-1)
- [Triggers](#triggers-2)
@@ -23,63 +24,63 @@
- [Background Writer](#background-writer)
- [Items](#items-5)
- [Graphs](#graphs-2)
- - [Blocks](#blocks)
+ - [Blocks](#blocks)
- [Items](#items-6)
- [Graphs](#graphs-3)
- - [Checkpoints](#checkpoints)
+ - [Checkpoints](#checkpoints)
- [Items](#items-7)
- [Graphs](#graphs-4)
- [Triggers](#triggers-3)
- - [Connections](#connections)
+ - [Connections](#connections)
- [Items](#items-8)
- [Graphs](#graphs-5)
- [Triggers](#triggers-4)
- - [Databases](#databases)
+ - [Databases](#databases)
- [Discovery Rules](#discovery-rules-2)
- - [Events](#events)
+ - [Events](#events)
- [Items](#items-9)
- [Graphs](#graphs-6)
- - [Health](#health)
+ - [Health](#health)
- [Items](#items-10)
- [Triggers](#triggers-5)
- - [Memory Leak](#memory-leak)
+ - [Memory Leak](#memory-leak)
- [Items](#items-11)
- [Triggers](#triggers-6)
- - [pg_buffercache](#pg_buffercache)
+ - [pg_buffercache](#pg_buffercache)
- [Items](#items-12)
- [Graphs](#graphs-7)
- - [pg_locks](#pg_locks)
+ - [pg_locks](#pg_locks)
- [Items](#items-13)
- [Graphs](#graphs-8)
- - [pg_stat_statements](#pg_stat_statements)
+ - [pg_stat_statements](#pg_stat_statements)
- [Items](#items-14)
- [Graphs](#graphs-9)
- - [Prepared Transactions](#prepared-transactions)
+ - [Prepared Transactions](#prepared-transactions)
- [Items](#items-15)
- [Graphs](#graphs-10)
- [Triggers](#triggers-7)
- - [Relations](#relations)
+ - [Relations](#relations)
- [Discovery Rules](#discovery-rules-3)
- - [Replication](#replication)
+ - [Replication](#replication)
- [Items](#items-16)
- [Discovery Rules](#discovery-rules-4)
- [Triggers](#triggers-8)
- - [Temp Files](#temp-files)
+ - [Temp Files](#temp-files)
- [Items](#items-17)
- [Graphs](#graphs-11)
- - [Transactions](#transactions)
+ - [Transactions](#transactions)
- [Items](#items-18)
- [Triggers](#triggers-9)
- - [Tuples](#tuples)
+ - [Tuples](#tuples)
- [Items](#items-19)
- [Graphs](#graphs-12)
- - [WAL](#wal)
+ - [WAL](#wal)
- [Items](#items-20)
-- [Postgres Pro metrics](#postgres-pro-metrics)
- - [Compressed File System](#compressed-file-system)
+ - [Postgres Pro metrics](#postgres-pro-metrics)
+ - [Compressed File System](#compressed-file-system)
- [Items](#items-21)
- [Discovery Rules](#discovery-rules-5)
- - [pg_wait_sampling](#pg_wait_sampling)
+ - [pg_wait_sampling](#pg_wait_sampling)
- [Items](#items-22)
- [Graphs](#graphs-13)
@@ -1203,57 +1204,68 @@ Default config:
4. **pg_probackup Discovery**
Items:
-
-
- Name |
- Pg_probackup dir {#BACKUPDIR}: error |
- Pg_probackup dir {#BACKUPDIR}: size |
-
-
- Key |
- pg_probackup.dir.error[{#BACKUPDIR}] |
- pg_probackup.dir.size[{#BACKUPDIR}] |
-
-
- Type |
- Text |
- Numeric (float) |
-
-
- Units |
- |
- Bytes |
-
-
- Delta |
- As Is |
- As Is |
-
-
+
+| Name | Key | Storage | Description |
+| ---------------------------------------------------------- | ------------------------------------------------ | ------- | ---------------------------------------------------------- |
+| Pg_probackup dir {#BACKUPDIR}: size | pg_probackup.dir.size[{#BACKUPDIR}] | 31d | Total catalog size: /backups + /wal |
+| Pg_probackup dir {#BACKUPDIR}/backups: size | pg_probackup.dir.size[{#BACKUPDIR}/backups] | 31d | Subdirectory Size /backups |
+| Pg_probackup dir {#BACKUPDIR}/wal: size | pg_probackup.dir.size[{#BACKUPDIR}/wal] | 31d | Subdirectory Size /wal |
+| Pg_probackup dir {#BACKUPDIR}: duration full backup | pg_probackup.dir.duration_full[{#BACKUPDIR}] | 31d | Duration in seconds of creating a complete backup |
+| Pg_probackup dir {#BACKUPDIR}: duration incremental backup | pg_probackup.dir.duration_inc[{#BACKUPDIR}] | 31d | Duration in seconds of creating an incremental backup |
+| Pg_probackup dir {#BACKUPDIR}: start time backup | pg_probackup.dir.start_time_backup[{#BACKUPDIR}] | | Time (unixtime) start creating backup |
+| Pg_probackup dir {#BACKUPDIR}: end time backup | pg_probackup.dir.end_time_backup[{#BACKUPDIR}] | | Time (UnixTime) Completion of Bacup Creation |
+| Pg_probackup dir {#BACKUPDIR}: mode | pg_probackup.dir.mode_backup[{#BACKUPDIR}] | | Current backup mode |
+| Pg_probackup dir {#BACKUPDIR}: status | pg_probackup.dir.status_backup[{#BACKUPDIR}] | | Current backup status |
+| Pg_probackup dir {#BACKUPDIR}: error | pg_probackup.dir.error[{#BACKUPDIR}] | | A sign of an erroneous state or "ok" if everything is fine |
Graphs:
-
-
- Name |
- Pg_probackup: backup dir: {#BACKUPDIR} size |
-
-
- Metrics |
- Pg_probackup dir {#BACKUPDIR}: size |
-
-
+
+1. Pg_probackup: backup dir: {#BACKUPDIR} size
+
+Shows 3 metrics with information about the size of directories with archival copies:
+
+| Key | Side graphs | Description |
+| ------------------------------------------- | ------------ | ------------------------------------ |
+| pg_probackup.dir.size[{#BACKUPDIR}] | (Left Side) | Total Directory Size /backups + /wal |
+| pg_probackup.dir.size[{#BACKUPDIR}/backups] | (Left Side) | Subdirectory size /backups |
+| pg_probackup.dir.size[{#BACKUPDIR}/wal] | (Right Side) | Subdirectory size /wal |
+
+2. Pg_probackup: backup dir: {#BACKUPDIR} duration
+
+Shows 2 metrics with a duration of creating archive copies:
+
+| Key | Side graphs | Description |
+| -------------------------------------------- | ------------ | ----------------------------------------------------- |
+| pg_probackup.dir.duration_full[{#BACKUPDIR}] | (Left Side) | Duration in seconds of creating a complete backup |
+| pg_probackup.dir.duration_inc[{#BACKUPDIR}] | (Right Side) | Duration in seconds of creating an incremental backup |
Triggers:
-
-
- Name |
- Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE}) |
-
-
- Expression |
- Triggers if pg_probackup status is not OK. |
-
-
+
+The following alerts have been created that allow you to monitor the status of archive directories:
+
+* The alert triggers if the creation of a backup is performed more than indicated in the configuration parameter `max_time_run_backup2alert_in_sec`. Time is specified in seconds and default value = 21600 (6 hours). The current state is monitored in which the process of creating a backfill.
+
+| Category | Details |
+| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Importance: | Warning |
+| Name: | Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING) |
+| Expression: | {PostgresPro-Linux:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING" and ( {PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()}) > max_time_run_backup2alert_in_sec |
+
+* The alert triggers if it does not create a new backup longer than indicated in the configuration parameter `max_time_lack_backupup2alert_in_sec`. Time is specified in seconds and default value = 100800 (28 hours). It is monitored that the next backup (the type of backup of any) will be created no later than indicated in the parameter.
+
+| Category | Details |
+| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Importance: | Warning |
+| Name: | Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} |
+| Expression: | ( {PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()} -{PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()}) > max_time_lack_backup2alert_in_sec |
+
+* Alert triggers if an error occurred when creating a backup - 'error', 'corrupt', 'orphan'. Controls the state of any archive copy, not only the latter. Active all the time has any historical copy with an erroneous state.
+
+| Category | Details |
+| ----------- | ----------------------------------------------------------------------------------- |
+| Importance: | Average |
+| Name: | Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE}) |
+| Expression: | {PostgresPro-Linux:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1 |
### Graphs
diff --git a/documentation/metrics.ru.md b/documentation/metrics.ru.md
new file mode 100644
index 00000000..1cab5da6
--- /dev/null
+++ b/documentation/metrics.ru.md
@@ -0,0 +1,82 @@
+# Описания плагинов
+
+## pg_probackup.py
+Предназначен для контроля за состоянием каталогов бэкапов создаваемых утилитой [pg_probackup](https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup).
+Плагин адаптирован для контроля нескольких инстансов в одном каталоге. Имя инстанса указывается в ключе метрики как подкаталог.
+
+### Настройки в секции [pgprobackup]
+
+| Наименование | Ключ | Описание |
+| --------------------------------- | ------------------------- | ------------------------------------------------------------------ |
+| enabled | False | По умолчанию плагин отключен. Укажите True для включения |
+| interval | 900 | Как часто опрашивать состояние каталогов. Указано в секундах |
+| backup_dirs | /backup_dir1,/backup_dir2 | Список каталогов бэкапов утилиты pg_probackup |
+| pg_probackup_path | /usr/bin/pg_probackup-13 | Полный путь к утилите создания бэкапов pg_probackup |
+| max_time_run_backup2alert_in_sec | 21600 | Время срабатывания алерта "Backup runs too long on..." в секундах. |
+| max_time_lack_backup2alert_in_sec | 100800 | Время срабатывания алерта "Long time no backups on..." в секундах. |
+
+
+### Текущие метрики в Discovery правиле:
+
+| Наименование | Ключ | Хранить | Описание |
+| ---------------------------------------------------------- | ------------------------------------------------ | ------- | -------------------------------------------------------- |
+| Pg_probackup dir {#BACKUPDIR}: size | pg_probackup.dir.size[{#BACKUPDIR}] | 31d | Общий размер каталога: /backups + /wal |
+| Pg_probackup dir {#BACKUPDIR}/backups: size | pg_probackup.dir.size[{#BACKUPDIR}/backups] | 31d | Размер подкаталога /backups |
+| Pg_probackup dir {#BACKUPDIR}/wal: size | pg_probackup.dir.size[{#BACKUPDIR}/wal] | 31d | Размер подкаталога /wal |
+| Pg_probackup dir {#BACKUPDIR}: duration full backup | pg_probackup.dir.duration_full[{#BACKUPDIR}] | 31d | Длительность в секундах создания полного бэкапа |
+| Pg_probackup dir {#BACKUPDIR}: duration incremental backup | pg_probackup.dir.duration_inc[{#BACKUPDIR}] | 31d | Длительность в секундах создания инкрементального бэкапа |
+| Pg_probackup dir {#BACKUPDIR}: start time backup | pg_probackup.dir.start_time_backup[{#BACKUPDIR}] | | Время (UNIXTIME) старта создания бэкапа |
+| Pg_probackup dir {#BACKUPDIR}: end time backup | pg_probackup.dir.end_time_backup[{#BACKUPDIR}] | | Время (UNIXTIME) завершения создания бэкапа |
+| Pg_probackup dir {#BACKUPDIR}: mode | pg_probackup.dir.mode_backup[{#BACKUPDIR}] | | Текущий режим бэкапа |
+| Pg_probackup dir {#BACKUPDIR}: status | pg_probackup.dir.status_backup[{#BACKUPDIR}] | | Текущий статус бэкапа |
+| Pg_probackup dir {#BACKUPDIR}: error | pg_probackup.dir.error[{#BACKUPDIR}] | | Признак ошибочного состояния или "ok" если всё хорошо |
+
+
+### Текущие алерты в Discovery правиле:
+Созданы следующие алерты, позволящие контролировать состояние архивных каталогов:
+
+* Алерт срабатывает если создание бэкапа выполняется дольше, чем указано в настроечном параметре `max_time_run_backup2alert_in_sec`. Время задаётся в секундах и значение по умолчанию = 21600 (6 часов). Контролируется текущее состояние в котором находится процесс создания бэкапной копии.
+
+| Категория | Детали |
+| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- |
+| Важность: | Warning |
+| Наименование: | Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING) |
+| Выражение: | {PostgresPro-Linux:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING" and ( {PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()}) > max_time_run_backup2alert_in_sec |
+
+* Алерт срабатывает если не выполняется создание нового бэкапа дольше, чем указано в настроечном параметре `max_time_lack_backup2alert_in_sec`. Время задаётся в секундах и значение по умолчанию = 100800 (28 часов). Контролируется, что очередной бэкап (тип бэкапа любой) будет создан не позже, чем указано в параметре.
+
+| Категория | Детали |
+| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ |
+| Важность: | Warning |
+| Наименование: | Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} |
+| Выражение: | ( {PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()} -{PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()}) > max_time_lack_backup2alert_in_sec |
+
+* Алерт срабатывает если при создании бэкапа произошла ошибка - 'ERROR', 'CORRUPT', 'ORPHAN'. Контролирует состояние любой архивной копии, не только последней. Активен всё время пока есть любая архивная копия с ошибочным состоянием.
+
+| Категория | Детали |
+| ------------- | ----------------------------------------------------------------------------------- |
+| Важность: | Average |
+| Наименование: | Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE}) |
+| Выражение: | {PostgresPro-Linux:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1 |
+
+
+### Текущие графики в Discovery правиле:
+
+1. Pg_probackup: backup dir: {#BACKUPDIR} size
+
+Показывает 3 метрики с информацией о размерах каталогов с архивными копиями:
+
+| Метрика | Сторона графика | Описание |
+| ------------------------------------------- | --------------- | -------------------------------------- |
+| pg_probackup.dir.size[{#BACKUPDIR}] | (Left Side) | Общий размер каталогов /backups + /wal |
+| pg_probackup.dir.size[{#BACKUPDIR}/backups] | (Left Side) | размер подкаталога /backups |
+| pg_probackup.dir.size[{#BACKUPDIR}/wal] | (Right Side) | размер подкаталога /wal |
+
+2. Pg_probackup: backup dir: {#BACKUPDIR} duration
+
+Показывает 2 метрики с длительностью создания архивных копий:
+
+| Метрика | Сторона графика | Описание |
+| -------------------------------------------- | --------------- | -------------------------------------------------------- |
+| pg_probackup.dir.duration_full[{#BACKUPDIR}] | (Left Side) | Длительность в секундах создания полного бэкапа |
+| pg_probackup.dir.duration_inc[{#BACKUPDIR}] | (Right Side) | Длительность в секундах создания инкрементального бэкапа |
diff --git a/mamonsu/plugins/system/linux/pg_probackup.py b/mamonsu/plugins/system/linux/pg_probackup.py
index 2472aa52..1f937436 100644
--- a/mamonsu/plugins/system/linux/pg_probackup.py
+++ b/mamonsu/plugins/system/linux/pg_probackup.py
@@ -3,17 +3,28 @@
import json
import os
import subprocess
-
+from datetime import datetime
class PgProbackup(Plugin):
os_walk_error = None
block_size = 4096
- Interval = 5 * 60
+ Interval = 15 * 60
key_main = 'pg_probackup.discovery{0}'
key_dir_size = 'pg_probackup.dir.size{0}'
key_dir_error = 'pg_probackup.dir.error{0}'
+ key_dir_duration_full = 'pg_probackup.dir.duration_full{0}'
+ key_dir_duration_inc = 'pg_probackup.dir.duration_inc{0}'
+ key_dir_endtime_backup = 'pg_probackup.dir.end_time_backup{0}'
+ key_dir_starttime_backup = 'pg_probackup.dir.start_time_backup{0}'
+ key_dir_status_backup = 'pg_probackup.dir.status_backup{0}'
+ key_dir_mode_backup = 'pg_probackup.dir.mode_backup{0}'
AgentPluginType = 'pg'
Type = "mamonsu"
+
+ DEFAULT_CONFIG = {
+ 'max_time_run_backup2alert_in_sec': str(21600), # The maximum time of running time of backup to Alert in seconds (6 hours)
+ 'max_time_lack_backup2alert_in_sec': str(100800), # The maximum time of lack of backup to Alert (28 hours)
+ }
def set_os_walk_error(self, e):
self.os_walk_error = e
@@ -56,50 +67,102 @@ def run(self, zbx):
"""Disable plugin and exit, because the parameter 'backup_dirs' in section [pgprobackup] is not set.
Set this parameter if needed and restart.""")
+ fmt_data = '%Y-%m-%d %H:%M:%S+03'
backup_dirs = config_backup_dirs.split(',')
dirs = []
- for _dir in backup_dirs:
- dirs.append({'{#BACKUPDIR}': _dir})
-
- dir_size = self.dir_size(_dir)
- if self.os_walk_error:
- self.log.error(
- "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format(
- backup_catalog=_dir, error=str(self.os_walk_error)))
- else:
- zbx.send(self.key_dir_size.format('[' + _dir + ']'), dir_size)
+ for _dir_top in backup_dirs:
# Search for backups with bad status is done by running
# "pg_probackup show -B backup_dir" command
- command = [config_pg_probackup_path, 'show', '-B', _dir, '--format=json']
+ command = [config_pg_probackup_path, 'show', '-B', _dir_top, '--format=json']
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
stdout, stderr = process.communicate()
return_code = process.returncode
if return_code != 0:
self.log.error(
"The command: {command} return code {return_code}. Error: {error}".format(command=command,
- return_code=return_code,
- error=stderr))
+ return_code=return_code,
+ error=stderr))
continue
try:
result = json.loads(stdout.decode('utf-8'))
except Exception as e:
self.log.error('Error in convert data: {stdout} \n {e}'.format(stdout=stdout, e=e))
continue
+
no_error= True
+
for instance in result:
- for backup in instance.get('backups', []):
+ # We consider the sizes of each instance
+ instance_name = instance['instance']
+ _dir = _dir_top + '/' + instance_name
+ dirs.append({'{#BACKUPDIR}': _dir})
+
+ # sud-directory backups
+ dir_size_backups = self.dir_size(_dir_top + '/backups/' + instance_name)
+ if self.os_walk_error:
+ self.log.error(
+ "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format(
+ backup_catalog=(_dir_top + '/backups/' + instance_name), error=str(self.os_walk_error)))
+ else:
+ # We consider the size of the predefined directories - backups
+ zbx.send(self.key_dir_size.format('[' + _dir + '/backups]'), dir_size_backups)
+
+ # sud-directory wal
+ dir_size_wal = self.dir_size(_dir_top + '/wal/' + instance_name)
+ if self.os_walk_error:
+ self.log.error(
+ "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format(
+ backup_catalog=(_dir_top + '/wal/' + instance_name), error=str(self.os_walk_error)))
+ else:
+ # We consider the size of the predefined directories - wal
+ zbx.send(self.key_dir_size.format('[' + _dir + '/wal]'), dir_size_wal)
+
+ # We consider the size of the predefined directories - backups and wal
+ zbx.send(self.key_dir_size.format('[' + _dir + ']'), dir_size_backups+dir_size_wal)
+
+ full_send = 0
+ for idx, backup in enumerate(instance.get('backups', [])):
status = backup['status']
+ mode = backup['backup-mode']
+ if idx == 0:
+ # Status of the last backup
+ zbx.send(self.key_dir_status_backup.format('[' + _dir + ']'), status)
+ # Backup Creation Mode Full, Page, Delta and Ptrack of the last backup
+ zbx.send(self.key_dir_mode_backup.format('[' + _dir + ']'), mode)
if status in ['ERROR', 'CORRUPT', 'ORPHAN']:
error = 'Backup with id: {backup_id} in instance: {instance_name} in pg_probackup dir: ' \
- '{backup_catalog} has status: {status}.'.format(backup_id=backup['id'],
- instance_name=instance['instance'],
- status=status, backup_catalog=_dir)
+ '{backup_catalog} has status: {status}.'.format(backup_id=backup['id'],
+ instance_name=instance_name,
+ status=status, backup_catalog=_dir)
self.log.info(error)
no_error = False
zbx.send(self.key_dir_error.format('[' + _dir + ']'), error)
- if no_error:
- zbx.send(self.key_dir_error.format('[' + _dir + ']'), 'ok')
+ if idx == 0:
+ # the start time of the last backup at unixtime
+ start = datetime.strptime(backup['start-time'], fmt_data)
+ zbx.send(self.key_dir_starttime_backup.format('[' + _dir + ']'), start.timestamp())
+ # check end-time and calculate duration
+ if 'end-time' in backup:
+ end = datetime.strptime(backup['end-time'], fmt_data)
+ delta = (end - start).total_seconds()
+ # the end time of the last backup at unixtime
+ zbx.send(self.key_dir_endtime_backup.format('[' + _dir + ']'), end.timestamp())
+ # duration full or incremental of the last backup
+ if backup['backup-mode'] == "FULL":
+ zbx.send(self.key_dir_duration_full.format('[' + _dir + ']'), delta)
+ full_send = 1
+ else:
+ zbx.send(self.key_dir_duration_inc.format('[' + _dir + ']'), delta)
+ if full_send == 0 and 'end-time' in backup and backup['backup-mode'] == "FULL":
+ start = datetime.strptime(backup['start-time'], fmt_data)
+ end = datetime.strptime(backup['end-time'], fmt_data)
+ delta = (end - start).total_seconds()
+ zbx.send(self.key_dir_duration_full.format('[' + _dir + ']'), delta)
+ full_send = 1
+
+ if no_error:
+ zbx.send(self.key_dir_error.format('[' + _dir + ']'), 'ok')
zbx.send(self.key_main.format('[]'), zbx.json({'data': dirs}))
del dirs
@@ -128,24 +191,132 @@ def discovery_rules(self, template, dashboard=False):
'name': 'Pg_probackup dir {#BACKUPDIR}: size',
'units': Plugin.UNITS.bytes,
'value_type': Plugin.VALUE_TYPE.numeric_unsigned,
- 'delay': self.plugin_config('interval')},
+ 'history': '31',
+ 'delay': self.plugin_config('interval'),
+ 'description': "Size of the entire catalog with backups"},
+ {'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/backups,"),
+ 'name': 'Pg_probackup dir {#BACKUPDIR}/backups: size',
+ 'units': Plugin.UNITS.bytes,
+ 'value_type': Plugin.VALUE_TYPE.numeric_unsigned,
+ 'history': '31',
+ 'delay': self.plugin_config('interval'),
+ 'description': "The size of the entire subdirectory /backups"},
+ {'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/wal,"),
+ 'name': 'Pg_probackup dir {#BACKUPDIR}/wal: size',
+ 'units': Plugin.UNITS.bytes,
+ 'value_type': Plugin.VALUE_TYPE.numeric_unsigned,
+ 'history': '31',
+ 'delay': self.plugin_config('interval'),
+ 'description': "The size of the entire subdirectory /wal"},
{'key': self.right_type(self.key_dir_error, var_discovery="{#BACKUPDIR},"),
'name': 'Pg_probackup dir {#BACKUPDIR}: error',
'value_type': Plugin.VALUE_TYPE.text,
- 'delay': self.plugin_config('interval')},
+ 'delay': self.plugin_config('interval'),
+ 'description': "Sign of the erroneous completion of the backup: 'ERROR', 'CORRUPT', 'ORPHAN'"},
+ {'key': self.right_type(self.key_dir_duration_full, var_discovery="{#BACKUPDIR},"),
+ 'name': 'Pg_probackup dir {#BACKUPDIR}: duration full backup',
+ 'units': Plugin.UNITS.s,
+ 'value_type': Plugin.VALUE_TYPE.numeric_unsigned,
+ 'history': '31',
+ 'delay': self.plugin_config('interval'),
+ 'description': "The duration of the last full backup"},
+ {'key': self.right_type(self.key_dir_duration_inc, var_discovery="{#BACKUPDIR},"),
+ 'name': 'Pg_probackup dir {#BACKUPDIR}: duration incremental backup',
+ 'units': Plugin.UNITS.s,
+ 'value_type': Plugin.VALUE_TYPE.numeric_unsigned,
+ 'history': '31',
+ 'delay': self.plugin_config('interval'),
+ 'description': "The duration of the last incremental backup"},
+ {'key': self.right_type(self.key_dir_endtime_backup, var_discovery="{#BACKUPDIR},"),
+ 'name': 'Pg_probackup dir {#BACKUPDIR}: end time backup',
+ 'units': Plugin.UNITS.unixtime,
+ 'value_type': Plugin.VALUE_TYPE.numeric_unsigned,
+ 'delay': self.plugin_config('interval'),
+ 'description': "The end time of the last any backup"},
+ {'key': self.right_type(self.key_dir_starttime_backup, var_discovery="{#BACKUPDIR},"),
+ 'name': 'Pg_probackup dir {#BACKUPDIR}: start time backup',
+ 'units': Plugin.UNITS.unixtime,
+ 'value_type': Plugin.VALUE_TYPE.numeric_unsigned,
+ 'delay': self.plugin_config('interval'),
+ 'description': "The start time of the last any backup"},
+ {'key': self.right_type(self.key_dir_status_backup, var_discovery="{#BACKUPDIR},"),
+ 'name': 'Pg_probackup dir {#BACKUPDIR}: status',
+ 'value_type': Plugin.VALUE_TYPE.text,
+ 'delay': self.plugin_config('interval'),
+ 'description': "Sign of the status completion of the last backup:\n\n"
+ "OK — the backup is complete and valid.\n"
+ "DONE — the backup is complete, but was not validated.\n"
+ "RUNNING — the backup is in progress.\n"
+ "MERGING — the backup is being merged.\n"
+ "MERGED — the backup data files were successfully merged, but its metadata is in the process of being updated. Only full backups can have this status.\n"
+ "DELETING — the backup files are being deleted.\n"
+ "CORRUPT — some of the backup files are corrupt.\n"
+ "ERROR — the backup was aborted because of an unexpected error.\n"
+ "ORPHAN — the backup is invalid because one of its parent backups is corrupt or missing.\n\n"
+ "https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup"
+ },
+ {'key': self.right_type(self.key_dir_mode_backup, var_discovery="{#BACKUPDIR},"),
+ 'name': 'Pg_probackup dir {#BACKUPDIR}: mode',
+ 'value_type': Plugin.VALUE_TYPE.text,
+ 'delay': self.plugin_config('interval'),
+ 'description': "Backup Creation Mode:\n\n"
+ "FULL — creates a full backup that contains all the data files of the cluster to be restored.\n"
+ "DELTA — reads all data files in the data directory and creates an incremental backup for pages that have changed since the previous backup.\n"
+ "PAGE — creates an incremental backup based on the WAL files that have been generated since the previous full or incremental backup was taken. Only changed blocks are read from data files.\n"
+ "PTRACK — creates an incremental backup tracking page changes on the fly.\n\n"
+ "https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup"
+ },
]
graphs = [
+ {
+ 'name': 'Pg_probackup: backup dir: {#BACKUPDIR} duration',
+ 'type': 0,
+ 'items': [
+ {'color': '00897B',
+ 'drawtype': 2,
+ 'key': self.right_type(self.key_dir_duration_full, var_discovery="{#BACKUPDIR},")},
+ {'color': '66BB6A',
+ 'drawtype': 2,
+ 'key': self.right_type(self.key_dir_duration_inc, var_discovery="{#BACKUPDIR},"),
+ 'yaxisside': 1}
+ ]
+ },
{
'name': 'Pg_probackup: backup dir: {#BACKUPDIR} size',
- 'type': 1,
+ 'type': 0,
'items': [
- {'color': '00CC00',
- 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR},")}]
+ {'color': 'C8E6C9',
+ 'drawtype': 1,
+ 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR},")},
+ {'color': '00897B',
+ 'drawtype': 2,
+ 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/backups,")},
+ {'color': '66BB6A',
+ 'drawtype': 2,
+ 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/wal,"),
+ 'yaxisside': 1}
+ ]
},
]
- triggers = [{
- 'name': 'Error in pg_probackup dir '
- '{#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE})',
- 'expression': '{#TEMPLATE:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1'}
+ triggers = [
+ {'name': 'Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE})',
+ 'expression': '{#TEMPLATE:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1',
+ 'priority': 3,
+ 'description': 'Backup status: CORRUPT / ERROR / ORPHAN'},
+ {'name': 'Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR}',
+ 'expression': '({#TEMPLATE:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()}-{#TEMPLATE:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()})>'
+ + self.plugin_config('max_time_lack_backup2alert_in_sec'),
+ 'priority': 2,
+ 'description': 'From the moment of completion of the backup passed more than '
+ + str(int(int(self.plugin_config('max_time_lack_backup2alert_in_sec'))/3600)) + ' hours ('
+ + self.plugin_config('max_time_lack_backup2alert_in_sec') + ' seconds)'},
+ {'name': 'Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING)',
+ 'expression': '{#TEMPLATE:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING"'
+ ' and ({#TEMPLATE:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{#TEMPLATE:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()})>'
+ + self.plugin_config('max_time_run_backup2alert_in_sec'),
+ 'priority': 2,
+ 'description': 'From the moment of start of the backup passed more than '
+ + str(int(int(self.plugin_config('max_time_run_backup2alert_in_sec'))/3600)) + ' hours ('
+ + self.plugin_config('max_time_run_backup2alert_in_sec') + ' seconds)'},
]
return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs, triggers=triggers)
diff --git a/packaging/conf/example.conf b/packaging/conf/example.conf
index b313b6c3..14fe99fc 100644
--- a/packaging/conf/example.conf
+++ b/packaging/conf/example.conf
@@ -183,9 +183,11 @@ interval = 60
# Trigger fires if some backup has bad status e.g. (ERROR,CORRUPT,ORPHAN).
[pgprobackup]
enabled = False
-interval = 300
+interval = 900
backup_dirs = /backup_dir1,/backup_dir2
-pg_probackup_path = /usr/bin/pg_probackup-11
+pg_probackup_path = /usr/bin/pg_probackup-13
+max_time_run_backup2alert_in_sec = 21600
+max_time_lack_backup2alert_in_sec = 100800
# Get size of relations defined in this section
# Relations - comma separated list of objects - tables and endexes (database_name.schema.relation) used to calculate relations size.