diff --git a/documentation/configuration_file.md b/documentation/configuration_file.md index a7bd93d7..5cc47750 100644 --- a/documentation/configuration_file.md +++ b/documentation/configuration_file.md @@ -190,9 +190,11 @@ interval = 60 [pgprobackup] enabled = false -interval = 300 +interval = 900 backup_dirs = /backup_dir1,/backup_dir2 -pg_probackup_path = /usr/bin/pg_probackup-11 +pg_probackup_path = /usr/bin/pg_probackup-13 +max_time_run_backup2alert_in_sec = 21600 +max_time_lack_backup2alert_in_sec = 100800 ``` **[preparedtransaction]** @@ -219,6 +221,10 @@ The *interval* parameter allows you to change the metrics collection interval. By default this plugin is disabled. To enable it set the enabled parameter to True. -This plugin collects two metrics: *pg_probackup.dir.size[#backup_directory]* (the size of the target directory) and *pg_probackup.dir.error[#backup_directory]* (backup errors) for each specified *backup_directory*. +This plugin collects several metrics: +- *pg_probackup.dir.size[#backup_directory]* (the size of the target directory) +- *pg_probackup.dir.error[#backup_directory]* (backup errors) +- other metrics for each specified *backup_directory*. +See file metrics.md If any generated backup has bad status, like ERROR, CORRUPT, ORPHAN, а trigger is fired. diff --git a/documentation/metrics.md b/documentation/metrics.md index e3049162..949e1121 100644 --- a/documentation/metrics.md +++ b/documentation/metrics.md @@ -1,20 +1,21 @@ # Mamonsu: metrics **Metrics:** -- [Mamonsu health metrics](#mamonsu-health-metrics) - - [Items](#items) - - [Triggers](#triggers) -- [System metrics](#system-metrics) - - [*nix](#nix) +- [Mamonsu: metrics](#mamonsu-metrics) + - [Mamonsu Health metrics](#mamonsu-health-metrics) + - [Items](#items) + - [Triggers](#triggers) + - [System metrics](#system-metrics) + - [*nix](#nix) - [Items](#items-1) - [Discovery Rules](#discovery-rules) - [Graphs](#graphs) - [Triggers](#triggers-1) - - [Windows](#windows) + - [Windows](#windows) - [Items](#items-2) - [Discovery Rules](#discovery-rules-1) -- [PostgreSQL metrics](#postgresql-metrics) - - [Archiving](#archiving) + - [PostgreSQL metrics](#postgresql-metrics) + - [Archiving](#archiving) - [Items](#items-3) - [Graphs](#graphs-1) - [Triggers](#triggers-2) @@ -23,63 +24,63 @@ - [Background Writer](#background-writer) - [Items](#items-5) - [Graphs](#graphs-2) - - [Blocks](#blocks) + - [Blocks](#blocks) - [Items](#items-6) - [Graphs](#graphs-3) - - [Checkpoints](#checkpoints) + - [Checkpoints](#checkpoints) - [Items](#items-7) - [Graphs](#graphs-4) - [Triggers](#triggers-3) - - [Connections](#connections) + - [Connections](#connections) - [Items](#items-8) - [Graphs](#graphs-5) - [Triggers](#triggers-4) - - [Databases](#databases) + - [Databases](#databases) - [Discovery Rules](#discovery-rules-2) - - [Events](#events) + - [Events](#events) - [Items](#items-9) - [Graphs](#graphs-6) - - [Health](#health) + - [Health](#health) - [Items](#items-10) - [Triggers](#triggers-5) - - [Memory Leak](#memory-leak) + - [Memory Leak](#memory-leak) - [Items](#items-11) - [Triggers](#triggers-6) - - [pg_buffercache](#pg_buffercache) + - [pg_buffercache](#pg_buffercache) - [Items](#items-12) - [Graphs](#graphs-7) - - [pg_locks](#pg_locks) + - [pg_locks](#pg_locks) - [Items](#items-13) - [Graphs](#graphs-8) - - [pg_stat_statements](#pg_stat_statements) + - [pg_stat_statements](#pg_stat_statements) - [Items](#items-14) - [Graphs](#graphs-9) - - [Prepared Transactions](#prepared-transactions) + - [Prepared Transactions](#prepared-transactions) - [Items](#items-15) - [Graphs](#graphs-10) - [Triggers](#triggers-7) - - [Relations](#relations) + - [Relations](#relations) - [Discovery Rules](#discovery-rules-3) - - [Replication](#replication) + - [Replication](#replication) - [Items](#items-16) - [Discovery Rules](#discovery-rules-4) - [Triggers](#triggers-8) - - [Temp Files](#temp-files) + - [Temp Files](#temp-files) - [Items](#items-17) - [Graphs](#graphs-11) - - [Transactions](#transactions) + - [Transactions](#transactions) - [Items](#items-18) - [Triggers](#triggers-9) - - [Tuples](#tuples) + - [Tuples](#tuples) - [Items](#items-19) - [Graphs](#graphs-12) - - [WAL](#wal) + - [WAL](#wal) - [Items](#items-20) -- [Postgres Pro metrics](#postgres-pro-metrics) - - [Compressed File System](#compressed-file-system) + - [Postgres Pro metrics](#postgres-pro-metrics) + - [Compressed File System](#compressed-file-system) - [Items](#items-21) - [Discovery Rules](#discovery-rules-5) - - [pg_wait_sampling](#pg_wait_sampling) + - [pg_wait_sampling](#pg_wait_sampling) - [Items](#items-22) - [Graphs](#graphs-13) @@ -1203,57 +1204,68 @@ Default config: 4. **pg_probackup Discovery** Items: - - - - - - - - - - - - - - - - - - - - - - - - - - -
NamePg_probackup dir {#BACKUPDIR}: errorPg_probackup dir {#BACKUPDIR}: size
Keypg_probackup.dir.error[{#BACKUPDIR}]pg_probackup.dir.size[{#BACKUPDIR}]
TypeTextNumeric (float)
UnitsBytes
DeltaAs IsAs Is
+ +| Name | Key | Storage | Description | +| ---------------------------------------------------------- | ------------------------------------------------ | ------- | ---------------------------------------------------------- | +| Pg_probackup dir {#BACKUPDIR}: size | pg_probackup.dir.size[{#BACKUPDIR}] | 31d | Total catalog size: /backups + /wal | +| Pg_probackup dir {#BACKUPDIR}/backups: size | pg_probackup.dir.size[{#BACKUPDIR}/backups] | 31d | Subdirectory Size /backups | +| Pg_probackup dir {#BACKUPDIR}/wal: size | pg_probackup.dir.size[{#BACKUPDIR}/wal] | 31d | Subdirectory Size /wal | +| Pg_probackup dir {#BACKUPDIR}: duration full backup | pg_probackup.dir.duration_full[{#BACKUPDIR}] | 31d | Duration in seconds of creating a complete backup | +| Pg_probackup dir {#BACKUPDIR}: duration incremental backup | pg_probackup.dir.duration_inc[{#BACKUPDIR}] | 31d | Duration in seconds of creating an incremental backup | +| Pg_probackup dir {#BACKUPDIR}: start time backup | pg_probackup.dir.start_time_backup[{#BACKUPDIR}] | | Time (unixtime) start creating backup | +| Pg_probackup dir {#BACKUPDIR}: end time backup | pg_probackup.dir.end_time_backup[{#BACKUPDIR}] | | Time (UnixTime) Completion of Bacup Creation | +| Pg_probackup dir {#BACKUPDIR}: mode | pg_probackup.dir.mode_backup[{#BACKUPDIR}] | | Current backup mode | +| Pg_probackup dir {#BACKUPDIR}: status | pg_probackup.dir.status_backup[{#BACKUPDIR}] | | Current backup status | +| Pg_probackup dir {#BACKUPDIR}: error | pg_probackup.dir.error[{#BACKUPDIR}] | | A sign of an erroneous state or "ok" if everything is fine | Graphs: - - - - - - - - - -
NamePg_probackup: backup dir: {#BACKUPDIR} size
MetricsPg_probackup dir {#BACKUPDIR}: size
+ +1. Pg_probackup: backup dir: {#BACKUPDIR} size + +Shows 3 metrics with information about the size of directories with archival copies: + +| Key | Side graphs | Description | +| ------------------------------------------- | ------------ | ------------------------------------ | +| pg_probackup.dir.size[{#BACKUPDIR}] | (Left Side) | Total Directory Size /backups + /wal | +| pg_probackup.dir.size[{#BACKUPDIR}/backups] | (Left Side) | Subdirectory size /backups | +| pg_probackup.dir.size[{#BACKUPDIR}/wal] | (Right Side) | Subdirectory size /wal | + +2. Pg_probackup: backup dir: {#BACKUPDIR} duration + +Shows 2 metrics with a duration of creating archive copies: + +| Key | Side graphs | Description | +| -------------------------------------------- | ------------ | ----------------------------------------------------- | +| pg_probackup.dir.duration_full[{#BACKUPDIR}] | (Left Side) | Duration in seconds of creating a complete backup | +| pg_probackup.dir.duration_inc[{#BACKUPDIR}] | (Right Side) | Duration in seconds of creating an incremental backup | Triggers: - - - - - - - - - -
NameError in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE})
ExpressionTriggers if pg_probackup status is not OK.
+ +The following alerts have been created that allow you to monitor the status of archive directories: + +* The alert triggers if the creation of a backup is performed more than indicated in the configuration parameter `max_time_run_backup2alert_in_sec`. Time is specified in seconds and default value = 21600 (6 hours). The current state is monitored in which the process of creating a backfill. + +| Category | Details | +| ----------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Importance: | Warning | +| Name: | Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING) | +| Expression: | {PostgresPro-Linux:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING" and ( {PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()}) > max_time_run_backup2alert_in_sec | + +* The alert triggers if it does not create a new backup longer than indicated in the configuration parameter `max_time_lack_backupup2alert_in_sec`. Time is specified in seconds and default value = 100800 (28 hours). It is monitored that the next backup (the type of backup of any) will be created no later than indicated in the parameter. + +| Category | Details | +| ----------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Importance: | Warning | +| Name: | Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} | +| Expression: | ( {PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()} -{PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()}) > max_time_lack_backup2alert_in_sec | + +* Alert triggers if an error occurred when creating a backup - 'error', 'corrupt', 'orphan'. Controls the state of any archive copy, not only the latter. Active all the time has any historical copy with an erroneous state. + +| Category | Details | +| ----------- | ----------------------------------------------------------------------------------- | +| Importance: | Average | +| Name: | Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE}) | +| Expression: | {PostgresPro-Linux:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1 | ### Graphs diff --git a/documentation/metrics.ru.md b/documentation/metrics.ru.md new file mode 100644 index 00000000..1cab5da6 --- /dev/null +++ b/documentation/metrics.ru.md @@ -0,0 +1,82 @@ +# Описания плагинов + +## pg_probackup.py +Предназначен для контроля за состоянием каталогов бэкапов создаваемых утилитой [pg_probackup](https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup). +Плагин адаптирован для контроля нескольких инстансов в одном каталоге. Имя инстанса указывается в ключе метрики как подкаталог. + +### Настройки в секции [pgprobackup] + +| Наименование | Ключ | Описание | +| --------------------------------- | ------------------------- | ------------------------------------------------------------------ | +| enabled | False | По умолчанию плагин отключен. Укажите True для включения | +| interval | 900 | Как часто опрашивать состояние каталогов. Указано в секундах | +| backup_dirs | /backup_dir1,/backup_dir2 | Список каталогов бэкапов утилиты pg_probackup | +| pg_probackup_path | /usr/bin/pg_probackup-13 | Полный путь к утилите создания бэкапов pg_probackup | +| max_time_run_backup2alert_in_sec | 21600 | Время срабатывания алерта "Backup runs too long on..." в секундах. | +| max_time_lack_backup2alert_in_sec | 100800 | Время срабатывания алерта "Long time no backups on..." в секундах. | + + +### Текущие метрики в Discovery правиле: + +| Наименование | Ключ | Хранить | Описание | +| ---------------------------------------------------------- | ------------------------------------------------ | ------- | -------------------------------------------------------- | +| Pg_probackup dir {#BACKUPDIR}: size | pg_probackup.dir.size[{#BACKUPDIR}] | 31d | Общий размер каталога: /backups + /wal | +| Pg_probackup dir {#BACKUPDIR}/backups: size | pg_probackup.dir.size[{#BACKUPDIR}/backups] | 31d | Размер подкаталога /backups | +| Pg_probackup dir {#BACKUPDIR}/wal: size | pg_probackup.dir.size[{#BACKUPDIR}/wal] | 31d | Размер подкаталога /wal | +| Pg_probackup dir {#BACKUPDIR}: duration full backup | pg_probackup.dir.duration_full[{#BACKUPDIR}] | 31d | Длительность в секундах создания полного бэкапа | +| Pg_probackup dir {#BACKUPDIR}: duration incremental backup | pg_probackup.dir.duration_inc[{#BACKUPDIR}] | 31d | Длительность в секундах создания инкрементального бэкапа | +| Pg_probackup dir {#BACKUPDIR}: start time backup | pg_probackup.dir.start_time_backup[{#BACKUPDIR}] | | Время (UNIXTIME) старта создания бэкапа | +| Pg_probackup dir {#BACKUPDIR}: end time backup | pg_probackup.dir.end_time_backup[{#BACKUPDIR}] | | Время (UNIXTIME) завершения создания бэкапа | +| Pg_probackup dir {#BACKUPDIR}: mode | pg_probackup.dir.mode_backup[{#BACKUPDIR}] | | Текущий режим бэкапа | +| Pg_probackup dir {#BACKUPDIR}: status | pg_probackup.dir.status_backup[{#BACKUPDIR}] | | Текущий статус бэкапа | +| Pg_probackup dir {#BACKUPDIR}: error | pg_probackup.dir.error[{#BACKUPDIR}] | | Признак ошибочного состояния или "ok" если всё хорошо | + + +### Текущие алерты в Discovery правиле: +Созданы следующие алерты, позволящие контролировать состояние архивных каталогов: + +* Алерт срабатывает если создание бэкапа выполняется дольше, чем указано в настроечном параметре `max_time_run_backup2alert_in_sec`. Время задаётся в секундах и значение по умолчанию = 21600 (6 часов). Контролируется текущее состояние в котором находится процесс создания бэкапной копии. + +| Категория | Детали | +| ------------- | ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| Важность: | Warning | +| Наименование: | Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING) | +| Выражение: | {PostgresPro-Linux:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING" and ( {PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{PostgresPro-Linux:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()}) > max_time_run_backup2alert_in_sec | + +* Алерт срабатывает если не выполняется создание нового бэкапа дольше, чем указано в настроечном параметре `max_time_lack_backup2alert_in_sec`. Время задаётся в секундах и значение по умолчанию = 100800 (28 часов). Контролируется, что очередной бэкап (тип бэкапа любой) будет создан не позже, чем указано в параметре. + +| Категория | Детали | +| ------------- | ------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------ | +| Важность: | Warning | +| Наименование: | Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} | +| Выражение: | ( {PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()} -{PostgresPro-Linux:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()}) > max_time_lack_backup2alert_in_sec | + +* Алерт срабатывает если при создании бэкапа произошла ошибка - 'ERROR', 'CORRUPT', 'ORPHAN'. Контролирует состояние любой архивной копии, не только последней. Активен всё время пока есть любая архивная копия с ошибочным состоянием. + +| Категория | Детали | +| ------------- | ----------------------------------------------------------------------------------- | +| Важность: | Average | +| Наименование: | Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE}) | +| Выражение: | {PostgresPro-Linux:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1 | + + +### Текущие графики в Discovery правиле: + +1. Pg_probackup: backup dir: {#BACKUPDIR} size + +Показывает 3 метрики с информацией о размерах каталогов с архивными копиями: + +| Метрика | Сторона графика | Описание | +| ------------------------------------------- | --------------- | -------------------------------------- | +| pg_probackup.dir.size[{#BACKUPDIR}] | (Left Side) | Общий размер каталогов /backups + /wal | +| pg_probackup.dir.size[{#BACKUPDIR}/backups] | (Left Side) | размер подкаталога /backups | +| pg_probackup.dir.size[{#BACKUPDIR}/wal] | (Right Side) | размер подкаталога /wal | + +2. Pg_probackup: backup dir: {#BACKUPDIR} duration + +Показывает 2 метрики с длительностью создания архивных копий: + +| Метрика | Сторона графика | Описание | +| -------------------------------------------- | --------------- | -------------------------------------------------------- | +| pg_probackup.dir.duration_full[{#BACKUPDIR}] | (Left Side) | Длительность в секундах создания полного бэкапа | +| pg_probackup.dir.duration_inc[{#BACKUPDIR}] | (Right Side) | Длительность в секундах создания инкрементального бэкапа | diff --git a/mamonsu/plugins/system/linux/pg_probackup.py b/mamonsu/plugins/system/linux/pg_probackup.py index 2472aa52..1f937436 100644 --- a/mamonsu/plugins/system/linux/pg_probackup.py +++ b/mamonsu/plugins/system/linux/pg_probackup.py @@ -3,17 +3,28 @@ import json import os import subprocess - +from datetime import datetime class PgProbackup(Plugin): os_walk_error = None block_size = 4096 - Interval = 5 * 60 + Interval = 15 * 60 key_main = 'pg_probackup.discovery{0}' key_dir_size = 'pg_probackup.dir.size{0}' key_dir_error = 'pg_probackup.dir.error{0}' + key_dir_duration_full = 'pg_probackup.dir.duration_full{0}' + key_dir_duration_inc = 'pg_probackup.dir.duration_inc{0}' + key_dir_endtime_backup = 'pg_probackup.dir.end_time_backup{0}' + key_dir_starttime_backup = 'pg_probackup.dir.start_time_backup{0}' + key_dir_status_backup = 'pg_probackup.dir.status_backup{0}' + key_dir_mode_backup = 'pg_probackup.dir.mode_backup{0}' AgentPluginType = 'pg' Type = "mamonsu" + + DEFAULT_CONFIG = { + 'max_time_run_backup2alert_in_sec': str(21600), # The maximum time of running time of backup to Alert in seconds (6 hours) + 'max_time_lack_backup2alert_in_sec': str(100800), # The maximum time of lack of backup to Alert (28 hours) + } def set_os_walk_error(self, e): self.os_walk_error = e @@ -56,50 +67,102 @@ def run(self, zbx): """Disable plugin and exit, because the parameter 'backup_dirs' in section [pgprobackup] is not set. Set this parameter if needed and restart.""") + fmt_data = '%Y-%m-%d %H:%M:%S+03' backup_dirs = config_backup_dirs.split(',') dirs = [] - for _dir in backup_dirs: - dirs.append({'{#BACKUPDIR}': _dir}) - - dir_size = self.dir_size(_dir) - if self.os_walk_error: - self.log.error( - "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format( - backup_catalog=_dir, error=str(self.os_walk_error))) - else: - zbx.send(self.key_dir_size.format('[' + _dir + ']'), dir_size) + for _dir_top in backup_dirs: # Search for backups with bad status is done by running # "pg_probackup show -B backup_dir" command - command = [config_pg_probackup_path, 'show', '-B', _dir, '--format=json'] + command = [config_pg_probackup_path, 'show', '-B', _dir_top, '--format=json'] process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = process.communicate() return_code = process.returncode if return_code != 0: self.log.error( "The command: {command} return code {return_code}. Error: {error}".format(command=command, - return_code=return_code, - error=stderr)) + return_code=return_code, + error=stderr)) continue try: result = json.loads(stdout.decode('utf-8')) except Exception as e: self.log.error('Error in convert data: {stdout} \n {e}'.format(stdout=stdout, e=e)) continue + no_error= True + for instance in result: - for backup in instance.get('backups', []): + # We consider the sizes of each instance + instance_name = instance['instance'] + _dir = _dir_top + '/' + instance_name + dirs.append({'{#BACKUPDIR}': _dir}) + + # sud-directory backups + dir_size_backups = self.dir_size(_dir_top + '/backups/' + instance_name) + if self.os_walk_error: + self.log.error( + "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format( + backup_catalog=(_dir_top + '/backups/' + instance_name), error=str(self.os_walk_error))) + else: + # We consider the size of the predefined directories - backups + zbx.send(self.key_dir_size.format('[' + _dir + '/backups]'), dir_size_backups) + + # sud-directory wal + dir_size_wal = self.dir_size(_dir_top + '/wal/' + instance_name) + if self.os_walk_error: + self.log.error( + "Error in count size pg_probackup dir: {backup_catalog}. Error: {error}".format( + backup_catalog=(_dir_top + '/wal/' + instance_name), error=str(self.os_walk_error))) + else: + # We consider the size of the predefined directories - wal + zbx.send(self.key_dir_size.format('[' + _dir + '/wal]'), dir_size_wal) + + # We consider the size of the predefined directories - backups and wal + zbx.send(self.key_dir_size.format('[' + _dir + ']'), dir_size_backups+dir_size_wal) + + full_send = 0 + for idx, backup in enumerate(instance.get('backups', [])): status = backup['status'] + mode = backup['backup-mode'] + if idx == 0: + # Status of the last backup + zbx.send(self.key_dir_status_backup.format('[' + _dir + ']'), status) + # Backup Creation Mode Full, Page, Delta and Ptrack of the last backup + zbx.send(self.key_dir_mode_backup.format('[' + _dir + ']'), mode) if status in ['ERROR', 'CORRUPT', 'ORPHAN']: error = 'Backup with id: {backup_id} in instance: {instance_name} in pg_probackup dir: ' \ - '{backup_catalog} has status: {status}.'.format(backup_id=backup['id'], - instance_name=instance['instance'], - status=status, backup_catalog=_dir) + '{backup_catalog} has status: {status}.'.format(backup_id=backup['id'], + instance_name=instance_name, + status=status, backup_catalog=_dir) self.log.info(error) no_error = False zbx.send(self.key_dir_error.format('[' + _dir + ']'), error) - if no_error: - zbx.send(self.key_dir_error.format('[' + _dir + ']'), 'ok') + if idx == 0: + # the start time of the last backup at unixtime + start = datetime.strptime(backup['start-time'], fmt_data) + zbx.send(self.key_dir_starttime_backup.format('[' + _dir + ']'), start.timestamp()) + # check end-time and calculate duration + if 'end-time' in backup: + end = datetime.strptime(backup['end-time'], fmt_data) + delta = (end - start).total_seconds() + # the end time of the last backup at unixtime + zbx.send(self.key_dir_endtime_backup.format('[' + _dir + ']'), end.timestamp()) + # duration full or incremental of the last backup + if backup['backup-mode'] == "FULL": + zbx.send(self.key_dir_duration_full.format('[' + _dir + ']'), delta) + full_send = 1 + else: + zbx.send(self.key_dir_duration_inc.format('[' + _dir + ']'), delta) + if full_send == 0 and 'end-time' in backup and backup['backup-mode'] == "FULL": + start = datetime.strptime(backup['start-time'], fmt_data) + end = datetime.strptime(backup['end-time'], fmt_data) + delta = (end - start).total_seconds() + zbx.send(self.key_dir_duration_full.format('[' + _dir + ']'), delta) + full_send = 1 + + if no_error: + zbx.send(self.key_dir_error.format('[' + _dir + ']'), 'ok') zbx.send(self.key_main.format('[]'), zbx.json({'data': dirs})) del dirs @@ -128,24 +191,132 @@ def discovery_rules(self, template, dashboard=False): 'name': 'Pg_probackup dir {#BACKUPDIR}: size', 'units': Plugin.UNITS.bytes, 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, - 'delay': self.plugin_config('interval')}, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "Size of the entire catalog with backups"}, + {'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/backups,"), + 'name': 'Pg_probackup dir {#BACKUPDIR}/backups: size', + 'units': Plugin.UNITS.bytes, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "The size of the entire subdirectory /backups"}, + {'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/wal,"), + 'name': 'Pg_probackup dir {#BACKUPDIR}/wal: size', + 'units': Plugin.UNITS.bytes, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "The size of the entire subdirectory /wal"}, {'key': self.right_type(self.key_dir_error, var_discovery="{#BACKUPDIR},"), 'name': 'Pg_probackup dir {#BACKUPDIR}: error', 'value_type': Plugin.VALUE_TYPE.text, - 'delay': self.plugin_config('interval')}, + 'delay': self.plugin_config('interval'), + 'description': "Sign of the erroneous completion of the backup: 'ERROR', 'CORRUPT', 'ORPHAN'"}, + {'key': self.right_type(self.key_dir_duration_full, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: duration full backup', + 'units': Plugin.UNITS.s, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "The duration of the last full backup"}, + {'key': self.right_type(self.key_dir_duration_inc, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: duration incremental backup', + 'units': Plugin.UNITS.s, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'history': '31', + 'delay': self.plugin_config('interval'), + 'description': "The duration of the last incremental backup"}, + {'key': self.right_type(self.key_dir_endtime_backup, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: end time backup', + 'units': Plugin.UNITS.unixtime, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'delay': self.plugin_config('interval'), + 'description': "The end time of the last any backup"}, + {'key': self.right_type(self.key_dir_starttime_backup, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: start time backup', + 'units': Plugin.UNITS.unixtime, + 'value_type': Plugin.VALUE_TYPE.numeric_unsigned, + 'delay': self.plugin_config('interval'), + 'description': "The start time of the last any backup"}, + {'key': self.right_type(self.key_dir_status_backup, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: status', + 'value_type': Plugin.VALUE_TYPE.text, + 'delay': self.plugin_config('interval'), + 'description': "Sign of the status completion of the last backup:\n\n" + "OK — the backup is complete and valid.\n" + "DONE — the backup is complete, but was not validated.\n" + "RUNNING — the backup is in progress.\n" + "MERGING — the backup is being merged.\n" + "MERGED — the backup data files were successfully merged, but its metadata is in the process of being updated. Only full backups can have this status.\n" + "DELETING — the backup files are being deleted.\n" + "CORRUPT — some of the backup files are corrupt.\n" + "ERROR — the backup was aborted because of an unexpected error.\n" + "ORPHAN — the backup is invalid because one of its parent backups is corrupt or missing.\n\n" + "https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup" + }, + {'key': self.right_type(self.key_dir_mode_backup, var_discovery="{#BACKUPDIR},"), + 'name': 'Pg_probackup dir {#BACKUPDIR}: mode', + 'value_type': Plugin.VALUE_TYPE.text, + 'delay': self.plugin_config('interval'), + 'description': "Backup Creation Mode:\n\n" + "FULL — creates a full backup that contains all the data files of the cluster to be restored.\n" + "DELTA — reads all data files in the data directory and creates an incremental backup for pages that have changed since the previous backup.\n" + "PAGE — creates an incremental backup based on the WAL files that have been generated since the previous full or incremental backup was taken. Only changed blocks are read from data files.\n" + "PTRACK — creates an incremental backup tracking page changes on the fly.\n\n" + "https://postgrespro.ru/docs/postgrespro/current/app-pgprobackup" + }, ] graphs = [ + { + 'name': 'Pg_probackup: backup dir: {#BACKUPDIR} duration', + 'type': 0, + 'items': [ + {'color': '00897B', + 'drawtype': 2, + 'key': self.right_type(self.key_dir_duration_full, var_discovery="{#BACKUPDIR},")}, + {'color': '66BB6A', + 'drawtype': 2, + 'key': self.right_type(self.key_dir_duration_inc, var_discovery="{#BACKUPDIR},"), + 'yaxisside': 1} + ] + }, { 'name': 'Pg_probackup: backup dir: {#BACKUPDIR} size', - 'type': 1, + 'type': 0, 'items': [ - {'color': '00CC00', - 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR},")}] + {'color': 'C8E6C9', + 'drawtype': 1, + 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR},")}, + {'color': '00897B', + 'drawtype': 2, + 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/backups,")}, + {'color': '66BB6A', + 'drawtype': 2, + 'key': self.right_type(self.key_dir_size, var_discovery="{#BACKUPDIR}/wal,"), + 'yaxisside': 1} + ] }, ] - triggers = [{ - 'name': 'Error in pg_probackup dir ' - '{#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE})', - 'expression': '{#TEMPLATE:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1'} + triggers = [ + {'name': 'Error in pg_probackup dir {#BACKUPDIR} (hostname={HOSTNAME} value={ITEM.LASTVALUE})', + 'expression': '{#TEMPLATE:pg_probackup.dir.error[{#BACKUPDIR}].str(ok)}<>1', + 'priority': 3, + 'description': 'Backup status: CORRUPT / ERROR / ORPHAN'}, + {'name': 'Long time no backups on {HOSTNAME} in pg_probackup dir {#BACKUPDIR}', + 'expression': '({#TEMPLATE:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].now()}-{#TEMPLATE:pg_probackup.dir.end_time_backup[{#BACKUPDIR}].last()})>' + + self.plugin_config('max_time_lack_backup2alert_in_sec'), + 'priority': 2, + 'description': 'From the moment of completion of the backup passed more than ' + + str(int(int(self.plugin_config('max_time_lack_backup2alert_in_sec'))/3600)) + ' hours (' + + self.plugin_config('max_time_lack_backup2alert_in_sec') + ' seconds)'}, + {'name': 'Backup runs too long on {HOSTNAME} in pg_probackup dir {#BACKUPDIR} (RUNNING)', + 'expression': '{#TEMPLATE:pg_probackup.dir.status_backup[{#BACKUPDIR}].last()}="RUNNING"' + ' and ({#TEMPLATE:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].now()}-{#TEMPLATE:pg_probackup.dir.start_time_backup[{#BACKUPDIR}].last()})>' + + self.plugin_config('max_time_run_backup2alert_in_sec'), + 'priority': 2, + 'description': 'From the moment of start of the backup passed more than ' + + str(int(int(self.plugin_config('max_time_run_backup2alert_in_sec'))/3600)) + ' hours (' + + self.plugin_config('max_time_run_backup2alert_in_sec') + ' seconds)'}, ] return template.discovery_rule(rule=rule, conditions=conditions, items=items, graphs=graphs, triggers=triggers) diff --git a/packaging/conf/example.conf b/packaging/conf/example.conf index b313b6c3..14fe99fc 100644 --- a/packaging/conf/example.conf +++ b/packaging/conf/example.conf @@ -183,9 +183,11 @@ interval = 60 # Trigger fires if some backup has bad status e.g. (ERROR,CORRUPT,ORPHAN). [pgprobackup] enabled = False -interval = 300 +interval = 900 backup_dirs = /backup_dir1,/backup_dir2 -pg_probackup_path = /usr/bin/pg_probackup-11 +pg_probackup_path = /usr/bin/pg_probackup-13 +max_time_run_backup2alert_in_sec = 21600 +max_time_lack_backup2alert_in_sec = 100800 # Get size of relations defined in this section # Relations - comma separated list of objects - tables and endexes (database_name.schema.relation) used to calculate relations size.