Skip to content

Commit d5e540c

Browse files
dashboard: add panels for Tarantool 3 configuration
Closes #224
1 parent 784c5dd commit d5e540c

16 files changed

+5114
-2182
lines changed

CHANGELOG.md

+3
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,9 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
66

77
## [Unreleased]
88

9+
### Added
10+
- Panels for Tarantool 3 configuration status and alerts (#224)
11+
912
### Changed
1013
- Use consistent style for panel requirements (PR #231)
1114

dashboard/panels/cluster.libsonnet

+173
Original file line numberDiff line numberDiff line change
@@ -373,6 +373,179 @@ local prometheus = grafana.prometheus;
373373
level='critical',
374374
),
375375

376+
local tarantool3_config_description_note(description) = std.join('\n', [description, |||
377+
Panel minimal requirements: metrics 1.2.0, Tarantool 3.
378+
|||]),
379+
380+
tarantool3_config_status(
381+
cfg,
382+
title='Tarantool configuration status',
383+
description=tarantool3_config_description_note(|||
384+
Current Tarantool 3 configuration apply status for a cluster instance.
385+
`uninitialized` decribes uninitialized instance,
386+
`check_errors` decribes instance with at least one apply error,
387+
`check_warnings` decribes instance with at least one apply warning,
388+
`startup_in_progress` decribes instance doing initial configuration apply,
389+
`reload_in_progress` decribes instance doing configuration apply over existing configuration,
390+
`ready` describes a healthy instance.
391+
392+
Panel minimal requirements: Grafana 8.
393+
|||),
394+
):: timeseries.new(
395+
title=title,
396+
description=description,
397+
datasource=cfg.datasource,
398+
panel_width=12,
399+
max=6,
400+
min=1,
401+
).addValueMapping(
402+
1, 'dark-red', 'uninitialized'
403+
).addRangeMapping(
404+
1.001, 1.999, '-'
405+
).addValueMapping(
406+
2, 'red', 'check_errors'
407+
).addRangeMapping(
408+
2.001, 2.999, '-'
409+
).addValueMapping(
410+
3, 'yellow', 'startup_in_progress'
411+
).addRangeMapping(
412+
3.001, 3.999, '-'
413+
).addValueMapping(
414+
4, 'dark-yellow', 'reload_in_progress'
415+
).addRangeMapping(
416+
4.001, 4.999, '-'
417+
).addValueMapping(
418+
5, 'dark-orange', 'check_warnings'
419+
).addRangeMapping(
420+
5.001, 5.999, '-'
421+
).addValueMapping(
422+
6, 'green', 'ready'
423+
).addTarget(
424+
if cfg.type == variable.datasource_type.prometheus then
425+
local expr = std.format(
426+
|||
427+
1 * %(metric_full_name)s{%(uninitialized_filters)s} + on(alias)
428+
2 * %(metric_full_name)s{%(check_errors_filters)s} + on(alias)
429+
3 * %(metric_full_name)s{%(startup_in_progress_filters)s} + on(alias)
430+
4 * %(metric_full_name)s{%(reload_in_progress_filters)s} + on(alias)
431+
5 * %(metric_full_name)s{%(check_warnings_filters)s} + on(alias)
432+
6 * %(metric_full_name)s{%(ready_filters)s}
433+
|||, {
434+
metric_full_name: cfg.metrics_prefix + 'tnt_config_status',
435+
uninitialized_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'uninitialized'] }),
436+
check_errors_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'check_errors'] }),
437+
startup_in_progress_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'startup_in_progress'] }),
438+
reload_in_progress_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'reload_in_progress'] }),
439+
check_warnings_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'check_warnings'] }),
440+
ready_filters: common.prometheus_query_filters(cfg.filters { status: ['=', 'ready'] }),
441+
}
442+
);
443+
prometheus.target(expr=expr, legendFormat='{{alias}}')
444+
else if cfg.type == variable.datasource_type.influxdb then
445+
local query = std.format(|||
446+
SELECT (1 * last("uninitialized") + 2 * last("check_errors") + 3 * last("startup_in_progress") +
447+
4 * last("reload_in_progress") + 5 * last("check_warnings") + 6 * last("ready")) as "status" FROM
448+
(
449+
SELECT "value" as "uninitialized" FROM %(measurement_with_policy)s
450+
WHERE ("metric_name" = '%(metric_full_name)s' AND %(uninitialized_filters)s) AND $timeFilter
451+
),
452+
(
453+
SELECT "value" as "check_errors" FROM %(measurement_with_policy)s
454+
WHERE ("metric_name" = '%(metric_full_name)s' AND %(check_errors_filters)s) AND $timeFilter
455+
),
456+
(
457+
SELECT "value" as "startup_in_progress" FROM %(measurement_with_policy)s
458+
WHERE ("metric_name" = '%(metric_full_name)s' AND %(startup_in_progress_filters)s) AND $timeFilter
459+
),
460+
(
461+
SELECT "value" as "reload_in_progress" FROM %(measurement_with_policy)s
462+
WHERE ("metric_name" = '%(metric_full_name)s' AND %(reload_in_progress_filters)s) AND $timeFilter
463+
),
464+
(
465+
SELECT "value" as "check_warnings" FROM %(measurement_with_policy)s
466+
WHERE ("metric_name" = '%(metric_full_name)s' AND %(check_warnings_filters)s) AND $timeFilter
467+
),
468+
(
469+
SELECT "value" as "ready" FROM %(measurement_with_policy)s
470+
WHERE ("metric_name" = '%(metric_full_name)s' AND %(ready_filters)s) AND $timeFilter
471+
)
472+
GROUP BY time($__interval), "label_pairs_alias" fill(0)
473+
|||, {
474+
metric_full_name: cfg.metrics_prefix + 'tnt_config_status',
475+
measurement_with_policy: std.format('%(policy_prefix)s"%(measurement)s"', {
476+
policy_prefix: if cfg.policy == 'default' then '' else std.format('"%(policy)s".', cfg.policy),
477+
measurement: cfg.measurement,
478+
}),
479+
uninitialized_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'uninitialized'] }),
480+
check_errors_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'check_errors'] }),
481+
startup_in_progress_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'startup_in_progress'] }),
482+
reload_in_progress_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'reload_in_progress'] }),
483+
check_warnings_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'check_warnings'] }),
484+
ready_filters: common.influxdb_query_filters(cfg.filters { label_pairs_status: ['=', 'ready'] }),
485+
});
486+
influxdb.target(
487+
rawQuery=true,
488+
query=query,
489+
alias='$tag_label_pairs_alias',
490+
)
491+
),
492+
493+
local tarantool3_config_alerts(
494+
cfg,
495+
title,
496+
description,
497+
level,
498+
) = common.default_graph(
499+
cfg,
500+
title=title,
501+
description=tarantool3_config_description_note(description),
502+
min=0,
503+
legend_avg=false,
504+
legend_max=false,
505+
panel_height=8,
506+
panel_width=6,
507+
).addTarget(
508+
common.target(
509+
cfg,
510+
'tnt_config_alerts',
511+
additional_filters={
512+
[variable.datasource_type.prometheus]: { level: ['=', level] },
513+
[variable.datasource_type.influxdb]: { label_pairs_level: ['=', level] },
514+
},
515+
converter='last',
516+
),
517+
),
518+
519+
tarantool3_config_warning_alerts(
520+
cfg,
521+
title='Tarantool configuration warnings',
522+
description=|||
523+
Number of "warn" alerts on Tarantool 3 configuration apply on a cluster instance.
524+
"warn" alerts cover non-critical issues which do not result in apply failure,
525+
like missing a role to grant for a user.
526+
|||,
527+
):: tarantool3_config_alerts(
528+
cfg,
529+
title=title,
530+
description=description,
531+
level='warn',
532+
),
533+
534+
tarantool3_config_error_alerts(
535+
cfg,
536+
title='Tarantool configuration errors',
537+
description=|||
538+
Number of "error" alerts on Tarantool 3 configuration apply on a cluster instance.
539+
"error" alerts cover critical issues which results in apply failure,
540+
like instance missing itself in configuration.
541+
|||,
542+
):: tarantool3_config_alerts(
543+
cfg,
544+
title=title,
545+
description=description,
546+
level='error',
547+
),
548+
376549
failovers_per_second(
377550
cfg,
378551
title='Failovers triggered',

dashboard/section.libsonnet

+6
Original file line numberDiff line numberDiff line change
@@ -37,13 +37,19 @@ local vinyl = import 'dashboard/panels/vinyl.libsonnet';
3737
cluster.http_rps_stat(cfg) { gridPos: { w: 4, h: 5, x: 12, y: 4 } },
3838
cluster.net_rps_stat(cfg) { gridPos: { w: 4, h: 5, x: 16, y: 4 } },
3939
cluster.space_ops_stat(cfg) { gridPos: { w: 4, h: 5, x: 20, y: 4 } },
40+
cluster.tarantool3_config_status(cfg),
41+
cluster.tarantool3_config_warning_alerts(cfg),
42+
cluster.tarantool3_config_error_alerts(cfg),
4043
cluster.read_only_status(cfg, panel_width=24),
4144
cluster.election_state(cfg),
4245
cluster.election_vote(cfg),
4346
cluster.election_leader(cfg),
4447
cluster.election_term(cfg),
4548
] else if cfg.type == variable.datasource_type.influxdb then [
4649
cluster.row,
50+
cluster.tarantool3_config_status(cfg),
51+
cluster.tarantool3_config_warning_alerts(cfg),
52+
cluster.tarantool3_config_error_alerts(cfg),
4753
cluster.read_only_status(cfg, panel_width=24),
4854
cluster.election_state(cfg),
4955
cluster.election_vote(cfg),

doc/monitoring/alerting.rst

+42
Original file line numberDiff line numberDiff line change
@@ -219,6 +219,48 @@ sleeps.
219219
Some high loaded fiber has too little yields. It may be the reason of 'Too long WAL write' warnings."
220220
221221
222+
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
223+
Configuration status
224+
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
225+
226+
:ref:`Configuration status <config_api_reference_info>` displays
227+
Tarantool 3 configuration apply state. Additional metrics desplay the count
228+
of apply warnings and errors.
229+
230+
.. code-block:: yaml
231+
232+
- alert: ConfigWarningAlerts
233+
expr: tnt_config_alerts{level="warn"} > 0
234+
for: 1m
235+
labels:
236+
severity: warning
237+
annotations:
238+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'warn' alerts"
239+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'warn' alerts.
240+
Please, check config:info() for detailed info."
241+
242+
- alert: ConfigErrorAlerts
243+
expr: tnt_config_alerts{level="error"} > 0
244+
for: 1m
245+
labels:
246+
severity: page
247+
annotations:
248+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'error' alerts"
249+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'error' alerts.
250+
Latest configuration has not been applied.
251+
Please, check config:info() for detailed info."
252+
253+
- alert: ConfigStatusNotReady
254+
expr: tnt_config_status{status="ready"} == 0
255+
for: 5m
256+
labels:
257+
severity: warning
258+
annotations:
259+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') configuration is not ready"
260+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' configuration is not ready.
261+
Please, check config:info() for detailed info."
262+
263+
222264
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
223265
Cartridge issues
224266
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""""
-2.66 KB
Loading
1.05 KB
Loading
10.3 KB
Loading

example_cluster/prometheus/alerts.yml

+34
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,40 @@ groups:
8282
You are likely to hit limit soon.
8383
It is strongly recommended to increase memtx_memory or number of storages in case of sharded data."
8484

85+
# Warning for configuration warning alerts.
86+
- alert: ConfigWarningAlerts
87+
expr: tnt_config_alerts{level="warn"} > 0
88+
for: 1m
89+
labels:
90+
severity: warning
91+
annotations:
92+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'warn' alerts"
93+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'warn' alerts.
94+
Please, check config:info() for detailed info."
95+
96+
# Alert for configuration error alerts.
97+
- alert: ConfigErrorAlerts
98+
expr: tnt_config_alerts{level="error"} > 0
99+
for: 1m
100+
labels:
101+
severity: page
102+
annotations:
103+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') has configuration 'error' alerts"
104+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' has configuration 'error' alerts.
105+
Latest configuration has not been applied.
106+
Please, check config:info() for detailed info."
107+
108+
# Warning for configuration status.
109+
- alert: ConfigStatusNotReady
110+
expr: tnt_config_status{status="ready"} == 0
111+
for: 5m
112+
labels:
113+
severity: warning
114+
annotations:
115+
summary: "Instance '{{ $labels.alias }}' ('{{ $labels.job }}') configuration is not ready"
116+
description: "Instance '{{ $labels.alias }}' of job '{{ $labels.job }}' configuration is not ready.
117+
Please, check config:info() for detailed info."
118+
85119
# Alert for Tarantool replication high lag (both for masters and replicas).
86120
- alert: HighReplicationLag
87121
expr: tnt_replication_lag > 1

example_cluster/prometheus/test_alerts.yml

+91
Original file line numberDiff line numberDiff line change
@@ -167,6 +167,97 @@ tests:
167167
exp_alerts: # no alert firing
168168

169169

170+
- interval: 15s
171+
input_series:
172+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="unitialized"}'
173+
values: '1+0x4 0+0x4 0+0x30'
174+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="check_errors"}'
175+
values: '0+0x4 0+0x4 0+0x30'
176+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="check_warnings"}'
177+
values: '0+0x4 0+0x4 0+0x30'
178+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="startup_in_progress"}'
179+
values: '0+0x4 1+0x4 0+0x30'
180+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="reload_in_progress"}'
181+
values: '0+0x4 0+0x4 0+0x30'
182+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="ready"}'
183+
values: '0+0x4 0+0x4 1+0x30'
184+
alert_rule_test:
185+
- eval_time: 10m
186+
alertname: ConfigStatusNotReady
187+
exp_alerts: # no alert firing
188+
189+
190+
- interval: 15s
191+
input_series:
192+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="unitialized"}'
193+
values: '1+0x4 0+0x4 0+0x30'
194+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="check_errors"}'
195+
values: '0+0x4 0+0x4 1+0x30'
196+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="check_warnings"}'
197+
values: '0+0x4 0+0x4 0+0x30'
198+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="startup_in_progress"}'
199+
values: '0+0x4 1+0x4 0+0x30'
200+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="reload_in_progress"}'
201+
values: '0+0x4 0+0x4 0+0x30'
202+
- series: 'tnt_config_status{job="tarantool",instance="app:8081",alias="tnt_router",status="ready"}'
203+
values: '0+0x4 0+0x4 0+0x30'
204+
alert_rule_test:
205+
- eval_time: 10m
206+
alertname: ConfigStatusNotReady
207+
exp_alerts:
208+
- exp_labels:
209+
severity: warning
210+
instance: app:8081
211+
alias: tnt_router
212+
job: tarantool
213+
status: ready
214+
exp_annotations:
215+
summary: "Instance 'tnt_router' ('tarantool') configuration is not ready"
216+
description: "Instance 'tnt_router' of job 'tarantool' configuration is not ready.
217+
Please, check config:info() for detailed info."
218+
219+
220+
- interval: 15s
221+
input_series:
222+
- series: 'tnt_config_alerts{job="tarantool",instance="app:8081",alias="tnt_router",level="warn"}'
223+
values: '1+0x10'
224+
alert_rule_test:
225+
- eval_time: 2m
226+
alertname: ConfigWarningAlerts
227+
exp_alerts:
228+
- exp_labels:
229+
severity: warning
230+
instance: app:8081
231+
alias: tnt_router
232+
job: tarantool
233+
level: warn
234+
exp_annotations:
235+
summary: "Instance 'tnt_router' ('tarantool') has configuration 'warn' alerts"
236+
description: "Instance 'tnt_router' of job 'tarantool' has configuration 'warn' alerts.
237+
Please, check config:info() for detailed info."
238+
239+
240+
- interval: 15s
241+
input_series:
242+
- series: 'tnt_config_alerts{job="tarantool",instance="app:8081",alias="tnt_router",level="error"}'
243+
values: '1+0x10'
244+
alert_rule_test:
245+
- eval_time: 2m
246+
alertname: ConfigErrorAlerts
247+
exp_alerts:
248+
- exp_labels:
249+
severity: page
250+
instance: app:8081
251+
alias: tnt_router
252+
job: tarantool
253+
level: error
254+
exp_annotations:
255+
summary: "Instance 'tnt_router' ('tarantool') has configuration 'error' alerts"
256+
description: "Instance 'tnt_router' of job 'tarantool' has configuration 'error' alerts.
257+
Latest configuration has not been applied.
258+
Please, check config:info() for detailed info."
259+
260+
170261
- interval: 15s
171262
input_series:
172263
- series: 'tnt_slab_quota_used_ratio{job="tarantool",instance="app:8081",alias="tnt_router"}'

0 commit comments

Comments
 (0)