Skip to content

Commit c0aa8fe

Browse files
authored
🎉 Support recover alarm event get current latest value (#152)
1 parent 3198464 commit c0aa8fe

File tree

3 files changed

+55
-32
lines changed

3 files changed

+55
-32
lines changed

alert/eval/eval.go

+12-5
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ type (
2323
Submit(rule models.AlertRule)
2424
Stop(ruleId string)
2525
Eval(ctx context.Context, rule models.AlertRule)
26-
Recover(ruleId, faultCenterKey string, faultCenterInfoKey string, curKeys []string)
26+
Recover(ruleId, faultCenterKey string, faultCenterInfoKey string, curKeys []string, curMetrics map[string]interface{})
2727
RestartAllEvals()
2828
}
2929

@@ -82,6 +82,7 @@ func (t *AlertRule) Eval(ctx context.Context, rule models.AlertRule) {
8282
}
8383

8484
var curFingerprints []string
85+
var fingerPrintMetrics map[string]interface{}
8586
for _, dsId := range rule.DatasourceIdList {
8687
instance, err := t.ctx.DB.Datasource().GetInstance(dsId)
8788
if err != nil {
@@ -95,9 +96,10 @@ func (t *AlertRule) Eval(ctx context.Context, rule models.AlertRule) {
9596
}
9697

9798
var fingerprints []string
99+
98100
switch rule.DatasourceType {
99101
case "Prometheus", "VictoriaMetrics":
100-
fingerprints = metrics(t.ctx, dsId, instance.Type, rule)
102+
fingerprints, fingerPrintMetrics = metrics(t.ctx, dsId, instance.Type, rule)
101103
case "AliCloudSLS", "Loki", "ElasticSearch":
102104
fingerprints = logs(t.ctx, dsId, instance.Type, rule)
103105
case "Jaeger":
@@ -113,7 +115,7 @@ func (t *AlertRule) Eval(ctx context.Context, rule models.AlertRule) {
113115
curFingerprints = append(curFingerprints, fingerprints...)
114116
}
115117
logc.Infof(t.ctx.Ctx, fmt.Sprintf("规则评估 -> %v", tools.JsonMarshal(rule)))
116-
t.Recover(rule.RuleId, models.BuildCacheEventKey(rule.TenantId, rule.FaultCenterId), models.BuildCacheInfoKey(rule.TenantId, rule.FaultCenterId), curFingerprints)
118+
t.Recover(rule.RuleId, models.BuildCacheEventKey(rule.TenantId, rule.FaultCenterId), models.BuildCacheInfoKey(rule.TenantId, rule.FaultCenterId), curFingerprints, fingerPrintMetrics)
117119
t.GC(rule, curFingerprints)
118120

119121
case <-ctx.Done():
@@ -134,7 +136,7 @@ func (t *AlertRule) getEvalTimeDuration(evalTimeType string, evalInterval int64)
134136
}
135137
}
136138

137-
func (t *AlertRule) Recover(RuleId, faultCenterKey string, faultCenterInfoKey string, curFingerprints []string) {
139+
func (t *AlertRule) Recover(RuleId, faultCenterKey string, faultCenterInfoKey string, curFingerprints []string, fingerPrintMetrics map[string]interface{}) {
138140
// 获取所有的故障中心的告警事件
139141
events, err := t.ctx.Redis.Event().GetAllEventsForFaultCenter(faultCenterKey)
140142
if err != nil {
@@ -191,7 +193,12 @@ func (t *AlertRule) Recover(RuleId, faultCenterKey string, faultCenterInfoKey st
191193
event.IsRecovered = true
192194
event.RecoverTime = curTime
193195
event.LastSendTime = 0
194-
event.Metric["value"] = ""
196+
currentMetrics := fingerPrintMetrics[fingerprint]
197+
if metric, ok := currentMetrics.(map[string]interface{}); ok {
198+
event.Metric["value"] = metric["value"]
199+
} else {
200+
event.Metric["value"] = 0
201+
}
195202

196203
t.ctx.Redis.Event().PushEventToFaultCenter(&event)
197204
// 触发恢复删除带恢复中的 key

alert/eval/query.go

+33-27
Original file line numberDiff line numberDiff line change
@@ -15,51 +15,54 @@ import (
1515
)
1616

1717
// Metrics 包含 Prometheus、VictoriaMetrics 数据源
18-
func metrics(ctx *ctx.Context, datasourceId, datasourceType string, rule models.AlertRule) []string {
18+
func metrics(ctx *ctx.Context, datasourceId, datasourceType string, rule models.AlertRule) ([]string, map[string]interface{}) {
1919
pools := ctx.Redis.ProviderPools()
2020
var (
2121
resQuery []provider.Metrics
2222
externalLabels map[string]interface{}
2323
)
24-
24+
fingerPrintMetrics := make(map[string]interface{})
2525
switch datasourceType {
2626
case provider.PrometheusDsProvider:
2727
cli, err := pools.GetClient(datasourceId)
2828
if err != nil {
2929
logc.Errorf(ctx.Ctx, err.Error())
30-
return nil
30+
return nil, nil
3131
}
3232

3333
resQuery, err = cli.(provider.PrometheusProvider).Query(rule.PrometheusConfig.PromQL)
3434
if err != nil {
3535
logc.Error(ctx.Ctx, err.Error())
36-
return nil
36+
return nil, nil
3737
}
3838

3939
externalLabels = cli.(provider.PrometheusProvider).GetExternalLabels()
4040
case provider.VictoriaMetricsDsProvider:
4141
cli, err := pools.GetClient(datasourceId)
4242
if err != nil {
4343
logc.Errorf(ctx.Ctx, err.Error())
44-
return nil
44+
return nil, nil
4545
}
4646

4747
resQuery, err = cli.(provider.VictoriaMetricsProvider).Query(rule.PrometheusConfig.PromQL)
4848
if err != nil {
4949
logc.Error(ctx.Ctx, err.Error())
50-
return nil
50+
return nil, nil
5151
}
5252

5353
externalLabels = cli.(provider.VictoriaMetricsProvider).GetExternalLabels()
5454
default:
5555
logc.Errorf(ctx.Ctx, fmt.Sprintf("Unsupported metrics type, type: %s", datasourceType))
56-
return nil
56+
return nil, nil
5757
}
5858

5959
if resQuery == nil {
60-
return nil
60+
return nil, nil
6161
}
6262

63+
// 获取已缓存事件指纹
64+
fingerPrintMap := process.GetFingerPrint(ctx, rule.TenantId, rule.FaultCenterId, rule.RuleId)
65+
6366
var curFingerprints []string
6467
for _, v := range resQuery {
6568
for _, ruleExpr := range rule.PrometheusConfig.Rules {
@@ -69,36 +72,39 @@ func metrics(ctx *ctx.Context, datasourceId, datasourceType string, rule models.
6972
continue
7073
}
7174

72-
event := func() *models.AlertCurEvent {
73-
event := process.BuildEvent(rule)
74-
event.DatasourceId = datasourceId
75-
event.Fingerprint = v.GetFingerprint()
76-
event.Metric = v.GetMetric()
77-
event.Metric["severity"] = ruleExpr.Severity
78-
for ek, ev := range externalLabels {
79-
event.Metric[ek] = ev
80-
}
81-
event.Severity = ruleExpr.Severity
82-
event.Annotations = tools.ParserVariables(rule.PrometheusConfig.Annotations, event.Metric)
83-
event.SearchQL = rule.PrometheusConfig.PromQL
84-
curFingerprints = append(curFingerprints, event.Fingerprint)
85-
86-
return &event
87-
}
88-
8975
option := models.EvalCondition{
9076
Operator: operator,
9177
QueryValue: v.Value,
9278
ExpectedValue: value,
9379
}
9480

81+
event := process.BuildEvent(rule)
82+
event.DatasourceId = datasourceId
83+
event.Fingerprint = v.GetFingerprint()
84+
event.Metric = v.GetMetric()
85+
event.Metric["severity"] = ruleExpr.Severity
86+
for ek, ev := range externalLabels {
87+
event.Metric[ek] = ev
88+
}
89+
event.Severity = ruleExpr.Severity
90+
event.Annotations = tools.ParserVariables(rule.PrometheusConfig.Annotations, event.Metric)
91+
event.SearchQL = rule.PrometheusConfig.PromQL
92+
9593
if process.EvalCondition(option) {
96-
process.PushEventToFaultCenter(ctx, event())
94+
// 如果告警条件满足需要将告警事件指纹加入,不满足时应当直接跳过
95+
curFingerprints = append(curFingerprints, event.Fingerprint)
96+
97+
process.PushEventToFaultCenter(ctx, &event)
98+
} else {
99+
// 仅更新已经触发事件的指纹对应指标
100+
if _, exist := fingerPrintMap[event.Fingerprint]; exist {
101+
fingerPrintMetrics[event.Fingerprint] = event.Metric
102+
}
97103
}
98104
}
99105
}
100106

101-
return curFingerprints
107+
return curFingerprints, fingerPrintMetrics
102108
}
103109

104110
// Logs 包含 AliSLS、Loki、ElasticSearch 数据源

alert/process/process.go

+10
Original file line numberDiff line numberDiff line change
@@ -169,3 +169,13 @@ func RecordAlertHisEvent(ctx *ctx.Context, alert models.AlertCurEvent) error {
169169

170170
return nil
171171
}
172+
173+
// GetFingerPrint 获取指纹信息
174+
func GetFingerPrint(ctx *ctx.Context, tenantId string, faultCenterId string, ruleId string) map[string]struct{} {
175+
fingerPrints := ctx.Redis.Event().GetFingerprintsByRuleId(tenantId, faultCenterId, ruleId)
176+
fingerPrintMap := make(map[string]struct{})
177+
for _, fingerPrint := range fingerPrints {
178+
fingerPrintMap[fingerPrint] = struct{}{}
179+
}
180+
return fingerPrintMap
181+
}

0 commit comments

Comments
 (0)