@@ -54,48 +54,39 @@ func (ag *AlertGroups) AddAlert(alert *models.AlertCurEvent, noticeGroup []map[s
54
54
ag .lock .Lock ()
55
55
defer ag .lock .Unlock ()
56
56
57
- // 查找Rule位置
58
- rulePos := sort .Search (len (ag .Rules ), func (i int ) bool {
59
- return ag .Rules [i ].RuleID >= alert .RuleId
60
- })
57
+ // 查找 Rule 位置
58
+ rulePos := ag .getRuleNodePos (alert .RuleId )
61
59
62
- // Rule存在时的处理 ,找到对应的规则组
60
+ // Rule 存在时的处理 ,找到对应的规则组
63
61
if rulePos < len (ag .Rules ) && ag .Rules [rulePos ].RuleID == alert .RuleId {
64
- groups := & ag .Rules [rulePos ].Groups
65
- // 查找Group位置
66
- groupPos := sort .Search (len (* groups ), func (i int ) bool {
67
- return (* groups )[i ].ID >= groupID
68
- })
62
+ rule := & ag .Rules [rulePos ]
69
63
70
- if groupPos < len (* groups ) && (* groups )[groupPos ].ID == groupID {
64
+ // 查找 Group 位置
65
+ groupPos := ag .getGroupNodePos (rule , groupID )
66
+
67
+ if groupPos < len (rule .Groups ) && (rule .Groups )[groupPos ].ID == groupID {
71
68
// 追加事件
72
- (* groups )[groupPos ].Events = append ((* groups )[groupPos ].Events , alert )
69
+ (rule . Groups )[groupPos ].Events = append ((rule . Groups )[groupPos ].Events , alert )
73
70
} else {
74
- // 插入新Group,新增空元素扩容切片
75
- * groups = append (* groups , EventsGroup {})
76
- // 将插入位置之后的元素后移,
77
- copy ((* groups )[groupPos + 1 :], (* groups )[groupPos :])
78
- // 将数据赋值到新的空元素上
79
- (* groups )[groupPos ] = EventsGroup {
71
+ // 插入新数据
72
+ rule .Groups = append (rule .Groups , EventsGroup {
80
73
ID : groupID ,
81
74
Events : []* models.AlertCurEvent {alert },
82
- }
75
+ })
83
76
}
84
77
return
85
78
}
86
79
87
80
// 插入新Rule
88
- ag .Rules = append (ag .Rules , RulesGroup {})
89
- copy (ag .Rules [rulePos + 1 :], ag .Rules [rulePos :])
90
- ag .Rules [rulePos ] = RulesGroup {
81
+ ag .Rules = append (ag .Rules , RulesGroup {
91
82
RuleID : alert .RuleId ,
92
83
Groups : []EventsGroup {
93
84
{
94
85
ID : groupID ,
95
86
Events : []* models.AlertCurEvent {alert },
96
87
},
97
88
},
98
- }
89
+ })
99
90
}
100
91
101
92
// generateGroupID 生成分组ID,每个规则可能会有多个分组(其分组通知),默认为 default,如果有匹配的分组则根据 key/value 计算一个 HASH值作为 ID。
@@ -114,6 +105,31 @@ func (ag *AlertGroups) generateGroupID(alert *models.AlertCurEvent, noticeGroupM
114
105
return groupId
115
106
}
116
107
108
+ // getRuleNodePos 获取 Rule 点位
109
+ func (ag * AlertGroups ) getRuleNodePos (ruleId string ) int {
110
+ // Rules 切片排序
111
+ sort .Slice (ag .Rules , func (i , j int ) bool {
112
+ return ag .Rules [i ].RuleID < ag .Rules [j ].RuleID
113
+ })
114
+
115
+ // 查找Rule位置
116
+ return sort .Search (len (ag .Rules ), func (i int ) bool {
117
+ return ag .Rules [i ].RuleID >= ruleId
118
+ })
119
+ }
120
+
121
+ func (ag * AlertGroups ) getGroupNodePos (rule * RulesGroup , groupId string ) int {
122
+ // Groups 切片排序
123
+ sort .Slice (rule .Groups , func (i , j int ) bool {
124
+ return rule .Groups [i ].ID < rule .Groups [j ].ID
125
+ })
126
+
127
+ // 查找Group位置
128
+ return sort .Search (len (rule .Groups ), func (i int ) bool {
129
+ return (rule .Groups )[i ].ID >= groupId
130
+ })
131
+ }
132
+
117
133
func NewConsumerWork (ctx * ctx.Context ) ConsumeInterface {
118
134
return & Consume {
119
135
ctx : ctx ,
@@ -141,52 +157,52 @@ func (c *Consume) Stop(faultCenterId string) {
141
157
142
158
// Watch 启动 Consumer Watch 进程
143
159
func (c * Consume ) Watch (ctx context.Context , faultCenter models.FaultCenter ) {
160
+ taskChan := make (chan struct {}, 1 )
144
161
timer := time .NewTicker (time .Second * time .Duration (1 ))
145
162
defer func () {
146
163
timer .Stop ()
147
- if r := recover (); r != nil {
148
- logc .Error (c .ctx .Ctx , fmt .Sprintf ("Recovered from consumer watch goroutine panic: %s, FaultCenterName: %s, Id: %s" , r , faultCenter .Name , faultCenter .ID ))
149
- }
164
+ // if r := recover(); r != nil {
165
+ // logc.Error(c.ctx.Ctx, fmt.Sprintf("Recovered from consumer watch goroutine panic: %s, FaultCenterName: %s, Id: %s", r, faultCenter.Name, faultCenter.ID))
166
+ // }
150
167
}()
151
168
152
169
for {
153
170
select {
154
171
case <- timer .C :
155
- c .processSilenceRule (faultCenter )
156
- // 获取故障中心的所有告警事件
157
- data , err := c .ctx .Redis .Redis ().HGetAll (faultCenter .GetFaultCenterKey ()).Result ()
158
- if err != nil {
159
- logc .Error (c .ctx .Ctx , fmt .Sprintf ("从 Redis 中获取事件信息错误, faultCenterKey: %s, err: %s" , faultCenter .GetFaultCenterKey (), err .Error ()))
160
- return
161
- }
162
- // 事件过滤
163
- filterEvents := c .filterAlertEvents (faultCenter , data )
164
- // 事件分组
165
- var alertGroups AlertGroups
166
- c .alarmGrouping (faultCenter , & alertGroups , filterEvents )
167
- // 事件聚合
168
- aggEvents := c .alarmAggregation (faultCenter , & alertGroups )
169
- // 发送事件
170
- c .sendAlerts (faultCenter , aggEvents )
172
+ // 处理任务信号量
173
+ taskChan <- struct {}{}
174
+ c .executeTask (faultCenter , taskChan )
171
175
case <- ctx .Done ():
172
176
return
173
177
}
174
178
}
175
179
}
176
180
177
- func (c * Consume ) wait (startAt , endAt int64 , sem chan struct {}) {
178
- timer := time .NewTicker (time .Second * time .Duration (1 ))
179
- for {
180
- select {
181
- case <- timer .C :
182
- if (endAt - startAt ) > 60 {
183
- time .Sleep (time .Millisecond * 200 )
184
- sem <- struct {}{}
185
- } else {
186
- return
187
- }
188
- }
181
+ // executeTask 执行具体的任务逻辑
182
+ func (c * Consume ) executeTask (faultCenter models.FaultCenter , taskChan chan struct {}) {
183
+ defer func () {
184
+ // 释放任务信号量
185
+ <- taskChan
186
+ }()
187
+ // 处理静默规则
188
+ c .processSilenceRule (faultCenter )
189
+ // 获取故障中心的所有告警事件
190
+ data , err := c .ctx .Redis .Redis ().HGetAll (faultCenter .GetFaultCenterKey ()).Result ()
191
+ if err != nil {
192
+ logc .Error (c .ctx .Ctx , fmt .Sprintf ("从 Redis 中获取事件信息错误, faultCenterKey: %s, err: %s" , faultCenter .GetFaultCenterKey (), err .Error ()))
193
+ return
189
194
}
195
+
196
+ // 事件过滤
197
+ filterEvents := c .filterAlertEvents (faultCenter , data )
198
+ // 事件分组
199
+ var alertGroups AlertGroups
200
+ c .alarmGrouping (faultCenter , & alertGroups , filterEvents )
201
+ fmt .Println ("alertGroups->" , tools .JsonMarshal (alertGroups .Rules ))
202
+ // 事件聚合
203
+ aggEvents := c .alarmAggregation (faultCenter , & alertGroups )
204
+ // 发送事件
205
+ c .sendAlerts (faultCenter , aggEvents )
190
206
}
191
207
192
208
// filterAlertEvents 过滤告警事件
@@ -226,23 +242,31 @@ func (c *Consume) isMutedEvent(event *models.AlertCurEvent, faultCenter models.F
226
242
227
243
// validateEvent 事件验证
228
244
func (c * Consume ) validateEvent (event * models.AlertCurEvent , faultCenter models.FaultCenter ) bool {
229
- if event .State == "Pending" {
230
- return false
231
- }
232
-
233
245
return event .IsRecovered || event .LastSendTime == 0 ||
234
246
event .LastEvalTime >= event .LastSendTime + faultCenter .RepeatNoticeInterval * 60
235
247
}
236
248
237
249
// alarmGrouping 告警分组
250
+ // 分组会进行 2 次分类
251
+ // 第一次是状态(用于区分事件是告警或恢复,用于后续聚合逻辑,避免告警和恢复聚合到一起)
252
+ // 第二次是规则(对隶属于相同规则的事件放再同一组,用于后续聚合逻辑,避免不同规则的告警或恢复聚合到一起)
238
253
func (c * Consume ) alarmGrouping (faultCenter models.FaultCenter , alertGroups * AlertGroups , alerts []* models.AlertCurEvent ) {
239
254
if len (alerts ) == 0 {
240
255
return
241
256
}
242
257
243
258
for _ , alert := range alerts {
259
+ // 状态分组
260
+ switch alert .IsRecovered {
261
+ case true :
262
+ alert .RuleId = "Recover_" + alert .RuleId
263
+ case false :
264
+ alert .RuleId = "Firing_" + alert .RuleId
265
+ default :
266
+ alert .RuleId = "Unknown_" + alert .RuleId
267
+ }
268
+
244
269
alertGroups .AddAlert (alert , faultCenter .NoticeGroup )
245
- //c.addAlertToGroup(alert, faultCenter.NoticeGroup)
246
270
if alert .IsRecovered {
247
271
c .removeAlertFromCache (alert )
248
272
if err := process .RecordAlertHisEvent (c .ctx , * alert ); err != nil {
0 commit comments