From af452488657186a748a42c22dd066c7d23ba9f46 Mon Sep 17 00:00:00 2001 From: acd19ml Date: Fri, 19 Sep 2025 22:06:03 +0800 Subject: [PATCH 1/4] =?UTF-8?q?feat(Metrics&Alert):=20=E6=96=B0=E5=A2=9E?= =?UTF-8?q?=E8=A7=84=E5=88=99=E4=B8=8E=E9=98=88=E5=80=BC=E7=AE=A1=E7=90=86?= =?UTF-8?q?=E7=9A=84=E8=A1=A8=E7=BB=93=E6=9E=84=EF=BC=8C=E6=A0=B8=E5=BF=83?= =?UTF-8?q?=E6=8E=A5=E5=8F=A3=EF=BC=8C=E7=BC=96=E6=8E=92=E6=B5=81=E7=A8=8B?= =?UTF-8?q?=EF=BC=8C=E4=B8=8E=E5=88=9D=E6=AD=A5=E5=AE=9E=E7=8E=B0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/alerting/api.md | 2 +- docs/alerting/database-design.md | 117 ++++----- internal/alerting/service/ruleset/README.md | 235 ++++++++++++++++++ internal/alerting/service/ruleset/manager.go | 143 +++++++++++ .../alerting/service/ruleset/manager_test.go | 85 +++++++ .../alerting/service/ruleset/normalize.go | 57 +++++ .../service/ruleset/normalize_test.go | 23 ++ .../service/ruleset/promsync_exporter.go | 67 +++++ internal/alerting/service/ruleset/store_pg.go | 170 +++++++++++++ internal/alerting/service/ruleset/types.go | 86 +++++++ 10 files changed, 920 insertions(+), 65 deletions(-) create mode 100644 internal/alerting/service/ruleset/README.md create mode 100644 internal/alerting/service/ruleset/manager.go create mode 100644 internal/alerting/service/ruleset/manager_test.go create mode 100644 internal/alerting/service/ruleset/normalize.go create mode 100644 internal/alerting/service/ruleset/normalize_test.go create mode 100644 internal/alerting/service/ruleset/promsync_exporter.go create mode 100644 internal/alerting/service/ruleset/store_pg.go create mode 100644 internal/alerting/service/ruleset/types.go diff --git a/docs/alerting/api.md b/docs/alerting/api.md index df0ddce..80d4735 100644 --- a/docs/alerting/api.md +++ b/docs/alerting/api.md @@ -126,7 +126,7 @@ GET /v1/issues/{issueID} |--------|------|------| | id | string | 告警问题唯一标识 | | state | string | 告警工单的生命周期状态状态:`Open`、`Closed` | -| level | string | 告警等级:`P0`、`P1`、`P2`、`Warning` | +| level | string | 告警等级:`P0`、`P1`、`P2` | | alertState | string | 告警本身的实时状态:`Pending`、`Restored`、`AutoRestored`、`InProcessing` | | title | string | 告警标题描述 | | labels | Label[] | 标签数组 | diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md index 47508d1..98b3e1c 100644 --- a/docs/alerting/database-design.md +++ b/docs/alerting/database-design.md @@ -2,14 +2,13 @@ ## 概述 -本文档为最新数据库设计,总计包含 7 张表: +本文档为最新数据库设计,总计包含 6 张表: - alert_issues - alert_issue_comments -- metric_alert_changes +- alert_meta_change_logs - alert_rules -- service_alert_metas -- service_metrics +- alert_rule_metas - service_states ## 数据表设计 @@ -51,16 +50,22 @@ --- -### 3) metric_alert_changes(指标告警规则变更记录表) +### 3) alert_meta_change_logs(阈值变更记录表) -用于追踪指标类告警规则或参数的变更历史。 +用于追踪规则阈值(threshold)与观察窗口(watch_time)的变更历史。 | 字段名 | 类型 | 说明 | |--------|------|------| -| id | varchar(64) PK | 变更记录 ID | -| change_time | TIMESTAMP(6) | 变更时间 | -| alert_name | varchar(255) | 告警名称/规则名 | -| change_items | json | 变更项数组:[{key, old_value, new_value}] | +| id | varchar(64) PK | 幂等/去重标识 | +| change_type | varchar(16) | 变更类型:Create / Update / Delete / Rollback | +| change_time | timestamptz | 变更时间 | +| alert_name | varchar(255) | 规则名 | +| labels | text | labels 的 JSON 字符串表示(规范化后) | +| old_threshold | numeric | 旧阈值(可空) | +| new_threshold | numeric | 新阈值(可空) | +| old_watch | interval | 旧观察窗口(可空) | +| new_watch | interval | 新观察窗口(可空) | + **索引建议:** - PRIMARY KEY: `id` @@ -71,49 +76,37 @@ ### 4) alert_rules(告警规则表) -定义可复用的规则表达式,支持作用域绑定。 - -| 字段名 | 类型 | 说明 | +| 字段名 | 类型 | 说明 | |--------|------|------| -| id | varchar(255) PK | 规则 ID(可与 K8s 资源 ID 对应或做映射) | -| name | varchar(255) | 规则名称,表达式可读的名称 | -| scopes | varchar(255) | 作用域,例:"services:svc1,svc2" | -| expr | text | 规则表达式(可含占位符) | - -**索引建议:** -- PRIMARY KEY: `id` -- INDEX: `(name)` -- INDEX: `(scopes)` +|name|varchar(255)|主键,告警规则名称| +|description|text|可读标题,可拼接渲染为可读的 title| +|expr|text|左侧业务指标表达式,(通常对应 PromQL 左侧的聚合,如 sum(apitime) by (service, version))| +|op|varchar(4)|阈值比较方式(枚举:>, <, =, !=)| +|severity|varchar(32)|告警等级,通常进入告警的 labels.severity| ---- - -### 5) service_alert_metas(服务告警元数据表) +**约束建议:** +- CHECK 约束:`op IN ('>', '<', '=', '!=')` -按服务维度存放参数化配置,用于渲染具体规则。 +⸻ -| 字段名 | 类型 | 说明 | -|--------|------|------| -| service | varchar(255) | 服务名 | -| key | varchar(255) | 参数名(如 `apitime_threshold`) | -| value | varchar(255) | 参数值(如 `50`) | +### 5) alert_rule_metas(规则阈值元信息表) -**索引建议:** -- PRIMARY KEY: `(service, key)` -- INDEX: `(service)` +字段名 类型 说明 +alert_name varchar(255) 关联 `alert_rules.name` +labels jsonb 适用标签(示例:{"service":"s3","version":"v1"});为空 `{}` 表示全局 +threshold numeric 阈值(会被渲染成特定规则的 threshold metric 数值) +watch_time interval 持续时长(映射 Prometheus rule 的 for:) ---- +**约束与索引建议:** +- FOREIGN KEY: `(alert_name)` REFERENCES `alert_rules(name)` ON DELETE CASCADE +- UNIQUE: `(alert_name, labels)` +- GIN INDEX: `labels`(`CREATE INDEX idx_metas_labels_gin ON alert_rule_metas USING gin(labels);`) -### 6) service_metrics(服务指标清单表) +⸻ -记录服务所关注的指标清单(可用于 UI 侧展示或校验)。 - -| 字段名 | 类型 | 说明 | -|--------|------|------| -| service | varchar(255) PK | 服务名 | -| metrics | json | 指标名数组:["metric1", "metric2", ...] | - -**索引建议:** -- PRIMARY KEY: `service` +说明: +- labels 建议用 jsonb,方便在 Postgres 中做索引和查询。 +- labels 的键名与值格式应在应用层规范化(排序/小写/去空值)以确保唯一性和可查询性一致。 --- @@ -140,21 +133,18 @@ erDiagram alert_issues ||--o{ alert_issue_comments : "has comments" alert_rules { - varchar id PK - varchar name - varchar scopes + varchar name PK + text description text expr + varchar op + varchar severity } - service_alert_metas { - varchar service PK - varchar key PK - varchar value - } - - service_metrics { - varchar service PK - json metrics + alert_rule_metas { + varchar alert_name FK + jsonb labels + numeric threshold + interval watch_time } service_states { @@ -184,14 +174,13 @@ erDiagram text content } - %% 通过 service 逻辑关联 - service_alert_metas ||..|| service_metrics : "by service" - service_states ||..|| service_alert_metas : "by service" + %% 通过 service 等标签在应用层逻辑关联 + alert_rule_metas ||..|| alert_rules : "by alert_name" + service_states ||..|| alert_rule_metas : "by service/version labels" ``` ## 数据流转 -1. 以 `alert_rules` 为模版,结合 `service_alert_metas` 渲染出面向具体服务的规则。 -2. 指标或规则参数发生调整时,记录到 `metric_alert_changes`。 -3. 规则触发创建 `alert_issues`;处理过程中的动作写入 `alert_issue_comments`。 -4. 面向服务的整体健康态以 `service_states` 记录和推进(new → analyzing → processing → resolved)。 \ No newline at end of file +1. 以 `alert_rules` 为模版,结合 `alert_rule_metas` 渲染出面向具体服务/版本等的规则(labels 可为空 `{}` 表示全局默认,或包含如 service/version 等标签)。 +2. 指标或规则参数发生调整时,记录到 `alert_meta_change_logs`。 +3. 规则触发创建 `alert_issues`;处理过程中的动作写入 `alert_issue_comments`。 \ No newline at end of file diff --git a/internal/alerting/service/ruleset/README.md b/internal/alerting/service/ruleset/README.md new file mode 100644 index 0000000..9c9ff47 --- /dev/null +++ b/internal/alerting/service/ruleset/README.md @@ -0,0 +1,235 @@ +## Ruleset(规则与阈值管理) + +本目录为“规则与阈值管理(ruleset)”实现说明。内容聚焦于:表结构、核心接口、编排流程、Prometheus 同步方式、并发与一致性、测试与使用示例。文档与当前代码实现保持一致。 + +--- + +## 1) 目标与边界(已实现) + +- 通过 `alert_rules` 与 `alert_rule_metas`,为同一告警规则按标签维度(如 `service`、`version`)配置阈值与持续时间(`watch_time`)。 +- 变更阈值后,立刻同步到内存 Exporter(无需 Prometheus reload)。 +- 多告警等级(P0/P1…)通过“多条规则”实现(如 `latency_p95_P0` 与 `latency_p95_P1`)。 +- 记录变更日志,支持审计,便于后续扩展回滚能力。 + +--- + +## 2) Go 组件与接口 + +### 2.1 关键类型与接口(节选) + +```go +// types.go +type AlertRule struct { + Name string + Description string + Expr string + Op string + Severity string +} + +type LabelMap map[string]string + +type AlertRuleMeta struct { + AlertName string + Labels LabelMap + Threshold float64 + WatchTime time.Duration // interval 映射 +} + +type ChangeLog struct { + ID string + AlertName string + ChangeType string + Labels LabelMap + OldThreshold *float64 + NewThreshold *float64 + OldWatch *time.Duration + NewWatch *time.Duration + ChangeTime time.Time +} + +type Store interface { + // rules + CreateRule(ctx context.Context, r *AlertRule) error + GetRule(ctx context.Context, name string) (*AlertRule, error) + UpdateRule(ctx context.Context, r *AlertRule) error + DeleteRule(ctx context.Context, name string) error + + // metas (UPSERT by alert_name + labels) + UpsertMeta(ctx context.Context, m *AlertRuleMeta) (created bool, err error) + GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error) + DeleteMeta(ctx context.Context, name string, labels LabelMap) error + + // change logs + InsertChangeLog(ctx context.Context, log *ChangeLog) error + + // tx helpers + WithTx(ctx context.Context, fn func(Store) error) error +} + +type PromSync interface { + AddToPrometheus(ctx context.Context, r *AlertRule) error // 新增时更新 rule 文件并 reload(当前实现为占位) + DeleteFromPrometheus(ctx context.Context, name string) error // 删除(当前实现为占位) + SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error +} + +type AlertRuleMgr interface { + LoadRule(ctx context.Context) error + UpsertRuleMetas(ctx context.Context, m *AlertRuleMeta) error + AddAlertRule(ctx context.Context, r *AlertRule) error + DeleteAlertRule(ctx context.Context, name string) error + + AddToPrometheus(ctx context.Context, r *AlertRule) error + DeleteFromPrometheus(ctx context.Context, name string) error + SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error + RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error +} +``` + +### 2.2 Manager 核心流程(与实现一致) + +```go +func (m *Manager) UpsertRuleMetas(ctx context.Context, meta *AlertRuleMeta) error { + meta.Labels = NormalizeLabels(meta.Labels, m.aliasMap) + if err := validateMeta(meta); err != nil { return err } + return m.store.WithTx(ctx, func(tx Store) error { + oldList, err := tx.GetMetas(ctx, meta.AlertName, meta.Labels) + if err != nil { return err } + var old *AlertRuleMeta + if len(oldList) > 0 { old = oldList[0] } + if _, err := tx.UpsertMeta(ctx, meta); err != nil { return err } + if err := m.RecordMetaChangeLog(ctx, old, meta); err != nil { return err } + return m.prom.SyncMetaToPrometheus(ctx, meta) + }) +} +``` + +--- + +## 3) Prometheus 同步 + +- 实现为内存版 Exporter(`ExporterSync`),维护 `(rule + 规范化 labels) → {threshold, watch_time}`。 +- `SyncMetaToPrometheus` 直接更新内存映射,变更即时生效。 +- `AddToPrometheus`/`DeleteFromPrometheus` 作为占位,当前不写规则文件。 +- 如需以 metrics 暴露阈值,可在同进程 `/metrics` 将 `ExporterSync` 中的映射导出(按规则维度命名指标)。 + +--- + +## 4) 事务、并发与一致性 + +- `Store.WithTx`:当前 PgStore 直接调用 fn(占位),可按需扩展为真正事务。 +- 写入采用单条 UPSERT(见下文 SQL),满足幂等。 +- 如存在同一 `(alert_name, labels)` 的高并发写入,建议使用 Postgres advisory lock。 +- Exporter 同步在 Upsert 成功后执行,生产中建议串行化该步骤以避免竞态。 + +提示: +- 标签命名不一致(例如 `service_version`/`version` 混用)通过 `NormalizeLabels` 的别名映射解决。 +- 多层阈值优先级(`{}`, `{service}`, `{service,version}`)建议仅导出“最具体”的一条(当前实现未裁剪,可扩展)。 + +--- + +## 5) SQL 示例(与代码一致) + +### 5.1 UPSERT Meta(带审计在应用层做) + +```sql +-- 假设参数:$1 alert_name, $2 labels::jsonb, $3 threshold::numeric, $4 watch::interval +INSERT INTO alert_rule_metas(alert_name, labels, threshold, watch_time) +VALUES ($1, $2, $3, $4) +ON CONFLICT (alert_name, labels) DO UPDATE SET + threshold = EXCLUDED.threshold, + watch_time = EXCLUDED.watch_time, + updated_at = now(); +``` + +### 5.2 查询:按部分标签匹配 + +```sql +-- 传入 {"service":"stg"},返回该规则下 service=stg 的 metas(无视 version) +SELECT * FROM alert_rule_metas +WHERE alert_name = $1 + AND labels @> $2::jsonb; -- 包含关系 +``` + +--- + +## 6) 使用示例(最小化) + +```go +db, _ := database.New(os.Getenv("ALERTING_PG_DSN")) +store := ruleset.NewPgStore(db) +prom := ruleset.NewExporterSync() +mgr := ruleset.NewManager(store, prom, map[string]string{"service_version":"version"}) + +meta := &ruleset.AlertRuleMeta{ + AlertName: "latency_p95_P0", + Labels: ruleset.LabelMap{"Service": "s3", "service_version": "v1"}, + Threshold: 450, + WatchTime: 2 * time.Minute, +} +_ = mgr.UpsertRuleMetas(context.Background(), meta) +``` + +--- + +## 7) Exporter 要点(当前实现) + +- 使用 `CanonicalLabelKey` 生成稳定键。 +- 当前未实现“优先级裁剪”(`{}`, `{service}`, `{service,version}` 仅导出最具体),可按需扩展。 +- 多副本部署需共享或拉取状态(可由 DB 拉取或事件广播)。 + +--- + +## 8) 测试 + +### 8.1 单元测试 + +- NormalizeLabels 与 CanonicalLabelKey: + - 输入包含大小写、空白、别名键(如 `service_version`)的 labels,断言小写化、去空白、别名映射、移除空值; + - 对乱序键,`CanonicalLabelKey` 结果一致。 +- Manager.UpsertRuleMetas: + - 使用内存实现的 Store 与 ExporterSync: + - 首次 Upsert 走 Create 分支,写入 metas,并同步到 ExporterSync; + - 再次 Upsert 走 Update 分支,产生变更日志; + - 断言阈值已生效到 ExporterSync。 + +对应测试用例:`internal/alerting/service/ruleset/normalize_test.go`,`internal/alerting/service/ruleset/manager_test.go` + +运行: + +```bash +go test ./internal/alerting/service/ruleset -v +``` + +### 8.2 手动测试(本地) + +- 数据库准备: + - 按本文数据库设计创建表(或参考 `docs/alerting/database-design.md`)。 + - 在 `alert_rules` 插入一条规则,如:`latency_p95_P0`。 +- 启动 Exporter/服务: + - 代码中使用 `NewExporterSync()` 并注入到 `NewManager(...)`。 + - 通过 `Manager.UpsertRuleMetas` 传入 `AlertRuleMeta{AlertName:"latency_p95_P0", Labels:{service:"s3",version:"v1"}, Threshold:450, WatchTime:2m}`。 + - 验证内存 Exporter `ForTestingGet` 返回阈值为 450。 +- 变更验证: + - 再次调用 `UpsertRuleMetas`,阈值改为 500,检查 `alert_meta_change_logs` 新增 Update 记录。 +- 回滚演练: + - 读取上一条变更日志的 old 值,再次 Upsert 即可实现回滚(可后续补充接口)。 + +--- + +## 9) 需求映射 + +- 同一规则多阈值等级(P0/P1)→ 通过多条 `alert_rules`(如 `_P0` 与 `_P1`)。 +- 告警变更接口(service + meta 参数)→ 统一落在 `alert_rule_metas`(已支持 labels 任意组合)。 +- 变更记录查询 → `alert_meta_change_logs`。 + +--- + +## 10) 后续增强(建议) + +1. Exporter 端优先级裁剪(仅导出最具体标签的阈值)。 +2. PgStore 接入真实事务(BeginTx),必要时使用 advisory lock。 +3. 增加回滚接口:基于 change_log 的 old 值再 Upsert 一次。 +4. 阈值 metrics 暴露:统一命名(每条规则单独 threshold metric)。 + +--- \ No newline at end of file diff --git a/internal/alerting/service/ruleset/manager.go b/internal/alerting/service/ruleset/manager.go new file mode 100644 index 0000000..03e58d2 --- /dev/null +++ b/internal/alerting/service/ruleset/manager.go @@ -0,0 +1,143 @@ +package ruleset + +import ( + "context" + "errors" + "fmt" + "time" +) + +var ( + // ErrInvalidMeta indicates provided meta is incomplete or invalid. + ErrInvalidMeta = errors.New("invalid alert rule meta") +) + +// Manager implements AlertRuleMgr, coordinating store and Prometheus sync. +type Manager struct { + store Store + prom PromSync + aliasMap map[string]string +} + +func NewManager(store Store, prom PromSync, aliasMap map[string]string) *Manager { + if aliasMap == nil { + aliasMap = map[string]string{} + } + return &Manager{store: store, prom: prom, aliasMap: aliasMap} +} + +func (m *Manager) LoadRule(ctx context.Context) error { return nil } + +func (m *Manager) AddAlertRule(ctx context.Context, r *AlertRule) error { + if r == nil || r.Name == "" { + return fmt.Errorf("invalid rule") + } + if err := m.store.CreateRule(ctx, r); err != nil { + return err + } + return m.prom.AddToPrometheus(ctx, r) +} + +func (m *Manager) DeleteAlertRule(ctx context.Context, name string) error { + if name == "" { + return fmt.Errorf("invalid name") + } + if err := m.store.DeleteRule(ctx, name); err != nil { + return err + } + return m.prom.DeleteFromPrometheus(ctx, name) +} + +func (m *Manager) AddToPrometheus(ctx context.Context, r *AlertRule) error { + return m.prom.AddToPrometheus(ctx, r) +} +func (m *Manager) DeleteFromPrometheus(ctx context.Context, name string) error { + return m.prom.DeleteFromPrometheus(ctx, name) +} +func (m *Manager) SyncMetaToPrometheus(ctx context.Context, meta *AlertRuleMeta) error { + return m.prom.SyncMetaToPrometheus(ctx, meta) +} + +func (m *Manager) UpsertRuleMetas(ctx context.Context, meta *AlertRuleMeta) error { + if meta == nil { + return ErrInvalidMeta + } + meta.Labels = NormalizeLabels(meta.Labels, m.aliasMap) + if err := validateMeta(meta); err != nil { + return err + } + return m.store.WithTx(ctx, func(tx Store) error { + oldList, err := tx.GetMetas(ctx, meta.AlertName, meta.Labels) + if err != nil { + return err + } + var old *AlertRuleMeta + if len(oldList) > 0 { + old = oldList[0] + } + _, err = tx.UpsertMeta(ctx, meta) + if err != nil { + return err + } + if err := m.RecordMetaChangeLog(ctx, old, meta); err != nil { + return err + } + if err := m.prom.SyncMetaToPrometheus(ctx, meta); err != nil { + return err + } + return nil + }) +} + +func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error { + if newMeta == nil { + return nil + } + var oldTh, newTh *float64 + var oldW, newW *time.Duration + if oldMeta != nil { + oldTh = &oldMeta.Threshold + oldW = &oldMeta.WatchTime + } + if newMeta != nil { + newTh = &newMeta.Threshold + newW = &newMeta.WatchTime + } + log := &ChangeLog{ + ID: fmt.Sprintf("%s-%s-%d", newMeta.AlertName, CanonicalLabelKey(newMeta.Labels), time.Now().UnixNano()), + AlertName: newMeta.AlertName, + ChangeType: classifyChange(oldMeta, newMeta), + Labels: newMeta.Labels, + OldThreshold: oldTh, + NewThreshold: newTh, + OldWatch: oldW, + NewWatch: newW, + ChangeTime: time.Now().UTC(), + } + return m.store.InsertChangeLog(ctx, log) +} + +func classifyChange(oldMeta, newMeta *AlertRuleMeta) string { + if oldMeta == nil && newMeta != nil { + return "Create" + } + if oldMeta != nil && newMeta == nil { + return "Delete" + } + return "Update" +} + +func validateMeta(m *AlertRuleMeta) error { + if m.AlertName == "" { + return ErrInvalidMeta + } + if !isFinite(m.Threshold) { + return ErrInvalidMeta + } + if m.WatchTime < 0 { + return ErrInvalidMeta + } + return nil +} + +func isFinite(f float64) bool { return !((f != f) || (f > 1e308) || (f < -1e308)) } diff --git a/internal/alerting/service/ruleset/manager_test.go b/internal/alerting/service/ruleset/manager_test.go new file mode 100644 index 0000000..658df7b --- /dev/null +++ b/internal/alerting/service/ruleset/manager_test.go @@ -0,0 +1,85 @@ +package ruleset + +import ( + "context" + "testing" + "time" +) + +type memStore struct { + rules map[string]*AlertRule + metas map[string]*AlertRuleMeta + logs []*ChangeLog +} + +func newMemStore() *memStore { + return &memStore{rules: map[string]*AlertRule{}, metas: map[string]*AlertRuleMeta{}, logs: []*ChangeLog{}} +} + +func (m *memStore) CreateRule(ctx context.Context, r *AlertRule) error { + m.rules[r.Name] = r + return nil +} +func (m *memStore) GetRule(ctx context.Context, name string) (*AlertRule, error) { + return m.rules[name], nil +} +func (m *memStore) UpdateRule(ctx context.Context, r *AlertRule) error { + m.rules[r.Name] = r + return nil +} +func (m *memStore) DeleteRule(ctx context.Context, name string) error { + delete(m.rules, name) + return nil +} +func (m *memStore) UpsertMeta(ctx context.Context, meta *AlertRuleMeta) (bool, error) { + m.metas[meta.AlertName+"|"+CanonicalLabelKey(meta.Labels)] = meta + return true, nil +} +func (m *memStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error) { + if v, ok := m.metas[name+"|"+CanonicalLabelKey(labels)]; ok { + return []*AlertRuleMeta{v}, nil + } + return nil, nil +} +func (m *memStore) DeleteMeta(ctx context.Context, name string, labels LabelMap) error { + delete(m.metas, name+"|"+CanonicalLabelKey(labels)) + return nil +} +func (m *memStore) InsertChangeLog(ctx context.Context, log *ChangeLog) error { + m.logs = append(m.logs, log) + return nil +} +func (m *memStore) WithTx(ctx context.Context, fn func(Store) error) error { return fn(m) } + +func TestManager_UpsertRuleMetas(t *testing.T) { + ctx := context.Background() + store := newMemStore() + prom := NewExporterSync() + mgr := NewManager(store, prom, map[string]string{"service_version": "version"}) + + meta := &AlertRuleMeta{AlertName: "latency_p95_P0", Labels: LabelMap{"Service": "s3", "service_version": "v1"}, Threshold: 450, WatchTime: 2 * time.Minute} + if err := mgr.UpsertRuleMetas(ctx, meta); err != nil { + t.Fatalf("upsert meta: %v", err) + } + // verify normalization + if _, ok := store.metas["latency_p95_P0|service=s3|version=v1"]; !ok { + t.Fatalf("normalized meta not found in store: %#v", store.metas) + } + // verify prom sync + if th, _, ok := prom.ForTestingGet("latency_p95_P0", LabelMap{"service": "s3", "version": "v1"}); !ok || th != 450 { + t.Fatalf("prom sync threshold mismatch: th=%v ok=%v", th, ok) + } + // verify change log + if len(store.logs) != 1 || store.logs[0].ChangeType != "Create" { + t.Fatalf("unexpected change logs: %#v", store.logs) + } + + // update path + meta2 := &AlertRuleMeta{AlertName: "latency_p95_P0", Labels: LabelMap{"service": "s3", "version": "v1"}, Threshold: 500, WatchTime: 3 * time.Minute} + if err := mgr.UpsertRuleMetas(ctx, meta2); err != nil { + t.Fatalf("upsert meta2: %v", err) + } + if len(store.logs) != 2 || store.logs[1].ChangeType != "Update" { + t.Fatalf("expected update log, got: %#v", store.logs) + } +} diff --git a/internal/alerting/service/ruleset/normalize.go b/internal/alerting/service/ruleset/normalize.go new file mode 100644 index 0000000..74b7b8c --- /dev/null +++ b/internal/alerting/service/ruleset/normalize.go @@ -0,0 +1,57 @@ +package ruleset + +import ( + "sort" + "strings" +) + +// NormalizeLabels returns a new LabelMap with keys normalized to lowercase, trimmed, aliases applied, +// empty values removed, and values trimmed. It does not mutate the input map. +// aliasMap maps alternative keys to canonical keys, e.g., "service_version" -> "version". +func NormalizeLabels(in LabelMap, aliasMap map[string]string) LabelMap { + if len(in) == 0 { + return LabelMap{} + } + result := make(LabelMap, len(in)) + for rawKey, rawVal := range in { + key := strings.ToLower(strings.TrimSpace(rawKey)) + if key == "" { + continue + } + if canonical, ok := aliasMap[key]; ok && strings.TrimSpace(canonical) != "" { + key = strings.ToLower(strings.TrimSpace(canonical)) + } + val := strings.TrimSpace(rawVal) + if val == "" { + continue + } + result[key] = val + } + return result +} + +// CanonicalLabelKey returns a stable string representation of labels for use as a map key. +// It sorts keys and concatenates as key=value pairs separated by '|'. +// This ensures {a=1,b=2} and {b=2,a=1} produce identical keys. +func CanonicalLabelKey(labels LabelMap) string { + if len(labels) == 0 { + return "{}" + } + keys := make([]string, 0, len(labels)) + for k := range labels { + keys = append(keys, k) + } + sort.Strings(keys) + var b strings.Builder + b.Grow(len(keys) * 8) + for i := 0; i < len(keys); i++ { + if i > 0 { + b.WriteByte('|') + } + k := keys[i] + b.WriteString(k) + b.WriteByte('=') + b.WriteString(labels[k]) + } + return b.String() +} diff --git a/internal/alerting/service/ruleset/normalize_test.go b/internal/alerting/service/ruleset/normalize_test.go new file mode 100644 index 0000000..02a9c75 --- /dev/null +++ b/internal/alerting/service/ruleset/normalize_test.go @@ -0,0 +1,23 @@ +package ruleset + +import "testing" + +func TestNormalizeLabels(t *testing.T) { + alias := map[string]string{"service_version": "version"} + in := LabelMap{" Service ": " s3 ", "service_version": " V1 ", "empty": " "} + out := NormalizeLabels(in, alias) + if out["service"] != "s3" || out["version"] != "V1" { + t.Fatalf("unexpected normalize: %#v", out) + } + if _, ok := out["empty"]; ok { + t.Fatalf("empty value should be removed: %#v", out) + } +} + +func TestCanonicalLabelKey(t *testing.T) { + key1 := CanonicalLabelKey(LabelMap{"b": "2", "a": "1"}) + key2 := CanonicalLabelKey(LabelMap{"a": "1", "b": "2"}) + if key1 != key2 { + t.Fatalf("keys should be equal: %s vs %s", key1, key2) + } +} diff --git a/internal/alerting/service/ruleset/promsync_exporter.go b/internal/alerting/service/ruleset/promsync_exporter.go new file mode 100644 index 0000000..573365f --- /dev/null +++ b/internal/alerting/service/ruleset/promsync_exporter.go @@ -0,0 +1,67 @@ +package ruleset + +import ( + "context" + "fmt" + "sync" + "time" +) + +// ExporterSync is an in-memory PromSync implementation that maintains threshold/watch values +// for each (rule, labels) pair. It is intended for unit tests and simple deployments where +// another component exposes these as metrics. +type ExporterSync struct { + mu sync.RWMutex + thresholds map[string]float64 + watchTimes map[string]time.Duration +} + +func NewExporterSync() *ExporterSync { + return &ExporterSync{ + thresholds: make(map[string]float64), + watchTimes: make(map[string]time.Duration), + } +} + +// keyFor builds a stable key for the given rule and labels. +func (e *ExporterSync) keyFor(rule string, labels LabelMap) string { + return fmt.Sprintf("%s|%s", rule, CanonicalLabelKey(labels)) +} + +func (e *ExporterSync) AddToPrometheus(ctx context.Context, r *AlertRule) error { return nil } + +func (e *ExporterSync) DeleteFromPrometheus(ctx context.Context, name string) error { + e.mu.Lock() + defer e.mu.Unlock() + // delete all entries for the rule + prefix := name + "|" + for k := range e.thresholds { + if len(k) >= len(prefix) && k[:len(prefix)] == prefix { + delete(e.thresholds, k) + delete(e.watchTimes, k) + } + } + return nil +} + +func (e *ExporterSync) SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error { + if m == nil || m.AlertName == "" { + return fmt.Errorf("invalid meta: missing alert name") + } + e.mu.Lock() + defer e.mu.Unlock() + key := e.keyFor(m.AlertName, m.Labels) + e.thresholds[key] = m.Threshold + e.watchTimes[key] = m.WatchTime + return nil +} + +// ForTestingGet exposes current values for assertions in unit tests. +func (e *ExporterSync) ForTestingGet(rule string, labels LabelMap) (threshold float64, watch time.Duration, ok bool) { + e.mu.RLock() + defer e.mu.RUnlock() + key := e.keyFor(rule, labels) + v, ok1 := e.thresholds[key] + w, ok2 := e.watchTimes[key] + return v, w, ok1 && ok2 +} diff --git a/internal/alerting/service/ruleset/store_pg.go b/internal/alerting/service/ruleset/store_pg.go new file mode 100644 index 0000000..fe3d231 --- /dev/null +++ b/internal/alerting/service/ruleset/store_pg.go @@ -0,0 +1,170 @@ +package ruleset + +import ( + "context" + "encoding/json" + "fmt" + "time" + + abd "github.com/qiniu/zeroops/internal/alerting/database" +) + +// PgStore is a PostgreSQL-backed Store implementation using the alerting database wrapper. +// Note: The current database wrapper does not expose transactions; WithTx acts as a simple wrapper. +// For production-grade atomicity, extend the database wrapper to support sql.Tx and wire it here. +type PgStore struct { + DB *abd.Database +} + +func NewPgStore(db *abd.Database) *PgStore { return &PgStore{DB: db} } + +func (s *PgStore) WithTx(ctx context.Context, fn func(Store) error) error { + // Fallback: invoke fn directly. Replace with real transactional context when available. + return fn(s) +} + +func (s *PgStore) CreateRule(ctx context.Context, r *AlertRule) error { + const q = ` + INSERT INTO alert_rules(name, description, expr, op, severity) + VALUES ($1, $2, $3, $4, $5) + ON CONFLICT (name) DO UPDATE SET + description = EXCLUDED.description, + expr = EXCLUDED.expr, + op = EXCLUDED.op, + severity = EXCLUDED.severity + ` + _, err := s.DB.ExecContext(ctx, q, r.Name, r.Description, r.Expr, r.Op, r.Severity) + if err != nil { + return fmt.Errorf("create rule: %w", err) + } + return nil +} + +func (s *PgStore) GetRule(ctx context.Context, name string) (*AlertRule, error) { + const q = `SELECT name, description, expr, op, severity FROM alert_rules WHERE name = $1` + rows, err := s.DB.QueryContext(ctx, q, name) + if err != nil { + return nil, fmt.Errorf("get rule: %w", err) + } + defer rows.Close() + if rows.Next() { + var r AlertRule + if err := rows.Scan(&r.Name, &r.Description, &r.Expr, &r.Op, &r.Severity); err != nil { + return nil, fmt.Errorf("scan rule: %w", err) + } + return &r, nil + } + return nil, fmt.Errorf("rule not found: %s", name) +} + +func (s *PgStore) UpdateRule(ctx context.Context, r *AlertRule) error { + const q = `UPDATE alert_rules SET description=$2, expr=$3, op=$4, severity=$5 WHERE name=$1` + _, err := s.DB.ExecContext(ctx, q, r.Name, r.Description, r.Expr, r.Op, r.Severity) + if err != nil { + return fmt.Errorf("update rule: %w", err) + } + return nil +} + +func (s *PgStore) DeleteRule(ctx context.Context, name string) error { + const q = `DELETE FROM alert_rules WHERE name=$1` + _, err := s.DB.ExecContext(ctx, q, name) + if err != nil { + return fmt.Errorf("delete rule: %w", err) + } + return nil +} + +func (s *PgStore) UpsertMeta(ctx context.Context, m *AlertRuleMeta) (bool, error) { + labelsJSON, _ := json.Marshal(m.Labels) + const q = ` + INSERT INTO alert_rule_metas(alert_name, labels, threshold, watch_time) + VALUES ($1, $2::jsonb, $3, $4) + ON CONFLICT (alert_name, labels) DO UPDATE SET + threshold=EXCLUDED.threshold, + watch_time=EXCLUDED.watch_time, + updated_at=now() + ` + _, err := s.DB.ExecContext(ctx, q, m.AlertName, string(labelsJSON), m.Threshold, m.WatchTime) + if err != nil { + return false, fmt.Errorf("upsert meta: %w", err) + } + // created flag is not easily observable here without RETURNING clause; return false. + return false, nil +} + +func (s *PgStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error) { + labelsJSON, _ := json.Marshal(labels) + const q = ` + SELECT alert_name, labels, threshold, watch_time + FROM alert_rule_metas + WHERE alert_name = $1 AND labels = $2::jsonb + ` + rows, err := s.DB.QueryContext(ctx, q, name, string(labelsJSON)) + if err != nil { + return nil, fmt.Errorf("get metas: %w", err) + } + defer rows.Close() + var res []*AlertRuleMeta + for rows.Next() { + var alertName string + var labelsRaw string + var threshold float64 + var watch any + if err := rows.Scan(&alertName, &labelsRaw, &threshold, &watch); err != nil { + return nil, fmt.Errorf("scan meta: %w", err) + } + lm := LabelMap{} + _ = json.Unmarshal([]byte(labelsRaw), &lm) + meta := &AlertRuleMeta{AlertName: alertName, Labels: lm, Threshold: threshold} + // best-effort: watch_time may come back as string or duration; we try string -> duration + switch v := watch.(type) { + case string: + if d, err := timeParseDurationPG(v); err == nil { + meta.WatchTime = d + } + } + res = append(res, meta) + } + return res, nil +} + +func (s *PgStore) DeleteMeta(ctx context.Context, name string, labels LabelMap) error { + labelsJSON, _ := json.Marshal(labels) + const q = `DELETE FROM alert_rule_metas WHERE alert_name=$1 AND labels=$2::jsonb` + _, err := s.DB.ExecContext(ctx, q, name, string(labelsJSON)) + if err != nil { + return fmt.Errorf("delete meta: %w", err) + } + return nil +} + +func (s *PgStore) InsertChangeLog(ctx context.Context, log *ChangeLog) error { + labelsJSON, _ := json.Marshal(log.Labels) + const q = ` + INSERT INTO alert_meta_change_logs(id, alert_name, change_type, labels, old_threshold, new_threshold, old_watch, new_watch, change_time) + VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) + ` + _, err := s.DB.ExecContext(ctx, q, log.ID, log.AlertName, log.ChangeType, string(labelsJSON), log.OldThreshold, log.NewThreshold, log.OldWatch, log.NewWatch, log.ChangeTime) + if err != nil { + return fmt.Errorf("insert change log: %w", err) + } + return nil +} + +// timeParseDurationPG parses a small subset of PostgreSQL interval text output into time.Duration. +// Supported examples: "01:02:03", "02:03", "3600 seconds". Best-effort only. +func timeParseDurationPG(s string) (time.Duration, error) { + // HH:MM:SS + var h, m int + var sec float64 + if n, _ := fmt.Sscanf(s, "%d:%d:%f", &h, &m, &sec); n >= 2 { + d := time.Duration(h)*time.Hour + time.Duration(m)*time.Minute + time.Duration(sec*float64(time.Second)) + return d, nil + } + var seconds float64 + if n, _ := fmt.Sscanf(s, "%f seconds", &seconds); n == 1 { + return time.Duration(seconds * float64(time.Second)), nil + } + return 0, fmt.Errorf("unsupported interval format: %s", s) +} diff --git a/internal/alerting/service/ruleset/types.go b/internal/alerting/service/ruleset/types.go new file mode 100644 index 0000000..4009832 --- /dev/null +++ b/internal/alerting/service/ruleset/types.go @@ -0,0 +1,86 @@ +package ruleset + +import ( + "context" + "time" +) + +// AlertRule defines a logical alert rule. This corresponds to a Prometheus alert rule entry +// excluding threshold information, which is managed separately via AlertRuleMeta. +// Name is the business identifier and should align with Prometheus alert: field. +type AlertRule struct { + Name string // unique rule name, typically equals Prometheus alert name + Description string // human readable explanation + Expr string // left-hand PromQL expression (e.g. p95 latency expression) + Op string // comparison operator: one of >, <, =, != + Severity string // severity code such as P0, P1, P2 +} + +// LabelMap represents a normalized set of label key-value pairs that identify a meta scope. +// Standardization rules are applied before persistence (see normalize.go). +type LabelMap map[string]string + +// AlertRuleMeta holds threshold and watch duration for a specific rule under certain labels. +// Threshold is a numeric boundary; WatchTime maps to Prometheus rule "for:" duration. +type AlertRuleMeta struct { + AlertName string // foreign key to AlertRule.Name + Labels LabelMap // normalized labels; {} means global default + Threshold float64 // numeric threshold + WatchTime time.Duration // watch window; exported or translated to Prometheus for: +} + +// ChangeLog captures before/after changes for auditing and potential rollback. +type ChangeLog struct { + ID string // external id for de-duplication + AlertName string // rule name + ChangeType string // Create | Update | Delete | Rollback + Labels LabelMap // affected labels + OldThreshold *float64 // nil if not applicable + NewThreshold *float64 // nil if not applicable + OldWatch *time.Duration // nil if not applicable + NewWatch *time.Duration // nil if not applicable + ChangeTime time.Time // when the change happened +} + +// Store abstracts persistence operations for rules and metas. Implementations should ensure +// correctness under concurrency via UPSERTs and, if necessary, advisory locks. +type Store interface { + // Rule CRUD + CreateRule(ctx context.Context, r *AlertRule) error + GetRule(ctx context.Context, name string) (*AlertRule, error) + UpdateRule(ctx context.Context, r *AlertRule) error + DeleteRule(ctx context.Context, name string) error + + // Meta operations (UPSERT by alert_name + labels) + UpsertMeta(ctx context.Context, m *AlertRuleMeta) (created bool, err error) + GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error) + DeleteMeta(ctx context.Context, name string, labels LabelMap) error + + // Change logs + InsertChangeLog(ctx context.Context, log *ChangeLog) error + + // Transaction helper. Implementation must call fn with a transactional Store + // that respects atomicity for the ops executed within. + WithTx(ctx context.Context, fn func(Store) error) error +} + +// PromSync defines interactions with Prometheus or an exporter responsible for threshold materialization. +// Add/Delete manage the lifecycle of rule files; SyncMeta updates threshold sources. +type PromSync interface { + AddToPrometheus(ctx context.Context, r *AlertRule) error + DeleteFromPrometheus(ctx context.Context, name string) error + SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error +} + +// AlertRuleMgr orchestrates validation, store operations, change logging, and Prometheus sync. +type AlertRuleMgr interface { + LoadRule(ctx context.Context) error + UpsertRuleMetas(ctx context.Context, m *AlertRuleMeta) error + AddAlertRule(ctx context.Context, r *AlertRule) error + DeleteAlertRule(ctx context.Context, name string) error + + AddToPrometheus(ctx context.Context, r *AlertRule) error + DeleteFromPrometheus(ctx context.Context, name string) error + SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error + RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error +} From 7ae82d12553ba2c02828e3b44b836cb6f629b701 Mon Sep 17 00:00:00 2001 From: acd19ml Date: Sun, 21 Sep 2025 01:30:42 +0800 Subject: [PATCH 2/4] =?UTF-8?q?=E4=BC=98=E5=8C=96promethues=E4=B8=8E?= =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=BA=93=E6=93=8D=E4=BD=9C=E9=A1=BA=E5=BA=8F?= =?UTF-8?q?=EF=BC=8C=E6=94=B9=E8=BF=9B=E9=94=99=E8=AF=AF=E5=A4=84=E7=90=86?= =?UTF-8?q?=EF=BC=8C=E6=94=B9=E8=BF=9B=E5=AD=98=E5=82=A8=E7=B1=BB=E5=9E=8B?= =?UTF-8?q?=E8=BD=AC=E6=8D=A2?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/alerting/database-design.md | 23 ++- internal/alerting/service/ruleset/manager.go | 93 +++++++-- internal/alerting/service/ruleset/store_pg.go | 113 ++++++++--- .../alerting/service/ruleset/store_pg_test.go | 188 ++++++++++++++++++ 4 files changed, 352 insertions(+), 65 deletions(-) create mode 100644 internal/alerting/service/ruleset/store_pg_test.go diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md index 98b3e1c..b119349 100644 --- a/docs/alerting/database-design.md +++ b/docs/alerting/database-design.md @@ -76,13 +76,13 @@ ### 4) alert_rules(告警规则表) -| 字段名 | 类型 | 说明 | +| 字段名 | 类型 | 说明 | |--------|------|------| -|name|varchar(255)|主键,告警规则名称| -|description|text|可读标题,可拼接渲染为可读的 title| -|expr|text|左侧业务指标表达式,(通常对应 PromQL 左侧的聚合,如 sum(apitime) by (service, version))| -|op|varchar(4)|阈值比较方式(枚举:>, <, =, !=)| -|severity|varchar(32)|告警等级,通常进入告警的 labels.severity| +| name | varchar(255) | 主键,告警规则名称 | +| description | text | 可读标题,可拼接渲染为可读的 title | +| expr | text | 左侧业务指标表达式,(通常对应 PromQL 左侧的聚合,如 sum(apitime) by (service, version)) | +| op | varchar(4) | 阈值比较方式(枚举:>, <, =, !=) | +| severity | varchar(32) | 告警等级,通常进入告警的 labels.severity | **约束建议:** - CHECK 约束:`op IN ('>', '<', '=', '!=')` @@ -91,11 +91,12 @@ ### 5) alert_rule_metas(规则阈值元信息表) -字段名 类型 说明 -alert_name varchar(255) 关联 `alert_rules.name` -labels jsonb 适用标签(示例:{"service":"s3","version":"v1"});为空 `{}` 表示全局 -threshold numeric 阈值(会被渲染成特定规则的 threshold metric 数值) -watch_time interval 持续时长(映射 Prometheus rule 的 for:) +| 字段名 | 类型 | 说明 | +|--------|------|------| +| alert_name | varchar(255) | 关联 `alert_rules.name` | +| labels | jsonb | 适用标签(示例:{"service":"s3","version":"v1"});为空 `{}` 表示全局 | +| threshold | numeric | 阈值(会被渲染成特定规则的 threshold metric 数值) | +| watch_time | interval | 持续时长(映射 Prometheus rule 的 for:) | **约束与索引建议:** - FOREIGN KEY: `(alert_name)` REFERENCES `alert_rules(name)` ON DELETE CASCADE diff --git a/internal/alerting/service/ruleset/manager.go b/internal/alerting/service/ruleset/manager.go index 03e58d2..7ffb279 100644 --- a/internal/alerting/service/ruleset/manager.go +++ b/internal/alerting/service/ruleset/manager.go @@ -32,20 +32,35 @@ func (m *Manager) AddAlertRule(ctx context.Context, r *AlertRule) error { if r == nil || r.Name == "" { return fmt.Errorf("invalid rule") } + // First ensure the rule is added to Prometheus successfully + // This guarantees Prometheus has the correct data even if DB write fails + if err := m.prom.AddToPrometheus(ctx, r); err != nil { + return fmt.Errorf("failed to add rule to Prometheus: %w", err) + } + // Then persist to database + // If this fails, the rule will still be in Prometheus, which is better than + // having it in DB but not in Prometheus (which would cause missing alerts) if err := m.store.CreateRule(ctx, r); err != nil { - return err + return fmt.Errorf("failed to create rule in database: %w", err) } - return m.prom.AddToPrometheus(ctx, r) + return nil } func (m *Manager) DeleteAlertRule(ctx context.Context, name string) error { if name == "" { return fmt.Errorf("invalid name") } + // First remove from Prometheus to stop alerting immediately + // This prevents false alerts if DB deletion fails + if err := m.prom.DeleteFromPrometheus(ctx, name); err != nil { + return fmt.Errorf("failed to delete rule from Prometheus: %w", err) + } + // Then remove from database + // If this fails, the rule is already removed from Prometheus (no false alerts) if err := m.store.DeleteRule(ctx, name); err != nil { - return err + return fmt.Errorf("failed to delete rule from database: %w", err) } - return m.prom.DeleteFromPrometheus(ctx, name) + return nil } func (m *Manager) AddToPrometheus(ctx context.Context, r *AlertRule) error { @@ -66,33 +81,54 @@ func (m *Manager) UpsertRuleMetas(ctx context.Context, meta *AlertRuleMeta) erro if err := validateMeta(meta); err != nil { return err } + + // First, get the old meta for change logging + oldList, err := m.store.GetMetas(ctx, meta.AlertName, meta.Labels) + if err != nil { + return err + } + var old *AlertRuleMeta + if len(oldList) > 0 { + old = oldList[0] + } + + // Prepare change log parameters outside of transaction to minimize lock time + var changeLog *ChangeLog + if old != nil || meta != nil { + changeLog = m.prepareChangeLog(old, meta) + } + + // First ensure the meta is synced to Prometheus successfully + // This guarantees Prometheus has the correct threshold data even if DB write fails + if err := m.prom.SyncMetaToPrometheus(ctx, meta); err != nil { + return fmt.Errorf("failed to sync meta to Prometheus: %w", err) + } + + // Then persist to database within a transaction + // If this fails, the meta will still be in Prometheus, which is better than + // having it in DB but not in Prometheus (which would cause incorrect thresholds) return m.store.WithTx(ctx, func(tx Store) error { - oldList, err := tx.GetMetas(ctx, meta.AlertName, meta.Labels) - if err != nil { - return err - } - var old *AlertRuleMeta - if len(oldList) > 0 { - old = oldList[0] - } _, err = tx.UpsertMeta(ctx, meta) if err != nil { return err } - if err := m.RecordMetaChangeLog(ctx, old, meta); err != nil { - return err - } - if err := m.prom.SyncMetaToPrometheus(ctx, meta); err != nil { - return err + // Insert pre-prepared change log + if changeLog != nil { + if err := tx.InsertChangeLog(ctx, changeLog); err != nil { + return err + } } return nil }) } -func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error { +// prepareChangeLog prepares change log parameters outside of transaction to minimize lock time +func (m *Manager) prepareChangeLog(oldMeta, newMeta *AlertRuleMeta) *ChangeLog { if newMeta == nil { return nil } + + // Prepare all parameters outside of transaction var oldTh, newTh *float64 var oldW, newW *time.Duration if oldMeta != nil { @@ -103,8 +139,14 @@ func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *Ale newTh = &newMeta.Threshold newW = &newMeta.WatchTime } - log := &ChangeLog{ - ID: fmt.Sprintf("%s-%s-%d", newMeta.AlertName, CanonicalLabelKey(newMeta.Labels), time.Now().UnixNano()), + + // Generate ID and timestamp outside of transaction + now := time.Now() + changeTime := now.UTC() + id := fmt.Sprintf("%s-%s-%d", newMeta.AlertName, CanonicalLabelKey(newMeta.Labels), now.UnixNano()) + + return &ChangeLog{ + ID: id, AlertName: newMeta.AlertName, ChangeType: classifyChange(oldMeta, newMeta), Labels: newMeta.Labels, @@ -112,9 +154,16 @@ func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *Ale NewThreshold: newTh, OldWatch: oldW, NewWatch: newW, - ChangeTime: time.Now().UTC(), + ChangeTime: changeTime, + } +} + +func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error { + changeLog := m.prepareChangeLog(oldMeta, newMeta) + if changeLog == nil { + return nil } - return m.store.InsertChangeLog(ctx, log) + return m.store.InsertChangeLog(ctx, changeLog) } func classifyChange(oldMeta, newMeta *AlertRuleMeta) string { diff --git a/internal/alerting/service/ruleset/store_pg.go b/internal/alerting/service/ruleset/store_pg.go index fe3d231..cd4c063 100644 --- a/internal/alerting/service/ruleset/store_pg.go +++ b/internal/alerting/service/ruleset/store_pg.go @@ -6,12 +6,14 @@ import ( "fmt" "time" + "github.com/jackc/pgx/v5/pgtype" abd "github.com/qiniu/zeroops/internal/alerting/database" ) // PgStore is a PostgreSQL-backed Store implementation using the alerting database wrapper. // Note: The current database wrapper does not expose transactions; WithTx acts as a simple wrapper. // For production-grade atomicity, extend the database wrapper to support sql.Tx and wire it here. +// This implementation uses pgx native types to avoid manual parsing of PostgreSQL interval types. type PgStore struct { DB *abd.Database } @@ -76,16 +78,22 @@ func (s *PgStore) DeleteRule(ctx context.Context, name string) error { } func (s *PgStore) UpsertMeta(ctx context.Context, m *AlertRuleMeta) (bool, error) { - labelsJSON, _ := json.Marshal(m.Labels) + labelsJSON, err := json.Marshal(m.Labels) + if err != nil { + return false, fmt.Errorf("marshal labels: %w", err) + } + + // Convert time.Duration to pgtype.Interval + interval := durationToPgInterval(m.WatchTime) + const q = ` INSERT INTO alert_rule_metas(alert_name, labels, threshold, watch_time) VALUES ($1, $2::jsonb, $3, $4) ON CONFLICT (alert_name, labels) DO UPDATE SET threshold=EXCLUDED.threshold, - watch_time=EXCLUDED.watch_time, - updated_at=now() + watch_time=EXCLUDED.watch_time ` - _, err := s.DB.ExecContext(ctx, q, m.AlertName, string(labelsJSON), m.Threshold, m.WatchTime) + _, err = s.DB.ExecContext(ctx, q, m.AlertName, string(labelsJSON), m.Threshold, interval) if err != nil { return false, fmt.Errorf("upsert meta: %w", err) } @@ -94,7 +102,10 @@ func (s *PgStore) UpsertMeta(ctx context.Context, m *AlertRuleMeta) (bool, error } func (s *PgStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error) { - labelsJSON, _ := json.Marshal(labels) + labelsJSON, err := json.Marshal(labels) + if err != nil { + return nil, fmt.Errorf("marshal labels for get: %w", err) + } const q = ` SELECT alert_name, labels, threshold, watch_time FROM alert_rule_metas @@ -110,19 +121,19 @@ func (s *PgStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([ var alertName string var labelsRaw string var threshold float64 - var watch any + var watch pgtype.Interval if err := rows.Scan(&alertName, &labelsRaw, &threshold, &watch); err != nil { return nil, fmt.Errorf("scan meta: %w", err) } lm := LabelMap{} - _ = json.Unmarshal([]byte(labelsRaw), &lm) + if err := json.Unmarshal([]byte(labelsRaw), &lm); err != nil { + return nil, fmt.Errorf("unmarshal labels: %w", err) + } meta := &AlertRuleMeta{AlertName: alertName, Labels: lm, Threshold: threshold} - // best-effort: watch_time may come back as string or duration; we try string -> duration - switch v := watch.(type) { - case string: - if d, err := timeParseDurationPG(v); err == nil { - meta.WatchTime = d - } + + // Convert pgtype.Interval to time.Duration + if duration, err := pgIntervalToDuration(watch); err == nil { + meta.WatchTime = duration } res = append(res, meta) } @@ -130,9 +141,12 @@ func (s *PgStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([ } func (s *PgStore) DeleteMeta(ctx context.Context, name string, labels LabelMap) error { - labelsJSON, _ := json.Marshal(labels) + labelsJSON, err := json.Marshal(labels) + if err != nil { + return fmt.Errorf("marshal labels: %w", err) + } const q = `DELETE FROM alert_rule_metas WHERE alert_name=$1 AND labels=$2::jsonb` - _, err := s.DB.ExecContext(ctx, q, name, string(labelsJSON)) + _, err = s.DB.ExecContext(ctx, q, name, string(labelsJSON)) if err != nil { return fmt.Errorf("delete meta: %w", err) } @@ -140,31 +154,66 @@ func (s *PgStore) DeleteMeta(ctx context.Context, name string, labels LabelMap) } func (s *PgStore) InsertChangeLog(ctx context.Context, log *ChangeLog) error { - labelsJSON, _ := json.Marshal(log.Labels) + labelsJSON, err := json.Marshal(log.Labels) + if err != nil { + return fmt.Errorf("marshal labels for changelog: %w", err) + } + + // Convert time.Duration to pgtype.Interval for old and new watch times + var oldWatch, newWatch *pgtype.Interval + if log.OldWatch != nil { + interval := durationToPgInterval(*log.OldWatch) + oldWatch = &interval + } + if log.NewWatch != nil { + interval := durationToPgInterval(*log.NewWatch) + newWatch = &interval + } + const q = ` INSERT INTO alert_meta_change_logs(id, alert_name, change_type, labels, old_threshold, new_threshold, old_watch, new_watch, change_time) VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9) ` - _, err := s.DB.ExecContext(ctx, q, log.ID, log.AlertName, log.ChangeType, string(labelsJSON), log.OldThreshold, log.NewThreshold, log.OldWatch, log.NewWatch, log.ChangeTime) + _, err = s.DB.ExecContext(ctx, q, log.ID, log.AlertName, log.ChangeType, string(labelsJSON), log.OldThreshold, log.NewThreshold, oldWatch, newWatch, log.ChangeTime) if err != nil { return fmt.Errorf("insert change log: %w", err) } return nil } -// timeParseDurationPG parses a small subset of PostgreSQL interval text output into time.Duration. -// Supported examples: "01:02:03", "02:03", "3600 seconds". Best-effort only. -func timeParseDurationPG(s string) (time.Duration, error) { - // HH:MM:SS - var h, m int - var sec float64 - if n, _ := fmt.Sscanf(s, "%d:%d:%f", &h, &m, &sec); n >= 2 { - d := time.Duration(h)*time.Hour + time.Duration(m)*time.Minute + time.Duration(sec*float64(time.Second)) - return d, nil - } - var seconds float64 - if n, _ := fmt.Sscanf(s, "%f seconds", &seconds); n == 1 { - return time.Duration(seconds * float64(time.Second)), nil - } - return 0, fmt.Errorf("unsupported interval format: %s", s) +// durationToPgInterval converts a time.Duration to pgtype.Interval. +// Note: This conversion assumes the duration represents a fixed time period. +// For durations that include months or years, this conversion may not be accurate. +func durationToPgInterval(d time.Duration) pgtype.Interval { + // Convert to total microseconds first + totalMicroseconds := d.Microseconds() + + // Calculate days and remaining microseconds + days := totalMicroseconds / (24 * 60 * 60 * 1000000) // 24 hours * 60 minutes * 60 seconds * 1,000,000 microseconds + remainingMicroseconds := totalMicroseconds % (24 * 60 * 60 * 1000000) + + return pgtype.Interval{ + Microseconds: remainingMicroseconds, + Days: int32(days), + Months: 0, // Duration doesn't include months + Valid: true, + } +} + +// pgIntervalToDuration converts a pgtype.Interval to time.Duration. +// This function returns an error if the interval contains months or years, +// as these cannot be accurately converted to a fixed duration. +func pgIntervalToDuration(interval pgtype.Interval) (time.Duration, error) { + if !interval.Valid { + return 0, fmt.Errorf("interval is not valid") + } + + // Check if the interval contains months or years + if interval.Months != 0 { + return 0, fmt.Errorf("cannot convert interval with months to duration: %d months", interval.Months) + } + + // Convert to duration + totalMicroseconds := interval.Microseconds + int64(interval.Days)*24*60*60*1000000 + return time.Duration(totalMicroseconds) * time.Microsecond, nil } diff --git a/internal/alerting/service/ruleset/store_pg_test.go b/internal/alerting/service/ruleset/store_pg_test.go new file mode 100644 index 0000000..a19c1e5 --- /dev/null +++ b/internal/alerting/service/ruleset/store_pg_test.go @@ -0,0 +1,188 @@ +package ruleset + +import ( + "testing" + "time" + + "github.com/jackc/pgx/v5/pgtype" +) + +func TestDurationToPgInterval(t *testing.T) { + tests := []struct { + name string + duration time.Duration + expected pgtype.Interval + }{ + { + name: "Zero duration", + duration: 0, + expected: pgtype.Interval{ + Microseconds: 0, + Days: 0, + Months: 0, + Valid: true, + }, + }, + { + name: "1 second", + duration: 1 * time.Second, + expected: pgtype.Interval{ + Microseconds: 1000000, // 1 second = 1,000,000 microseconds + Days: 0, + Months: 0, + Valid: true, + }, + }, + { + name: "1 minute", + duration: 1 * time.Minute, + expected: pgtype.Interval{ + Microseconds: 60000000, // 1 minute = 60,000,000 microseconds + Days: 0, + Months: 0, + Valid: true, + }, + }, + { + name: "1 hour", + duration: 1 * time.Hour, + expected: pgtype.Interval{ + Microseconds: 3600000000, // 1 hour = 3,600,000,000 microseconds + Days: 0, + Months: 0, + Valid: true, + }, + }, + { + name: "1 day", + duration: 24 * time.Hour, + expected: pgtype.Interval{ + Microseconds: 0, // Days are stored separately + Days: 1, + Months: 0, + Valid: true, + }, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := durationToPgInterval(tt.duration) + if got != tt.expected { + t.Errorf("durationToPgInterval() = %v, want %v", got, tt.expected) + } + }) + } +} + +func TestPgIntervalToDuration(t *testing.T) { + tests := []struct { + name string + interval pgtype.Interval + expected time.Duration + expectError bool + }{ + { + name: "Valid interval - microseconds only", + interval: pgtype.Interval{ + Microseconds: 1000000, // 1 second + Days: 0, + Months: 0, + Valid: true, + }, + expected: 1 * time.Second, + expectError: false, + }, + { + name: "Valid interval - days only", + interval: pgtype.Interval{ + Microseconds: 0, + Days: 1, + Months: 0, + Valid: true, + }, + expected: 24 * time.Hour, + expectError: false, + }, + { + name: "Valid interval - days and microseconds", + interval: pgtype.Interval{ + Microseconds: 1000000, // 1 second + Days: 1, // 1 day + Months: 0, + Valid: true, + }, + expected: 24*time.Hour + 1*time.Second, + expectError: false, + }, + { + name: "Invalid interval - contains months", + interval: pgtype.Interval{ + Microseconds: 0, + Days: 0, + Months: 1, + Valid: true, + }, + expected: 0, + expectError: true, + }, + { + name: "Invalid interval - not valid", + interval: pgtype.Interval{ + Microseconds: 0, + Days: 0, + Months: 0, + Valid: false, + }, + expected: 0, + expectError: true, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got, err := pgIntervalToDuration(tt.interval) + if (err != nil) != tt.expectError { + t.Errorf("pgIntervalToDuration() error = %v, expectError %v", err, tt.expectError) + return + } + if got != tt.expected { + t.Errorf("pgIntervalToDuration() = %v, want %v", got, tt.expected) + } + }) + } +} + +func TestDurationRoundTrip(t *testing.T) { + tests := []struct { + name string + duration time.Duration + }{ + {"Zero", 0}, + {"1 second", 1 * time.Second}, + {"1 minute", 1 * time.Minute}, + {"1 hour", 1 * time.Hour}, + {"1 day", 24 * time.Hour}, + {"1 day 1 hour", 25 * time.Hour}, + {"1 day 1 hour 1 minute", 25*time.Hour + 1*time.Minute}, + {"1 day 1 hour 1 minute 1 second", 25*time.Hour + 1*time.Minute + 1*time.Second}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + // Convert duration to pgtype.Interval + interval := durationToPgInterval(tt.duration) + + // Convert back to duration + got, err := pgIntervalToDuration(interval) + if err != nil { + t.Errorf("pgIntervalToDuration() error = %v", err) + return + } + + if got != tt.duration { + t.Errorf("Round trip conversion failed: got %v, want %v", got, tt.duration) + } + }) + } +} From 132f692e98d84ce0cb5e0d6fffa3baf38c082471 Mon Sep 17 00:00:00 2001 From: acd19ml Date: Sun, 21 Sep 2025 03:01:41 +0800 Subject: [PATCH 3/4] feat: implement alert healing with observation window mechanism - Add heal_actions table and related DAO/Service layers - Implement observation window mechanism using Redis - Add P0/P1/P2 alert processing logic with fault healing - Update remediation consumer with healing and observation flows - Add comprehensive tests for new functionality - Update documentation with new healing process --- docs/alerting/database-design.md | 47 ++- go.mod | 3 + internal/alerting/database/database.go | 5 + .../alerting/service/remediation/README.md | 373 +++++++++++++++--- .../alerting/service/remediation/consumer.go | 258 ++++++++++-- .../service/remediation/heal_action_dao.go | 145 +++++++ .../remediation/heal_action_service.go | 172 ++++++++ .../remediation/heal_action_service_test.go | 178 +++++++++ .../service/remediation/init_heal_actions.sql | 38 ++ .../service/remediation/observation_window.go | 169 ++++++++ .../remediation/observation_window_test.go | 100 +++++ .../alerting/service/remediation/types.go | 74 ++++ 12 files changed, 1492 insertions(+), 70 deletions(-) create mode 100644 internal/alerting/service/remediation/heal_action_dao.go create mode 100644 internal/alerting/service/remediation/heal_action_service.go create mode 100644 internal/alerting/service/remediation/heal_action_service_test.go create mode 100644 internal/alerting/service/remediation/init_heal_actions.sql create mode 100644 internal/alerting/service/remediation/observation_window.go create mode 100644 internal/alerting/service/remediation/observation_window_test.go create mode 100644 internal/alerting/service/remediation/types.go diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md index b119349..0f860df 100644 --- a/docs/alerting/database-design.md +++ b/docs/alerting/database-design.md @@ -2,7 +2,7 @@ ## 概述 -本文档为最新数据库设计,总计包含 6 张表: +本文档为最新数据库设计,总计包含 7 张表: - alert_issues - alert_issue_comments @@ -10,6 +10,7 @@ - alert_rules - alert_rule_metas - service_states +- heal_actions ## 数据表设计 @@ -111,7 +112,7 @@ --- -### 7) service_states(服务状态表) +### 6) service_states(服务状态表) 追踪服务在某一版本上的健康状态与处置进度。 @@ -127,6 +128,34 @@ **索引建议:** - PRIMARY KEY: `(service, version)` +--- + +### 7) heal_actions(告警治愈解决方案表) + +存储不同故障域对应的治愈方案和规则。 + +| 字段名 | 类型 | 说明 | +|--------|------|------| +| id | varchar(255) PK | 治愈方案 ID | +| desc | text | 简单描述,如 action 是处理什么告警场景的 | +| type | varchar(255) | 对应的故障域类型 | +| rules | jsonb | 条件规则:{condition1: action1, condition2: action2} | + +**索引建议:** +- PRIMARY KEY: `id` +- INDEX: `(type)` + +**示例数据:** +```sql +INSERT INTO heal_actions (id, desc, type, rules) VALUES +('service_version_rollback', '服务版本回滚方案', 'service_version_issue', + '{"deployment_status": "deploying", "action": "rollback", "target": "previous_version"}'), +('service_version_alert', '服务版本告警方案', 'service_version_issue', + '{"deployment_status": "deployed", "action": "alert", "message": "版本已发布,暂不支持自动回滚"}'); +``` + +TODO: health_state映射逻辑 + ## 数据关系(ER) ```mermaid @@ -175,13 +204,25 @@ erDiagram text content } + heal_actions { + varchar id PK + text desc + varchar type + jsonb rules + } + %% 通过 service 等标签在应用层逻辑关联 alert_rule_metas ||..|| alert_rules : "by alert_name" service_states ||..|| alert_rule_metas : "by service/version labels" + heal_actions ||..|| alert_issues : "by fault domain analysis" ``` ## 数据流转 1. 以 `alert_rules` 为模版,结合 `alert_rule_metas` 渲染出面向具体服务/版本等的规则(labels 可为空 `{}` 表示全局默认,或包含如 service/version 等标签)。 2. 指标或规则参数发生调整时,记录到 `alert_meta_change_logs`。 -3. 规则触发创建 `alert_issues`;处理过程中的动作写入 `alert_issue_comments`。 \ No newline at end of file +3. 规则触发创建 `alert_issues`;处理过程中的动作写入 `alert_issue_comments`。 +4. **告警治愈流程**: + - P0 告警:根据 `alert_issues.labels` 识别故障域,查询 `heal_actions` 获取治愈方案 + - 执行治愈操作(如回滚),成功后更新 `alert_issues` 和 `service_states` 状态 + - P1/P2 告警:直接进入下钻分析,记录分析结果到 `alert_issue_comments` \ No newline at end of file diff --git a/go.mod b/go.mod index 6094f9c..8cf046f 100644 --- a/go.mod +++ b/go.mod @@ -9,6 +9,7 @@ require ( github.com/lib/pq v1.10.9 github.com/redis/go-redis/v9 v9.5.1 github.com/rs/zerolog v1.34.0 + github.com/stretchr/testify v1.11.1 ) require ( @@ -16,6 +17,7 @@ require ( github.com/bytedance/sonic/loader v0.2.4 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cloudwego/base64x v0.1.5 // indirect + github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/gabriel-vasile/mimetype v1.4.9 // indirect github.com/gin-contrib/cors v1.7.6 // indirect @@ -39,6 +41,7 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/natefinch/lumberjack v2.0.0+incompatible // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect + github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.3.0 // indirect golang.org/x/arch v0.18.0 // indirect diff --git a/internal/alerting/database/database.go b/internal/alerting/database/database.go index e6ee504..6b2ab9b 100644 --- a/internal/alerting/database/database.go +++ b/internal/alerting/database/database.go @@ -38,3 +38,8 @@ func (d *Database) ExecContext(ctx context.Context, q string, args ...any) (sql. func (d *Database) QueryContext(ctx context.Context, q string, args ...any) (*sql.Rows, error) { return d.db.QueryContext(ctx, q, args...) } + +// QueryRowContext exposes database/sql QueryRowContext for single row SELECT queries. +func (d *Database) QueryRowContext(ctx context.Context, q string, args ...any) *sql.Row { + return d.db.QueryRowContext(ctx, q, args...) +} diff --git a/internal/alerting/service/remediation/README.md b/internal/alerting/service/remediation/README.md index de5ca08..ca41e49 100644 --- a/internal/alerting/service/remediation/README.md +++ b/internal/alerting/service/remediation/README.md @@ -1,23 +1,28 @@ -# remediation — 通道消费与自动回滚(Mock) +# remediation — 告警治愈与下钻分析 -本包规划一个后台处理器:消费 `healthcheck` 投递到进程内 channel 的告警消息,模拟执行“自动回滚”,回滚成功后将相关告警与服务态标记为恢复。 +本包实现一个后台处理器:消费 `healthcheck` 投递到进程内 channel 的告警消息,根据告警等级进行分流处理: +- **P0 告警**:进入"故障治愈"模块,执行自动修复操作 +- **P1/P2 告警**:进入"下钻分析"模块,进行深度分析 —— ## 1. 目标 - 订阅 `healthcheck` 的 `AlertMessage`(进程内 channel) -- 对每条消息: - 1) Mock 调用回滚接口 `POST /v1/deployments/:deployID/rollback` - 2) `sleep 30s` 后返回“回滚成功”的模拟响应 - 3) 若成功,则更新 DB 与缓存: - - `alert_issues.alert_state = 'Restored'` - - `alert_issues.state = 'Closed'` - - `service_states.health_state = 'Normal'` - - `service_states.resolved_at = NOW()`(当前时间) - - 同时在 `alert_issue_comments` 中追加一条 AI 分析评论(见下文内容模板) - -> 说明:本阶段仅实现消费与 Mock,真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。 +- 根据 `level` 字段进行分流: + - **P0 告警**:故障治愈流程 + 1) 确认故障域(从 labels 分析 service_name + version) + 2) 查询 `heal_actions` 表获取治愈方案 + 3) 执行治愈操作(当前仅支持回滚) + 4) 治愈成功后启动观察窗口(默认30分钟) + 5) 观察窗口内如果出现新告警,取消观察并重新处理 + 6) 观察窗口完成后,更新服务状态为正常 + - **P1/P2 告警**:直接进入下钻分析流程 + 1) 执行 AI 分析 + 2) 更新告警状态为恢复 + 3) 记录分析结果到评论 + +> 说明:本阶段实现故障域识别和治愈方案查询,真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。 —— @@ -37,8 +42,10 @@ } ``` +- 故障域识别:从 `Labels` 中提取 `service_name` 和 `version` 信息 - deployID 的来源(用于构造回滚 URL): - - Mock 阶段:可从 `Labels["deploy_id"]`(若存在)读取;若为空,可按 `{service}:{version}` 组装一个占位 ID。 + - 可从 `Labels["deploy_id"]`(若存在)读取 + - 若为空,可按 `{service}:{version}` 组装一个占位 ID —— @@ -62,7 +69,7 @@ REMEDIATION_ROLLBACK_SLEEP=30s —— -## 4. 流程(伪代码) +## 4. 处理流程(伪代码) ```go func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rdb *redis.Client) { @@ -71,30 +78,256 @@ func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rd case <-ctx.Done(): return case m := <-ch: - // 1) 组装回滚 URL(Mock) - deployID := m.Labels["deploy_id"] - if deployID == "" { - // 仅 Mock:用 service:version 兜底 - deployID = fmt.Sprintf("%s:%s", m.Service, m.Version) + switch m.Level { + case "P0": + // P0 告警:故障治愈流程 + handleP0Alert(ctx, m, db, rdb) + case "P1", "P2": + // P1/P2 告警:下钻分析流程 + handleP1P2Alert(ctx, m, db, rdb) + default: + log.Printf("Unknown alert level: %s", m.Level) } - url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deployID) + } + } +} - // 2) 发起回滚(Mock):sleep 指定时间再判为成功 - sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) - // TODO: 如需真实 HTTP 调用,可在此发起 POST 并根据响应判断 +// P0 告警处理:故障治愈流程 +func handleP0Alert(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) { + // 1) 确认故障域 + faultDomain := identifyFaultDomain(m.Labels) + + // 2) 查询治愈方案 + healAction, err := queryHealAction(ctx, db, faultDomain) + if err != nil { + log.Printf("Failed to query heal action: %v", err) + // 治愈方案查询失败,直接进入下钻分析 + handleDrillDownAnalysis(ctx, m, db, rdb) + return + } + + // 3) 执行治愈操作 + success := executeHealAction(ctx, healAction, m) + if !success { + log.Printf("Heal action failed for alert %s", m.ID) + // 治愈操作失败,直接进入下钻分析 + handleDrillDownAnalysis(ctx, m, db, rdb) + return + } + + // 4) 治愈成功后启动观察窗口,延迟状态更新 + handleDrillDownAnalysisWithObservation(ctx, m, db, rdb) +} - // 3) 成功后,先写入 AI 分析评论,再更新 DB 与缓存状态 - _ = addAIAnalysisComment(ctx, db, m) - _ = markRestoredInDB(ctx, db, m) - _ = markRestoredInCache(ctx, rdb, m) - } +// P1/P2 告警处理:下钻分析流程 +func handleP1P2Alert(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) { + handleDrillDownAnalysis(ctx, m, db, rdb) +} + +// 故障域识别 +func identifyFaultDomain(labels map[string]string) string { + service := labels["service_name"] + version := labels["version"] + + if service != "" && version != "" { + return "service_version_issue" } + + // 可根据更多条件扩展其他故障域 + return "unknown" +} + +// 查询治愈方案 +func queryHealAction(ctx context.Context, db *Database, faultDomain string) (*HealAction, error) { + const q = `SELECT id, desc, type, rules FROM heal_actions WHERE type = $1 LIMIT 1` + // 实现查询逻辑 + return nil, nil +} + +// 执行治愈操作 +func executeHealAction(ctx context.Context, action *HealAction, m AlertMessage) bool { + // 根据 action.rules 中的条件执行相应操作 + // 当前仅支持回滚操作 + if action.Rules["action"] == "rollback" { + return executeRollback(ctx, m) + } else if action.Rules["action"] == "alert" { + log.Printf("Alert: %s", action.Rules["message"]) + return false + } + return false +} + +// 执行回滚操作 +func executeRollback(ctx context.Context, m AlertMessage) bool { + deployID := m.Labels["deploy_id"] + if deployID == "" { + deployID = fmt.Sprintf("%s:%s", m.Service, m.Version) + } + + // Mock 回滚:sleep 指定时间 + sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) + // TODO: 真实 HTTP 调用回滚接口 + + return true +} + +// 下钻分析处理(P1/P2 告警直接使用) +func handleDrillDownAnalysis(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) { + // 1) 执行 AI 分析 + _ = addAIAnalysisComment(ctx, db, m) + + // 2) 更新告警状态为恢复 + _ = markRestoredInDB(ctx, db, m) + + // 3) 更新缓存状态 + _ = markRestoredInCache(ctx, rdb, m) +} + +// 下钻分析处理(P0 告警治愈后使用,延迟状态更新) +func handleDrillDownAnalysisWithObservation(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) { + // 1) 执行 AI 分析 + _ = addAIAnalysisComment(ctx, db, m) + + // 2) 记录治愈完成评论,但不更新告警状态 + _ = addHealingCompletedComment(ctx, db, m) + + // 3) 启动观察窗口,等待30分钟 + _ = startObservationWindow(ctx, m.Service, m.Version, m.ID, 30*time.Minute) + + // 注意:此时不更新 alert_issues.alert_state 和 service_states.health_state + // 状态更新将在观察窗口完成后进行 +} + +// 观察窗口完成后的处理 +func completeObservationWindow(ctx context.Context, service, version string, db *Database, rdb *redis.Client) { + // 1) 完成观察窗口 + _ = completeObservation(ctx, service, version) + + // 2) 更新 alert_issues.alert_state 为 'Restored' + // 3) 更新 service_states.health_state 为 'Normal' + // 4) 更新相关缓存 + _ = markServiceAsNormal(ctx, service, version, db, rdb) + + log.Printf("Observation window completed for service %s version %s, status updated to Normal", service, version) } ``` —— -## 5. DB 更新(SQL 建议) +## 5. 故障域识别与治愈方案 + +### 故障域类型 + +当前支持的故障域类型: + +1. **service_version_issue**:服务版本问题 + - 识别条件:`labels["service_name"]` 和 `labels["version"]` 都存在 + - 治愈方案: + - 发布中版本:执行回滚操作 + - 已完成发布版本:提示暂不支持自动回滚 + +2. **unknown**:未知故障域 + - 识别条件:无法从标签中识别出已知故障域 + - 处理方式:跳过治愈,直接进入下钻分析 + +### 治愈方案规则 + +`heal_actions.rules` 字段的 JSON 结构: + +```json +{ + "deployment_status": "deploying|deployed", + "action": "rollback|alert", + "target": "previous_version", + "message": "版本已发布,暂不支持自动回滚" +} +``` + +### 治愈操作类型 + +1. **rollback**:执行回滚操作 + - 调用部署系统的回滚接口 + - 回滚到上一个稳定版本 + +2. **alert**:仅告警,不执行自动操作 + - 记录告警信息 + - 需要人工介入处理 + +### 扩展性设计 + +- 故障域类型可扩展:整体问题、单机房问题、网络问题等 +- 治愈方案可扩展:重启服务、扩容、切换流量等 +- 规则条件可扩展:基于更多标签和指标进行判断 + +#### 添加新的故障域类型 + +1. 在 `types.go` 中添加新的 `FaultDomain` 常量 +2. 在 `IdentifyFaultDomain` 方法中添加识别逻辑 +3. 在数据库中配置对应的治愈方案 + +#### 添加新的治愈操作类型 + +1. 在 `HealActionRules` 结构体中添加新字段 +2. 在 `ExecuteHealAction` 方法中添加新的 case 分支 +3. 实现具体的治愈操作逻辑 + +### 观察窗口机制 + +观察窗口是治愈操作完成后的验证期,用于确保治愈操作的有效性: + +1. **启动条件**:P0 告警治愈操作成功完成后自动启动 +2. **持续时间**:默认30分钟,可配置 +3. **监控内容**:观察该服务是否在窗口期内出现新的告警 +4. **处理逻辑**: + - 如果窗口期内出现新告警:取消观察窗口,重新进入治愈流程 + - 如果窗口期内无新告警:完成观察窗口,更新服务状态为正常 +5. **状态更新时机**: + - **治愈操作完成后**:不立即更新状态,只记录治愈完成评论 + - **观察窗口完成后**:同时更新 `alert_issues.alert_state` 为 `Restored` 和 `service_states.health_state` 为 `Normal` +6. **关键原则**:每次修改 `service_states.health_state` 为 `Normal` 时,都必须同时修改 `alert_issues.alert_state` 为 `Restored` + +—— + +## 6. 代码使用示例 + +### 数据库初始化 + +```bash +# 执行初始化脚本 +psql -U postgres -d zeroops -f init_heal_actions.sql +``` + +### 代码使用 + +```go +// 创建服务 +healDAO := NewPgHealActionDAO(db) +healService := NewHealActionService(healDAO) + +// 识别故障域 +faultDomain := healService.IdentifyFaultDomain(labels) + +// 获取治愈方案 +healAction, err := healService.GetHealAction(ctx, faultDomain) + +// 执行治愈操作 +result, err := healService.ExecuteHealAction(ctx, healAction, alertID, labels) +``` + +### 测试 + +运行测试: + +```bash +go test ./internal/alerting/service/remediation -v +``` + +测试覆盖: +- 故障域识别逻辑 +- 治愈操作执行 +- 部署状态判断 + +## 7. DB 更新(SQL 建议) - 告警状态: ```sql @@ -121,7 +354,7 @@ VALUES ( ); ``` -评论内容模板(Markdown,多行): +评论内容模板(Markdown,多行,内容暂未设计): ``` ## AI分析结果 **问题类型**:非发版本导致的问题 @@ -137,7 +370,7 @@ VALUES ( —— -## 6. 缓存更新(Redis,Lua CAS 建议) +## 8. 缓存更新(Redis,Lua CAS 建议) - 告警缓存 `alert:issue:{id}`: ```lua @@ -178,30 +411,80 @@ return 1 —— -## 7. 幂等与重试 +## 9. 幂等与重试 - 幂等:同一 `AlertMessage.ID` 的回滚处理应具备幂等性,重复消费不应产生额外副作用。 - 重试:Mock 模式下可忽略;接入真实接口后,对 5xx/网络错误考虑重试与退避,最终写入失败应有告警与补偿。 —— -## 8. 验证步骤(与 healthcheck E2E 相衔接) +## 10. 验证步骤(与 healthcheck E2E 相衔接) + +### 基础验证步骤 1) 启动 Redis/Postgres 与 API(参考 `healthcheck/E2E_VALIDATION.md` 与 `env_example.txt`) -2) 创建 channel,并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)` -3) `curl` 触发 Webhook,`alert_issues` 入库为 `Pending` -4) 等待 `healthcheck` 将缓存态切到 `InProcessing` -5) 等待 `remediation` mock 回滚完成 → DB 与缓存更新: - - `alert_issues.alert_state = 'Restored'` - - `service_states.health_state = 'Normal'` - - `service_states.resolved_at = NOW()` -6) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新(comments 仍为 mock) +2) 创建 `heal_actions` 表并插入测试数据 +3) 创建 channel,并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)` + +### P0 告警验证(故障治愈流程) + +4) 触发 P0 级别 Webhook,`alert_issues` 入库为 `Pending` +5) 等待 `healthcheck` 将缓存态切到 `InProcessing` +6) 验证故障域识别:检查日志中是否正确识别为 `service_version_issue` +7) 验证治愈方案查询:检查是否从 `heal_actions` 表查询到对应方案 +8) 等待 `remediation` 执行治愈操作完成: + - 验证观察窗口已启动(Redis 中存在观察窗口记录) + - `alert_issue_comments` 中新增治愈完成评论 + - **重要**:验证 `alert_issues.alert_state` 仍为 `InProcessing`(未更新为 `Restored`) + - **重要**:验证 `service_states.health_state` 未更新为 `Normal` +9) 等待观察窗口完成(30分钟后)或模拟窗口期内新告警: + - **如果无新告警**: + - 验证观察窗口自动完成 + - 验证状态同时更新为 `alert_issues.alert_state = 'Restored'` 和 `service_states.health_state = 'Normal'` + - **如果有新告警**: + - 验证观察窗口被取消 + - 验证重新进入治愈流程 + - 验证状态未更新为 `Restored`/`Normal` + +### P1/P2 告警验证(下钻分析流程) + +9) 触发 P1 或 P2 级别 Webhook +10) 验证直接进入下钻分析流程,跳过故障治愈步骤 +11) 验证 AI 分析评论生成和状态更新 + +### 最终验证 + +12) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新 +13) 验证不同告警等级的处理路径正确性 —— -## 9. 后续计划 +## 11. 注意事项 + +1. **service_states 表逻辑**: 当前版本中,`service_states` 表的更新逻辑暂时不实现,但保留了扩展空间 +2. **Mock 模式**: 当前回滚操作为 Mock 模式,实际部署时需要接入真实的部署系统 API +3. **错误处理**: 治愈操作失败时会记录日志并继续进入下钻分析流程 +4. **幂等性**: 同一告警的重复处理应该具备幂等性 + +## 12. 后续计划 + +### 短期计划 +- 实现 `heal_actions` 表的完整 CRUD 操作 +- 完善故障域识别逻辑,支持更多故障类型 - 接入真实部署系统回滚接口与鉴权 +- 实现治愈方案的动态配置和管理界面 + +### 中期计划 + +- 扩展治愈操作类型:服务重启、扩容、流量切换等 +- 增加治愈方案的执行结果反馈和效果评估 - 将进程内 channel 平滑切换为 MQ(Kafka/NATS) -- 完善指标与可观测:事件消费速率、成功率、时延分位、回滚结果等 -- 增加补偿任务:对“回滚成功但缓存/DB 未一致”的场景进行对账修复 +- 完善指标与可观测:事件消费速率、成功率、时延分位、治愈结果等 + +### 长期计划 + +- 基于历史数据训练 AI 模型,自动推荐最优治愈方案 +- 增加补偿任务:对"治愈成功但缓存/DB 未一致"的场景进行对账修复 +- 实现治愈方案的 A/B 测试和效果对比 +- 构建完整的故障自愈知识库和最佳实践库 diff --git a/internal/alerting/service/remediation/consumer.go b/internal/alerting/service/remediation/consumer.go index 07cae6b..cead8d1 100644 --- a/internal/alerting/service/remediation/consumer.go +++ b/internal/alerting/service/remediation/consumer.go @@ -3,7 +3,6 @@ package remediation import ( "context" "fmt" - "os" "strconv" "time" @@ -17,46 +16,189 @@ type Consumer struct { DB *adb.Database Redis *redis.Client + // Heal action service for P0 alerts + healService HealActionService + + // Observation window manager + obsManager ObservationWindowManager + // sleepFn allows overriding for tests sleepFn func(time.Duration) } func NewConsumer(db *adb.Database, rdb *redis.Client) *Consumer { - return &Consumer{DB: db, Redis: rdb, sleepFn: time.Sleep} + healDAO := NewPgHealActionDAO(db) + healService := NewHealActionService(healDAO) + obsManager := NewRedisObservationWindowManager(rdb) + return &Consumer{ + DB: db, + Redis: rdb, + healService: healService, + obsManager: obsManager, + sleepFn: time.Sleep, + } } -// Start consumes alert messages and performs a mocked rollback then marks restored. +// Start consumes alert messages and processes them based on alert level func (c *Consumer) Start(ctx context.Context, ch <-chan healthcheck.AlertMessage) { if ch == nil { log.Warn().Msg("remediation consumer started without channel; no-op") return } - sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) + for { select { case <-ctx.Done(): return case m := <-ch: - // 1) Mock rollback: optional URL composition (unused) - _ = fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(&m)) - // 2) Sleep to simulate rollback time - if c.sleepFn != nil { - c.sleepFn(sleepDur) - } - // 3) On success: add AI analysis comment, update DB and cache - if err := c.addAIAnalysisComment(ctx, &m); err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed") - } - if err := c.markRestoredInDB(ctx, &m); err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed") - } - if err := c.markRestoredInCache(ctx, &m); err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed") + // 首先检查是否有观察窗口需要处理 + c.handleObservationWindow(ctx, &m) + + switch m.Level { + case "P0": + // P0 告警:故障治愈流程 + c.handleP0Alert(ctx, &m) + case "P1", "P2": + // P1/P2 告警:下钻分析流程 + c.handleP1P2Alert(ctx, &m) + default: + log.Warn().Str("level", m.Level).Str("issue", m.ID).Msg("unknown alert level, skipping") } } } } +// handleObservationWindow handles observation window logic for incoming alerts +func (c *Consumer) handleObservationWindow(ctx context.Context, m *healthcheck.AlertMessage) { + if m.Service == "" { + return // No service information, skip observation window check + } + + // 检查是否有该服务的观察窗口 + window, err := c.obsManager.CheckObservation(ctx, m.Service, m.Version) + if err != nil { + log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to check observation window") + return + } + + if window == nil { + return // No active observation window + } + + // 如果在观察窗口期间出现新的告警,取消观察窗口 + log.Warn(). + Str("service", m.Service). + Str("version", m.Version). + Str("alert_id", m.ID). + Str("observation_alert_id", window.AlertID). + Msg("new alert detected during observation window, cancelling observation") + + if err := c.obsManager.CancelObservation(ctx, m.Service, m.Version); err != nil { + log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to cancel observation window") + } +} + +// handleP0Alert handles P0 alerts with fault healing process +func (c *Consumer) handleP0Alert(ctx context.Context, m *healthcheck.AlertMessage) { + log.Info().Str("issue", m.ID).Str("level", m.Level).Msg("processing P0 alert with fault healing") + + // 1) 确认故障域 + faultDomain := c.healService.IdentifyFaultDomain(m.Labels) + log.Info().Str("issue", m.ID).Str("fault_domain", string(faultDomain)).Msg("identified fault domain") + + // 2) 查询治愈方案 + healAction, err := c.healService.GetHealAction(ctx, faultDomain) + if err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("failed to get heal action") + // 如果无法获取治愈方案,直接进入下钻分析 + c.handleDrillDownAnalysis(ctx, m) + return + } + + // 3) 执行治愈操作 + result, err := c.healService.ExecuteHealAction(ctx, healAction, m.ID, m.Labels) + if err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("failed to execute heal action") + c.handleDrillDownAnalysis(ctx, m) + return + } + + if !result.Success { + log.Warn().Str("issue", m.ID).Str("message", result.Message).Msg("heal action failed") + // 治愈失败,仍然进入下钻分析 + c.handleDrillDownAnalysis(ctx, m) + return + } + + log.Info().Str("issue", m.ID).Str("message", result.Message).Msg("heal action completed successfully") + + // 4) 治愈成功后启动观察窗口 + if m.Service != "" { + obsDuration := GetObservationDuration() + if err := c.obsManager.StartObservation(ctx, m.Service, m.Version, m.ID, obsDuration); err != nil { + log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to start observation window") + } else { + log.Info(). + Str("service", m.Service). + Str("version", m.Version). + Str("alert_id", m.ID). + Dur("duration", obsDuration). + Msg("started observation window after successful healing") + } + } + + // 5) 治愈成功后进入下钻分析(但不立即更新状态) + c.handleDrillDownAnalysisWithObservation(ctx, m) +} + +// handleP1P2Alert handles P1/P2 alerts with drill-down analysis +func (c *Consumer) handleP1P2Alert(ctx context.Context, m *healthcheck.AlertMessage) { + log.Info().Str("issue", m.ID).Str("level", m.Level).Msg("processing P1/P2 alert with drill-down analysis") + + // 直接进入下钻分析流程 + c.handleDrillDownAnalysis(ctx, m) +} + +// handleDrillDownAnalysis performs drill-down analysis and marks alert as restored +func (c *Consumer) handleDrillDownAnalysis(ctx context.Context, m *healthcheck.AlertMessage) { + // 1) 执行 AI 分析 + if err := c.addAIAnalysisComment(ctx, m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed") + } + + // 2) 更新告警状态为恢复 + if err := c.markRestoredInDB(ctx, m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed") + } + + // 3) 更新缓存状态 + if err := c.markRestoredInCache(ctx, m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed") + } +} + +// handleDrillDownAnalysisWithObservation performs drill-down analysis but delays status update for observation +func (c *Consumer) handleDrillDownAnalysisWithObservation(ctx context.Context, m *healthcheck.AlertMessage) { + // 1) 执行 AI 分析 + if err := c.addAIAnalysisComment(ctx, m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed") + } + + // 2) 暂时不更新告警状态,等待观察窗口完成 + // 只记录治愈操作完成的评论 + if err := c.addHealingCompletedComment(ctx, m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("addHealingCompletedComment failed") + } + + log.Info(). + Str("issue", m.ID). + Str("service", m.Service). + Str("version", m.Version). + Msg("healing completed, waiting for observation window to complete before updating status") +} + +// deriveDeployID derives deployment ID from alert message +// TODO: Use this function when implementing real rollback API calls func deriveDeployID(m *healthcheck.AlertMessage) string { if m == nil { return "" @@ -91,15 +233,38 @@ func (c *Consumer) addAIAnalysisComment(ctx context.Context, m *healthcheck.Aler return err } +func (c *Consumer) addHealingCompletedComment(ctx context.Context, m *healthcheck.AlertMessage) error { + if c.DB == nil || m == nil { + return nil + } + const existsQ = `SELECT 1 FROM alert_issue_comments WHERE issue_id=$1 AND content=$2 LIMIT 1` + const insertQ = `INSERT INTO alert_issue_comments (issue_id, create_at, content) VALUES ($1, NOW(), $2)` + content := "## 治愈操作完成\n" + + "**操作状态**:治愈操作已成功执行\n" + + "**观察窗口**:正在等待观察窗口完成(30分钟)\n" + + "**下一步**:如果观察窗口内无新告警,将自动更新服务状态为正常" + if rows, err := c.DB.QueryContext(ctx, existsQ, m.ID, content); err == nil { + defer rows.Close() + if rows.Next() { + return nil + } + } + _, err := c.DB.ExecContext(ctx, insertQ, m.ID, content) + return err +} + func (c *Consumer) markRestoredInDB(ctx context.Context, m *healthcheck.AlertMessage) error { if c.DB == nil || m == nil { return nil } - // alert_issues + + // 更新 alert_issues 状态 if _, err := c.DB.ExecContext(ctx, `UPDATE alert_issues SET alert_state = 'Restored' , state = 'Closed' WHERE id = $1`, m.ID); err != nil { return err } - // service_states (upsert) + + // 同时更新 service_states.health_state 为 Normal + // 注意:每次修改 service_states 为 Normal 时都需要修改 alert_issues.alert_state 为 Restored if m.Service != "" { const upsert = ` INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids) @@ -112,6 +277,7 @@ SET health_state = 'Normal', return err } } + return nil } @@ -146,7 +312,7 @@ return 1 `) _, _ = script.Run(ctx, c.Redis, []string{alertKey, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing", "alert:index:alert_state:Restored", "alert:index:open", "alert:index:closed"}, "Restored", m.ID, "Closed").Result() - // 2) service_state:{service}:{version} → health_state=Normal; resolved_at=now; add to Normal index + // 更新 service_state 缓存 if m.Service != "" { svcKey := "service_state:" + m.Service + ":" + m.Version now := time.Now().UTC().Format(time.RFC3339Nano) @@ -165,6 +331,54 @@ return 1 return nil } +// CompleteObservationAndUpdateStatus completes observation window and updates service status +func (c *Consumer) CompleteObservationAndUpdateStatus(ctx context.Context, service, version string) error { + if service == "" { + return fmt.Errorf("service name is required") + } + + // 完成观察窗口 + if err := c.obsManager.CompleteObservation(ctx, service, version); err != nil { + return fmt.Errorf("failed to complete observation window: %w", err) + } + + // 更新服务状态为正常 + const upsert = ` +INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids) +VALUES ($1, $2, NULL, NOW(), 'Normal', ARRAY[]::text[]) +ON CONFLICT (service, version) DO UPDATE +SET health_state = 'Normal', + resolved_at = NOW(); +` + if _, err := c.DB.ExecContext(ctx, upsert, service, version); err != nil { + return fmt.Errorf("failed to update service state: %w", err) + } + + // 更新缓存 + if c.Redis != nil { + svcKey := "service_state:" + service + ":" + version + now := time.Now().UTC().Format(time.RFC3339Nano) + svcScript := redis.NewScript(` +local v = redis.call('GET', KEYS[1]) +if not v then v = '{}' end +local obj = cjson.decode(v) +obj.health_state = ARGV[1] +obj.resolved_at = ARGV[2] +redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') +if KEYS[2] ~= '' then redis.call('SADD', KEYS[2], KEYS[1]) end +return 1 +`) + _, _ = svcScript.Run(ctx, c.Redis, []string{svcKey, "service_state:index:health:Normal"}, "Normal", now).Result() + } + + log.Info(). + Str("service", service). + Str("version", version). + Msg("observation window completed successfully, service status updated to Normal") + + return nil +} + func parseDuration(s string, d time.Duration) time.Duration { if s == "" { return d diff --git a/internal/alerting/service/remediation/heal_action_dao.go b/internal/alerting/service/remediation/heal_action_dao.go new file mode 100644 index 0000000..af71b5e --- /dev/null +++ b/internal/alerting/service/remediation/heal_action_dao.go @@ -0,0 +1,145 @@ +package remediation + +import ( + "context" + "database/sql" + "encoding/json" + "fmt" + + adb "github.com/qiniu/zeroops/internal/alerting/database" +) + +// PgHealActionDAO implements HealActionDAO using PostgreSQL +type PgHealActionDAO struct { + DB *adb.Database +} + +// NewPgHealActionDAO creates a new PostgreSQL heal action DAO +func NewPgHealActionDAO(db *adb.Database) *PgHealActionDAO { + return &PgHealActionDAO{DB: db} +} + +// GetByType retrieves a heal action by fault domain type +func (d *PgHealActionDAO) GetByType(ctx context.Context, faultType string) (*HealAction, error) { + const q = `SELECT id, desc, type, rules FROM heal_actions WHERE type = $1 LIMIT 1` + + row := d.DB.QueryRowContext(ctx, q, faultType) + var action HealAction + var rulesJSON string + + err := row.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON) + if err != nil { + if err == sql.ErrNoRows { + return nil, fmt.Errorf("no heal action found for type: %s", faultType) + } + return nil, fmt.Errorf("failed to get heal action by type: %w", err) + } + + action.Rules = json.RawMessage(rulesJSON) + return &action, nil +} + +// GetByID retrieves a heal action by ID +func (d *PgHealActionDAO) GetByID(ctx context.Context, id string) (*HealAction, error) { + const q = `SELECT id, desc, type, rules FROM heal_actions WHERE id = $1` + + row := d.DB.QueryRowContext(ctx, q, id) + var action HealAction + var rulesJSON string + + err := row.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON) + if err != nil { + if err == sql.ErrNoRows { + return nil, fmt.Errorf("no heal action found with id: %s", id) + } + return nil, fmt.Errorf("failed to get heal action by id: %w", err) + } + + action.Rules = json.RawMessage(rulesJSON) + return &action, nil +} + +// Create creates a new heal action +func (d *PgHealActionDAO) Create(ctx context.Context, action *HealAction) error { + const q = `INSERT INTO heal_actions (id, desc, type, rules) VALUES ($1, $2, $3, $4)` + + _, err := d.DB.ExecContext(ctx, q, action.ID, action.Desc, action.Type, string(action.Rules)) + if err != nil { + return fmt.Errorf("failed to create heal action: %w", err) + } + + return nil +} + +// Update updates an existing heal action +func (d *PgHealActionDAO) Update(ctx context.Context, action *HealAction) error { + const q = `UPDATE heal_actions SET desc = $2, type = $3, rules = $4 WHERE id = $1` + + result, err := d.DB.ExecContext(ctx, q, action.ID, action.Desc, action.Type, string(action.Rules)) + if err != nil { + return fmt.Errorf("failed to update heal action: %w", err) + } + + rowsAffected, err := result.RowsAffected() + if err != nil { + return fmt.Errorf("failed to get rows affected: %w", err) + } + + if rowsAffected == 0 { + return fmt.Errorf("no heal action found with id: %s", action.ID) + } + + return nil +} + +// Delete deletes a heal action by ID +func (d *PgHealActionDAO) Delete(ctx context.Context, id string) error { + const q = `DELETE FROM heal_actions WHERE id = $1` + + result, err := d.DB.ExecContext(ctx, q, id) + if err != nil { + return fmt.Errorf("failed to delete heal action: %w", err) + } + + rowsAffected, err := result.RowsAffected() + if err != nil { + return fmt.Errorf("failed to get rows affected: %w", err) + } + + if rowsAffected == 0 { + return fmt.Errorf("no heal action found with id: %s", id) + } + + return nil +} + +// List retrieves all heal actions +func (d *PgHealActionDAO) List(ctx context.Context) ([]*HealAction, error) { + const q = `SELECT id, desc, type, rules FROM heal_actions ORDER BY type, id` + + rows, err := d.DB.QueryContext(ctx, q) + if err != nil { + return nil, fmt.Errorf("failed to list heal actions: %w", err) + } + defer rows.Close() + + var actions []*HealAction + for rows.Next() { + var action HealAction + var rulesJSON string + + err := rows.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON) + if err != nil { + return nil, fmt.Errorf("failed to scan heal action: %w", err) + } + + action.Rules = json.RawMessage(rulesJSON) + actions = append(actions, &action) + } + + if err = rows.Err(); err != nil { + return nil, fmt.Errorf("error iterating heal actions: %w", err) + } + + return actions, nil +} diff --git a/internal/alerting/service/remediation/heal_action_service.go b/internal/alerting/service/remediation/heal_action_service.go new file mode 100644 index 0000000..aa9a8f0 --- /dev/null +++ b/internal/alerting/service/remediation/heal_action_service.go @@ -0,0 +1,172 @@ +package remediation + +import ( + "context" + "encoding/json" + "fmt" + "os" + "time" + + "github.com/rs/zerolog/log" +) + +// HealActionServiceImpl implements HealActionService +type HealActionServiceImpl struct { + dao HealActionDAO +} + +// NewHealActionService creates a new heal action service +func NewHealActionService(dao HealActionDAO) *HealActionServiceImpl { + return &HealActionServiceImpl{dao: dao} +} + +// IdentifyFaultDomain identifies the fault domain from alert labels +func (s *HealActionServiceImpl) IdentifyFaultDomain(labels map[string]string) FaultDomain { + service := labels["service_name"] + version := labels["version"] + + if service != "" && version != "" { + return FaultDomainServiceVersion + } + + // TODO: 可根据更多条件扩展其他故障域 + // - 整体问题:检查是否有全局性指标异常 + // - 单机房问题:检查是否有机房相关标签 + // - 网络问题:检查是否有网络相关标签 + return FaultDomainUnknown +} + +// GetHealAction retrieves the appropriate heal action for a fault domain +func (s *HealActionServiceImpl) GetHealAction(ctx context.Context, faultDomain FaultDomain) (*HealAction, error) { + if faultDomain == FaultDomainUnknown { + return nil, fmt.Errorf("unknown fault domain, cannot determine heal action") + } + + action, err := s.dao.GetByType(ctx, string(faultDomain)) + if err != nil { + return nil, fmt.Errorf("failed to get heal action for domain %s: %w", faultDomain, err) + } + + return action, nil +} + +// ExecuteHealAction executes the heal action based on the rules +func (s *HealActionServiceImpl) ExecuteHealAction(ctx context.Context, action *HealAction, alertID string, labels map[string]string) (*HealActionResult, error) { + if action == nil { + return &HealActionResult{ + Success: false, + Error: "no heal action provided", + }, nil + } + + // Parse the rules + var rules HealActionRules + if err := json.Unmarshal(action.Rules, &rules); err != nil { + return &HealActionResult{ + Success: false, + Error: fmt.Sprintf("failed to parse heal action rules: %v", err), + }, nil + } + + // Execute based on action type + switch rules.Action { + case "rollback": + return s.executeRollback(ctx, rules, alertID, labels) + case "alert": + return s.executeAlert(rules, alertID, labels) + default: + return &HealActionResult{ + Success: false, + Error: fmt.Sprintf("unsupported action type: %s", rules.Action), + }, nil + } +} + +// executeRollback executes a rollback operation +func (s *HealActionServiceImpl) executeRollback(ctx context.Context, rules HealActionRules, alertID string, labels map[string]string) (*HealActionResult, error) { + _ = ctx // TODO: Use context for HTTP timeout when calling real rollback API + // Check deployment status if specified + if rules.DeploymentStatus != "" { + // TODO: 实际实现中应该查询部署系统获取真实的部署状态 + // 这里暂时模拟检查 + deployStatus := s.getDeploymentStatus(labels) + if deployStatus != rules.DeploymentStatus { + return &HealActionResult{ + Success: false, + Message: fmt.Sprintf("deployment status mismatch: expected %s, got %s", rules.DeploymentStatus, deployStatus), + }, nil + } + } + + // Mock rollback execution + sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) + log.Info(). + Str("alert_id", alertID). + Str("target", rules.Target). + Dur("sleep_duration", sleepDur). + Msg("executing mock rollback") + + // Simulate rollback time + time.Sleep(sleepDur) + + // TODO: 实际实现中应该调用真实的回滚接口 + // url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(labels)) + // 发起 HTTP POST 请求到回滚接口 + + return &HealActionResult{ + Success: true, + Message: fmt.Sprintf("rollback completed successfully, target: %s", rules.Target), + }, nil +} + +// executeAlert executes an alert-only action (no automatic healing) +func (s *HealActionServiceImpl) executeAlert(rules HealActionRules, alertID string, labels map[string]string) (*HealActionResult, error) { + _ = labels // TODO: Use labels for context-specific alert messages + log.Warn(). + Str("alert_id", alertID). + Str("message", rules.Message). + Msg("heal action requires manual intervention") + + return &HealActionResult{ + Success: false, + Message: rules.Message, + }, nil +} + +// getDeploymentStatus gets the deployment status for the given labels +// TODO: 实际实现中应该查询部署系统获取真实的部署状态 +func (s *HealActionServiceImpl) getDeploymentStatus(labels map[string]string) string { + // 这里暂时返回模拟状态 + // 实际实现中应该: + // 1. 从 labels 中提取 service 和 version + // 2. 查询部署系统 API 获取当前部署状态 + // 3. 返回 "deploying" 或 "deployed" + + service := labels["service_name"] + version := labels["version"] + + if service == "" || version == "" { + return "unknown" + } + + // 模拟逻辑:如果版本号包含 "dev" 或 "test",认为是发布中,待确认修改为实际的部署状态区分方式 + if version == "dev" || version == "test" { + return "deploying" + } + + return "deployed" +} + +// deriveDeployIDFromLabels derives deployment ID from labels +// TODO: Use this function when implementing real rollback API calls +func deriveDeployIDFromLabels(labels map[string]string) string { + if v := labels["deploy_id"]; v != "" { + return v + } + service := labels["service_name"] + version := labels["version"] + if service != "" && version != "" { + return fmt.Sprintf("%s:%s", service, version) + } + return "" +} diff --git a/internal/alerting/service/remediation/heal_action_service_test.go b/internal/alerting/service/remediation/heal_action_service_test.go new file mode 100644 index 0000000..eb45209 --- /dev/null +++ b/internal/alerting/service/remediation/heal_action_service_test.go @@ -0,0 +1,178 @@ +package remediation + +import ( + "context" + "encoding/json" + "testing" +) + +func TestHealActionServiceImpl_IdentifyFaultDomain(t *testing.T) { + service := &HealActionServiceImpl{} + + tests := []struct { + name string + labels map[string]string + expected FaultDomain + }{ + { + name: "service_version_issue", + labels: map[string]string{ + "service_name": "test-service", + "version": "v1.0.0", + }, + expected: FaultDomainServiceVersion, + }, + { + name: "missing_service_name", + labels: map[string]string{ + "version": "v1.0.0", + }, + expected: FaultDomainUnknown, + }, + { + name: "missing_version", + labels: map[string]string{ + "service_name": "test-service", + }, + expected: FaultDomainUnknown, + }, + { + name: "empty_labels", + labels: map[string]string{}, + expected: FaultDomainUnknown, + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := service.IdentifyFaultDomain(tt.labels) + if result != tt.expected { + t.Errorf("IdentifyFaultDomain() = %v, want %v", result, tt.expected) + } + }) + } +} + +func TestHealActionServiceImpl_ExecuteHealAction(t *testing.T) { + service := &HealActionServiceImpl{} + + tests := []struct { + name string + action *HealAction + alertID string + labels map[string]string + expectError bool + }{ + { + name: "rollback_action", + action: &HealAction{ + ID: "test-rollback", + Desc: "Test rollback action", + Type: "service_version_issue", + Rules: json.RawMessage(`{ + "deployment_status": "deploying", + "action": "rollback", + "target": "previous_version" + }`), + }, + alertID: "test-alert-1", + labels: map[string]string{ + "service_name": "test-service", + "version": "dev", + }, + expectError: false, + }, + { + name: "alert_action", + action: &HealAction{ + ID: "test-alert", + Desc: "Test alert action", + Type: "service_version_issue", + Rules: json.RawMessage(`{ + "action": "alert", + "message": "Version already deployed, manual intervention required" + }`), + }, + alertID: "test-alert-2", + labels: map[string]string{ + "service_name": "test-service", + "version": "v1.0.0", + }, + expectError: false, + }, + { + name: "nil_action", + action: nil, + alertID: "test-alert-3", + labels: map[string]string{}, + expectError: false, // Should not error, but return failure result + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result, err := service.ExecuteHealAction(context.Background(), tt.action, tt.alertID, tt.labels) + + if tt.expectError && err == nil { + t.Errorf("ExecuteHealAction() expected error but got none") + } + if !tt.expectError && err != nil { + t.Errorf("ExecuteHealAction() unexpected error: %v", err) + } + + if result == nil { + t.Errorf("ExecuteHealAction() returned nil result") + } + }) + } +} + +func TestHealActionServiceImpl_getDeploymentStatus(t *testing.T) { + service := &HealActionServiceImpl{} + + tests := []struct { + name string + labels map[string]string + expected string + }{ + { + name: "deploying_version", + labels: map[string]string{ + "service_name": "test-service", + "version": "dev", + }, + expected: "deploying", + }, + { + name: "deployed_version", + labels: map[string]string{ + "service_name": "test-service", + "version": "v1.0.0", + }, + expected: "deployed", + }, + { + name: "missing_service_name", + labels: map[string]string{ + "version": "v1.0.0", + }, + expected: "unknown", + }, + { + name: "missing_version", + labels: map[string]string{ + "service_name": "test-service", + }, + expected: "unknown", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + result := service.getDeploymentStatus(tt.labels) + if result != tt.expected { + t.Errorf("getDeploymentStatus() = %v, want %v", result, tt.expected) + } + }) + } +} diff --git a/internal/alerting/service/remediation/init_heal_actions.sql b/internal/alerting/service/remediation/init_heal_actions.sql new file mode 100644 index 0000000..b11fcee --- /dev/null +++ b/internal/alerting/service/remediation/init_heal_actions.sql @@ -0,0 +1,38 @@ +-- 创建 heal_actions 表 +CREATE TABLE IF NOT EXISTS heal_actions ( + id VARCHAR(255) PRIMARY KEY, + desc TEXT NOT NULL, + type VARCHAR(255) NOT NULL, + rules JSONB NOT NULL +); + +-- 创建索引 +CREATE INDEX IF NOT EXISTS idx_heal_actions_type ON heal_actions(type); + +-- 插入示例数据 +INSERT INTO heal_actions (id, desc, type, rules) VALUES +( + 'service_version_rollback_deploying', + '服务版本回滚方案(发布中版本)', + 'service_version_issue', + '{"deployment_status": "deploying", "action": "rollback", "target": "previous_version"}' +), +( + 'service_version_alert_deployed', + '服务版本告警方案(已完成发布版本)', + 'service_version_issue', + '{"deployment_status": "deployed", "action": "alert", "message": "版本已发布,暂不支持自动回滚,需要人工介入处理"}' +), +( + 'service_version_rollback_default', + '服务版本回滚方案(默认)', + 'service_version_issue', + '{"action": "rollback", "target": "previous_version"}' +) +ON CONFLICT (id) DO UPDATE SET + desc = EXCLUDED.desc, + type = EXCLUDED.type, + rules = EXCLUDED.rules; + +-- 查询验证 +SELECT id, desc, type, rules FROM heal_actions ORDER BY type, id; diff --git a/internal/alerting/service/remediation/observation_window.go b/internal/alerting/service/remediation/observation_window.go new file mode 100644 index 0000000..c07e146 --- /dev/null +++ b/internal/alerting/service/remediation/observation_window.go @@ -0,0 +1,169 @@ +package remediation + +import ( + "context" + "encoding/json" + "fmt" + "time" + + "github.com/redis/go-redis/v9" + "github.com/rs/zerolog/log" +) + +// RedisObservationWindowManager implements ObservationWindowManager using Redis +type RedisObservationWindowManager struct { + redis *redis.Client +} + +// NewRedisObservationWindowManager creates a new Redis-based observation window manager +func NewRedisObservationWindowManager(redis *redis.Client) *RedisObservationWindowManager { + return &RedisObservationWindowManager{redis: redis} +} + +// StartObservation starts an observation window for a service +func (m *RedisObservationWindowManager) StartObservation(ctx context.Context, service, version, alertID string, duration time.Duration) error { + if m.redis == nil { + return fmt.Errorf("redis client is nil") + } + + now := time.Now() + window := &ObservationWindow{ + Duration: duration, + Service: service, + Version: version, + AlertID: alertID, + StartTime: now, + EndTime: now.Add(duration), + IsActive: true, + } + + key := fmt.Sprintf("observation:%s:%s", service, version) + data, err := json.Marshal(window) + if err != nil { + return fmt.Errorf("failed to marshal observation window: %w", err) + } + + // Store with TTL equal to observation duration + buffer + ttl := duration + 5*time.Minute + err = m.redis.Set(ctx, key, data, ttl).Err() + if err != nil { + return fmt.Errorf("failed to store observation window: %w", err) + } + + log.Info(). + Str("service", service). + Str("version", version). + Str("alert_id", alertID). + Dur("duration", duration). + Time("end_time", window.EndTime). + Msg("started observation window") + + return nil +} + +// CheckObservation checks if there's an active observation window for a service +func (m *RedisObservationWindowManager) CheckObservation(ctx context.Context, service, version string) (*ObservationWindow, error) { + if m.redis == nil { + return nil, fmt.Errorf("redis client is nil") + } + + key := fmt.Sprintf("observation:%s:%s", service, version) + data, err := m.redis.Get(ctx, key).Result() + if err != nil { + if err == redis.Nil { + return nil, nil // No active observation window + } + return nil, fmt.Errorf("failed to get observation window: %w", err) + } + + var window ObservationWindow + err = json.Unmarshal([]byte(data), &window) + if err != nil { + return nil, fmt.Errorf("failed to unmarshal observation window: %w", err) + } + + // Check if observation window has expired + if time.Now().After(window.EndTime) { + // Clean up expired window + m.redis.Del(ctx, key) + return nil, nil + } + + return &window, nil +} + +// CompleteObservation completes an observation window and marks it as successful +func (m *RedisObservationWindowManager) CompleteObservation(ctx context.Context, service, version string) error { + if m.redis == nil { + return fmt.Errorf("redis client is nil") + } + + key := fmt.Sprintf("observation:%s:%s", service, version) + + // Get the current window + window, err := m.CheckObservation(ctx, service, version) + if err != nil { + return fmt.Errorf("failed to check observation window: %w", err) + } + + if window == nil { + return fmt.Errorf("no active observation window found for service %s version %s", service, version) + } + + // Mark as completed and remove from Redis + window.IsActive = false + err = m.redis.Del(ctx, key).Err() + if err != nil { + return fmt.Errorf("failed to remove observation window: %w", err) + } + + log.Info(). + Str("service", service). + Str("version", version). + Str("alert_id", window.AlertID). + Dur("duration", window.Duration). + Msg("completed observation window successfully") + + return nil +} + +// CancelObservation cancels an observation window due to new alerts +func (m *RedisObservationWindowManager) CancelObservation(ctx context.Context, service, version string) error { + if m.redis == nil { + return fmt.Errorf("redis client is nil") + } + + key := fmt.Sprintf("observation:%s:%s", service, version) + + // Get the current window for logging + window, err := m.CheckObservation(ctx, service, version) + if err != nil { + return fmt.Errorf("failed to check observation window: %w", err) + } + + if window == nil { + return nil // No active window to cancel + } + + // Remove the observation window + err = m.redis.Del(ctx, key).Err() + if err != nil { + return fmt.Errorf("failed to cancel observation window: %w", err) + } + + log.Warn(). + Str("service", service). + Str("version", version). + Str("alert_id", window.AlertID). + Msg("cancelled observation window due to new alerts") + + return nil +} + +// GetObservationDuration returns the configured observation duration +// TODO: 后续可以从配置或数据库中动态获取观察时间 +func GetObservationDuration() time.Duration { + // 暂时使用固定的30分钟观察窗口 + // 后续可以扩展为从环境变量或配置文件中读取 + return 30 * time.Minute +} diff --git a/internal/alerting/service/remediation/observation_window_test.go b/internal/alerting/service/remediation/observation_window_test.go new file mode 100644 index 0000000..8a2ec35 --- /dev/null +++ b/internal/alerting/service/remediation/observation_window_test.go @@ -0,0 +1,100 @@ +package remediation + +import ( + "context" + "testing" + "time" + + "github.com/redis/go-redis/v9" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestRedisObservationWindowManager(t *testing.T) { + // 使用内存 Redis 客户端进行测试 + rdb := redis.NewClient(&redis.Options{ + Addr: "localhost:6379", // 需要 Redis 实例 + }) + defer rdb.Close() + + // 检查 Redis 连接 + ctx := context.Background() + if err := rdb.Ping(ctx).Err(); err != nil { + t.Skip("Redis not available, skipping test") + } + + manager := NewRedisObservationWindowManager(rdb) + + t.Run("StartObservation", func(t *testing.T) { + service := "test-service" + version := "v1.0.0" + alertID := "test-alert-1" + duration := 5 * time.Minute + + err := manager.StartObservation(ctx, service, version, alertID, duration) + require.NoError(t, err) + + // 验证观察窗口已创建 + window, err := manager.CheckObservation(ctx, service, version) + require.NoError(t, err) + require.NotNil(t, window) + assert.Equal(t, service, window.Service) + assert.Equal(t, version, window.Version) + assert.Equal(t, alertID, window.AlertID) + assert.True(t, window.IsActive) + }) + + t.Run("CheckObservation_NotFound", func(t *testing.T) { + service := "non-existent-service" + version := "v1.0.0" + + window, err := manager.CheckObservation(ctx, service, version) + require.NoError(t, err) + assert.Nil(t, window) + }) + + t.Run("CompleteObservation", func(t *testing.T) { + service := "test-service-2" + version := "v1.0.0" + alertID := "test-alert-2" + duration := 5 * time.Minute + + // 先创建观察窗口 + err := manager.StartObservation(ctx, service, version, alertID, duration) + require.NoError(t, err) + + // 完成观察窗口 + err = manager.CompleteObservation(ctx, service, version) + require.NoError(t, err) + + // 验证观察窗口已被移除 + window, err := manager.CheckObservation(ctx, service, version) + require.NoError(t, err) + assert.Nil(t, window) + }) + + t.Run("CancelObservation", func(t *testing.T) { + service := "test-service-3" + version := "v1.0.0" + alertID := "test-alert-3" + duration := 5 * time.Minute + + // 先创建观察窗口 + err := manager.StartObservation(ctx, service, version, alertID, duration) + require.NoError(t, err) + + // 取消观察窗口 + err = manager.CancelObservation(ctx, service, version) + require.NoError(t, err) + + // 验证观察窗口已被移除 + window, err := manager.CheckObservation(ctx, service, version) + require.NoError(t, err) + assert.Nil(t, window) + }) +} + +func TestGetObservationDuration(t *testing.T) { + duration := GetObservationDuration() + assert.Equal(t, 30*time.Minute, duration) +} diff --git a/internal/alerting/service/remediation/types.go b/internal/alerting/service/remediation/types.go new file mode 100644 index 0000000..c1c5f02 --- /dev/null +++ b/internal/alerting/service/remediation/types.go @@ -0,0 +1,74 @@ +package remediation + +import ( + "context" + "encoding/json" + "time" +) + +// HealAction represents a healing action configuration +type HealAction struct { + ID string `json:"id"` + Desc string `json:"desc"` + Type string `json:"type"` + Rules json.RawMessage `json:"rules"` +} + +// HealActionRules represents the rules for a heal action +type HealActionRules struct { + DeploymentStatus string `json:"deployment_status,omitempty"` + Action string `json:"action"` + Target string `json:"target,omitempty"` + Message string `json:"message,omitempty"` +} + +// FaultDomain represents the identified fault domain +type FaultDomain string + +const ( + FaultDomainServiceVersion FaultDomain = "service_version_issue" + FaultDomainUnknown FaultDomain = "unknown" +) + +// HealActionResult represents the result of executing a heal action +type HealActionResult struct { + Success bool `json:"success"` + Message string `json:"message,omitempty"` + Error string `json:"error,omitempty"` +} + +// ObservationWindow represents the observation period after healing +type ObservationWindow struct { + Duration time.Duration `json:"duration"` + Service string `json:"service"` + Version string `json:"version"` + AlertID string `json:"alert_id"` + StartTime time.Time `json:"start_time"` + EndTime time.Time `json:"end_time"` + IsActive bool `json:"is_active"` +} + +// ObservationWindowManager defines the interface for managing observation windows +type ObservationWindowManager interface { + StartObservation(ctx context.Context, service, version, alertID string, duration time.Duration) error + CheckObservation(ctx context.Context, service, version string) (*ObservationWindow, error) + CompleteObservation(ctx context.Context, service, version string) error + CancelObservation(ctx context.Context, service, version string) error +} + +// HealActionDAO defines the interface for heal action database operations +type HealActionDAO interface { + GetByType(ctx context.Context, faultType string) (*HealAction, error) + GetByID(ctx context.Context, id string) (*HealAction, error) + Create(ctx context.Context, action *HealAction) error + Update(ctx context.Context, action *HealAction) error + Delete(ctx context.Context, id string) error + List(ctx context.Context) ([]*HealAction, error) +} + +// HealActionService defines the interface for heal action business logic +type HealActionService interface { + IdentifyFaultDomain(labels map[string]string) FaultDomain + GetHealAction(ctx context.Context, faultDomain FaultDomain) (*HealAction, error) + ExecuteHealAction(ctx context.Context, action *HealAction, alertID string, labels map[string]string) (*HealActionResult, error) +} From b40dbd31ccce39b0e748566f1626752feb54b22f Mon Sep 17 00:00:00 2001 From: acd19ml Date: Mon, 22 Sep 2025 17:55:37 +0800 Subject: [PATCH 4/4] Revert "Merge branch 'feature/alert-healing-with-observation-window' into develop" This reverts commit bc519391b3de8af08641d9bc30f4e69b6d2bbf8d, reversing changes made to 7ae82d12553ba2c02828e3b44b836cb6f629b701. --- docs/alerting/database-design.md | 47 +-- go.mod | 3 - internal/alerting/database/database.go | 5 - .../alerting/service/remediation/README.md | 373 +++--------------- .../alerting/service/remediation/consumer.go | 258 ++---------- .../service/remediation/heal_action_dao.go | 145 ------- .../remediation/heal_action_service.go | 172 -------- .../remediation/heal_action_service_test.go | 178 --------- .../service/remediation/init_heal_actions.sql | 38 -- .../service/remediation/observation_window.go | 169 -------- .../remediation/observation_window_test.go | 100 ----- .../alerting/service/remediation/types.go | 74 ---- 12 files changed, 70 insertions(+), 1492 deletions(-) delete mode 100644 internal/alerting/service/remediation/heal_action_dao.go delete mode 100644 internal/alerting/service/remediation/heal_action_service.go delete mode 100644 internal/alerting/service/remediation/heal_action_service_test.go delete mode 100644 internal/alerting/service/remediation/init_heal_actions.sql delete mode 100644 internal/alerting/service/remediation/observation_window.go delete mode 100644 internal/alerting/service/remediation/observation_window_test.go delete mode 100644 internal/alerting/service/remediation/types.go diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md index 0f860df..b119349 100644 --- a/docs/alerting/database-design.md +++ b/docs/alerting/database-design.md @@ -2,7 +2,7 @@ ## 概述 -本文档为最新数据库设计,总计包含 7 张表: +本文档为最新数据库设计,总计包含 6 张表: - alert_issues - alert_issue_comments @@ -10,7 +10,6 @@ - alert_rules - alert_rule_metas - service_states -- heal_actions ## 数据表设计 @@ -112,7 +111,7 @@ --- -### 6) service_states(服务状态表) +### 7) service_states(服务状态表) 追踪服务在某一版本上的健康状态与处置进度。 @@ -128,34 +127,6 @@ **索引建议:** - PRIMARY KEY: `(service, version)` ---- - -### 7) heal_actions(告警治愈解决方案表) - -存储不同故障域对应的治愈方案和规则。 - -| 字段名 | 类型 | 说明 | -|--------|------|------| -| id | varchar(255) PK | 治愈方案 ID | -| desc | text | 简单描述,如 action 是处理什么告警场景的 | -| type | varchar(255) | 对应的故障域类型 | -| rules | jsonb | 条件规则:{condition1: action1, condition2: action2} | - -**索引建议:** -- PRIMARY KEY: `id` -- INDEX: `(type)` - -**示例数据:** -```sql -INSERT INTO heal_actions (id, desc, type, rules) VALUES -('service_version_rollback', '服务版本回滚方案', 'service_version_issue', - '{"deployment_status": "deploying", "action": "rollback", "target": "previous_version"}'), -('service_version_alert', '服务版本告警方案', 'service_version_issue', - '{"deployment_status": "deployed", "action": "alert", "message": "版本已发布,暂不支持自动回滚"}'); -``` - -TODO: health_state映射逻辑 - ## 数据关系(ER) ```mermaid @@ -204,25 +175,13 @@ erDiagram text content } - heal_actions { - varchar id PK - text desc - varchar type - jsonb rules - } - %% 通过 service 等标签在应用层逻辑关联 alert_rule_metas ||..|| alert_rules : "by alert_name" service_states ||..|| alert_rule_metas : "by service/version labels" - heal_actions ||..|| alert_issues : "by fault domain analysis" ``` ## 数据流转 1. 以 `alert_rules` 为模版,结合 `alert_rule_metas` 渲染出面向具体服务/版本等的规则(labels 可为空 `{}` 表示全局默认,或包含如 service/version 等标签)。 2. 指标或规则参数发生调整时,记录到 `alert_meta_change_logs`。 -3. 规则触发创建 `alert_issues`;处理过程中的动作写入 `alert_issue_comments`。 -4. **告警治愈流程**: - - P0 告警:根据 `alert_issues.labels` 识别故障域,查询 `heal_actions` 获取治愈方案 - - 执行治愈操作(如回滚),成功后更新 `alert_issues` 和 `service_states` 状态 - - P1/P2 告警:直接进入下钻分析,记录分析结果到 `alert_issue_comments` \ No newline at end of file +3. 规则触发创建 `alert_issues`;处理过程中的动作写入 `alert_issue_comments`。 \ No newline at end of file diff --git a/go.mod b/go.mod index 8cf046f..6094f9c 100644 --- a/go.mod +++ b/go.mod @@ -9,7 +9,6 @@ require ( github.com/lib/pq v1.10.9 github.com/redis/go-redis/v9 v9.5.1 github.com/rs/zerolog v1.34.0 - github.com/stretchr/testify v1.11.1 ) require ( @@ -17,7 +16,6 @@ require ( github.com/bytedance/sonic/loader v0.2.4 // indirect github.com/cespare/xxhash/v2 v2.2.0 // indirect github.com/cloudwego/base64x v0.1.5 // indirect - github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect github.com/gabriel-vasile/mimetype v1.4.9 // indirect github.com/gin-contrib/cors v1.7.6 // indirect @@ -41,7 +39,6 @@ require ( github.com/modern-go/reflect2 v1.0.2 // indirect github.com/natefinch/lumberjack v2.0.0+incompatible // indirect github.com/pelletier/go-toml/v2 v2.2.4 // indirect - github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/twitchyliquid64/golang-asm v0.15.1 // indirect github.com/ugorji/go/codec v1.3.0 // indirect golang.org/x/arch v0.18.0 // indirect diff --git a/internal/alerting/database/database.go b/internal/alerting/database/database.go index 6b2ab9b..e6ee504 100644 --- a/internal/alerting/database/database.go +++ b/internal/alerting/database/database.go @@ -38,8 +38,3 @@ func (d *Database) ExecContext(ctx context.Context, q string, args ...any) (sql. func (d *Database) QueryContext(ctx context.Context, q string, args ...any) (*sql.Rows, error) { return d.db.QueryContext(ctx, q, args...) } - -// QueryRowContext exposes database/sql QueryRowContext for single row SELECT queries. -func (d *Database) QueryRowContext(ctx context.Context, q string, args ...any) *sql.Row { - return d.db.QueryRowContext(ctx, q, args...) -} diff --git a/internal/alerting/service/remediation/README.md b/internal/alerting/service/remediation/README.md index ca41e49..de5ca08 100644 --- a/internal/alerting/service/remediation/README.md +++ b/internal/alerting/service/remediation/README.md @@ -1,28 +1,23 @@ -# remediation — 告警治愈与下钻分析 +# remediation — 通道消费与自动回滚(Mock) -本包实现一个后台处理器:消费 `healthcheck` 投递到进程内 channel 的告警消息,根据告警等级进行分流处理: -- **P0 告警**:进入"故障治愈"模块,执行自动修复操作 -- **P1/P2 告警**:进入"下钻分析"模块,进行深度分析 +本包规划一个后台处理器:消费 `healthcheck` 投递到进程内 channel 的告警消息,模拟执行“自动回滚”,回滚成功后将相关告警与服务态标记为恢复。 —— ## 1. 目标 - 订阅 `healthcheck` 的 `AlertMessage`(进程内 channel) -- 根据 `level` 字段进行分流: - - **P0 告警**:故障治愈流程 - 1) 确认故障域(从 labels 分析 service_name + version) - 2) 查询 `heal_actions` 表获取治愈方案 - 3) 执行治愈操作(当前仅支持回滚) - 4) 治愈成功后启动观察窗口(默认30分钟) - 5) 观察窗口内如果出现新告警,取消观察并重新处理 - 6) 观察窗口完成后,更新服务状态为正常 - - **P1/P2 告警**:直接进入下钻分析流程 - 1) 执行 AI 分析 - 2) 更新告警状态为恢复 - 3) 记录分析结果到评论 - -> 说明:本阶段实现故障域识别和治愈方案查询,真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。 +- 对每条消息: + 1) Mock 调用回滚接口 `POST /v1/deployments/:deployID/rollback` + 2) `sleep 30s` 后返回“回滚成功”的模拟响应 + 3) 若成功,则更新 DB 与缓存: + - `alert_issues.alert_state = 'Restored'` + - `alert_issues.state = 'Closed'` + - `service_states.health_state = 'Normal'` + - `service_states.resolved_at = NOW()`(当前时间) + - 同时在 `alert_issue_comments` 中追加一条 AI 分析评论(见下文内容模板) + +> 说明:本阶段仅实现消费与 Mock,真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。 —— @@ -42,10 +37,8 @@ } ``` -- 故障域识别:从 `Labels` 中提取 `service_name` 和 `version` 信息 - deployID 的来源(用于构造回滚 URL): - - 可从 `Labels["deploy_id"]`(若存在)读取 - - 若为空,可按 `{service}:{version}` 组装一个占位 ID + - Mock 阶段:可从 `Labels["deploy_id"]`(若存在)读取;若为空,可按 `{service}:{version}` 组装一个占位 ID。 —— @@ -69,7 +62,7 @@ REMEDIATION_ROLLBACK_SLEEP=30s —— -## 4. 处理流程(伪代码) +## 4. 流程(伪代码) ```go func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rdb *redis.Client) { @@ -78,256 +71,30 @@ func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rd case <-ctx.Done(): return case m := <-ch: - switch m.Level { - case "P0": - // P0 告警:故障治愈流程 - handleP0Alert(ctx, m, db, rdb) - case "P1", "P2": - // P1/P2 告警:下钻分析流程 - handleP1P2Alert(ctx, m, db, rdb) - default: - log.Printf("Unknown alert level: %s", m.Level) + // 1) 组装回滚 URL(Mock) + deployID := m.Labels["deploy_id"] + if deployID == "" { + // 仅 Mock:用 service:version 兜底 + deployID = fmt.Sprintf("%s:%s", m.Service, m.Version) } - } - } -} - -// P0 告警处理:故障治愈流程 -func handleP0Alert(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) { - // 1) 确认故障域 - faultDomain := identifyFaultDomain(m.Labels) - - // 2) 查询治愈方案 - healAction, err := queryHealAction(ctx, db, faultDomain) - if err != nil { - log.Printf("Failed to query heal action: %v", err) - // 治愈方案查询失败,直接进入下钻分析 - handleDrillDownAnalysis(ctx, m, db, rdb) - return - } - - // 3) 执行治愈操作 - success := executeHealAction(ctx, healAction, m) - if !success { - log.Printf("Heal action failed for alert %s", m.ID) - // 治愈操作失败,直接进入下钻分析 - handleDrillDownAnalysis(ctx, m, db, rdb) - return - } - - // 4) 治愈成功后启动观察窗口,延迟状态更新 - handleDrillDownAnalysisWithObservation(ctx, m, db, rdb) -} - -// P1/P2 告警处理:下钻分析流程 -func handleP1P2Alert(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) { - handleDrillDownAnalysis(ctx, m, db, rdb) -} + url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deployID) -// 故障域识别 -func identifyFaultDomain(labels map[string]string) string { - service := labels["service_name"] - version := labels["version"] - - if service != "" && version != "" { - return "service_version_issue" - } - - // 可根据更多条件扩展其他故障域 - return "unknown" -} + // 2) 发起回滚(Mock):sleep 指定时间再判为成功 + sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) + // TODO: 如需真实 HTTP 调用,可在此发起 POST 并根据响应判断 -// 查询治愈方案 -func queryHealAction(ctx context.Context, db *Database, faultDomain string) (*HealAction, error) { - const q = `SELECT id, desc, type, rules FROM heal_actions WHERE type = $1 LIMIT 1` - // 实现查询逻辑 - return nil, nil -} - -// 执行治愈操作 -func executeHealAction(ctx context.Context, action *HealAction, m AlertMessage) bool { - // 根据 action.rules 中的条件执行相应操作 - // 当前仅支持回滚操作 - if action.Rules["action"] == "rollback" { - return executeRollback(ctx, m) - } else if action.Rules["action"] == "alert" { - log.Printf("Alert: %s", action.Rules["message"]) - return false - } - return false -} - -// 执行回滚操作 -func executeRollback(ctx context.Context, m AlertMessage) bool { - deployID := m.Labels["deploy_id"] - if deployID == "" { - deployID = fmt.Sprintf("%s:%s", m.Service, m.Version) + // 3) 成功后,先写入 AI 分析评论,再更新 DB 与缓存状态 + _ = addAIAnalysisComment(ctx, db, m) + _ = markRestoredInDB(ctx, db, m) + _ = markRestoredInCache(ctx, rdb, m) + } } - - // Mock 回滚:sleep 指定时间 - sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) - // TODO: 真实 HTTP 调用回滚接口 - - return true -} - -// 下钻分析处理(P1/P2 告警直接使用) -func handleDrillDownAnalysis(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) { - // 1) 执行 AI 分析 - _ = addAIAnalysisComment(ctx, db, m) - - // 2) 更新告警状态为恢复 - _ = markRestoredInDB(ctx, db, m) - - // 3) 更新缓存状态 - _ = markRestoredInCache(ctx, rdb, m) -} - -// 下钻分析处理(P0 告警治愈后使用,延迟状态更新) -func handleDrillDownAnalysisWithObservation(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) { - // 1) 执行 AI 分析 - _ = addAIAnalysisComment(ctx, db, m) - - // 2) 记录治愈完成评论,但不更新告警状态 - _ = addHealingCompletedComment(ctx, db, m) - - // 3) 启动观察窗口,等待30分钟 - _ = startObservationWindow(ctx, m.Service, m.Version, m.ID, 30*time.Minute) - - // 注意:此时不更新 alert_issues.alert_state 和 service_states.health_state - // 状态更新将在观察窗口完成后进行 -} - -// 观察窗口完成后的处理 -func completeObservationWindow(ctx context.Context, service, version string, db *Database, rdb *redis.Client) { - // 1) 完成观察窗口 - _ = completeObservation(ctx, service, version) - - // 2) 更新 alert_issues.alert_state 为 'Restored' - // 3) 更新 service_states.health_state 为 'Normal' - // 4) 更新相关缓存 - _ = markServiceAsNormal(ctx, service, version, db, rdb) - - log.Printf("Observation window completed for service %s version %s, status updated to Normal", service, version) } ``` —— -## 5. 故障域识别与治愈方案 - -### 故障域类型 - -当前支持的故障域类型: - -1. **service_version_issue**:服务版本问题 - - 识别条件:`labels["service_name"]` 和 `labels["version"]` 都存在 - - 治愈方案: - - 发布中版本:执行回滚操作 - - 已完成发布版本:提示暂不支持自动回滚 - -2. **unknown**:未知故障域 - - 识别条件:无法从标签中识别出已知故障域 - - 处理方式:跳过治愈,直接进入下钻分析 - -### 治愈方案规则 - -`heal_actions.rules` 字段的 JSON 结构: - -```json -{ - "deployment_status": "deploying|deployed", - "action": "rollback|alert", - "target": "previous_version", - "message": "版本已发布,暂不支持自动回滚" -} -``` - -### 治愈操作类型 - -1. **rollback**:执行回滚操作 - - 调用部署系统的回滚接口 - - 回滚到上一个稳定版本 - -2. **alert**:仅告警,不执行自动操作 - - 记录告警信息 - - 需要人工介入处理 - -### 扩展性设计 - -- 故障域类型可扩展:整体问题、单机房问题、网络问题等 -- 治愈方案可扩展:重启服务、扩容、切换流量等 -- 规则条件可扩展:基于更多标签和指标进行判断 - -#### 添加新的故障域类型 - -1. 在 `types.go` 中添加新的 `FaultDomain` 常量 -2. 在 `IdentifyFaultDomain` 方法中添加识别逻辑 -3. 在数据库中配置对应的治愈方案 - -#### 添加新的治愈操作类型 - -1. 在 `HealActionRules` 结构体中添加新字段 -2. 在 `ExecuteHealAction` 方法中添加新的 case 分支 -3. 实现具体的治愈操作逻辑 - -### 观察窗口机制 - -观察窗口是治愈操作完成后的验证期,用于确保治愈操作的有效性: - -1. **启动条件**:P0 告警治愈操作成功完成后自动启动 -2. **持续时间**:默认30分钟,可配置 -3. **监控内容**:观察该服务是否在窗口期内出现新的告警 -4. **处理逻辑**: - - 如果窗口期内出现新告警:取消观察窗口,重新进入治愈流程 - - 如果窗口期内无新告警:完成观察窗口,更新服务状态为正常 -5. **状态更新时机**: - - **治愈操作完成后**:不立即更新状态,只记录治愈完成评论 - - **观察窗口完成后**:同时更新 `alert_issues.alert_state` 为 `Restored` 和 `service_states.health_state` 为 `Normal` -6. **关键原则**:每次修改 `service_states.health_state` 为 `Normal` 时,都必须同时修改 `alert_issues.alert_state` 为 `Restored` - -—— - -## 6. 代码使用示例 - -### 数据库初始化 - -```bash -# 执行初始化脚本 -psql -U postgres -d zeroops -f init_heal_actions.sql -``` - -### 代码使用 - -```go -// 创建服务 -healDAO := NewPgHealActionDAO(db) -healService := NewHealActionService(healDAO) - -// 识别故障域 -faultDomain := healService.IdentifyFaultDomain(labels) - -// 获取治愈方案 -healAction, err := healService.GetHealAction(ctx, faultDomain) - -// 执行治愈操作 -result, err := healService.ExecuteHealAction(ctx, healAction, alertID, labels) -``` - -### 测试 - -运行测试: - -```bash -go test ./internal/alerting/service/remediation -v -``` - -测试覆盖: -- 故障域识别逻辑 -- 治愈操作执行 -- 部署状态判断 - -## 7. DB 更新(SQL 建议) +## 5. DB 更新(SQL 建议) - 告警状态: ```sql @@ -354,7 +121,7 @@ VALUES ( ); ``` -评论内容模板(Markdown,多行,内容暂未设计): +评论内容模板(Markdown,多行): ``` ## AI分析结果 **问题类型**:非发版本导致的问题 @@ -370,7 +137,7 @@ VALUES ( —— -## 8. 缓存更新(Redis,Lua CAS 建议) +## 6. 缓存更新(Redis,Lua CAS 建议) - 告警缓存 `alert:issue:{id}`: ```lua @@ -411,80 +178,30 @@ return 1 —— -## 9. 幂等与重试 +## 7. 幂等与重试 - 幂等:同一 `AlertMessage.ID` 的回滚处理应具备幂等性,重复消费不应产生额外副作用。 - 重试:Mock 模式下可忽略;接入真实接口后,对 5xx/网络错误考虑重试与退避,最终写入失败应有告警与补偿。 —— -## 10. 验证步骤(与 healthcheck E2E 相衔接) - -### 基础验证步骤 +## 8. 验证步骤(与 healthcheck E2E 相衔接) 1) 启动 Redis/Postgres 与 API(参考 `healthcheck/E2E_VALIDATION.md` 与 `env_example.txt`) -2) 创建 `heal_actions` 表并插入测试数据 -3) 创建 channel,并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)` - -### P0 告警验证(故障治愈流程) - -4) 触发 P0 级别 Webhook,`alert_issues` 入库为 `Pending` -5) 等待 `healthcheck` 将缓存态切到 `InProcessing` -6) 验证故障域识别:检查日志中是否正确识别为 `service_version_issue` -7) 验证治愈方案查询:检查是否从 `heal_actions` 表查询到对应方案 -8) 等待 `remediation` 执行治愈操作完成: - - 验证观察窗口已启动(Redis 中存在观察窗口记录) - - `alert_issue_comments` 中新增治愈完成评论 - - **重要**:验证 `alert_issues.alert_state` 仍为 `InProcessing`(未更新为 `Restored`) - - **重要**:验证 `service_states.health_state` 未更新为 `Normal` -9) 等待观察窗口完成(30分钟后)或模拟窗口期内新告警: - - **如果无新告警**: - - 验证观察窗口自动完成 - - 验证状态同时更新为 `alert_issues.alert_state = 'Restored'` 和 `service_states.health_state = 'Normal'` - - **如果有新告警**: - - 验证观察窗口被取消 - - 验证重新进入治愈流程 - - 验证状态未更新为 `Restored`/`Normal` - -### P1/P2 告警验证(下钻分析流程) - -9) 触发 P1 或 P2 级别 Webhook -10) 验证直接进入下钻分析流程,跳过故障治愈步骤 -11) 验证 AI 分析评论生成和状态更新 - -### 最终验证 - -12) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新 -13) 验证不同告警等级的处理路径正确性 +2) 创建 channel,并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)` +3) `curl` 触发 Webhook,`alert_issues` 入库为 `Pending` +4) 等待 `healthcheck` 将缓存态切到 `InProcessing` +5) 等待 `remediation` mock 回滚完成 → DB 与缓存更新: + - `alert_issues.alert_state = 'Restored'` + - `service_states.health_state = 'Normal'` + - `service_states.resolved_at = NOW()` +6) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新(comments 仍为 mock) —— -## 11. 注意事项 - -1. **service_states 表逻辑**: 当前版本中,`service_states` 表的更新逻辑暂时不实现,但保留了扩展空间 -2. **Mock 模式**: 当前回滚操作为 Mock 模式,实际部署时需要接入真实的部署系统 API -3. **错误处理**: 治愈操作失败时会记录日志并继续进入下钻分析流程 -4. **幂等性**: 同一告警的重复处理应该具备幂等性 - -## 12. 后续计划 - -### 短期计划 +## 9. 后续计划 -- 实现 `heal_actions` 表的完整 CRUD 操作 -- 完善故障域识别逻辑,支持更多故障类型 - 接入真实部署系统回滚接口与鉴权 -- 实现治愈方案的动态配置和管理界面 - -### 中期计划 - -- 扩展治愈操作类型:服务重启、扩容、流量切换等 -- 增加治愈方案的执行结果反馈和效果评估 - 将进程内 channel 平滑切换为 MQ(Kafka/NATS) -- 完善指标与可观测:事件消费速率、成功率、时延分位、治愈结果等 - -### 长期计划 - -- 基于历史数据训练 AI 模型,自动推荐最优治愈方案 -- 增加补偿任务:对"治愈成功但缓存/DB 未一致"的场景进行对账修复 -- 实现治愈方案的 A/B 测试和效果对比 -- 构建完整的故障自愈知识库和最佳实践库 +- 完善指标与可观测:事件消费速率、成功率、时延分位、回滚结果等 +- 增加补偿任务:对“回滚成功但缓存/DB 未一致”的场景进行对账修复 diff --git a/internal/alerting/service/remediation/consumer.go b/internal/alerting/service/remediation/consumer.go index cead8d1..07cae6b 100644 --- a/internal/alerting/service/remediation/consumer.go +++ b/internal/alerting/service/remediation/consumer.go @@ -3,6 +3,7 @@ package remediation import ( "context" "fmt" + "os" "strconv" "time" @@ -16,189 +17,46 @@ type Consumer struct { DB *adb.Database Redis *redis.Client - // Heal action service for P0 alerts - healService HealActionService - - // Observation window manager - obsManager ObservationWindowManager - // sleepFn allows overriding for tests sleepFn func(time.Duration) } func NewConsumer(db *adb.Database, rdb *redis.Client) *Consumer { - healDAO := NewPgHealActionDAO(db) - healService := NewHealActionService(healDAO) - obsManager := NewRedisObservationWindowManager(rdb) - return &Consumer{ - DB: db, - Redis: rdb, - healService: healService, - obsManager: obsManager, - sleepFn: time.Sleep, - } + return &Consumer{DB: db, Redis: rdb, sleepFn: time.Sleep} } -// Start consumes alert messages and processes them based on alert level +// Start consumes alert messages and performs a mocked rollback then marks restored. func (c *Consumer) Start(ctx context.Context, ch <-chan healthcheck.AlertMessage) { if ch == nil { log.Warn().Msg("remediation consumer started without channel; no-op") return } - + sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) for { select { case <-ctx.Done(): return case m := <-ch: - // 首先检查是否有观察窗口需要处理 - c.handleObservationWindow(ctx, &m) - - switch m.Level { - case "P0": - // P0 告警:故障治愈流程 - c.handleP0Alert(ctx, &m) - case "P1", "P2": - // P1/P2 告警:下钻分析流程 - c.handleP1P2Alert(ctx, &m) - default: - log.Warn().Str("level", m.Level).Str("issue", m.ID).Msg("unknown alert level, skipping") + // 1) Mock rollback: optional URL composition (unused) + _ = fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(&m)) + // 2) Sleep to simulate rollback time + if c.sleepFn != nil { + c.sleepFn(sleepDur) + } + // 3) On success: add AI analysis comment, update DB and cache + if err := c.addAIAnalysisComment(ctx, &m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed") + } + if err := c.markRestoredInDB(ctx, &m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed") + } + if err := c.markRestoredInCache(ctx, &m); err != nil { + log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed") } } } } -// handleObservationWindow handles observation window logic for incoming alerts -func (c *Consumer) handleObservationWindow(ctx context.Context, m *healthcheck.AlertMessage) { - if m.Service == "" { - return // No service information, skip observation window check - } - - // 检查是否有该服务的观察窗口 - window, err := c.obsManager.CheckObservation(ctx, m.Service, m.Version) - if err != nil { - log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to check observation window") - return - } - - if window == nil { - return // No active observation window - } - - // 如果在观察窗口期间出现新的告警,取消观察窗口 - log.Warn(). - Str("service", m.Service). - Str("version", m.Version). - Str("alert_id", m.ID). - Str("observation_alert_id", window.AlertID). - Msg("new alert detected during observation window, cancelling observation") - - if err := c.obsManager.CancelObservation(ctx, m.Service, m.Version); err != nil { - log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to cancel observation window") - } -} - -// handleP0Alert handles P0 alerts with fault healing process -func (c *Consumer) handleP0Alert(ctx context.Context, m *healthcheck.AlertMessage) { - log.Info().Str("issue", m.ID).Str("level", m.Level).Msg("processing P0 alert with fault healing") - - // 1) 确认故障域 - faultDomain := c.healService.IdentifyFaultDomain(m.Labels) - log.Info().Str("issue", m.ID).Str("fault_domain", string(faultDomain)).Msg("identified fault domain") - - // 2) 查询治愈方案 - healAction, err := c.healService.GetHealAction(ctx, faultDomain) - if err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("failed to get heal action") - // 如果无法获取治愈方案,直接进入下钻分析 - c.handleDrillDownAnalysis(ctx, m) - return - } - - // 3) 执行治愈操作 - result, err := c.healService.ExecuteHealAction(ctx, healAction, m.ID, m.Labels) - if err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("failed to execute heal action") - c.handleDrillDownAnalysis(ctx, m) - return - } - - if !result.Success { - log.Warn().Str("issue", m.ID).Str("message", result.Message).Msg("heal action failed") - // 治愈失败,仍然进入下钻分析 - c.handleDrillDownAnalysis(ctx, m) - return - } - - log.Info().Str("issue", m.ID).Str("message", result.Message).Msg("heal action completed successfully") - - // 4) 治愈成功后启动观察窗口 - if m.Service != "" { - obsDuration := GetObservationDuration() - if err := c.obsManager.StartObservation(ctx, m.Service, m.Version, m.ID, obsDuration); err != nil { - log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to start observation window") - } else { - log.Info(). - Str("service", m.Service). - Str("version", m.Version). - Str("alert_id", m.ID). - Dur("duration", obsDuration). - Msg("started observation window after successful healing") - } - } - - // 5) 治愈成功后进入下钻分析(但不立即更新状态) - c.handleDrillDownAnalysisWithObservation(ctx, m) -} - -// handleP1P2Alert handles P1/P2 alerts with drill-down analysis -func (c *Consumer) handleP1P2Alert(ctx context.Context, m *healthcheck.AlertMessage) { - log.Info().Str("issue", m.ID).Str("level", m.Level).Msg("processing P1/P2 alert with drill-down analysis") - - // 直接进入下钻分析流程 - c.handleDrillDownAnalysis(ctx, m) -} - -// handleDrillDownAnalysis performs drill-down analysis and marks alert as restored -func (c *Consumer) handleDrillDownAnalysis(ctx context.Context, m *healthcheck.AlertMessage) { - // 1) 执行 AI 分析 - if err := c.addAIAnalysisComment(ctx, m); err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed") - } - - // 2) 更新告警状态为恢复 - if err := c.markRestoredInDB(ctx, m); err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed") - } - - // 3) 更新缓存状态 - if err := c.markRestoredInCache(ctx, m); err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed") - } -} - -// handleDrillDownAnalysisWithObservation performs drill-down analysis but delays status update for observation -func (c *Consumer) handleDrillDownAnalysisWithObservation(ctx context.Context, m *healthcheck.AlertMessage) { - // 1) 执行 AI 分析 - if err := c.addAIAnalysisComment(ctx, m); err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed") - } - - // 2) 暂时不更新告警状态,等待观察窗口完成 - // 只记录治愈操作完成的评论 - if err := c.addHealingCompletedComment(ctx, m); err != nil { - log.Error().Err(err).Str("issue", m.ID).Msg("addHealingCompletedComment failed") - } - - log.Info(). - Str("issue", m.ID). - Str("service", m.Service). - Str("version", m.Version). - Msg("healing completed, waiting for observation window to complete before updating status") -} - -// deriveDeployID derives deployment ID from alert message -// TODO: Use this function when implementing real rollback API calls func deriveDeployID(m *healthcheck.AlertMessage) string { if m == nil { return "" @@ -233,38 +91,15 @@ func (c *Consumer) addAIAnalysisComment(ctx context.Context, m *healthcheck.Aler return err } -func (c *Consumer) addHealingCompletedComment(ctx context.Context, m *healthcheck.AlertMessage) error { - if c.DB == nil || m == nil { - return nil - } - const existsQ = `SELECT 1 FROM alert_issue_comments WHERE issue_id=$1 AND content=$2 LIMIT 1` - const insertQ = `INSERT INTO alert_issue_comments (issue_id, create_at, content) VALUES ($1, NOW(), $2)` - content := "## 治愈操作完成\n" + - "**操作状态**:治愈操作已成功执行\n" + - "**观察窗口**:正在等待观察窗口完成(30分钟)\n" + - "**下一步**:如果观察窗口内无新告警,将自动更新服务状态为正常" - if rows, err := c.DB.QueryContext(ctx, existsQ, m.ID, content); err == nil { - defer rows.Close() - if rows.Next() { - return nil - } - } - _, err := c.DB.ExecContext(ctx, insertQ, m.ID, content) - return err -} - func (c *Consumer) markRestoredInDB(ctx context.Context, m *healthcheck.AlertMessage) error { if c.DB == nil || m == nil { return nil } - - // 更新 alert_issues 状态 + // alert_issues if _, err := c.DB.ExecContext(ctx, `UPDATE alert_issues SET alert_state = 'Restored' , state = 'Closed' WHERE id = $1`, m.ID); err != nil { return err } - - // 同时更新 service_states.health_state 为 Normal - // 注意:每次修改 service_states 为 Normal 时都需要修改 alert_issues.alert_state 为 Restored + // service_states (upsert) if m.Service != "" { const upsert = ` INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids) @@ -277,7 +112,6 @@ SET health_state = 'Normal', return err } } - return nil } @@ -312,7 +146,7 @@ return 1 `) _, _ = script.Run(ctx, c.Redis, []string{alertKey, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing", "alert:index:alert_state:Restored", "alert:index:open", "alert:index:closed"}, "Restored", m.ID, "Closed").Result() - // 更新 service_state 缓存 + // 2) service_state:{service}:{version} → health_state=Normal; resolved_at=now; add to Normal index if m.Service != "" { svcKey := "service_state:" + m.Service + ":" + m.Version now := time.Now().UTC().Format(time.RFC3339Nano) @@ -331,54 +165,6 @@ return 1 return nil } -// CompleteObservationAndUpdateStatus completes observation window and updates service status -func (c *Consumer) CompleteObservationAndUpdateStatus(ctx context.Context, service, version string) error { - if service == "" { - return fmt.Errorf("service name is required") - } - - // 完成观察窗口 - if err := c.obsManager.CompleteObservation(ctx, service, version); err != nil { - return fmt.Errorf("failed to complete observation window: %w", err) - } - - // 更新服务状态为正常 - const upsert = ` -INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids) -VALUES ($1, $2, NULL, NOW(), 'Normal', ARRAY[]::text[]) -ON CONFLICT (service, version) DO UPDATE -SET health_state = 'Normal', - resolved_at = NOW(); -` - if _, err := c.DB.ExecContext(ctx, upsert, service, version); err != nil { - return fmt.Errorf("failed to update service state: %w", err) - } - - // 更新缓存 - if c.Redis != nil { - svcKey := "service_state:" + service + ":" + version - now := time.Now().UTC().Format(time.RFC3339Nano) - svcScript := redis.NewScript(` -local v = redis.call('GET', KEYS[1]) -if not v then v = '{}' end -local obj = cjson.decode(v) -obj.health_state = ARGV[1] -obj.resolved_at = ARGV[2] -redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL') -if KEYS[2] ~= '' then redis.call('SADD', KEYS[2], KEYS[1]) end -return 1 -`) - _, _ = svcScript.Run(ctx, c.Redis, []string{svcKey, "service_state:index:health:Normal"}, "Normal", now).Result() - } - - log.Info(). - Str("service", service). - Str("version", version). - Msg("observation window completed successfully, service status updated to Normal") - - return nil -} - func parseDuration(s string, d time.Duration) time.Duration { if s == "" { return d diff --git a/internal/alerting/service/remediation/heal_action_dao.go b/internal/alerting/service/remediation/heal_action_dao.go deleted file mode 100644 index af71b5e..0000000 --- a/internal/alerting/service/remediation/heal_action_dao.go +++ /dev/null @@ -1,145 +0,0 @@ -package remediation - -import ( - "context" - "database/sql" - "encoding/json" - "fmt" - - adb "github.com/qiniu/zeroops/internal/alerting/database" -) - -// PgHealActionDAO implements HealActionDAO using PostgreSQL -type PgHealActionDAO struct { - DB *adb.Database -} - -// NewPgHealActionDAO creates a new PostgreSQL heal action DAO -func NewPgHealActionDAO(db *adb.Database) *PgHealActionDAO { - return &PgHealActionDAO{DB: db} -} - -// GetByType retrieves a heal action by fault domain type -func (d *PgHealActionDAO) GetByType(ctx context.Context, faultType string) (*HealAction, error) { - const q = `SELECT id, desc, type, rules FROM heal_actions WHERE type = $1 LIMIT 1` - - row := d.DB.QueryRowContext(ctx, q, faultType) - var action HealAction - var rulesJSON string - - err := row.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON) - if err != nil { - if err == sql.ErrNoRows { - return nil, fmt.Errorf("no heal action found for type: %s", faultType) - } - return nil, fmt.Errorf("failed to get heal action by type: %w", err) - } - - action.Rules = json.RawMessage(rulesJSON) - return &action, nil -} - -// GetByID retrieves a heal action by ID -func (d *PgHealActionDAO) GetByID(ctx context.Context, id string) (*HealAction, error) { - const q = `SELECT id, desc, type, rules FROM heal_actions WHERE id = $1` - - row := d.DB.QueryRowContext(ctx, q, id) - var action HealAction - var rulesJSON string - - err := row.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON) - if err != nil { - if err == sql.ErrNoRows { - return nil, fmt.Errorf("no heal action found with id: %s", id) - } - return nil, fmt.Errorf("failed to get heal action by id: %w", err) - } - - action.Rules = json.RawMessage(rulesJSON) - return &action, nil -} - -// Create creates a new heal action -func (d *PgHealActionDAO) Create(ctx context.Context, action *HealAction) error { - const q = `INSERT INTO heal_actions (id, desc, type, rules) VALUES ($1, $2, $3, $4)` - - _, err := d.DB.ExecContext(ctx, q, action.ID, action.Desc, action.Type, string(action.Rules)) - if err != nil { - return fmt.Errorf("failed to create heal action: %w", err) - } - - return nil -} - -// Update updates an existing heal action -func (d *PgHealActionDAO) Update(ctx context.Context, action *HealAction) error { - const q = `UPDATE heal_actions SET desc = $2, type = $3, rules = $4 WHERE id = $1` - - result, err := d.DB.ExecContext(ctx, q, action.ID, action.Desc, action.Type, string(action.Rules)) - if err != nil { - return fmt.Errorf("failed to update heal action: %w", err) - } - - rowsAffected, err := result.RowsAffected() - if err != nil { - return fmt.Errorf("failed to get rows affected: %w", err) - } - - if rowsAffected == 0 { - return fmt.Errorf("no heal action found with id: %s", action.ID) - } - - return nil -} - -// Delete deletes a heal action by ID -func (d *PgHealActionDAO) Delete(ctx context.Context, id string) error { - const q = `DELETE FROM heal_actions WHERE id = $1` - - result, err := d.DB.ExecContext(ctx, q, id) - if err != nil { - return fmt.Errorf("failed to delete heal action: %w", err) - } - - rowsAffected, err := result.RowsAffected() - if err != nil { - return fmt.Errorf("failed to get rows affected: %w", err) - } - - if rowsAffected == 0 { - return fmt.Errorf("no heal action found with id: %s", id) - } - - return nil -} - -// List retrieves all heal actions -func (d *PgHealActionDAO) List(ctx context.Context) ([]*HealAction, error) { - const q = `SELECT id, desc, type, rules FROM heal_actions ORDER BY type, id` - - rows, err := d.DB.QueryContext(ctx, q) - if err != nil { - return nil, fmt.Errorf("failed to list heal actions: %w", err) - } - defer rows.Close() - - var actions []*HealAction - for rows.Next() { - var action HealAction - var rulesJSON string - - err := rows.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON) - if err != nil { - return nil, fmt.Errorf("failed to scan heal action: %w", err) - } - - action.Rules = json.RawMessage(rulesJSON) - actions = append(actions, &action) - } - - if err = rows.Err(); err != nil { - return nil, fmt.Errorf("error iterating heal actions: %w", err) - } - - return actions, nil -} diff --git a/internal/alerting/service/remediation/heal_action_service.go b/internal/alerting/service/remediation/heal_action_service.go deleted file mode 100644 index aa9a8f0..0000000 --- a/internal/alerting/service/remediation/heal_action_service.go +++ /dev/null @@ -1,172 +0,0 @@ -package remediation - -import ( - "context" - "encoding/json" - "fmt" - "os" - "time" - - "github.com/rs/zerolog/log" -) - -// HealActionServiceImpl implements HealActionService -type HealActionServiceImpl struct { - dao HealActionDAO -} - -// NewHealActionService creates a new heal action service -func NewHealActionService(dao HealActionDAO) *HealActionServiceImpl { - return &HealActionServiceImpl{dao: dao} -} - -// IdentifyFaultDomain identifies the fault domain from alert labels -func (s *HealActionServiceImpl) IdentifyFaultDomain(labels map[string]string) FaultDomain { - service := labels["service_name"] - version := labels["version"] - - if service != "" && version != "" { - return FaultDomainServiceVersion - } - - // TODO: 可根据更多条件扩展其他故障域 - // - 整体问题:检查是否有全局性指标异常 - // - 单机房问题:检查是否有机房相关标签 - // - 网络问题:检查是否有网络相关标签 - return FaultDomainUnknown -} - -// GetHealAction retrieves the appropriate heal action for a fault domain -func (s *HealActionServiceImpl) GetHealAction(ctx context.Context, faultDomain FaultDomain) (*HealAction, error) { - if faultDomain == FaultDomainUnknown { - return nil, fmt.Errorf("unknown fault domain, cannot determine heal action") - } - - action, err := s.dao.GetByType(ctx, string(faultDomain)) - if err != nil { - return nil, fmt.Errorf("failed to get heal action for domain %s: %w", faultDomain, err) - } - - return action, nil -} - -// ExecuteHealAction executes the heal action based on the rules -func (s *HealActionServiceImpl) ExecuteHealAction(ctx context.Context, action *HealAction, alertID string, labels map[string]string) (*HealActionResult, error) { - if action == nil { - return &HealActionResult{ - Success: false, - Error: "no heal action provided", - }, nil - } - - // Parse the rules - var rules HealActionRules - if err := json.Unmarshal(action.Rules, &rules); err != nil { - return &HealActionResult{ - Success: false, - Error: fmt.Sprintf("failed to parse heal action rules: %v", err), - }, nil - } - - // Execute based on action type - switch rules.Action { - case "rollback": - return s.executeRollback(ctx, rules, alertID, labels) - case "alert": - return s.executeAlert(rules, alertID, labels) - default: - return &HealActionResult{ - Success: false, - Error: fmt.Sprintf("unsupported action type: %s", rules.Action), - }, nil - } -} - -// executeRollback executes a rollback operation -func (s *HealActionServiceImpl) executeRollback(ctx context.Context, rules HealActionRules, alertID string, labels map[string]string) (*HealActionResult, error) { - _ = ctx // TODO: Use context for HTTP timeout when calling real rollback API - // Check deployment status if specified - if rules.DeploymentStatus != "" { - // TODO: 实际实现中应该查询部署系统获取真实的部署状态 - // 这里暂时模拟检查 - deployStatus := s.getDeploymentStatus(labels) - if deployStatus != rules.DeploymentStatus { - return &HealActionResult{ - Success: false, - Message: fmt.Sprintf("deployment status mismatch: expected %s, got %s", rules.DeploymentStatus, deployStatus), - }, nil - } - } - - // Mock rollback execution - sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second) - log.Info(). - Str("alert_id", alertID). - Str("target", rules.Target). - Dur("sleep_duration", sleepDur). - Msg("executing mock rollback") - - // Simulate rollback time - time.Sleep(sleepDur) - - // TODO: 实际实现中应该调用真实的回滚接口 - // url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(labels)) - // 发起 HTTP POST 请求到回滚接口 - - return &HealActionResult{ - Success: true, - Message: fmt.Sprintf("rollback completed successfully, target: %s", rules.Target), - }, nil -} - -// executeAlert executes an alert-only action (no automatic healing) -func (s *HealActionServiceImpl) executeAlert(rules HealActionRules, alertID string, labels map[string]string) (*HealActionResult, error) { - _ = labels // TODO: Use labels for context-specific alert messages - log.Warn(). - Str("alert_id", alertID). - Str("message", rules.Message). - Msg("heal action requires manual intervention") - - return &HealActionResult{ - Success: false, - Message: rules.Message, - }, nil -} - -// getDeploymentStatus gets the deployment status for the given labels -// TODO: 实际实现中应该查询部署系统获取真实的部署状态 -func (s *HealActionServiceImpl) getDeploymentStatus(labels map[string]string) string { - // 这里暂时返回模拟状态 - // 实际实现中应该: - // 1. 从 labels 中提取 service 和 version - // 2. 查询部署系统 API 获取当前部署状态 - // 3. 返回 "deploying" 或 "deployed" - - service := labels["service_name"] - version := labels["version"] - - if service == "" || version == "" { - return "unknown" - } - - // 模拟逻辑:如果版本号包含 "dev" 或 "test",认为是发布中,待确认修改为实际的部署状态区分方式 - if version == "dev" || version == "test" { - return "deploying" - } - - return "deployed" -} - -// deriveDeployIDFromLabels derives deployment ID from labels -// TODO: Use this function when implementing real rollback API calls -func deriveDeployIDFromLabels(labels map[string]string) string { - if v := labels["deploy_id"]; v != "" { - return v - } - service := labels["service_name"] - version := labels["version"] - if service != "" && version != "" { - return fmt.Sprintf("%s:%s", service, version) - } - return "" -} diff --git a/internal/alerting/service/remediation/heal_action_service_test.go b/internal/alerting/service/remediation/heal_action_service_test.go deleted file mode 100644 index eb45209..0000000 --- a/internal/alerting/service/remediation/heal_action_service_test.go +++ /dev/null @@ -1,178 +0,0 @@ -package remediation - -import ( - "context" - "encoding/json" - "testing" -) - -func TestHealActionServiceImpl_IdentifyFaultDomain(t *testing.T) { - service := &HealActionServiceImpl{} - - tests := []struct { - name string - labels map[string]string - expected FaultDomain - }{ - { - name: "service_version_issue", - labels: map[string]string{ - "service_name": "test-service", - "version": "v1.0.0", - }, - expected: FaultDomainServiceVersion, - }, - { - name: "missing_service_name", - labels: map[string]string{ - "version": "v1.0.0", - }, - expected: FaultDomainUnknown, - }, - { - name: "missing_version", - labels: map[string]string{ - "service_name": "test-service", - }, - expected: FaultDomainUnknown, - }, - { - name: "empty_labels", - labels: map[string]string{}, - expected: FaultDomainUnknown, - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := service.IdentifyFaultDomain(tt.labels) - if result != tt.expected { - t.Errorf("IdentifyFaultDomain() = %v, want %v", result, tt.expected) - } - }) - } -} - -func TestHealActionServiceImpl_ExecuteHealAction(t *testing.T) { - service := &HealActionServiceImpl{} - - tests := []struct { - name string - action *HealAction - alertID string - labels map[string]string - expectError bool - }{ - { - name: "rollback_action", - action: &HealAction{ - ID: "test-rollback", - Desc: "Test rollback action", - Type: "service_version_issue", - Rules: json.RawMessage(`{ - "deployment_status": "deploying", - "action": "rollback", - "target": "previous_version" - }`), - }, - alertID: "test-alert-1", - labels: map[string]string{ - "service_name": "test-service", - "version": "dev", - }, - expectError: false, - }, - { - name: "alert_action", - action: &HealAction{ - ID: "test-alert", - Desc: "Test alert action", - Type: "service_version_issue", - Rules: json.RawMessage(`{ - "action": "alert", - "message": "Version already deployed, manual intervention required" - }`), - }, - alertID: "test-alert-2", - labels: map[string]string{ - "service_name": "test-service", - "version": "v1.0.0", - }, - expectError: false, - }, - { - name: "nil_action", - action: nil, - alertID: "test-alert-3", - labels: map[string]string{}, - expectError: false, // Should not error, but return failure result - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result, err := service.ExecuteHealAction(context.Background(), tt.action, tt.alertID, tt.labels) - - if tt.expectError && err == nil { - t.Errorf("ExecuteHealAction() expected error but got none") - } - if !tt.expectError && err != nil { - t.Errorf("ExecuteHealAction() unexpected error: %v", err) - } - - if result == nil { - t.Errorf("ExecuteHealAction() returned nil result") - } - }) - } -} - -func TestHealActionServiceImpl_getDeploymentStatus(t *testing.T) { - service := &HealActionServiceImpl{} - - tests := []struct { - name string - labels map[string]string - expected string - }{ - { - name: "deploying_version", - labels: map[string]string{ - "service_name": "test-service", - "version": "dev", - }, - expected: "deploying", - }, - { - name: "deployed_version", - labels: map[string]string{ - "service_name": "test-service", - "version": "v1.0.0", - }, - expected: "deployed", - }, - { - name: "missing_service_name", - labels: map[string]string{ - "version": "v1.0.0", - }, - expected: "unknown", - }, - { - name: "missing_version", - labels: map[string]string{ - "service_name": "test-service", - }, - expected: "unknown", - }, - } - - for _, tt := range tests { - t.Run(tt.name, func(t *testing.T) { - result := service.getDeploymentStatus(tt.labels) - if result != tt.expected { - t.Errorf("getDeploymentStatus() = %v, want %v", result, tt.expected) - } - }) - } -} diff --git a/internal/alerting/service/remediation/init_heal_actions.sql b/internal/alerting/service/remediation/init_heal_actions.sql deleted file mode 100644 index b11fcee..0000000 --- a/internal/alerting/service/remediation/init_heal_actions.sql +++ /dev/null @@ -1,38 +0,0 @@ --- 创建 heal_actions 表 -CREATE TABLE IF NOT EXISTS heal_actions ( - id VARCHAR(255) PRIMARY KEY, - desc TEXT NOT NULL, - type VARCHAR(255) NOT NULL, - rules JSONB NOT NULL -); - --- 创建索引 -CREATE INDEX IF NOT EXISTS idx_heal_actions_type ON heal_actions(type); - --- 插入示例数据 -INSERT INTO heal_actions (id, desc, type, rules) VALUES -( - 'service_version_rollback_deploying', - '服务版本回滚方案(发布中版本)', - 'service_version_issue', - '{"deployment_status": "deploying", "action": "rollback", "target": "previous_version"}' -), -( - 'service_version_alert_deployed', - '服务版本告警方案(已完成发布版本)', - 'service_version_issue', - '{"deployment_status": "deployed", "action": "alert", "message": "版本已发布,暂不支持自动回滚,需要人工介入处理"}' -), -( - 'service_version_rollback_default', - '服务版本回滚方案(默认)', - 'service_version_issue', - '{"action": "rollback", "target": "previous_version"}' -) -ON CONFLICT (id) DO UPDATE SET - desc = EXCLUDED.desc, - type = EXCLUDED.type, - rules = EXCLUDED.rules; - --- 查询验证 -SELECT id, desc, type, rules FROM heal_actions ORDER BY type, id; diff --git a/internal/alerting/service/remediation/observation_window.go b/internal/alerting/service/remediation/observation_window.go deleted file mode 100644 index c07e146..0000000 --- a/internal/alerting/service/remediation/observation_window.go +++ /dev/null @@ -1,169 +0,0 @@ -package remediation - -import ( - "context" - "encoding/json" - "fmt" - "time" - - "github.com/redis/go-redis/v9" - "github.com/rs/zerolog/log" -) - -// RedisObservationWindowManager implements ObservationWindowManager using Redis -type RedisObservationWindowManager struct { - redis *redis.Client -} - -// NewRedisObservationWindowManager creates a new Redis-based observation window manager -func NewRedisObservationWindowManager(redis *redis.Client) *RedisObservationWindowManager { - return &RedisObservationWindowManager{redis: redis} -} - -// StartObservation starts an observation window for a service -func (m *RedisObservationWindowManager) StartObservation(ctx context.Context, service, version, alertID string, duration time.Duration) error { - if m.redis == nil { - return fmt.Errorf("redis client is nil") - } - - now := time.Now() - window := &ObservationWindow{ - Duration: duration, - Service: service, - Version: version, - AlertID: alertID, - StartTime: now, - EndTime: now.Add(duration), - IsActive: true, - } - - key := fmt.Sprintf("observation:%s:%s", service, version) - data, err := json.Marshal(window) - if err != nil { - return fmt.Errorf("failed to marshal observation window: %w", err) - } - - // Store with TTL equal to observation duration + buffer - ttl := duration + 5*time.Minute - err = m.redis.Set(ctx, key, data, ttl).Err() - if err != nil { - return fmt.Errorf("failed to store observation window: %w", err) - } - - log.Info(). - Str("service", service). - Str("version", version). - Str("alert_id", alertID). - Dur("duration", duration). - Time("end_time", window.EndTime). - Msg("started observation window") - - return nil -} - -// CheckObservation checks if there's an active observation window for a service -func (m *RedisObservationWindowManager) CheckObservation(ctx context.Context, service, version string) (*ObservationWindow, error) { - if m.redis == nil { - return nil, fmt.Errorf("redis client is nil") - } - - key := fmt.Sprintf("observation:%s:%s", service, version) - data, err := m.redis.Get(ctx, key).Result() - if err != nil { - if err == redis.Nil { - return nil, nil // No active observation window - } - return nil, fmt.Errorf("failed to get observation window: %w", err) - } - - var window ObservationWindow - err = json.Unmarshal([]byte(data), &window) - if err != nil { - return nil, fmt.Errorf("failed to unmarshal observation window: %w", err) - } - - // Check if observation window has expired - if time.Now().After(window.EndTime) { - // Clean up expired window - m.redis.Del(ctx, key) - return nil, nil - } - - return &window, nil -} - -// CompleteObservation completes an observation window and marks it as successful -func (m *RedisObservationWindowManager) CompleteObservation(ctx context.Context, service, version string) error { - if m.redis == nil { - return fmt.Errorf("redis client is nil") - } - - key := fmt.Sprintf("observation:%s:%s", service, version) - - // Get the current window - window, err := m.CheckObservation(ctx, service, version) - if err != nil { - return fmt.Errorf("failed to check observation window: %w", err) - } - - if window == nil { - return fmt.Errorf("no active observation window found for service %s version %s", service, version) - } - - // Mark as completed and remove from Redis - window.IsActive = false - err = m.redis.Del(ctx, key).Err() - if err != nil { - return fmt.Errorf("failed to remove observation window: %w", err) - } - - log.Info(). - Str("service", service). - Str("version", version). - Str("alert_id", window.AlertID). - Dur("duration", window.Duration). - Msg("completed observation window successfully") - - return nil -} - -// CancelObservation cancels an observation window due to new alerts -func (m *RedisObservationWindowManager) CancelObservation(ctx context.Context, service, version string) error { - if m.redis == nil { - return fmt.Errorf("redis client is nil") - } - - key := fmt.Sprintf("observation:%s:%s", service, version) - - // Get the current window for logging - window, err := m.CheckObservation(ctx, service, version) - if err != nil { - return fmt.Errorf("failed to check observation window: %w", err) - } - - if window == nil { - return nil // No active window to cancel - } - - // Remove the observation window - err = m.redis.Del(ctx, key).Err() - if err != nil { - return fmt.Errorf("failed to cancel observation window: %w", err) - } - - log.Warn(). - Str("service", service). - Str("version", version). - Str("alert_id", window.AlertID). - Msg("cancelled observation window due to new alerts") - - return nil -} - -// GetObservationDuration returns the configured observation duration -// TODO: 后续可以从配置或数据库中动态获取观察时间 -func GetObservationDuration() time.Duration { - // 暂时使用固定的30分钟观察窗口 - // 后续可以扩展为从环境变量或配置文件中读取 - return 30 * time.Minute -} diff --git a/internal/alerting/service/remediation/observation_window_test.go b/internal/alerting/service/remediation/observation_window_test.go deleted file mode 100644 index 8a2ec35..0000000 --- a/internal/alerting/service/remediation/observation_window_test.go +++ /dev/null @@ -1,100 +0,0 @@ -package remediation - -import ( - "context" - "testing" - "time" - - "github.com/redis/go-redis/v9" - "github.com/stretchr/testify/assert" - "github.com/stretchr/testify/require" -) - -func TestRedisObservationWindowManager(t *testing.T) { - // 使用内存 Redis 客户端进行测试 - rdb := redis.NewClient(&redis.Options{ - Addr: "localhost:6379", // 需要 Redis 实例 - }) - defer rdb.Close() - - // 检查 Redis 连接 - ctx := context.Background() - if err := rdb.Ping(ctx).Err(); err != nil { - t.Skip("Redis not available, skipping test") - } - - manager := NewRedisObservationWindowManager(rdb) - - t.Run("StartObservation", func(t *testing.T) { - service := "test-service" - version := "v1.0.0" - alertID := "test-alert-1" - duration := 5 * time.Minute - - err := manager.StartObservation(ctx, service, version, alertID, duration) - require.NoError(t, err) - - // 验证观察窗口已创建 - window, err := manager.CheckObservation(ctx, service, version) - require.NoError(t, err) - require.NotNil(t, window) - assert.Equal(t, service, window.Service) - assert.Equal(t, version, window.Version) - assert.Equal(t, alertID, window.AlertID) - assert.True(t, window.IsActive) - }) - - t.Run("CheckObservation_NotFound", func(t *testing.T) { - service := "non-existent-service" - version := "v1.0.0" - - window, err := manager.CheckObservation(ctx, service, version) - require.NoError(t, err) - assert.Nil(t, window) - }) - - t.Run("CompleteObservation", func(t *testing.T) { - service := "test-service-2" - version := "v1.0.0" - alertID := "test-alert-2" - duration := 5 * time.Minute - - // 先创建观察窗口 - err := manager.StartObservation(ctx, service, version, alertID, duration) - require.NoError(t, err) - - // 完成观察窗口 - err = manager.CompleteObservation(ctx, service, version) - require.NoError(t, err) - - // 验证观察窗口已被移除 - window, err := manager.CheckObservation(ctx, service, version) - require.NoError(t, err) - assert.Nil(t, window) - }) - - t.Run("CancelObservation", func(t *testing.T) { - service := "test-service-3" - version := "v1.0.0" - alertID := "test-alert-3" - duration := 5 * time.Minute - - // 先创建观察窗口 - err := manager.StartObservation(ctx, service, version, alertID, duration) - require.NoError(t, err) - - // 取消观察窗口 - err = manager.CancelObservation(ctx, service, version) - require.NoError(t, err) - - // 验证观察窗口已被移除 - window, err := manager.CheckObservation(ctx, service, version) - require.NoError(t, err) - assert.Nil(t, window) - }) -} - -func TestGetObservationDuration(t *testing.T) { - duration := GetObservationDuration() - assert.Equal(t, 30*time.Minute, duration) -} diff --git a/internal/alerting/service/remediation/types.go b/internal/alerting/service/remediation/types.go deleted file mode 100644 index c1c5f02..0000000 --- a/internal/alerting/service/remediation/types.go +++ /dev/null @@ -1,74 +0,0 @@ -package remediation - -import ( - "context" - "encoding/json" - "time" -) - -// HealAction represents a healing action configuration -type HealAction struct { - ID string `json:"id"` - Desc string `json:"desc"` - Type string `json:"type"` - Rules json.RawMessage `json:"rules"` -} - -// HealActionRules represents the rules for a heal action -type HealActionRules struct { - DeploymentStatus string `json:"deployment_status,omitempty"` - Action string `json:"action"` - Target string `json:"target,omitempty"` - Message string `json:"message,omitempty"` -} - -// FaultDomain represents the identified fault domain -type FaultDomain string - -const ( - FaultDomainServiceVersion FaultDomain = "service_version_issue" - FaultDomainUnknown FaultDomain = "unknown" -) - -// HealActionResult represents the result of executing a heal action -type HealActionResult struct { - Success bool `json:"success"` - Message string `json:"message,omitempty"` - Error string `json:"error,omitempty"` -} - -// ObservationWindow represents the observation period after healing -type ObservationWindow struct { - Duration time.Duration `json:"duration"` - Service string `json:"service"` - Version string `json:"version"` - AlertID string `json:"alert_id"` - StartTime time.Time `json:"start_time"` - EndTime time.Time `json:"end_time"` - IsActive bool `json:"is_active"` -} - -// ObservationWindowManager defines the interface for managing observation windows -type ObservationWindowManager interface { - StartObservation(ctx context.Context, service, version, alertID string, duration time.Duration) error - CheckObservation(ctx context.Context, service, version string) (*ObservationWindow, error) - CompleteObservation(ctx context.Context, service, version string) error - CancelObservation(ctx context.Context, service, version string) error -} - -// HealActionDAO defines the interface for heal action database operations -type HealActionDAO interface { - GetByType(ctx context.Context, faultType string) (*HealAction, error) - GetByID(ctx context.Context, id string) (*HealAction, error) - Create(ctx context.Context, action *HealAction) error - Update(ctx context.Context, action *HealAction) error - Delete(ctx context.Context, id string) error - List(ctx context.Context) ([]*HealAction, error) -} - -// HealActionService defines the interface for heal action business logic -type HealActionService interface { - IdentifyFaultDomain(labels map[string]string) FaultDomain - GetHealAction(ctx context.Context, faultDomain FaultDomain) (*HealAction, error) - ExecuteHealAction(ctx context.Context, action *HealAction, alertID string, labels map[string]string) (*HealActionResult, error) -}