From af452488657186a748a42c22dd066c7d23ba9f46 Mon Sep 17 00:00:00 2001
From: acd19ml <acd19ml@gmail.com>
Date: Fri, 19 Sep 2025 22:06:03 +0800
Subject: [PATCH 1/4] =?UTF-8?q?feat(Metrics&Alert):=20=E6=96=B0=E5=A2=9E?=
 =?UTF-8?q?=E8=A7=84=E5=88=99=E4=B8=8E=E9=98=88=E5=80=BC=E7=AE=A1=E7=90=86?=
 =?UTF-8?q?=E7=9A=84=E8=A1=A8=E7=BB=93=E6=9E=84=EF=BC=8C=E6=A0=B8=E5=BF=83?=
 =?UTF-8?q?=E6=8E=A5=E5=8F=A3=EF=BC=8C=E7=BC=96=E6=8E=92=E6=B5=81=E7=A8=8B?=
 =?UTF-8?q?=EF=BC=8C=E4=B8=8E=E5=88=9D=E6=AD=A5=E5=AE=9E=E7=8E=B0?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/alerting/api.md                          |   2 +-
 docs/alerting/database-design.md              | 117 ++++-----
 internal/alerting/service/ruleset/README.md   | 235 ++++++++++++++++++
 internal/alerting/service/ruleset/manager.go  | 143 +++++++++++
 .../alerting/service/ruleset/manager_test.go  |  85 +++++++
 .../alerting/service/ruleset/normalize.go     |  57 +++++
 .../service/ruleset/normalize_test.go         |  23 ++
 .../service/ruleset/promsync_exporter.go      |  67 +++++
 internal/alerting/service/ruleset/store_pg.go | 170 +++++++++++++
 internal/alerting/service/ruleset/types.go    |  86 +++++++
 10 files changed, 920 insertions(+), 65 deletions(-)
 create mode 100644 internal/alerting/service/ruleset/README.md
 create mode 100644 internal/alerting/service/ruleset/manager.go
 create mode 100644 internal/alerting/service/ruleset/manager_test.go
 create mode 100644 internal/alerting/service/ruleset/normalize.go
 create mode 100644 internal/alerting/service/ruleset/normalize_test.go
 create mode 100644 internal/alerting/service/ruleset/promsync_exporter.go
 create mode 100644 internal/alerting/service/ruleset/store_pg.go
 create mode 100644 internal/alerting/service/ruleset/types.go

diff --git a/docs/alerting/api.md b/docs/alerting/api.md
index df0ddce..80d4735 100644
--- a/docs/alerting/api.md
+++ b/docs/alerting/api.md
@@ -126,7 +126,7 @@ GET /v1/issues/{issueID}
 |--------|------|------|
 | id | string | 告警问题唯一标识 |
 | state | string | 告警工单的生命周期状态状态：`Open`、`Closed` |
-| level | string | 告警等级：`P0`、`P1`、`P2`、`Warning` |
+| level | string | 告警等级：`P0`、`P1`、`P2` |
 | alertState | string | 告警本身的实时状态：`Pending`、`Restored`、`AutoRestored`、`InProcessing` |
 | title | string | 告警标题描述 |
 | labels | Label[] | 标签数组 |
diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md
index 47508d1..98b3e1c 100644
--- a/docs/alerting/database-design.md
+++ b/docs/alerting/database-design.md
@@ -2,14 +2,13 @@
 
 ## 概述
 
-本文档为最新数据库设计，总计包含 7 张表：
+本文档为最新数据库设计，总计包含 6 张表：
 
 - alert_issues
 - alert_issue_comments
-- metric_alert_changes
+- alert_meta_change_logs
 - alert_rules
-- service_alert_metas
-- service_metrics
+- alert_rule_metas
 - service_states
 
 ## 数据表设计
@@ -51,16 +50,22 @@
 
 ---
 
-### 3) metric_alert_changes（指标告警规则变更记录表）
+### 3) alert_meta_change_logs（阈值变更记录表）
 
-用于追踪指标类告警规则或参数的变更历史。
+用于追踪规则阈值（threshold）与观察窗口（watch_time）的变更历史。
 
 | 字段名 | 类型 | 说明 |
 |--------|------|------|
-| id | varchar(64) PK | 变更记录 ID |
-| change_time | TIMESTAMP(6) | 变更时间 |
-| alert_name | varchar(255) | 告警名称/规则名 |
-| change_items | json | 变更项数组：[{key, old_value, new_value}] |
+| id | varchar(64) PK | 幂等/去重标识 |
+| change_type | varchar(16) | 变更类型：Create / Update / Delete / Rollback |
+| change_time | timestamptz | 变更时间 |
+| alert_name | varchar(255) | 规则名 |
+| labels | text | labels 的 JSON 字符串表示（规范化后） |
+| old_threshold | numeric | 旧阈值（可空） |
+| new_threshold | numeric | 新阈值（可空） |
+| old_watch | interval | 旧观察窗口（可空） |
+| new_watch | interval | 新观察窗口（可空） |
+
 
 **索引建议：**
 - PRIMARY KEY: `id`
@@ -71,49 +76,37 @@
 
 ### 4) alert_rules（告警规则表）
 
-定义可复用的规则表达式，支持作用域绑定。
-
-| 字段名 | 类型 | 说明 |
+｜ 字段名 ｜ 类型 ｜ 说明 ｜
 |--------|------|------|
-| id | varchar(255) PK | 规则 ID（可与 K8s 资源 ID 对应或做映射） |
-| name | varchar(255) | 规则名称，表达式可读的名称 |
-| scopes | varchar(255) | 作用域，例："services:svc1,svc2" |
-| expr | text | 规则表达式（可含占位符） |
-
-**索引建议：**
-- PRIMARY KEY: `id`
-- INDEX: `(name)`
-- INDEX: `(scopes)`
+｜name｜varchar(255)｜主键，告警规则名称｜
+｜description｜text｜可读标题，可拼接渲染为可读的 title｜
+｜expr｜text｜左侧业务指标表达式，（通常对应 PromQL 左侧的聚合，如 sum(apitime) by (service, version)）｜
+｜op｜varchar(4)｜阈值比较方式（枚举：>, <, =, !=）｜
+｜severity｜varchar(32)｜告警等级，通常进入告警的 labels.severity｜
 
----
-
-### 5) service_alert_metas（服务告警元数据表）
+**约束建议：**
+- CHECK 约束：`op IN ('>', '<', '=', '!=')`
 
-按服务维度存放参数化配置，用于渲染具体规则。
+⸻
 
-| 字段名 | 类型 | 说明 |
-|--------|------|------|
-| service | varchar(255) | 服务名 |
-| key | varchar(255) | 参数名（如 `apitime_threshold`） |
-| value | varchar(255) | 参数值（如 `50`） |
+### 5) alert_rule_metas（规则阈值元信息表）
 
-**索引建议：**
-- PRIMARY KEY: `(service, key)`
-- INDEX: `(service)`
+字段名	类型	说明
+alert_name	varchar(255)	关联 `alert_rules.name`
+labels	jsonb	适用标签（示例：{"service":"s3","version":"v1"}）；为空 `{}` 表示全局
+threshold	numeric	阈值（会被渲染成特定规则的 threshold metric 数值）
+watch_time	interval	持续时长（映射 Prometheus rule 的 for:）
 
----
+**约束与索引建议：**
+- FOREIGN KEY: `(alert_name)` REFERENCES `alert_rules(name)` ON DELETE CASCADE
+- UNIQUE: `(alert_name, labels)`
+- GIN INDEX: `labels`（`CREATE INDEX idx_metas_labels_gin ON alert_rule_metas USING gin(labels);`）
 
-### 6) service_metrics（服务指标清单表）
+⸻
 
-记录服务所关注的指标清单（可用于 UI 侧展示或校验）。
-
-| 字段名 | 类型 | 说明 |
-|--------|------|------|
-| service | varchar(255) PK | 服务名 |
-| metrics | json | 指标名数组：["metric1", "metric2", ...] |
-
-**索引建议：**
-- PRIMARY KEY: `service`
+说明：
+- labels 建议用 jsonb，方便在 Postgres 中做索引和查询。
+- labels 的键名与值格式应在应用层规范化（排序/小写/去空值）以确保唯一性和可查询性一致。
 
 ---
 
@@ -140,21 +133,18 @@ erDiagram
     alert_issues ||--o{ alert_issue_comments : "has comments"
 
     alert_rules {
-        varchar id PK
-        varchar name
-        varchar scopes
+        varchar name PK
+        text description
         text expr
+        varchar op
+        varchar severity
     }
 
-    service_alert_metas {
-        varchar service PK
-        varchar key PK
-        varchar value
-    }
-
-    service_metrics {
-        varchar service PK
-        json metrics
+    alert_rule_metas {
+        varchar alert_name FK
+        jsonb labels
+        numeric threshold
+        interval watch_time
     }
 
     service_states {
@@ -184,14 +174,13 @@ erDiagram
         text content
     }
 
-    %% 通过 service 逻辑关联
-    service_alert_metas ||..|| service_metrics : "by service"
-    service_states ||..|| service_alert_metas : "by service"
+    %% 通过 service 等标签在应用层逻辑关联
+    alert_rule_metas ||..|| alert_rules : "by alert_name"
+    service_states ||..|| alert_rule_metas : "by service/version labels"
 ```
 
 ## 数据流转
 
-1. 以 `alert_rules` 为模版，结合 `service_alert_metas` 渲染出面向具体服务的规则。
-2. 指标或规则参数发生调整时，记录到 `metric_alert_changes`。
-3. 规则触发创建 `alert_issues`；处理过程中的动作写入 `alert_issue_comments`。
-4. 面向服务的整体健康态以 `service_states` 记录和推进（new → analyzing → processing → resolved）。
\ No newline at end of file
+1. 以 `alert_rules` 为模版，结合 `alert_rule_metas` 渲染出面向具体服务/版本等的规则（labels 可为空 `{}` 表示全局默认，或包含如 service/version 等标签）。
+2. 指标或规则参数发生调整时，记录到 `alert_meta_change_logs`。
+3. 规则触发创建 `alert_issues`；处理过程中的动作写入 `alert_issue_comments`。
\ No newline at end of file
diff --git a/internal/alerting/service/ruleset/README.md b/internal/alerting/service/ruleset/README.md
new file mode 100644
index 0000000..9c9ff47
--- /dev/null
+++ b/internal/alerting/service/ruleset/README.md
@@ -0,0 +1,235 @@
+## Ruleset（规则与阈值管理）
+
+本目录为“规则与阈值管理（ruleset）”实现说明。内容聚焦于：表结构、核心接口、编排流程、Prometheus 同步方式、并发与一致性、测试与使用示例。文档与当前代码实现保持一致。
+
+---
+
+## 1) 目标与边界（已实现）
+
+- 通过 `alert_rules` 与 `alert_rule_metas`，为同一告警规则按标签维度（如 `service`、`version`）配置阈值与持续时间（`watch_time`）。
+- 变更阈值后，立刻同步到内存 Exporter（无需 Prometheus reload）。
+- 多告警等级（P0/P1…）通过“多条规则”实现（如 `latency_p95_P0` 与 `latency_p95_P1`）。
+- 记录变更日志，支持审计，便于后续扩展回滚能力。
+
+---
+
+## 2) Go 组件与接口
+
+### 2.1 关键类型与接口（节选）
+
+```go
+// types.go
+type AlertRule struct {
+    Name        string
+    Description string
+    Expr        string
+    Op          string
+    Severity    string
+}
+
+type LabelMap map[string]string
+
+type AlertRuleMeta struct {
+    AlertName string
+    Labels    LabelMap
+    Threshold float64
+    WatchTime time.Duration // interval 映射
+}
+
+type ChangeLog struct {
+    ID           string
+    AlertName    string
+    ChangeType   string
+    Labels       LabelMap
+    OldThreshold *float64
+    NewThreshold *float64
+    OldWatch     *time.Duration
+    NewWatch     *time.Duration
+    ChangeTime   time.Time
+}
+
+type Store interface {
+    // rules
+    CreateRule(ctx context.Context, r *AlertRule) error
+    GetRule(ctx context.Context, name string) (*AlertRule, error)
+    UpdateRule(ctx context.Context, r *AlertRule) error
+    DeleteRule(ctx context.Context, name string) error
+
+    // metas (UPSERT by alert_name + labels)
+    UpsertMeta(ctx context.Context, m *AlertRuleMeta) (created bool, err error)
+    GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error)
+    DeleteMeta(ctx context.Context, name string, labels LabelMap) error
+
+    // change logs
+    InsertChangeLog(ctx context.Context, log *ChangeLog) error
+
+    // tx helpers
+    WithTx(ctx context.Context, fn func(Store) error) error
+}
+
+type PromSync interface {
+    AddToPrometheus(ctx context.Context, r *AlertRule) error         // 新增时更新 rule 文件并 reload（当前实现为占位）
+    DeleteFromPrometheus(ctx context.Context, name string) error     // 删除（当前实现为占位）
+    SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error
+}
+
+type AlertRuleMgr interface {
+    LoadRule(ctx context.Context) error
+    UpsertRuleMetas(ctx context.Context, m *AlertRuleMeta) error
+    AddAlertRule(ctx context.Context, r *AlertRule) error
+    DeleteAlertRule(ctx context.Context, name string) error
+
+    AddToPrometheus(ctx context.Context, r *AlertRule) error
+    DeleteFromPrometheus(ctx context.Context, name string) error
+    SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error
+    RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error
+}
+```
+
+### 2.2 Manager 核心流程（与实现一致）
+
+```go
+func (m *Manager) UpsertRuleMetas(ctx context.Context, meta *AlertRuleMeta) error {
+    meta.Labels = NormalizeLabels(meta.Labels, m.aliasMap)
+    if err := validateMeta(meta); err != nil { return err }
+    return m.store.WithTx(ctx, func(tx Store) error {
+        oldList, err := tx.GetMetas(ctx, meta.AlertName, meta.Labels)
+        if err != nil { return err }
+        var old *AlertRuleMeta
+        if len(oldList) > 0 { old = oldList[0] }
+        if _, err := tx.UpsertMeta(ctx, meta); err != nil { return err }
+        if err := m.RecordMetaChangeLog(ctx, old, meta); err != nil { return err }
+        return m.prom.SyncMetaToPrometheus(ctx, meta)
+    })
+}
+```
+
+---
+
+## 3) Prometheus 同步
+
+- 实现为内存版 Exporter（`ExporterSync`），维护 `(rule + 规范化 labels) → {threshold, watch_time}`。
+- `SyncMetaToPrometheus` 直接更新内存映射，变更即时生效。
+- `AddToPrometheus`/`DeleteFromPrometheus` 作为占位，当前不写规则文件。
+- 如需以 metrics 暴露阈值，可在同进程 `/metrics` 将 `ExporterSync` 中的映射导出（按规则维度命名指标）。
+
+---
+
+## 4) 事务、并发与一致性
+
+- `Store.WithTx`：当前 PgStore 直接调用 fn（占位），可按需扩展为真正事务。
+- 写入采用单条 UPSERT（见下文 SQL），满足幂等。
+- 如存在同一 `(alert_name, labels)` 的高并发写入，建议使用 Postgres advisory lock。
+- Exporter 同步在 Upsert 成功后执行，生产中建议串行化该步骤以避免竞态。
+
+提示：
+- 标签命名不一致（例如 `service_version`/`version` 混用）通过 `NormalizeLabels` 的别名映射解决。
+- 多层阈值优先级（`{}`, `{service}`, `{service,version}`）建议仅导出“最具体”的一条（当前实现未裁剪，可扩展）。
+
+---
+
+## 5) SQL 示例（与代码一致）
+
+### 5.1 UPSERT Meta（带审计在应用层做）
+
+```sql
+-- 假设参数：$1 alert_name, $2 labels::jsonb, $3 threshold::numeric, $4 watch::interval
+INSERT INTO alert_rule_metas(alert_name, labels, threshold, watch_time)
+VALUES ($1, $2, $3, $4)
+ON CONFLICT (alert_name, labels) DO UPDATE SET
+  threshold  = EXCLUDED.threshold,
+  watch_time = EXCLUDED.watch_time,
+  updated_at = now();
+```
+
+### 5.2 查询：按部分标签匹配
+
+```sql
+-- 传入 {"service":"stg"}，返回该规则下 service=stg 的 metas（无视 version）
+SELECT * FROM alert_rule_metas
+WHERE alert_name = $1
+  AND labels @> $2::jsonb;   -- 包含关系
+```
+
+---
+
+## 6) 使用示例（最小化）
+
+```go
+db, _ := database.New(os.Getenv("ALERTING_PG_DSN"))
+store := ruleset.NewPgStore(db)
+prom  := ruleset.NewExporterSync()
+mgr   := ruleset.NewManager(store, prom, map[string]string{"service_version":"version"})
+
+meta := &ruleset.AlertRuleMeta{
+    AlertName: "latency_p95_P0",
+    Labels:    ruleset.LabelMap{"Service": "s3", "service_version": "v1"},
+    Threshold: 450,
+    WatchTime: 2 * time.Minute,
+}
+_ = mgr.UpsertRuleMetas(context.Background(), meta)
+```
+
+---
+
+## 7) Exporter 要点（当前实现）
+
+- 使用 `CanonicalLabelKey` 生成稳定键。
+- 当前未实现“优先级裁剪”（`{}`, `{service}`, `{service,version}` 仅导出最具体），可按需扩展。
+- 多副本部署需共享或拉取状态（可由 DB 拉取或事件广播）。
+
+---
+
+## 8) 测试
+
+### 8.1 单元测试
+
+- NormalizeLabels 与 CanonicalLabelKey：
+  - 输入包含大小写、空白、别名键（如 `service_version`）的 labels，断言小写化、去空白、别名映射、移除空值；
+  - 对乱序键，`CanonicalLabelKey` 结果一致。
+- Manager.UpsertRuleMetas：
+  - 使用内存实现的 Store 与 ExporterSync：
+    - 首次 Upsert 走 Create 分支，写入 metas，并同步到 ExporterSync；
+    - 再次 Upsert 走 Update 分支，产生变更日志；
+    - 断言阈值已生效到 ExporterSync。
+
+对应测试用例：`internal/alerting/service/ruleset/normalize_test.go`，`internal/alerting/service/ruleset/manager_test.go`
+
+运行：
+
+```bash
+go test ./internal/alerting/service/ruleset -v
+```
+
+### 8.2 手动测试（本地）
+
+- 数据库准备：
+  - 按本文数据库设计创建表（或参考 `docs/alerting/database-design.md`）。
+  - 在 `alert_rules` 插入一条规则，如：`latency_p95_P0`。
+- 启动 Exporter/服务：
+  - 代码中使用 `NewExporterSync()` 并注入到 `NewManager(...)`。
+  - 通过 `Manager.UpsertRuleMetas` 传入 `AlertRuleMeta{AlertName:"latency_p95_P0", Labels:{service:"s3",version:"v1"}, Threshold:450, WatchTime:2m}`。
+  - 验证内存 Exporter `ForTestingGet` 返回阈值为 450。
+- 变更验证：
+  - 再次调用 `UpsertRuleMetas`，阈值改为 500，检查 `alert_meta_change_logs` 新增 Update 记录。
+- 回滚演练：
+  - 读取上一条变更日志的 old 值，再次 Upsert 即可实现回滚（可后续补充接口）。
+
+---
+
+## 9) 需求映射
+
+- 同一规则多阈值等级（P0/P1）→ 通过多条 `alert_rules`（如 `_P0` 与 `_P1`）。
+- 告警变更接口（service + meta 参数）→ 统一落在 `alert_rule_metas`（已支持 labels 任意组合）。
+- 变更记录查询 → `alert_meta_change_logs`。
+
+---
+
+## 10) 后续增强（建议）
+
+1. Exporter 端优先级裁剪（仅导出最具体标签的阈值）。
+2. PgStore 接入真实事务（BeginTx），必要时使用 advisory lock。
+3. 增加回滚接口：基于 change_log 的 old 值再 Upsert 一次。
+4. 阈值 metrics 暴露：统一命名（每条规则单独 threshold metric）。
+
+---
\ No newline at end of file
diff --git a/internal/alerting/service/ruleset/manager.go b/internal/alerting/service/ruleset/manager.go
new file mode 100644
index 0000000..03e58d2
--- /dev/null
+++ b/internal/alerting/service/ruleset/manager.go
@@ -0,0 +1,143 @@
+package ruleset
+
+import (
+	"context"
+	"errors"
+	"fmt"
+	"time"
+)
+
+var (
+	// ErrInvalidMeta indicates provided meta is incomplete or invalid.
+	ErrInvalidMeta = errors.New("invalid alert rule meta")
+)
+
+// Manager implements AlertRuleMgr, coordinating store and Prometheus sync.
+type Manager struct {
+	store    Store
+	prom     PromSync
+	aliasMap map[string]string
+}
+
+func NewManager(store Store, prom PromSync, aliasMap map[string]string) *Manager {
+	if aliasMap == nil {
+		aliasMap = map[string]string{}
+	}
+	return &Manager{store: store, prom: prom, aliasMap: aliasMap}
+}
+
+func (m *Manager) LoadRule(ctx context.Context) error { return nil }
+
+func (m *Manager) AddAlertRule(ctx context.Context, r *AlertRule) error {
+	if r == nil || r.Name == "" {
+		return fmt.Errorf("invalid rule")
+	}
+	if err := m.store.CreateRule(ctx, r); err != nil {
+		return err
+	}
+	return m.prom.AddToPrometheus(ctx, r)
+}
+
+func (m *Manager) DeleteAlertRule(ctx context.Context, name string) error {
+	if name == "" {
+		return fmt.Errorf("invalid name")
+	}
+	if err := m.store.DeleteRule(ctx, name); err != nil {
+		return err
+	}
+	return m.prom.DeleteFromPrometheus(ctx, name)
+}
+
+func (m *Manager) AddToPrometheus(ctx context.Context, r *AlertRule) error {
+	return m.prom.AddToPrometheus(ctx, r)
+}
+func (m *Manager) DeleteFromPrometheus(ctx context.Context, name string) error {
+	return m.prom.DeleteFromPrometheus(ctx, name)
+}
+func (m *Manager) SyncMetaToPrometheus(ctx context.Context, meta *AlertRuleMeta) error {
+	return m.prom.SyncMetaToPrometheus(ctx, meta)
+}
+
+func (m *Manager) UpsertRuleMetas(ctx context.Context, meta *AlertRuleMeta) error {
+	if meta == nil {
+		return ErrInvalidMeta
+	}
+	meta.Labels = NormalizeLabels(meta.Labels, m.aliasMap)
+	if err := validateMeta(meta); err != nil {
+		return err
+	}
+	return m.store.WithTx(ctx, func(tx Store) error {
+		oldList, err := tx.GetMetas(ctx, meta.AlertName, meta.Labels)
+		if err != nil {
+			return err
+		}
+		var old *AlertRuleMeta
+		if len(oldList) > 0 {
+			old = oldList[0]
+		}
+		_, err = tx.UpsertMeta(ctx, meta)
+		if err != nil {
+			return err
+		}
+		if err := m.RecordMetaChangeLog(ctx, old, meta); err != nil {
+			return err
+		}
+		if err := m.prom.SyncMetaToPrometheus(ctx, meta); err != nil {
+			return err
+		}
+		return nil
+	})
+}
+
+func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error {
+	if newMeta == nil {
+		return nil
+	}
+	var oldTh, newTh *float64
+	var oldW, newW *time.Duration
+	if oldMeta != nil {
+		oldTh = &oldMeta.Threshold
+		oldW = &oldMeta.WatchTime
+	}
+	if newMeta != nil {
+		newTh = &newMeta.Threshold
+		newW = &newMeta.WatchTime
+	}
+	log := &ChangeLog{
+		ID:           fmt.Sprintf("%s-%s-%d", newMeta.AlertName, CanonicalLabelKey(newMeta.Labels), time.Now().UnixNano()),
+		AlertName:    newMeta.AlertName,
+		ChangeType:   classifyChange(oldMeta, newMeta),
+		Labels:       newMeta.Labels,
+		OldThreshold: oldTh,
+		NewThreshold: newTh,
+		OldWatch:     oldW,
+		NewWatch:     newW,
+		ChangeTime:   time.Now().UTC(),
+	}
+	return m.store.InsertChangeLog(ctx, log)
+}
+
+func classifyChange(oldMeta, newMeta *AlertRuleMeta) string {
+	if oldMeta == nil && newMeta != nil {
+		return "Create"
+	}
+	if oldMeta != nil && newMeta == nil {
+		return "Delete"
+	}
+	return "Update"
+}
+
+func validateMeta(m *AlertRuleMeta) error {
+	if m.AlertName == "" {
+		return ErrInvalidMeta
+	}
+	if !isFinite(m.Threshold) {
+		return ErrInvalidMeta
+	}
+	if m.WatchTime < 0 {
+		return ErrInvalidMeta
+	}
+	return nil
+}
+
+func isFinite(f float64) bool { return !((f != f) || (f > 1e308) || (f < -1e308)) }
diff --git a/internal/alerting/service/ruleset/manager_test.go b/internal/alerting/service/ruleset/manager_test.go
new file mode 100644
index 0000000..658df7b
--- /dev/null
+++ b/internal/alerting/service/ruleset/manager_test.go
@@ -0,0 +1,85 @@
+package ruleset
+
+import (
+	"context"
+	"testing"
+	"time"
+)
+
+type memStore struct {
+	rules map[string]*AlertRule
+	metas map[string]*AlertRuleMeta
+	logs  []*ChangeLog
+}
+
+func newMemStore() *memStore {
+	return &memStore{rules: map[string]*AlertRule{}, metas: map[string]*AlertRuleMeta{}, logs: []*ChangeLog{}}
+}
+
+func (m *memStore) CreateRule(ctx context.Context, r *AlertRule) error {
+	m.rules[r.Name] = r
+	return nil
+}
+func (m *memStore) GetRule(ctx context.Context, name string) (*AlertRule, error) {
+	return m.rules[name], nil
+}
+func (m *memStore) UpdateRule(ctx context.Context, r *AlertRule) error {
+	m.rules[r.Name] = r
+	return nil
+}
+func (m *memStore) DeleteRule(ctx context.Context, name string) error {
+	delete(m.rules, name)
+	return nil
+}
+func (m *memStore) UpsertMeta(ctx context.Context, meta *AlertRuleMeta) (bool, error) {
+	m.metas[meta.AlertName+"|"+CanonicalLabelKey(meta.Labels)] = meta
+	return true, nil
+}
+func (m *memStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error) {
+	if v, ok := m.metas[name+"|"+CanonicalLabelKey(labels)]; ok {
+		return []*AlertRuleMeta{v}, nil
+	}
+	return nil, nil
+}
+func (m *memStore) DeleteMeta(ctx context.Context, name string, labels LabelMap) error {
+	delete(m.metas, name+"|"+CanonicalLabelKey(labels))
+	return nil
+}
+func (m *memStore) InsertChangeLog(ctx context.Context, log *ChangeLog) error {
+	m.logs = append(m.logs, log)
+	return nil
+}
+func (m *memStore) WithTx(ctx context.Context, fn func(Store) error) error { return fn(m) }
+
+func TestManager_UpsertRuleMetas(t *testing.T) {
+	ctx := context.Background()
+	store := newMemStore()
+	prom := NewExporterSync()
+	mgr := NewManager(store, prom, map[string]string{"service_version": "version"})
+
+	meta := &AlertRuleMeta{AlertName: "latency_p95_P0", Labels: LabelMap{"Service": "s3", "service_version": "v1"}, Threshold: 450, WatchTime: 2 * time.Minute}
+	if err := mgr.UpsertRuleMetas(ctx, meta); err != nil {
+		t.Fatalf("upsert meta: %v", err)
+	}
+	// verify normalization
+	if _, ok := store.metas["latency_p95_P0|service=s3|version=v1"]; !ok {
+		t.Fatalf("normalized meta not found in store: %#v", store.metas)
+	}
+	// verify prom sync
+	if th, _, ok := prom.ForTestingGet("latency_p95_P0", LabelMap{"service": "s3", "version": "v1"}); !ok || th != 450 {
+		t.Fatalf("prom sync threshold mismatch: th=%v ok=%v", th, ok)
+	}
+	// verify change log
+	if len(store.logs) != 1 || store.logs[0].ChangeType != "Create" {
+		t.Fatalf("unexpected change logs: %#v", store.logs)
+	}
+
+	// update path
+	meta2 := &AlertRuleMeta{AlertName: "latency_p95_P0", Labels: LabelMap{"service": "s3", "version": "v1"}, Threshold: 500, WatchTime: 3 * time.Minute}
+	if err := mgr.UpsertRuleMetas(ctx, meta2); err != nil {
+		t.Fatalf("upsert meta2: %v", err)
+	}
+	if len(store.logs) != 2 || store.logs[1].ChangeType != "Update" {
+		t.Fatalf("expected update log, got: %#v", store.logs)
+	}
+}
diff --git a/internal/alerting/service/ruleset/normalize.go b/internal/alerting/service/ruleset/normalize.go
new file mode 100644
index 0000000..74b7b8c
--- /dev/null
+++ b/internal/alerting/service/ruleset/normalize.go
@@ -0,0 +1,57 @@
+package ruleset
+
+import (
+	"sort"
+	"strings"
+)
+
+// NormalizeLabels returns a new LabelMap with keys normalized to lowercase, trimmed, aliases applied,
+// empty values removed, and values trimmed. It does not mutate the input map.
+// aliasMap maps alternative keys to canonical keys, e.g., "service_version" -> "version".
+func NormalizeLabels(in LabelMap, aliasMap map[string]string) LabelMap {
+	if len(in) == 0 {
+		return LabelMap{}
+	}
+	result := make(LabelMap, len(in))
+	for rawKey, rawVal := range in {
+		key := strings.ToLower(strings.TrimSpace(rawKey))
+		if key == "" {
+			continue
+		}
+		if canonical, ok := aliasMap[key]; ok && strings.TrimSpace(canonical) != "" {
+			key = strings.ToLower(strings.TrimSpace(canonical))
+		}
+		val := strings.TrimSpace(rawVal)
+		if val == "" {
+			continue
+		}
+		result[key] = val
+	}
+	return result
+}
+
+// CanonicalLabelKey returns a stable string representation of labels for use as a map key.
+// It sorts keys and concatenates as key=value pairs separated by '|'.
+// This ensures {a=1,b=2} and {b=2,a=1} produce identical keys.
+func CanonicalLabelKey(labels LabelMap) string {
+	if len(labels) == 0 {
+		return "{}"
+	}
+	keys := make([]string, 0, len(labels))
+	for k := range labels {
+		keys = append(keys, k)
+	}
+	sort.Strings(keys)
+	var b strings.Builder
+	b.Grow(len(keys) * 8)
+	for i := 0; i < len(keys); i++ {
+		if i > 0 {
+			b.WriteByte('|')
+		}
+		k := keys[i]
+		b.WriteString(k)
+		b.WriteByte('=')
+		b.WriteString(labels[k])
+	}
+	return b.String()
+}
diff --git a/internal/alerting/service/ruleset/normalize_test.go b/internal/alerting/service/ruleset/normalize_test.go
new file mode 100644
index 0000000..02a9c75
--- /dev/null
+++ b/internal/alerting/service/ruleset/normalize_test.go
@@ -0,0 +1,23 @@
+package ruleset
+
+import "testing"
+
+func TestNormalizeLabels(t *testing.T) {
+	alias := map[string]string{"service_version": "version"}
+	in := LabelMap{" Service ": " s3 ", "service_version": " V1 ", "empty": "  "}
+	out := NormalizeLabels(in, alias)
+	if out["service"] != "s3" || out["version"] != "V1" {
+		t.Fatalf("unexpected normalize: %#v", out)
+	}
+	if _, ok := out["empty"]; ok {
+		t.Fatalf("empty value should be removed: %#v", out)
+	}
+}
+
+func TestCanonicalLabelKey(t *testing.T) {
+	key1 := CanonicalLabelKey(LabelMap{"b": "2", "a": "1"})
+	key2 := CanonicalLabelKey(LabelMap{"a": "1", "b": "2"})
+	if key1 != key2 {
+		t.Fatalf("keys should be equal: %s vs %s", key1, key2)
+	}
+}
diff --git a/internal/alerting/service/ruleset/promsync_exporter.go b/internal/alerting/service/ruleset/promsync_exporter.go
new file mode 100644
index 0000000..573365f
--- /dev/null
+++ b/internal/alerting/service/ruleset/promsync_exporter.go
@@ -0,0 +1,67 @@
+package ruleset
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"time"
+)
+
+// ExporterSync is an in-memory PromSync implementation that maintains threshold/watch values
+// for each (rule, labels) pair. It is intended for unit tests and simple deployments where
+// another component exposes these as metrics.
+type ExporterSync struct {
+	mu         sync.RWMutex
+	thresholds map[string]float64
+	watchTimes map[string]time.Duration
+}
+
+func NewExporterSync() *ExporterSync {
+	return &ExporterSync{
+		thresholds: make(map[string]float64),
+		watchTimes: make(map[string]time.Duration),
+	}
+}
+
+// keyFor builds a stable key for the given rule and labels.
+func (e *ExporterSync) keyFor(rule string, labels LabelMap) string {
+	return fmt.Sprintf("%s|%s", rule, CanonicalLabelKey(labels))
+}
+
+func (e *ExporterSync) AddToPrometheus(ctx context.Context, r *AlertRule) error { return nil }
+
+func (e *ExporterSync) DeleteFromPrometheus(ctx context.Context, name string) error {
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	// delete all entries for the rule
+	prefix := name + "|"
+	for k := range e.thresholds {
+		if len(k) >= len(prefix) && k[:len(prefix)] == prefix {
+			delete(e.thresholds, k)
+			delete(e.watchTimes, k)
+		}
+	}
+	return nil
+}
+
+func (e *ExporterSync) SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error {
+	if m == nil || m.AlertName == "" {
+		return fmt.Errorf("invalid meta: missing alert name")
+	}
+	e.mu.Lock()
+	defer e.mu.Unlock()
+	key := e.keyFor(m.AlertName, m.Labels)
+	e.thresholds[key] = m.Threshold
+	e.watchTimes[key] = m.WatchTime
+	return nil
+}
+
+// ForTestingGet exposes current values for assertions in unit tests.
+func (e *ExporterSync) ForTestingGet(rule string, labels LabelMap) (threshold float64, watch time.Duration, ok bool) {
+	e.mu.RLock()
+	defer e.mu.RUnlock()
+	key := e.keyFor(rule, labels)
+	v, ok1 := e.thresholds[key]
+	w, ok2 := e.watchTimes[key]
+	return v, w, ok1 && ok2
+}
diff --git a/internal/alerting/service/ruleset/store_pg.go b/internal/alerting/service/ruleset/store_pg.go
new file mode 100644
index 0000000..fe3d231
--- /dev/null
+++ b/internal/alerting/service/ruleset/store_pg.go
@@ -0,0 +1,170 @@
+package ruleset
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	abd "github.com/qiniu/zeroops/internal/alerting/database"
+)
+
+// PgStore is a PostgreSQL-backed Store implementation using the alerting database wrapper.
+// Note: The current database wrapper does not expose transactions; WithTx acts as a simple wrapper.
+// For production-grade atomicity, extend the database wrapper to support sql.Tx and wire it here.
+type PgStore struct {
+	DB *abd.Database
+}
+
+func NewPgStore(db *abd.Database) *PgStore { return &PgStore{DB: db} }
+
+func (s *PgStore) WithTx(ctx context.Context, fn func(Store) error) error {
+	// Fallback: invoke fn directly. Replace with real transactional context when available.
+	return fn(s)
+}
+
+func (s *PgStore) CreateRule(ctx context.Context, r *AlertRule) error {
+	const q = `
+	INSERT INTO alert_rules(name, description, expr, op, severity)
+	VALUES ($1, $2, $3, $4, $5)
+	ON CONFLICT (name) DO UPDATE SET
+		description = EXCLUDED.description,
+		expr = EXCLUDED.expr,
+		op = EXCLUDED.op,
+		severity = EXCLUDED.severity
+	`
+	_, err := s.DB.ExecContext(ctx, q, r.Name, r.Description, r.Expr, r.Op, r.Severity)
+	if err != nil {
+		return fmt.Errorf("create rule: %w", err)
+	}
+	return nil
+}
+
+func (s *PgStore) GetRule(ctx context.Context, name string) (*AlertRule, error) {
+	const q = `SELECT name, description, expr, op, severity FROM alert_rules WHERE name = $1`
+	rows, err := s.DB.QueryContext(ctx, q, name)
+	if err != nil {
+		return nil, fmt.Errorf("get rule: %w", err)
+	}
+	defer rows.Close()
+	if rows.Next() {
+		var r AlertRule
+		if err := rows.Scan(&r.Name, &r.Description, &r.Expr, &r.Op, &r.Severity); err != nil {
+			return nil, fmt.Errorf("scan rule: %w", err)
+		}
+		return &r, nil
+	}
+	return nil, fmt.Errorf("rule not found: %s", name)
+}
+
+func (s *PgStore) UpdateRule(ctx context.Context, r *AlertRule) error {
+	const q = `UPDATE alert_rules SET description=$2, expr=$3, op=$4, severity=$5 WHERE name=$1`
+	_, err := s.DB.ExecContext(ctx, q, r.Name, r.Description, r.Expr, r.Op, r.Severity)
+	if err != nil {
+		return fmt.Errorf("update rule: %w", err)
+	}
+	return nil
+}
+
+func (s *PgStore) DeleteRule(ctx context.Context, name string) error {
+	const q = `DELETE FROM alert_rules WHERE name=$1`
+	_, err := s.DB.ExecContext(ctx, q, name)
+	if err != nil {
+		return fmt.Errorf("delete rule: %w", err)
+	}
+	return nil
+}
+
+func (s *PgStore) UpsertMeta(ctx context.Context, m *AlertRuleMeta) (bool, error) {
+	labelsJSON, _ := json.Marshal(m.Labels)
+	const q = `
+	INSERT INTO alert_rule_metas(alert_name, labels, threshold, watch_time)
+	VALUES ($1, $2::jsonb, $3, $4)
+	ON CONFLICT (alert_name, labels) DO UPDATE SET
+		threshold=EXCLUDED.threshold,
+		watch_time=EXCLUDED.watch_time,
+		updated_at=now()
+	`
+	_, err := s.DB.ExecContext(ctx, q, m.AlertName, string(labelsJSON), m.Threshold, m.WatchTime)
+	if err != nil {
+		return false, fmt.Errorf("upsert meta: %w", err)
+	}
+	// created flag is not easily observable here without RETURNING clause; return false.
+	return false, nil
+}
+
+func (s *PgStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error) {
+	labelsJSON, _ := json.Marshal(labels)
+	const q = `
+	SELECT alert_name, labels, threshold, watch_time
+	FROM alert_rule_metas
+	WHERE alert_name = $1 AND labels = $2::jsonb
+	`
+	rows, err := s.DB.QueryContext(ctx, q, name, string(labelsJSON))
+	if err != nil {
+		return nil, fmt.Errorf("get metas: %w", err)
+	}
+	defer rows.Close()
+	var res []*AlertRuleMeta
+	for rows.Next() {
+		var alertName string
+		var labelsRaw string
+		var threshold float64
+		var watch any
+		if err := rows.Scan(&alertName, &labelsRaw, &threshold, &watch); err != nil {
+			return nil, fmt.Errorf("scan meta: %w", err)
+		}
+		lm := LabelMap{}
+		_ = json.Unmarshal([]byte(labelsRaw), &lm)
+		meta := &AlertRuleMeta{AlertName: alertName, Labels: lm, Threshold: threshold}
+		// best-effort: watch_time may come back as string or duration; we try string -> duration
+		switch v := watch.(type) {
+		case string:
+			if d, err := timeParseDurationPG(v); err == nil {
+				meta.WatchTime = d
+			}
+		}
+		res = append(res, meta)
+	}
+	return res, nil
+}
+
+func (s *PgStore) DeleteMeta(ctx context.Context, name string, labels LabelMap) error {
+	labelsJSON, _ := json.Marshal(labels)
+	const q = `DELETE FROM alert_rule_metas WHERE alert_name=$1 AND labels=$2::jsonb`
+	_, err := s.DB.ExecContext(ctx, q, name, string(labelsJSON))
+	if err != nil {
+		return fmt.Errorf("delete meta: %w", err)
+	}
+	return nil
+}
+
+func (s *PgStore) InsertChangeLog(ctx context.Context, log *ChangeLog) error {
+	labelsJSON, _ := json.Marshal(log.Labels)
+	const q = `
+	INSERT INTO alert_meta_change_logs(id, alert_name, change_type, labels, old_threshold, new_threshold, old_watch, new_watch, change_time)
+	VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
+	`
+	_, err := s.DB.ExecContext(ctx, q, log.ID, log.AlertName, log.ChangeType, string(labelsJSON), log.OldThreshold, log.NewThreshold, log.OldWatch, log.NewWatch, log.ChangeTime)
+	if err != nil {
+		return fmt.Errorf("insert change log: %w", err)
+	}
+	return nil
+}
+
+// timeParseDurationPG parses a small subset of PostgreSQL interval text output into time.Duration.
+// Supported examples: "01:02:03", "02:03", "3600 seconds". Best-effort only.
+func timeParseDurationPG(s string) (time.Duration, error) {
+	// HH:MM:SS
+	var h, m int
+	var sec float64
+	if n, _ := fmt.Sscanf(s, "%d:%d:%f", &h, &m, &sec); n >= 2 {
+		d := time.Duration(h)*time.Hour + time.Duration(m)*time.Minute + time.Duration(sec*float64(time.Second))
+		return d, nil
+	}
+	var seconds float64
+	if n, _ := fmt.Sscanf(s, "%f seconds", &seconds); n == 1 {
+		return time.Duration(seconds * float64(time.Second)), nil
+	}
+	return 0, fmt.Errorf("unsupported interval format: %s", s)
+}
diff --git a/internal/alerting/service/ruleset/types.go b/internal/alerting/service/ruleset/types.go
new file mode 100644
index 0000000..4009832
--- /dev/null
+++ b/internal/alerting/service/ruleset/types.go
@@ -0,0 +1,86 @@
+package ruleset
+
+import (
+	"context"
+	"time"
+)
+
+// AlertRule defines a logical alert rule. This corresponds to a Prometheus alert rule entry
+// excluding threshold information, which is managed separately via AlertRuleMeta.
+// Name is the business identifier and should align with Prometheus alert: field.
+type AlertRule struct {
+	Name        string // unique rule name, typically equals Prometheus alert name
+	Description string // human readable explanation
+	Expr        string // left-hand PromQL expression (e.g. p95 latency expression)
+	Op          string // comparison operator: one of >, <, =, !=
+	Severity    string // severity code such as P0, P1, P2
+}
+
+// LabelMap represents a normalized set of label key-value pairs that identify a meta scope.
+// Standardization rules are applied before persistence (see normalize.go).
+type LabelMap map[string]string
+
+// AlertRuleMeta holds threshold and watch duration for a specific rule under certain labels.
+// Threshold is a numeric boundary; WatchTime maps to Prometheus rule "for:" duration.
+type AlertRuleMeta struct {
+	AlertName string        // foreign key to AlertRule.Name
+	Labels    LabelMap      // normalized labels; {} means global default
+	Threshold float64       // numeric threshold
+	WatchTime time.Duration // watch window; exported or translated to Prometheus for:
+}
+
+// ChangeLog captures before/after changes for auditing and potential rollback.
+type ChangeLog struct {
+	ID           string         // external id for de-duplication
+	AlertName    string         // rule name
+	ChangeType   string         // Create | Update | Delete | Rollback
+	Labels       LabelMap       // affected labels
+	OldThreshold *float64       // nil if not applicable
+	NewThreshold *float64       // nil if not applicable
+	OldWatch     *time.Duration // nil if not applicable
+	NewWatch     *time.Duration // nil if not applicable
+	ChangeTime   time.Time      // when the change happened
+}
+
+// Store abstracts persistence operations for rules and metas. Implementations should ensure
+// correctness under concurrency via UPSERTs and, if necessary, advisory locks.
+type Store interface {
+	// Rule CRUD
+	CreateRule(ctx context.Context, r *AlertRule) error
+	GetRule(ctx context.Context, name string) (*AlertRule, error)
+	UpdateRule(ctx context.Context, r *AlertRule) error
+	DeleteRule(ctx context.Context, name string) error
+
+	// Meta operations (UPSERT by alert_name + labels)
+	UpsertMeta(ctx context.Context, m *AlertRuleMeta) (created bool, err error)
+	GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error)
+	DeleteMeta(ctx context.Context, name string, labels LabelMap) error
+
+	// Change logs
+	InsertChangeLog(ctx context.Context, log *ChangeLog) error
+
+	// Transaction helper. Implementation must call fn with a transactional Store
+	// that respects atomicity for the ops executed within.
+	WithTx(ctx context.Context, fn func(Store) error) error
+}
+
+// PromSync defines interactions with Prometheus or an exporter responsible for threshold materialization.
+// Add/Delete manage the lifecycle of rule files; SyncMeta updates threshold sources.
+type PromSync interface {
+	AddToPrometheus(ctx context.Context, r *AlertRule) error
+	DeleteFromPrometheus(ctx context.Context, name string) error
+	SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error
+}
+
+// AlertRuleMgr orchestrates validation, store operations, change logging, and Prometheus sync.
+type AlertRuleMgr interface {
+	LoadRule(ctx context.Context) error
+	UpsertRuleMetas(ctx context.Context, m *AlertRuleMeta) error
+	AddAlertRule(ctx context.Context, r *AlertRule) error
+	DeleteAlertRule(ctx context.Context, name string) error
+
+	AddToPrometheus(ctx context.Context, r *AlertRule) error
+	DeleteFromPrometheus(ctx context.Context, name string) error
+	SyncMetaToPrometheus(ctx context.Context, m *AlertRuleMeta) error
+	RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error
+}

From 7ae82d12553ba2c02828e3b44b836cb6f629b701 Mon Sep 17 00:00:00 2001
From: acd19ml <acd19ml@gmail.com>
Date: Sun, 21 Sep 2025 01:30:42 +0800
Subject: [PATCH 2/4] =?UTF-8?q?=E4=BC=98=E5=8C=96promethues=E4=B8=8E?=
 =?UTF-8?q?=E6=95=B0=E6=8D=AE=E5=BA=93=E6=93=8D=E4=BD=9C=E9=A1=BA=E5=BA=8F?=
 =?UTF-8?q?=EF=BC=8C=E6=94=B9=E8=BF=9B=E9=94=99=E8=AF=AF=E5=A4=84=E7=90=86?=
 =?UTF-8?q?=EF=BC=8C=E6=94=B9=E8=BF=9B=E5=AD=98=E5=82=A8=E7=B1=BB=E5=9E=8B?=
 =?UTF-8?q?=E8=BD=AC=E6=8D=A2?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 docs/alerting/database-design.md              |  23 ++-
 internal/alerting/service/ruleset/manager.go  |  93 +++++++--
 internal/alerting/service/ruleset/store_pg.go | 113 ++++++++---
 .../alerting/service/ruleset/store_pg_test.go | 188 ++++++++++++++++++
 4 files changed, 352 insertions(+), 65 deletions(-)
 create mode 100644 internal/alerting/service/ruleset/store_pg_test.go

diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md
index 98b3e1c..b119349 100644
--- a/docs/alerting/database-design.md
+++ b/docs/alerting/database-design.md
@@ -76,13 +76,13 @@
 
 ### 4) alert_rules（告警规则表）
 
-｜ 字段名 ｜ 类型 ｜ 说明 ｜
+| 字段名 | 类型 | 说明 |
 |--------|------|------|
-｜name｜varchar(255)｜主键，告警规则名称｜
-｜description｜text｜可读标题，可拼接渲染为可读的 title｜
-｜expr｜text｜左侧业务指标表达式，（通常对应 PromQL 左侧的聚合，如 sum(apitime) by (service, version)）｜
-｜op｜varchar(4)｜阈值比较方式（枚举：>, <, =, !=）｜
-｜severity｜varchar(32)｜告警等级，通常进入告警的 labels.severity｜
+| name | varchar(255) | 主键，告警规则名称 |
+| description | text | 可读标题，可拼接渲染为可读的 title |
+| expr | text | 左侧业务指标表达式，（通常对应 PromQL 左侧的聚合，如 sum(apitime) by (service, version)） |
+| op | varchar(4) | 阈值比较方式（枚举：>, <, =, !=） |
+| severity | varchar(32) | 告警等级，通常进入告警的 labels.severity |
 
 **约束建议：**
 - CHECK 约束：`op IN ('>', '<', '=', '!=')`
@@ -91,11 +91,12 @@
 
 ### 5) alert_rule_metas（规则阈值元信息表）
 
-字段名	类型	说明
-alert_name	varchar(255)	关联 `alert_rules.name`
-labels	jsonb	适用标签（示例：{"service":"s3","version":"v1"}）；为空 `{}` 表示全局
-threshold	numeric	阈值（会被渲染成特定规则的 threshold metric 数值）
-watch_time	interval	持续时长（映射 Prometheus rule 的 for:）
+| 字段名 | 类型 | 说明 |
+|--------|------|------|
+| alert_name | varchar(255) | 关联 `alert_rules.name` |
+| labels | jsonb | 适用标签（示例：{"service":"s3","version":"v1"}）；为空 `{}` 表示全局 |
+| threshold | numeric | 阈值（会被渲染成特定规则的 threshold metric 数值） |
+| watch_time | interval | 持续时长（映射 Prometheus rule 的 for:） |
 
 **约束与索引建议：**
 - FOREIGN KEY: `(alert_name)` REFERENCES `alert_rules(name)` ON DELETE CASCADE
diff --git a/internal/alerting/service/ruleset/manager.go b/internal/alerting/service/ruleset/manager.go
index 03e58d2..7ffb279 100644
--- a/internal/alerting/service/ruleset/manager.go
+++ b/internal/alerting/service/ruleset/manager.go
@@ -32,20 +32,35 @@ func (m *Manager) AddAlertRule(ctx context.Context, r *AlertRule) error {
 	if r == nil || r.Name == "" {
 		return fmt.Errorf("invalid rule")
 	}
+	// First ensure the rule is added to Prometheus successfully
+	// This guarantees Prometheus has the correct data even if DB write fails
+	if err := m.prom.AddToPrometheus(ctx, r); err != nil {
+		return fmt.Errorf("failed to add rule to Prometheus: %w", err)
+	}
+	// Then persist to database
+	// If this fails, the rule will still be in Prometheus, which is better than
+	// having it in DB but not in Prometheus (which would cause missing alerts)
 	if err := m.store.CreateRule(ctx, r); err != nil {
-		return err
+		return fmt.Errorf("failed to create rule in database: %w", err)
 	}
-	return m.prom.AddToPrometheus(ctx, r)
+	return nil
 }
 
 func (m *Manager) DeleteAlertRule(ctx context.Context, name string) error {
 	if name == "" {
 		return fmt.Errorf("invalid name")
 	}
+	// First remove from Prometheus to stop alerting immediately
+	// This prevents false alerts if DB deletion fails
+	if err := m.prom.DeleteFromPrometheus(ctx, name); err != nil {
+		return fmt.Errorf("failed to delete rule from Prometheus: %w", err)
+	}
+	// Then remove from database
+	// If this fails, the rule is already removed from Prometheus (no false alerts)
 	if err := m.store.DeleteRule(ctx, name); err != nil {
-		return err
+		return fmt.Errorf("failed to delete rule from database: %w", err)
 	}
-	return m.prom.DeleteFromPrometheus(ctx, name)
+	return nil
 }
 
 func (m *Manager) AddToPrometheus(ctx context.Context, r *AlertRule) error {
@@ -66,33 +81,54 @@ func (m *Manager) UpsertRuleMetas(ctx context.Context, meta *AlertRuleMeta) erro
 	if err := validateMeta(meta); err != nil {
 		return err
 	}
+
+	// First, get the old meta for change logging
+	oldList, err := m.store.GetMetas(ctx, meta.AlertName, meta.Labels)
+	if err != nil {
+		return err
+	}
+	var old *AlertRuleMeta
+	if len(oldList) > 0 {
+		old = oldList[0]
+	}
+
+	// Prepare change log parameters outside of transaction to minimize lock time
+	var changeLog *ChangeLog
+	if old != nil || meta != nil {
+		changeLog = m.prepareChangeLog(old, meta)
+	}
+
+	// First ensure the meta is synced to Prometheus successfully
+	// This guarantees Prometheus has the correct threshold data even if DB write fails
+	if err := m.prom.SyncMetaToPrometheus(ctx, meta); err != nil {
+		return fmt.Errorf("failed to sync meta to Prometheus: %w", err)
+	}
+
+	// Then persist to database within a transaction
+	// If this fails, the meta will still be in Prometheus, which is better than
+	// having it in DB but not in Prometheus (which would cause incorrect thresholds)
 	return m.store.WithTx(ctx, func(tx Store) error {
-		oldList, err := tx.GetMetas(ctx, meta.AlertName, meta.Labels)
-		if err != nil {
-			return err
-		}
-		var old *AlertRuleMeta
-		if len(oldList) > 0 {
-			old = oldList[0]
-		}
 		_, err = tx.UpsertMeta(ctx, meta)
 		if err != nil {
 			return err
 		}
-		if err := m.RecordMetaChangeLog(ctx, old, meta); err != nil {
-			return err
-		}
-		if err := m.prom.SyncMetaToPrometheus(ctx, meta); err != nil {
-			return err
+		// Insert pre-prepared change log
+		if changeLog != nil {
+			if err := tx.InsertChangeLog(ctx, changeLog); err != nil {
+				return err
+			}
 		}
 		return nil
 	})
 }
 
-func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error {
+// prepareChangeLog prepares change log parameters outside of transaction to minimize lock time
+func (m *Manager) prepareChangeLog(oldMeta, newMeta *AlertRuleMeta) *ChangeLog {
 	if newMeta == nil {
 		return nil
 	}
+
+	// Prepare all parameters outside of transaction
 	var oldTh, newTh *float64
 	var oldW, newW *time.Duration
 	if oldMeta != nil {
@@ -103,8 +139,14 @@ func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *Ale
 		newTh = &newMeta.Threshold
 		newW = &newMeta.WatchTime
 	}
-	log := &ChangeLog{
-		ID:           fmt.Sprintf("%s-%s-%d", newMeta.AlertName, CanonicalLabelKey(newMeta.Labels), time.Now().UnixNano()),
+
+	// Generate ID and timestamp outside of transaction
+	now := time.Now()
+	changeTime := now.UTC()
+	id := fmt.Sprintf("%s-%s-%d", newMeta.AlertName, CanonicalLabelKey(newMeta.Labels), now.UnixNano())
+
+	return &ChangeLog{
+		ID:           id,
 		AlertName:    newMeta.AlertName,
 		ChangeType:   classifyChange(oldMeta, newMeta),
 		Labels:       newMeta.Labels,
@@ -112,9 +154,16 @@ func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *Ale
 		NewThreshold: newTh,
 		OldWatch:     oldW,
 		NewWatch:     newW,
-		ChangeTime:   time.Now().UTC(),
+		ChangeTime:   changeTime,
+	}
+}
+
+func (m *Manager) RecordMetaChangeLog(ctx context.Context, oldMeta, newMeta *AlertRuleMeta) error {
+	changeLog := m.prepareChangeLog(oldMeta, newMeta)
+	if changeLog == nil {
+		return nil
 	}
-	return m.store.InsertChangeLog(ctx, log)
+	return m.store.InsertChangeLog(ctx, changeLog)
 }
 
 func classifyChange(oldMeta, newMeta *AlertRuleMeta) string {
diff --git a/internal/alerting/service/ruleset/store_pg.go b/internal/alerting/service/ruleset/store_pg.go
index fe3d231..cd4c063 100644
--- a/internal/alerting/service/ruleset/store_pg.go
+++ b/internal/alerting/service/ruleset/store_pg.go
@@ -6,12 +6,14 @@ import (
 	"fmt"
 	"time"
 
+	"github.com/jackc/pgx/v5/pgtype"
 	abd "github.com/qiniu/zeroops/internal/alerting/database"
 )
 
 // PgStore is a PostgreSQL-backed Store implementation using the alerting database wrapper.
 // Note: The current database wrapper does not expose transactions; WithTx acts as a simple wrapper.
 // For production-grade atomicity, extend the database wrapper to support sql.Tx and wire it here.
+// This implementation uses pgx native types to avoid manual parsing of PostgreSQL interval types.
 type PgStore struct {
 	DB *abd.Database
 }
@@ -76,16 +78,22 @@ func (s *PgStore) DeleteRule(ctx context.Context, name string) error {
 }
 
 func (s *PgStore) UpsertMeta(ctx context.Context, m *AlertRuleMeta) (bool, error) {
-	labelsJSON, _ := json.Marshal(m.Labels)
+	labelsJSON, err := json.Marshal(m.Labels)
+	if err != nil {
+		return false, fmt.Errorf("marshal labels: %w", err)
+	}
+
+	// Convert time.Duration to pgtype.Interval
+	interval := durationToPgInterval(m.WatchTime)
+
 	const q = `
 	INSERT INTO alert_rule_metas(alert_name, labels, threshold, watch_time)
 	VALUES ($1, $2::jsonb, $3, $4)
 	ON CONFLICT (alert_name, labels) DO UPDATE SET
 		threshold=EXCLUDED.threshold,
-		watch_time=EXCLUDED.watch_time,
-		updated_at=now()
+		watch_time=EXCLUDED.watch_time
 	`
-	_, err := s.DB.ExecContext(ctx, q, m.AlertName, string(labelsJSON), m.Threshold, m.WatchTime)
+	_, err = s.DB.ExecContext(ctx, q, m.AlertName, string(labelsJSON), m.Threshold, interval)
 	if err != nil {
 		return false, fmt.Errorf("upsert meta: %w", err)
 	}
@@ -94,7 +102,10 @@ func (s *PgStore) UpsertMeta(ctx context.Context, m *AlertRuleMeta) (bool, error
 }
 
 func (s *PgStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([]*AlertRuleMeta, error) {
-	labelsJSON, _ := json.Marshal(labels)
+	labelsJSON, err := json.Marshal(labels)
+	if err != nil {
+		return nil, fmt.Errorf("marshal labels for get: %w", err)
+	}
 	const q = `
 	SELECT alert_name, labels, threshold, watch_time
 	FROM alert_rule_metas
@@ -110,19 +121,19 @@ func (s *PgStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([
 		var alertName string
 		var labelsRaw string
 		var threshold float64
-		var watch any
+		var watch pgtype.Interval
 		if err := rows.Scan(&alertName, &labelsRaw, &threshold, &watch); err != nil {
 			return nil, fmt.Errorf("scan meta: %w", err)
 		}
 		lm := LabelMap{}
-		_ = json.Unmarshal([]byte(labelsRaw), &lm)
+		if err := json.Unmarshal([]byte(labelsRaw), &lm); err != nil {
+			return nil, fmt.Errorf("unmarshal labels: %w", err)
+		}
 		meta := &AlertRuleMeta{AlertName: alertName, Labels: lm, Threshold: threshold}
-		// best-effort: watch_time may come back as string or duration; we try string -> duration
-		switch v := watch.(type) {
-		case string:
-			if d, err := timeParseDurationPG(v); err == nil {
-				meta.WatchTime = d
-			}
+
+		// Convert pgtype.Interval to time.Duration
+		if duration, err := pgIntervalToDuration(watch); err == nil {
+			meta.WatchTime = duration
 		}
 		res = append(res, meta)
 	}
@@ -130,9 +141,12 @@ func (s *PgStore) GetMetas(ctx context.Context, name string, labels LabelMap) ([
 }
 
 func (s *PgStore) DeleteMeta(ctx context.Context, name string, labels LabelMap) error {
-	labelsJSON, _ := json.Marshal(labels)
+	labelsJSON, err := json.Marshal(labels)
+	if err != nil {
+		return fmt.Errorf("marshal labels: %w", err)
+	}
 	const q = `DELETE FROM alert_rule_metas WHERE alert_name=$1 AND labels=$2::jsonb`
-	_, err := s.DB.ExecContext(ctx, q, name, string(labelsJSON))
+	_, err = s.DB.ExecContext(ctx, q, name, string(labelsJSON))
 	if err != nil {
 		return fmt.Errorf("delete meta: %w", err)
 	}
@@ -140,31 +154,66 @@ func (s *PgStore) DeleteMeta(ctx context.Context, name string, labels LabelMap)
 }
 
 func (s *PgStore) InsertChangeLog(ctx context.Context, log *ChangeLog) error {
-	labelsJSON, _ := json.Marshal(log.Labels)
+	labelsJSON, err := json.Marshal(log.Labels)
+	if err != nil {
+		return fmt.Errorf("marshal labels for changelog: %w", err)
+	}
+
+	// Convert time.Duration to pgtype.Interval for old and new watch times
+	var oldWatch, newWatch *pgtype.Interval
+	if log.OldWatch != nil {
+		interval := durationToPgInterval(*log.OldWatch)
+		oldWatch = &interval
+	}
+	if log.NewWatch != nil {
+		interval := durationToPgInterval(*log.NewWatch)
+		newWatch = &interval
+	}
+
 	const q = `
 	INSERT INTO alert_meta_change_logs(id, alert_name, change_type, labels, old_threshold, new_threshold, old_watch, new_watch, change_time)
 	VALUES ($1, $2, $3, $4, $5, $6, $7, $8, $9)
 	`
-	_, err := s.DB.ExecContext(ctx, q, log.ID, log.AlertName, log.ChangeType, string(labelsJSON), log.OldThreshold, log.NewThreshold, log.OldWatch, log.NewWatch, log.ChangeTime)
+	_, err = s.DB.ExecContext(ctx, q, log.ID, log.AlertName, log.ChangeType, string(labelsJSON), log.OldThreshold, log.NewThreshold, oldWatch, newWatch, log.ChangeTime)
 	if err != nil {
 		return fmt.Errorf("insert change log: %w", err)
 	}
 	return nil
 }
 
-// timeParseDurationPG parses a small subset of PostgreSQL interval text output into time.Duration.
-// Supported examples: "01:02:03", "02:03", "3600 seconds". Best-effort only.
-func timeParseDurationPG(s string) (time.Duration, error) {
-	// HH:MM:SS
-	var h, m int
-	var sec float64
-	if n, _ := fmt.Sscanf(s, "%d:%d:%f", &h, &m, &sec); n >= 2 {
-		d := time.Duration(h)*time.Hour + time.Duration(m)*time.Minute + time.Duration(sec*float64(time.Second))
-		return d, nil
-	}
-	var seconds float64
-	if n, _ := fmt.Sscanf(s, "%f seconds", &seconds); n == 1 {
-		return time.Duration(seconds * float64(time.Second)), nil
-	}
-	return 0, fmt.Errorf("unsupported interval format: %s", s)
+// durationToPgInterval converts a time.Duration to pgtype.Interval.
+// Note: This conversion assumes the duration represents a fixed time period.
+// For durations that include months or years, this conversion may not be accurate.
+func durationToPgInterval(d time.Duration) pgtype.Interval {
+	// Convert to total microseconds first
+	totalMicroseconds := d.Microseconds()
+
+	// Calculate days and remaining microseconds
+	days := totalMicroseconds / (24 * 60 * 60 * 1000000) // 24 hours * 60 minutes * 60 seconds * 1,000,000 microseconds
+	remainingMicroseconds := totalMicroseconds % (24 * 60 * 60 * 1000000)
+
+	return pgtype.Interval{
+		Microseconds: remainingMicroseconds,
+		Days:         int32(days),
+		Months:       0, // Duration doesn't include months
+		Valid:        true,
+	}
+}
+
+// pgIntervalToDuration converts a pgtype.Interval to time.Duration.
+// This function returns an error if the interval contains months or years,
+// as these cannot be accurately converted to a fixed duration.
+func pgIntervalToDuration(interval pgtype.Interval) (time.Duration, error) {
+	if !interval.Valid {
+		return 0, fmt.Errorf("interval is not valid")
+	}
+
+	// Check if the interval contains months or years
+	if interval.Months != 0 {
+		return 0, fmt.Errorf("cannot convert interval with months to duration: %d months", interval.Months)
+	}
+
+	// Convert to duration
+	totalMicroseconds := interval.Microseconds + int64(interval.Days)*24*60*60*1000000
+	return time.Duration(totalMicroseconds) * time.Microsecond, nil
 }
diff --git a/internal/alerting/service/ruleset/store_pg_test.go b/internal/alerting/service/ruleset/store_pg_test.go
new file mode 100644
index 0000000..a19c1e5
--- /dev/null
+++ b/internal/alerting/service/ruleset/store_pg_test.go
@@ -0,0 +1,188 @@
+package ruleset
+
+import (
+	"testing"
+	"time"
+
+	"github.com/jackc/pgx/v5/pgtype"
+)
+
+func TestDurationToPgInterval(t *testing.T) {
+	tests := []struct {
+		name     string
+		duration time.Duration
+		expected pgtype.Interval
+	}{
+		{
+			name:     "Zero duration",
+			duration: 0,
+			expected: pgtype.Interval{
+				Microseconds: 0,
+				Days:         0,
+				Months:       0,
+				Valid:        true,
+			},
+		},
+		{
+			name:     "1 second",
+			duration: 1 * time.Second,
+			expected: pgtype.Interval{
+				Microseconds: 1000000, // 1 second = 1,000,000 microseconds
+				Days:         0,
+				Months:       0,
+				Valid:        true,
+			},
+		},
+		{
+			name:     "1 minute",
+			duration: 1 * time.Minute,
+			expected: pgtype.Interval{
+				Microseconds: 60000000, // 1 minute = 60,000,000 microseconds
+				Days:         0,
+				Months:       0,
+				Valid:        true,
+			},
+		},
+		{
+			name:     "1 hour",
+			duration: 1 * time.Hour,
+			expected: pgtype.Interval{
+				Microseconds: 3600000000, // 1 hour = 3,600,000,000 microseconds
+				Days:         0,
+				Months:       0,
+				Valid:        true,
+			},
+		},
+		{
+			name:     "1 day",
+			duration: 24 * time.Hour,
+			expected: pgtype.Interval{
+				Microseconds: 0, // Days are stored separately
+				Days:         1,
+				Months:       0,
+				Valid:        true,
+			},
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := durationToPgInterval(tt.duration)
+			if got != tt.expected {
+				t.Errorf("durationToPgInterval() = %v, want %v", got, tt.expected)
+			}
+		})
+	}
+}
+
+func TestPgIntervalToDuration(t *testing.T) {
+	tests := []struct {
+		name        string
+		interval    pgtype.Interval
+		expected    time.Duration
+		expectError bool
+	}{
+		{
+			name: "Valid interval - microseconds only",
+			interval: pgtype.Interval{
+				Microseconds: 1000000, // 1 second
+				Days:         0,
+				Months:       0,
+				Valid:        true,
+			},
+			expected:    1 * time.Second,
+			expectError: false,
+		},
+		{
+			name: "Valid interval - days only",
+			interval: pgtype.Interval{
+				Microseconds: 0,
+				Days:         1,
+				Months:       0,
+				Valid:        true,
+			},
+			expected:    24 * time.Hour,
+			expectError: false,
+		},
+		{
+			name: "Valid interval - days and microseconds",
+			interval: pgtype.Interval{
+				Microseconds: 1000000, // 1 second
+				Days:         1,       // 1 day
+				Months:       0,
+				Valid:        true,
+			},
+			expected:    24*time.Hour + 1*time.Second,
+			expectError: false,
+		},
+		{
+			name: "Invalid interval - contains months",
+			interval: pgtype.Interval{
+				Microseconds: 0,
+				Days:         0,
+				Months:       1,
+				Valid:        true,
+			},
+			expected:    0,
+			expectError: true,
+		},
+		{
+			name: "Invalid interval - not valid",
+			interval: pgtype.Interval{
+				Microseconds: 0,
+				Days:         0,
+				Months:       0,
+				Valid:        false,
+			},
+			expected:    0,
+			expectError: true,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got, err := pgIntervalToDuration(tt.interval)
+			if (err != nil) != tt.expectError {
+				t.Errorf("pgIntervalToDuration() error = %v, expectError %v", err, tt.expectError)
+				return
+			}
+			if got != tt.expected {
+				t.Errorf("pgIntervalToDuration() = %v, want %v", got, tt.expected)
+			}
+		})
+	}
+}
+
+func TestDurationRoundTrip(t *testing.T) {
+	tests := []struct {
+		name     string
+		duration time.Duration
+	}{
+		{"Zero", 0},
+		{"1 second", 1 * time.Second},
+		{"1 minute", 1 * time.Minute},
+		{"1 hour", 1 * time.Hour},
+		{"1 day", 24 * time.Hour},
+		{"1 day 1 hour", 25 * time.Hour},
+		{"1 day 1 hour 1 minute", 25*time.Hour + 1*time.Minute},
+		{"1 day 1 hour 1 minute 1 second", 25*time.Hour + 1*time.Minute + 1*time.Second},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			// Convert duration to pgtype.Interval
+			interval := durationToPgInterval(tt.duration)
+
+			// Convert back to duration
+			got, err := pgIntervalToDuration(interval)
+			if err != nil {
+				t.Errorf("pgIntervalToDuration() error = %v", err)
+				return
+			}
+
+			if got != tt.duration {
+				t.Errorf("Round trip conversion failed: got %v, want %v", got, tt.duration)
+			}
+		})
+	}
+}

From 132f692e98d84ce0cb5e0d6fffa3baf38c082471 Mon Sep 17 00:00:00 2001
From: acd19ml <acd19ml@gmail.com>
Date: Sun, 21 Sep 2025 03:01:41 +0800
Subject: [PATCH 3/4] feat: implement alert healing with observation window
 mechanism

- Add heal_actions table and related DAO/Service layers
- Implement observation window mechanism using Redis
- Add P0/P1/P2 alert processing logic with fault healing
- Update remediation consumer with healing and observation flows
- Add comprehensive tests for new functionality
- Update documentation with new healing process
---
 docs/alerting/database-design.md              |  47 ++-
 go.mod                                        |   3 +
 internal/alerting/database/database.go        |   5 +
 .../alerting/service/remediation/README.md    | 373 +++++++++++++++---
 .../alerting/service/remediation/consumer.go  | 258 ++++++++++--
 .../service/remediation/heal_action_dao.go    | 145 +++++++
 .../remediation/heal_action_service.go        | 172 ++++++++
 .../remediation/heal_action_service_test.go   | 178 +++++++++
 .../service/remediation/init_heal_actions.sql |  38 ++
 .../service/remediation/observation_window.go | 169 ++++++++
 .../remediation/observation_window_test.go    | 100 +++++
 .../alerting/service/remediation/types.go     |  74 ++++
 12 files changed, 1492 insertions(+), 70 deletions(-)
 create mode 100644 internal/alerting/service/remediation/heal_action_dao.go
 create mode 100644 internal/alerting/service/remediation/heal_action_service.go
 create mode 100644 internal/alerting/service/remediation/heal_action_service_test.go
 create mode 100644 internal/alerting/service/remediation/init_heal_actions.sql
 create mode 100644 internal/alerting/service/remediation/observation_window.go
 create mode 100644 internal/alerting/service/remediation/observation_window_test.go
 create mode 100644 internal/alerting/service/remediation/types.go

diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md
index b119349..0f860df 100644
--- a/docs/alerting/database-design.md
+++ b/docs/alerting/database-design.md
@@ -2,7 +2,7 @@
 
 ## 概述
 
-本文档为最新数据库设计，总计包含 6 张表：
+本文档为最新数据库设计，总计包含 7 张表：
 
 - alert_issues
 - alert_issue_comments
@@ -10,6 +10,7 @@
 - alert_rules
 - alert_rule_metas
 - service_states
+- heal_actions
 
 ## 数据表设计
 
@@ -111,7 +112,7 @@
 
 ---
 
-### 7) service_states（服务状态表）
+### 6) service_states（服务状态表）
 
 追踪服务在某一版本上的健康状态与处置进度。
 
@@ -127,6 +128,34 @@
 **索引建议：**
 - PRIMARY KEY: `(service, version)`
 
+---
+
+### 7) heal_actions（告警治愈解决方案表）
+
+存储不同故障域对应的治愈方案和规则。
+
+| 字段名 | 类型 | 说明 |
+|--------|------|------|
+| id | varchar(255) PK | 治愈方案 ID |
+| desc | text | 简单描述，如 action 是处理什么告警场景的 |
+| type | varchar(255) | 对应的故障域类型 |
+| rules | jsonb | 条件规则：{condition1: action1, condition2: action2} |
+
+**索引建议：**
+- PRIMARY KEY: `id`
+- INDEX: `(type)`
+
+**示例数据：**
+```sql
+INSERT INTO heal_actions (id, desc, type, rules) VALUES 
+('service_version_rollback', '服务版本回滚方案', 'service_version_issue', 
+ '{"deployment_status": "deploying", "action": "rollback", "target": "previous_version"}'),
+('service_version_alert', '服务版本告警方案', 'service_version_issue', 
+ '{"deployment_status": "deployed", "action": "alert", "message": "版本已发布，暂不支持自动回滚"}');
+```
+
+TODO： health_state映射逻辑
+
 ## 数据关系（ER）
 
 ```mermaid
@@ -175,13 +204,25 @@ erDiagram
         text content
     }
 
+    heal_actions {
+        varchar id PK
+        text desc
+        varchar type
+        jsonb rules
+    }
+
     %% 通过 service 等标签在应用层逻辑关联
     alert_rule_metas ||..|| alert_rules : "by alert_name"
     service_states ||..|| alert_rule_metas : "by service/version labels"
+    heal_actions ||..|| alert_issues : "by fault domain analysis"
 ```
 
 ## 数据流转
 
 1. 以 `alert_rules` 为模版，结合 `alert_rule_metas` 渲染出面向具体服务/版本等的规则（labels 可为空 `{}` 表示全局默认，或包含如 service/version 等标签）。
 2. 指标或规则参数发生调整时，记录到 `alert_meta_change_logs`。
-3. 规则触发创建 `alert_issues`；处理过程中的动作写入 `alert_issue_comments`。
\ No newline at end of file
+3. 规则触发创建 `alert_issues`；处理过程中的动作写入 `alert_issue_comments`。
+4. **告警治愈流程**：
+   - P0 告警：根据 `alert_issues.labels` 识别故障域，查询 `heal_actions` 获取治愈方案
+   - 执行治愈操作（如回滚），成功后更新 `alert_issues` 和 `service_states` 状态
+   - P1/P2 告警：直接进入下钻分析，记录分析结果到 `alert_issue_comments`
\ No newline at end of file
diff --git a/go.mod b/go.mod
index 6094f9c..8cf046f 100644
--- a/go.mod
+++ b/go.mod
@@ -9,6 +9,7 @@ require (
 	github.com/lib/pq v1.10.9
 	github.com/redis/go-redis/v9 v9.5.1
 	github.com/rs/zerolog v1.34.0
+	github.com/stretchr/testify v1.11.1
 )
 
 require (
@@ -16,6 +17,7 @@ require (
 	github.com/bytedance/sonic/loader v0.2.4 // indirect
 	github.com/cespare/xxhash/v2 v2.2.0 // indirect
 	github.com/cloudwego/base64x v0.1.5 // indirect
+	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
 	github.com/gabriel-vasile/mimetype v1.4.9 // indirect
 	github.com/gin-contrib/cors v1.7.6 // indirect
@@ -39,6 +41,7 @@ require (
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/natefinch/lumberjack v2.0.0+incompatible // indirect
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
+	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.3.0 // indirect
 	golang.org/x/arch v0.18.0 // indirect
diff --git a/internal/alerting/database/database.go b/internal/alerting/database/database.go
index e6ee504..6b2ab9b 100644
--- a/internal/alerting/database/database.go
+++ b/internal/alerting/database/database.go
@@ -38,3 +38,8 @@ func (d *Database) ExecContext(ctx context.Context, q string, args ...any) (sql.
 func (d *Database) QueryContext(ctx context.Context, q string, args ...any) (*sql.Rows, error) {
 	return d.db.QueryContext(ctx, q, args...)
 }
+
+// QueryRowContext exposes database/sql QueryRowContext for single row SELECT queries.
+func (d *Database) QueryRowContext(ctx context.Context, q string, args ...any) *sql.Row {
+	return d.db.QueryRowContext(ctx, q, args...)
+}
diff --git a/internal/alerting/service/remediation/README.md b/internal/alerting/service/remediation/README.md
index de5ca08..ca41e49 100644
--- a/internal/alerting/service/remediation/README.md
+++ b/internal/alerting/service/remediation/README.md
@@ -1,23 +1,28 @@
-# remediation — 通道消费与自动回滚（Mock）
+# remediation — 告警治愈与下钻分析
 
-本包规划一个后台处理器：消费 `healthcheck` 投递到进程内 channel 的告警消息，模拟执行“自动回滚”，回滚成功后将相关告警与服务态标记为恢复。
+本包实现一个后台处理器：消费 `healthcheck` 投递到进程内 channel 的告警消息，根据告警等级进行分流处理：
+- **P0 告警**：进入"故障治愈"模块，执行自动修复操作
+- **P1/P2 告警**：进入"下钻分析"模块，进行深度分析
 
 ——
 
 ## 1. 目标
 
 - 订阅 `healthcheck` 的 `AlertMessage`（进程内 channel）
-- 对每条消息：
-  1) Mock 调用回滚接口 `POST /v1/deployments/:deployID/rollback`
-  2) `sleep 30s` 后返回“回滚成功”的模拟响应
-  3) 若成功，则更新 DB 与缓存：
-     - `alert_issues.alert_state = 'Restored'`
-     - `alert_issues.state = 'Closed'`
-     - `service_states.health_state = 'Normal'`
-     - `service_states.resolved_at = NOW()`（当前时间）
-     - 同时在 `alert_issue_comments` 中追加一条 AI 分析评论（见下文内容模板）
-
-> 说明：本阶段仅实现消费与 Mock，真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。
+- 根据 `level` 字段进行分流：
+  - **P0 告警**：故障治愈流程
+    1) 确认故障域（从 labels 分析 service_name + version）
+    2) 查询 `heal_actions` 表获取治愈方案
+    3) 执行治愈操作（当前仅支持回滚）
+    4) 治愈成功后启动观察窗口（默认30分钟）
+    5) 观察窗口内如果出现新告警，取消观察并重新处理
+    6) 观察窗口完成后，更新服务状态为正常
+  - **P1/P2 告警**：直接进入下钻分析流程
+    1) 执行 AI 分析
+    2) 更新告警状态为恢复
+    3) 记录分析结果到评论
+
+> 说明：本阶段实现故障域识别和治愈方案查询，真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。
 
 ——
 
@@ -37,8 +42,10 @@
  }
 ```
 
+- 故障域识别：从 `Labels` 中提取 `service_name` 和 `version` 信息
 - deployID 的来源（用于构造回滚 URL）：
-  - Mock 阶段：可从 `Labels["deploy_id"]`（若存在）读取；若为空，可按 `{service}:{version}` 组装一个占位 ID。
+  - 可从 `Labels["deploy_id"]`（若存在）读取
+  - 若为空，可按 `{service}:{version}` 组装一个占位 ID
 
 ——
 
@@ -62,7 +69,7 @@ REMEDIATION_ROLLBACK_SLEEP=30s
 
 ——
 
-## 4. 流程（伪代码）
+## 4. 处理流程（伪代码）
 
 ```go
 func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rdb *redis.Client) {
@@ -71,30 +78,256 @@ func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rd
         case <-ctx.Done():
             return
         case m := <-ch:
-            // 1) 组装回滚 URL（Mock）
-            deployID := m.Labels["deploy_id"]
-            if deployID == "" {
-                // 仅 Mock：用 service:version 兜底
-                deployID = fmt.Sprintf("%s:%s", m.Service, m.Version)
+            switch m.Level {
+            case "P0":
+                // P0 告警：故障治愈流程
+                handleP0Alert(ctx, m, db, rdb)
+            case "P1", "P2":
+                // P1/P2 告警：下钻分析流程
+                handleP1P2Alert(ctx, m, db, rdb)
+            default:
+                log.Printf("Unknown alert level: %s", m.Level)
             }
-            url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deployID)
+        }
+    }
+}
 
-            // 2) 发起回滚（Mock）：sleep 指定时间再判为成功
-            sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second)
-            // TODO: 如需真实 HTTP 调用，可在此发起 POST 并根据响应判断
+// P0 告警处理：故障治愈流程
+func handleP0Alert(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) {
+    // 1) 确认故障域
+    faultDomain := identifyFaultDomain(m.Labels)
+    
+    // 2) 查询治愈方案
+    healAction, err := queryHealAction(ctx, db, faultDomain)
+    if err != nil {
+        log.Printf("Failed to query heal action: %v", err)
+        // 治愈方案查询失败，直接进入下钻分析
+        handleDrillDownAnalysis(ctx, m, db, rdb)
+        return
+    }
+    
+    // 3) 执行治愈操作
+    success := executeHealAction(ctx, healAction, m)
+    if !success {
+        log.Printf("Heal action failed for alert %s", m.ID)
+        // 治愈操作失败，直接进入下钻分析
+        handleDrillDownAnalysis(ctx, m, db, rdb)
+        return
+    }
+    
+    // 4) 治愈成功后启动观察窗口，延迟状态更新
+    handleDrillDownAnalysisWithObservation(ctx, m, db, rdb)
+}
 
-            // 3) 成功后，先写入 AI 分析评论，再更新 DB 与缓存状态
-            _ = addAIAnalysisComment(ctx, db, m)
-            _ = markRestoredInDB(ctx, db, m)
-            _ = markRestoredInCache(ctx, rdb, m)
-        }
+// P1/P2 告警处理：下钻分析流程
+func handleP1P2Alert(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) {
+    handleDrillDownAnalysis(ctx, m, db, rdb)
+}
+
+// 故障域识别
+func identifyFaultDomain(labels map[string]string) string {
+    service := labels["service_name"]
+    version := labels["version"]
+    
+    if service != "" && version != "" {
+        return "service_version_issue"
     }
+    
+    // 可根据更多条件扩展其他故障域
+    return "unknown"
+}
+
+// 查询治愈方案
+func queryHealAction(ctx context.Context, db *Database, faultDomain string) (*HealAction, error) {
+    const q = `SELECT id, desc, type, rules FROM heal_actions WHERE type = $1 LIMIT 1`
+    // 实现查询逻辑
+    return nil, nil
+}
+
+// 执行治愈操作
+func executeHealAction(ctx context.Context, action *HealAction, m AlertMessage) bool {
+    // 根据 action.rules 中的条件执行相应操作
+    // 当前仅支持回滚操作
+    if action.Rules["action"] == "rollback" {
+        return executeRollback(ctx, m)
+    } else if action.Rules["action"] == "alert" {
+        log.Printf("Alert: %s", action.Rules["message"])
+        return false
+    }
+    return false
+}
+
+// 执行回滚操作
+func executeRollback(ctx context.Context, m AlertMessage) bool {
+    deployID := m.Labels["deploy_id"]
+    if deployID == "" {
+        deployID = fmt.Sprintf("%s:%s", m.Service, m.Version)
+    }
+    
+    // Mock 回滚：sleep 指定时间
+    sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second)
+    // TODO: 真实 HTTP 调用回滚接口
+    
+    return true
+}
+
+// 下钻分析处理（P1/P2 告警直接使用）
+func handleDrillDownAnalysis(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) {
+    // 1) 执行 AI 分析
+    _ = addAIAnalysisComment(ctx, db, m)
+    
+    // 2) 更新告警状态为恢复
+    _ = markRestoredInDB(ctx, db, m)
+    
+    // 3) 更新缓存状态
+    _ = markRestoredInCache(ctx, rdb, m)
+}
+
+// 下钻分析处理（P0 告警治愈后使用，延迟状态更新）
+func handleDrillDownAnalysisWithObservation(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) {
+    // 1) 执行 AI 分析
+    _ = addAIAnalysisComment(ctx, db, m)
+    
+    // 2) 记录治愈完成评论，但不更新告警状态
+    _ = addHealingCompletedComment(ctx, db, m)
+    
+    // 3) 启动观察窗口，等待30分钟
+    _ = startObservationWindow(ctx, m.Service, m.Version, m.ID, 30*time.Minute)
+    
+    // 注意：此时不更新 alert_issues.alert_state 和 service_states.health_state
+    // 状态更新将在观察窗口完成后进行
+}
+
+// 观察窗口完成后的处理
+func completeObservationWindow(ctx context.Context, service, version string, db *Database, rdb *redis.Client) {
+    // 1) 完成观察窗口
+    _ = completeObservation(ctx, service, version)
+    
+    // 2) 更新 alert_issues.alert_state 为 'Restored'
+    // 3) 更新 service_states.health_state 为 'Normal'
+    // 4) 更新相关缓存
+    _ = markServiceAsNormal(ctx, service, version, db, rdb)
+    
+    log.Printf("Observation window completed for service %s version %s, status updated to Normal", service, version)
 }
 ```
 
 ——
 
-## 5. DB 更新（SQL 建议）
+## 5. 故障域识别与治愈方案
+
+### 故障域类型
+
+当前支持的故障域类型：
+
+1. **service_version_issue**：服务版本问题
+   - 识别条件：`labels["service_name"]` 和 `labels["version"]` 都存在
+   - 治愈方案：
+     - 发布中版本：执行回滚操作
+     - 已完成发布版本：提示暂不支持自动回滚
+
+2. **unknown**：未知故障域
+   - 识别条件：无法从标签中识别出已知故障域
+   - 处理方式：跳过治愈，直接进入下钻分析
+
+### 治愈方案规则
+
+`heal_actions.rules` 字段的 JSON 结构：
+
+```json
+{
+  "deployment_status": "deploying|deployed",
+  "action": "rollback|alert",
+  "target": "previous_version",
+  "message": "版本已发布，暂不支持自动回滚"
+}
+```
+
+### 治愈操作类型
+
+1. **rollback**：执行回滚操作
+   - 调用部署系统的回滚接口
+   - 回滚到上一个稳定版本
+
+2. **alert**：仅告警，不执行自动操作
+   - 记录告警信息
+   - 需要人工介入处理
+
+### 扩展性设计
+
+- 故障域类型可扩展：整体问题、单机房问题、网络问题等
+- 治愈方案可扩展：重启服务、扩容、切换流量等
+- 规则条件可扩展：基于更多标签和指标进行判断
+
+#### 添加新的故障域类型
+
+1. 在 `types.go` 中添加新的 `FaultDomain` 常量
+2. 在 `IdentifyFaultDomain` 方法中添加识别逻辑
+3. 在数据库中配置对应的治愈方案
+
+#### 添加新的治愈操作类型
+
+1. 在 `HealActionRules` 结构体中添加新字段
+2. 在 `ExecuteHealAction` 方法中添加新的 case 分支
+3. 实现具体的治愈操作逻辑
+
+### 观察窗口机制
+
+观察窗口是治愈操作完成后的验证期，用于确保治愈操作的有效性：
+
+1. **启动条件**：P0 告警治愈操作成功完成后自动启动
+2. **持续时间**：默认30分钟，可配置
+3. **监控内容**：观察该服务是否在窗口期内出现新的告警
+4. **处理逻辑**：
+   - 如果窗口期内出现新告警：取消观察窗口，重新进入治愈流程
+   - 如果窗口期内无新告警：完成观察窗口，更新服务状态为正常
+5. **状态更新时机**：
+   - **治愈操作完成后**：不立即更新状态，只记录治愈完成评论
+   - **观察窗口完成后**：同时更新 `alert_issues.alert_state` 为 `Restored` 和 `service_states.health_state` 为 `Normal`
+6. **关键原则**：每次修改 `service_states.health_state` 为 `Normal` 时，都必须同时修改 `alert_issues.alert_state` 为 `Restored`
+
+——
+
+## 6. 代码使用示例
+
+### 数据库初始化
+
+```bash
+# 执行初始化脚本
+psql -U postgres -d zeroops -f init_heal_actions.sql
+```
+
+### 代码使用
+
+```go
+// 创建服务
+healDAO := NewPgHealActionDAO(db)
+healService := NewHealActionService(healDAO)
+
+// 识别故障域
+faultDomain := healService.IdentifyFaultDomain(labels)
+
+// 获取治愈方案
+healAction, err := healService.GetHealAction(ctx, faultDomain)
+
+// 执行治愈操作
+result, err := healService.ExecuteHealAction(ctx, healAction, alertID, labels)
+```
+
+### 测试
+
+运行测试：
+
+```bash
+go test ./internal/alerting/service/remediation -v
+```
+
+测试覆盖：
+- 故障域识别逻辑
+- 治愈操作执行
+- 部署状态判断
+
+## 7. DB 更新（SQL 建议）
 
 - 告警状态：
 ```sql
@@ -121,7 +354,7 @@ VALUES (
 );
 ```
 
-评论内容模板（Markdown，多行）：
+评论内容模板（Markdown，多行，内容暂未设计）：
 ```
 ## AI分析结果
 **问题类型**：非发版本导致的问题
@@ -137,7 +370,7 @@ VALUES (
 
 ——
 
-## 6. 缓存更新（Redis，Lua CAS 建议）
+## 8. 缓存更新（Redis，Lua CAS 建议）
 
 - 告警缓存 `alert:issue:{id}`：
 ```lua
@@ -178,30 +411,80 @@ return 1
 
 ——
 
-## 7. 幂等与重试
+## 9. 幂等与重试
 
 - 幂等：同一 `AlertMessage.ID` 的回滚处理应具备幂等性，重复消费不应产生额外副作用。
 - 重试：Mock 模式下可忽略；接入真实接口后，对 5xx/网络错误考虑重试与退避，最终写入失败应有告警与补偿。
 
 ——
 
-## 8. 验证步骤（与 healthcheck E2E 相衔接）
+## 10. 验证步骤（与 healthcheck E2E 相衔接）
+
+### 基础验证步骤
 
 1) 启动 Redis/Postgres 与 API（参考 `healthcheck/E2E_VALIDATION.md` 与 `env_example.txt`）
-2) 创建 channel，并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)`
-3) `curl` 触发 Webhook，`alert_issues` 入库为 `Pending`
-4) 等待 `healthcheck` 将缓存态切到 `InProcessing`
-5) 等待 `remediation` mock 回滚完成 → DB 与缓存更新：
-   - `alert_issues.alert_state = 'Restored'`
-   - `service_states.health_state = 'Normal'`
-   - `service_states.resolved_at = NOW()`
-6) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新（comments 仍为 mock）
+2) 创建 `heal_actions` 表并插入测试数据
+3) 创建 channel，并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)`
+
+### P0 告警验证（故障治愈流程）
+
+4) 触发 P0 级别 Webhook，`alert_issues` 入库为 `Pending`
+5) 等待 `healthcheck` 将缓存态切到 `InProcessing`
+6) 验证故障域识别：检查日志中是否正确识别为 `service_version_issue`
+7) 验证治愈方案查询：检查是否从 `heal_actions` 表查询到对应方案
+8) 等待 `remediation` 执行治愈操作完成：
+   - 验证观察窗口已启动（Redis 中存在观察窗口记录）
+   - `alert_issue_comments` 中新增治愈完成评论
+   - **重要**：验证 `alert_issues.alert_state` 仍为 `InProcessing`（未更新为 `Restored`）
+   - **重要**：验证 `service_states.health_state` 未更新为 `Normal`
+9) 等待观察窗口完成（30分钟后）或模拟窗口期内新告警：
+   - **如果无新告警**：
+     - 验证观察窗口自动完成
+     - 验证状态同时更新为 `alert_issues.alert_state = 'Restored'` 和 `service_states.health_state = 'Normal'`
+   - **如果有新告警**：
+     - 验证观察窗口被取消
+     - 验证重新进入治愈流程
+     - 验证状态未更新为 `Restored`/`Normal`
+
+### P1/P2 告警验证（下钻分析流程）
+
+9) 触发 P1 或 P2 级别 Webhook
+10) 验证直接进入下钻分析流程，跳过故障治愈步骤
+11) 验证 AI 分析评论生成和状态更新
+
+### 最终验证
+
+12) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新
+13) 验证不同告警等级的处理路径正确性
 
 ——
 
-## 9. 后续计划
+## 11. 注意事项
+
+1. **service_states 表逻辑**: 当前版本中，`service_states` 表的更新逻辑暂时不实现，但保留了扩展空间
+2. **Mock 模式**: 当前回滚操作为 Mock 模式，实际部署时需要接入真实的部署系统 API
+3. **错误处理**: 治愈操作失败时会记录日志并继续进入下钻分析流程
+4. **幂等性**: 同一告警的重复处理应该具备幂等性
+
+## 12. 后续计划
+
+### 短期计划
 
+- 实现 `heal_actions` 表的完整 CRUD 操作
+- 完善故障域识别逻辑，支持更多故障类型
 - 接入真实部署系统回滚接口与鉴权
+- 实现治愈方案的动态配置和管理界面
+
+### 中期计划
+
+- 扩展治愈操作类型：服务重启、扩容、流量切换等
+- 增加治愈方案的执行结果反馈和效果评估
 - 将进程内 channel 平滑切换为 MQ（Kafka/NATS）
-- 完善指标与可观测：事件消费速率、成功率、时延分位、回滚结果等
-- 增加补偿任务：对“回滚成功但缓存/DB 未一致”的场景进行对账修复
+- 完善指标与可观测：事件消费速率、成功率、时延分位、治愈结果等
+
+### 长期计划
+
+- 基于历史数据训练 AI 模型，自动推荐最优治愈方案
+- 增加补偿任务：对"治愈成功但缓存/DB 未一致"的场景进行对账修复
+- 实现治愈方案的 A/B 测试和效果对比
+- 构建完整的故障自愈知识库和最佳实践库
diff --git a/internal/alerting/service/remediation/consumer.go b/internal/alerting/service/remediation/consumer.go
index 07cae6b..cead8d1 100644
--- a/internal/alerting/service/remediation/consumer.go
+++ b/internal/alerting/service/remediation/consumer.go
@@ -3,7 +3,6 @@ package remediation
 import (
 	"context"
 	"fmt"
-	"os"
 	"strconv"
 	"time"
 
@@ -17,46 +16,189 @@ type Consumer struct {
 	DB    *adb.Database
 	Redis *redis.Client
 
+	// Heal action service for P0 alerts
+	healService HealActionService
+
+	// Observation window manager
+	obsManager ObservationWindowManager
+
 	// sleepFn allows overriding for tests
 	sleepFn func(time.Duration)
 }
 
 func NewConsumer(db *adb.Database, rdb *redis.Client) *Consumer {
-	return &Consumer{DB: db, Redis: rdb, sleepFn: time.Sleep}
+	healDAO := NewPgHealActionDAO(db)
+	healService := NewHealActionService(healDAO)
+	obsManager := NewRedisObservationWindowManager(rdb)
+	return &Consumer{
+		DB:          db,
+		Redis:       rdb,
+		healService: healService,
+		obsManager:  obsManager,
+		sleepFn:     time.Sleep,
+	}
 }
 
-// Start consumes alert messages and performs a mocked rollback then marks restored.
+// Start consumes alert messages and processes them based on alert level
 func (c *Consumer) Start(ctx context.Context, ch <-chan healthcheck.AlertMessage) {
 	if ch == nil {
 		log.Warn().Msg("remediation consumer started without channel; no-op")
 		return
 	}
-	sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second)
+
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case m := <-ch:
-			// 1) Mock rollback: optional URL composition (unused)
-			_ = fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(&m))
-			// 2) Sleep to simulate rollback time
-			if c.sleepFn != nil {
-				c.sleepFn(sleepDur)
-			}
-			// 3) On success: add AI analysis comment, update DB and cache
-			if err := c.addAIAnalysisComment(ctx, &m); err != nil {
-				log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed")
-			}
-			if err := c.markRestoredInDB(ctx, &m); err != nil {
-				log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed")
-			}
-			if err := c.markRestoredInCache(ctx, &m); err != nil {
-				log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed")
+			// 首先检查是否有观察窗口需要处理
+			c.handleObservationWindow(ctx, &m)
+
+			switch m.Level {
+			case "P0":
+				// P0 告警：故障治愈流程
+				c.handleP0Alert(ctx, &m)
+			case "P1", "P2":
+				// P1/P2 告警：下钻分析流程
+				c.handleP1P2Alert(ctx, &m)
+			default:
+				log.Warn().Str("level", m.Level).Str("issue", m.ID).Msg("unknown alert level, skipping")
 			}
 		}
 	}
 }
 
+// handleObservationWindow handles observation window logic for incoming alerts
+func (c *Consumer) handleObservationWindow(ctx context.Context, m *healthcheck.AlertMessage) {
+	if m.Service == "" {
+		return // No service information, skip observation window check
+	}
+
+	// 检查是否有该服务的观察窗口
+	window, err := c.obsManager.CheckObservation(ctx, m.Service, m.Version)
+	if err != nil {
+		log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to check observation window")
+		return
+	}
+
+	if window == nil {
+		return // No active observation window
+	}
+
+	// 如果在观察窗口期间出现新的告警，取消观察窗口
+	log.Warn().
+		Str("service", m.Service).
+		Str("version", m.Version).
+		Str("alert_id", m.ID).
+		Str("observation_alert_id", window.AlertID).
+		Msg("new alert detected during observation window, cancelling observation")
+
+	if err := c.obsManager.CancelObservation(ctx, m.Service, m.Version); err != nil {
+		log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to cancel observation window")
+	}
+}
+
+// handleP0Alert handles P0 alerts with fault healing process
+func (c *Consumer) handleP0Alert(ctx context.Context, m *healthcheck.AlertMessage) {
+	log.Info().Str("issue", m.ID).Str("level", m.Level).Msg("processing P0 alert with fault healing")
+
+	// 1) 确认故障域
+	faultDomain := c.healService.IdentifyFaultDomain(m.Labels)
+	log.Info().Str("issue", m.ID).Str("fault_domain", string(faultDomain)).Msg("identified fault domain")
+
+	// 2) 查询治愈方案
+	healAction, err := c.healService.GetHealAction(ctx, faultDomain)
+	if err != nil {
+		log.Error().Err(err).Str("issue", m.ID).Msg("failed to get heal action")
+		// 如果无法获取治愈方案，直接进入下钻分析
+		c.handleDrillDownAnalysis(ctx, m)
+		return
+	}
+
+	// 3) 执行治愈操作
+	result, err := c.healService.ExecuteHealAction(ctx, healAction, m.ID, m.Labels)
+	if err != nil {
+		log.Error().Err(err).Str("issue", m.ID).Msg("failed to execute heal action")
+		c.handleDrillDownAnalysis(ctx, m)
+		return
+	}
+
+	if !result.Success {
+		log.Warn().Str("issue", m.ID).Str("message", result.Message).Msg("heal action failed")
+		// 治愈失败，仍然进入下钻分析
+		c.handleDrillDownAnalysis(ctx, m)
+		return
+	}
+
+	log.Info().Str("issue", m.ID).Str("message", result.Message).Msg("heal action completed successfully")
+
+	// 4) 治愈成功后启动观察窗口
+	if m.Service != "" {
+		obsDuration := GetObservationDuration()
+		if err := c.obsManager.StartObservation(ctx, m.Service, m.Version, m.ID, obsDuration); err != nil {
+			log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to start observation window")
+		} else {
+			log.Info().
+				Str("service", m.Service).
+				Str("version", m.Version).
+				Str("alert_id", m.ID).
+				Dur("duration", obsDuration).
+				Msg("started observation window after successful healing")
+		}
+	}
+
+	// 5) 治愈成功后进入下钻分析（但不立即更新状态）
+	c.handleDrillDownAnalysisWithObservation(ctx, m)
+}
+
+// handleP1P2Alert handles P1/P2 alerts with drill-down analysis
+func (c *Consumer) handleP1P2Alert(ctx context.Context, m *healthcheck.AlertMessage) {
+	log.Info().Str("issue", m.ID).Str("level", m.Level).Msg("processing P1/P2 alert with drill-down analysis")
+
+	// 直接进入下钻分析流程
+	c.handleDrillDownAnalysis(ctx, m)
+}
+
+// handleDrillDownAnalysis performs drill-down analysis and marks alert as restored
+func (c *Consumer) handleDrillDownAnalysis(ctx context.Context, m *healthcheck.AlertMessage) {
+	// 1) 执行 AI 分析
+	if err := c.addAIAnalysisComment(ctx, m); err != nil {
+		log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed")
+	}
+
+	// 2) 更新告警状态为恢复
+	if err := c.markRestoredInDB(ctx, m); err != nil {
+		log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed")
+	}
+
+	// 3) 更新缓存状态
+	if err := c.markRestoredInCache(ctx, m); err != nil {
+		log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed")
+	}
+}
+
+// handleDrillDownAnalysisWithObservation performs drill-down analysis but delays status update for observation
+func (c *Consumer) handleDrillDownAnalysisWithObservation(ctx context.Context, m *healthcheck.AlertMessage) {
+	// 1) 执行 AI 分析
+	if err := c.addAIAnalysisComment(ctx, m); err != nil {
+		log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed")
+	}
+
+	// 2) 暂时不更新告警状态，等待观察窗口完成
+	// 只记录治愈操作完成的评论
+	if err := c.addHealingCompletedComment(ctx, m); err != nil {
+		log.Error().Err(err).Str("issue", m.ID).Msg("addHealingCompletedComment failed")
+	}
+
+	log.Info().
+		Str("issue", m.ID).
+		Str("service", m.Service).
+		Str("version", m.Version).
+		Msg("healing completed, waiting for observation window to complete before updating status")
+}
+
+// deriveDeployID derives deployment ID from alert message
+// TODO: Use this function when implementing real rollback API calls
 func deriveDeployID(m *healthcheck.AlertMessage) string {
 	if m == nil {
 		return ""
@@ -91,15 +233,38 @@ func (c *Consumer) addAIAnalysisComment(ctx context.Context, m *healthcheck.Aler
 	return err
 }
 
+func (c *Consumer) addHealingCompletedComment(ctx context.Context, m *healthcheck.AlertMessage) error {
+	if c.DB == nil || m == nil {
+		return nil
+	}
+	const existsQ = `SELECT 1 FROM alert_issue_comments WHERE issue_id=$1 AND content=$2 LIMIT 1`
+	const insertQ = `INSERT INTO alert_issue_comments (issue_id, create_at, content) VALUES ($1, NOW(), $2)`
+	content := "## 治愈操作完成\n" +
+		"**操作状态**：治愈操作已成功执行\n" +
+		"**观察窗口**：正在等待观察窗口完成（30分钟）\n" +
+		"**下一步**：如果观察窗口内无新告警，将自动更新服务状态为正常"
+	if rows, err := c.DB.QueryContext(ctx, existsQ, m.ID, content); err == nil {
+		defer rows.Close()
+		if rows.Next() {
+			return nil
+		}
+	}
+	_, err := c.DB.ExecContext(ctx, insertQ, m.ID, content)
+	return err
+}
+
 func (c *Consumer) markRestoredInDB(ctx context.Context, m *healthcheck.AlertMessage) error {
 	if c.DB == nil || m == nil {
 		return nil
 	}
-	// alert_issues
+
+	// 更新 alert_issues 状态
 	if _, err := c.DB.ExecContext(ctx, `UPDATE alert_issues SET alert_state = 'Restored' , state = 'Closed' WHERE id = $1`, m.ID); err != nil {
 		return err
 	}
-	// service_states (upsert)
+
+	// 同时更新 service_states.health_state 为 Normal
+	// 注意：每次修改 service_states 为 Normal 时都需要修改 alert_issues.alert_state 为 Restored
 	if m.Service != "" {
 		const upsert = `
 INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids)
@@ -112,6 +277,7 @@ SET health_state = 'Normal',
 			return err
 		}
 	}
+
 	return nil
 }
 
@@ -146,7 +312,7 @@ return 1
 `)
 	_, _ = script.Run(ctx, c.Redis, []string{alertKey, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing", "alert:index:alert_state:Restored", "alert:index:open", "alert:index:closed"}, "Restored", m.ID, "Closed").Result()
 
-	// 2) service_state:{service}:{version} → health_state=Normal; resolved_at=now; add to Normal index
+	// 更新 service_state 缓存
 	if m.Service != "" {
 		svcKey := "service_state:" + m.Service + ":" + m.Version
 		now := time.Now().UTC().Format(time.RFC3339Nano)
@@ -165,6 +331,54 @@ return 1
 	return nil
 }
 
+// CompleteObservationAndUpdateStatus completes observation window and updates service status
+func (c *Consumer) CompleteObservationAndUpdateStatus(ctx context.Context, service, version string) error {
+	if service == "" {
+		return fmt.Errorf("service name is required")
+	}
+
+	// 完成观察窗口
+	if err := c.obsManager.CompleteObservation(ctx, service, version); err != nil {
+		return fmt.Errorf("failed to complete observation window: %w", err)
+	}
+
+	// 更新服务状态为正常
+	const upsert = `
+INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids)
+VALUES ($1, $2, NULL, NOW(), 'Normal', ARRAY[]::text[])
+ON CONFLICT (service, version) DO UPDATE
+SET health_state = 'Normal',
+    resolved_at = NOW();
+`
+	if _, err := c.DB.ExecContext(ctx, upsert, service, version); err != nil {
+		return fmt.Errorf("failed to update service state: %w", err)
+	}
+
+	// 更新缓存
+	if c.Redis != nil {
+		svcKey := "service_state:" + service + ":" + version
+		now := time.Now().UTC().Format(time.RFC3339Nano)
+		svcScript := redis.NewScript(`
+local v = redis.call('GET', KEYS[1])
+if not v then v = '{}' end
+local obj = cjson.decode(v)
+obj.health_state = ARGV[1]
+obj.resolved_at = ARGV[2]
+redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL')
+if KEYS[2] ~= '' then redis.call('SADD', KEYS[2], KEYS[1]) end
+return 1
+`)
+		_, _ = svcScript.Run(ctx, c.Redis, []string{svcKey, "service_state:index:health:Normal"}, "Normal", now).Result()
+	}
+
+	log.Info().
+		Str("service", service).
+		Str("version", version).
+		Msg("observation window completed successfully, service status updated to Normal")
+
+	return nil
+}
+
 func parseDuration(s string, d time.Duration) time.Duration {
 	if s == "" {
 		return d
diff --git a/internal/alerting/service/remediation/heal_action_dao.go b/internal/alerting/service/remediation/heal_action_dao.go
new file mode 100644
index 0000000..af71b5e
--- /dev/null
+++ b/internal/alerting/service/remediation/heal_action_dao.go
@@ -0,0 +1,145 @@
+package remediation
+
+import (
+	"context"
+	"database/sql"
+	"encoding/json"
+	"fmt"
+
+	adb "github.com/qiniu/zeroops/internal/alerting/database"
+)
+
+// PgHealActionDAO implements HealActionDAO using PostgreSQL
+type PgHealActionDAO struct {
+	DB *adb.Database
+}
+
+// NewPgHealActionDAO creates a new PostgreSQL heal action DAO
+func NewPgHealActionDAO(db *adb.Database) *PgHealActionDAO {
+	return &PgHealActionDAO{DB: db}
+}
+
+// GetByType retrieves a heal action by fault domain type
+func (d *PgHealActionDAO) GetByType(ctx context.Context, faultType string) (*HealAction, error) {
+	const q = `SELECT id, desc, type, rules FROM heal_actions WHERE type = $1 LIMIT 1`
+
+	row := d.DB.QueryRowContext(ctx, q, faultType)
+	var action HealAction
+	var rulesJSON string
+
+	err := row.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON)
+	if err != nil {
+		if err == sql.ErrNoRows {
+			return nil, fmt.Errorf("no heal action found for type: %s", faultType)
+		}
+		return nil, fmt.Errorf("failed to get heal action by type: %w", err)
+	}
+
+	action.Rules = json.RawMessage(rulesJSON)
+	return &action, nil
+}
+
+// GetByID retrieves a heal action by ID
+func (d *PgHealActionDAO) GetByID(ctx context.Context, id string) (*HealAction, error) {
+	const q = `SELECT id, desc, type, rules FROM heal_actions WHERE id = $1`
+
+	row := d.DB.QueryRowContext(ctx, q, id)
+	var action HealAction
+	var rulesJSON string
+
+	err := row.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON)
+	if err != nil {
+		if err == sql.ErrNoRows {
+			return nil, fmt.Errorf("no heal action found with id: %s", id)
+		}
+		return nil, fmt.Errorf("failed to get heal action by id: %w", err)
+	}
+
+	action.Rules = json.RawMessage(rulesJSON)
+	return &action, nil
+}
+
+// Create creates a new heal action
+func (d *PgHealActionDAO) Create(ctx context.Context, action *HealAction) error {
+	const q = `INSERT INTO heal_actions (id, desc, type, rules) VALUES ($1, $2, $3, $4)`
+
+	_, err := d.DB.ExecContext(ctx, q, action.ID, action.Desc, action.Type, string(action.Rules))
+	if err != nil {
+		return fmt.Errorf("failed to create heal action: %w", err)
+	}
+
+	return nil
+}
+
+// Update updates an existing heal action
+func (d *PgHealActionDAO) Update(ctx context.Context, action *HealAction) error {
+	const q = `UPDATE heal_actions SET desc = $2, type = $3, rules = $4 WHERE id = $1`
+
+	result, err := d.DB.ExecContext(ctx, q, action.ID, action.Desc, action.Type, string(action.Rules))
+	if err != nil {
+		return fmt.Errorf("failed to update heal action: %w", err)
+	}
+
+	rowsAffected, err := result.RowsAffected()
+	if err != nil {
+		return fmt.Errorf("failed to get rows affected: %w", err)
+	}
+
+	if rowsAffected == 0 {
+		return fmt.Errorf("no heal action found with id: %s", action.ID)
+	}
+
+	return nil
+}
+
+// Delete deletes a heal action by ID
+func (d *PgHealActionDAO) Delete(ctx context.Context, id string) error {
+	const q = `DELETE FROM heal_actions WHERE id = $1`
+
+	result, err := d.DB.ExecContext(ctx, q, id)
+	if err != nil {
+		return fmt.Errorf("failed to delete heal action: %w", err)
+	}
+
+	rowsAffected, err := result.RowsAffected()
+	if err != nil {
+		return fmt.Errorf("failed to get rows affected: %w", err)
+	}
+
+	if rowsAffected == 0 {
+		return fmt.Errorf("no heal action found with id: %s", id)
+	}
+
+	return nil
+}
+
+// List retrieves all heal actions
+func (d *PgHealActionDAO) List(ctx context.Context) ([]*HealAction, error) {
+	const q = `SELECT id, desc, type, rules FROM heal_actions ORDER BY type, id`
+
+	rows, err := d.DB.QueryContext(ctx, q)
+	if err != nil {
+		return nil, fmt.Errorf("failed to list heal actions: %w", err)
+	}
+	defer rows.Close()
+
+	var actions []*HealAction
+	for rows.Next() {
+		var action HealAction
+		var rulesJSON string
+
+		err := rows.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON)
+		if err != nil {
+			return nil, fmt.Errorf("failed to scan heal action: %w", err)
+		}
+
+		action.Rules = json.RawMessage(rulesJSON)
+		actions = append(actions, &action)
+	}
+
+	if err = rows.Err(); err != nil {
+		return nil, fmt.Errorf("error iterating heal actions: %w", err)
+	}
+
+	return actions, nil
+}
diff --git a/internal/alerting/service/remediation/heal_action_service.go b/internal/alerting/service/remediation/heal_action_service.go
new file mode 100644
index 0000000..aa9a8f0
--- /dev/null
+++ b/internal/alerting/service/remediation/heal_action_service.go
@@ -0,0 +1,172 @@
+package remediation
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"os"
+	"time"
+
+	"github.com/rs/zerolog/log"
+)
+
+// HealActionServiceImpl implements HealActionService
+type HealActionServiceImpl struct {
+	dao HealActionDAO
+}
+
+// NewHealActionService creates a new heal action service
+func NewHealActionService(dao HealActionDAO) *HealActionServiceImpl {
+	return &HealActionServiceImpl{dao: dao}
+}
+
+// IdentifyFaultDomain identifies the fault domain from alert labels
+func (s *HealActionServiceImpl) IdentifyFaultDomain(labels map[string]string) FaultDomain {
+	service := labels["service_name"]
+	version := labels["version"]
+
+	if service != "" && version != "" {
+		return FaultDomainServiceVersion
+	}
+
+	// TODO: 可根据更多条件扩展其他故障域
+	// - 整体问题：检查是否有全局性指标异常
+	// - 单机房问题：检查是否有机房相关标签
+	// - 网络问题：检查是否有网络相关标签
+	return FaultDomainUnknown
+}
+
+// GetHealAction retrieves the appropriate heal action for a fault domain
+func (s *HealActionServiceImpl) GetHealAction(ctx context.Context, faultDomain FaultDomain) (*HealAction, error) {
+	if faultDomain == FaultDomainUnknown {
+		return nil, fmt.Errorf("unknown fault domain, cannot determine heal action")
+	}
+
+	action, err := s.dao.GetByType(ctx, string(faultDomain))
+	if err != nil {
+		return nil, fmt.Errorf("failed to get heal action for domain %s: %w", faultDomain, err)
+	}
+
+	return action, nil
+}
+
+// ExecuteHealAction executes the heal action based on the rules
+func (s *HealActionServiceImpl) ExecuteHealAction(ctx context.Context, action *HealAction, alertID string, labels map[string]string) (*HealActionResult, error) {
+	if action == nil {
+		return &HealActionResult{
+			Success: false,
+			Error:   "no heal action provided",
+		}, nil
+	}
+
+	// Parse the rules
+	var rules HealActionRules
+	if err := json.Unmarshal(action.Rules, &rules); err != nil {
+		return &HealActionResult{
+			Success: false,
+			Error:   fmt.Sprintf("failed to parse heal action rules: %v", err),
+		}, nil
+	}
+
+	// Execute based on action type
+	switch rules.Action {
+	case "rollback":
+		return s.executeRollback(ctx, rules, alertID, labels)
+	case "alert":
+		return s.executeAlert(rules, alertID, labels)
+	default:
+		return &HealActionResult{
+			Success: false,
+			Error:   fmt.Sprintf("unsupported action type: %s", rules.Action),
+		}, nil
+	}
+}
+
+// executeRollback executes a rollback operation
+func (s *HealActionServiceImpl) executeRollback(ctx context.Context, rules HealActionRules, alertID string, labels map[string]string) (*HealActionResult, error) {
+	_ = ctx // TODO: Use context for HTTP timeout when calling real rollback API
+	// Check deployment status if specified
+	if rules.DeploymentStatus != "" {
+		// TODO: 实际实现中应该查询部署系统获取真实的部署状态
+		// 这里暂时模拟检查
+		deployStatus := s.getDeploymentStatus(labels)
+		if deployStatus != rules.DeploymentStatus {
+			return &HealActionResult{
+				Success: false,
+				Message: fmt.Sprintf("deployment status mismatch: expected %s, got %s", rules.DeploymentStatus, deployStatus),
+			}, nil
+		}
+	}
+
+	// Mock rollback execution
+	sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second)
+	log.Info().
+		Str("alert_id", alertID).
+		Str("target", rules.Target).
+		Dur("sleep_duration", sleepDur).
+		Msg("executing mock rollback")
+
+	// Simulate rollback time
+	time.Sleep(sleepDur)
+
+	// TODO: 实际实现中应该调用真实的回滚接口
+	// url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(labels))
+	// 发起 HTTP POST 请求到回滚接口
+
+	return &HealActionResult{
+		Success: true,
+		Message: fmt.Sprintf("rollback completed successfully, target: %s", rules.Target),
+	}, nil
+}
+
+// executeAlert executes an alert-only action (no automatic healing)
+func (s *HealActionServiceImpl) executeAlert(rules HealActionRules, alertID string, labels map[string]string) (*HealActionResult, error) {
+	_ = labels // TODO: Use labels for context-specific alert messages
+	log.Warn().
+		Str("alert_id", alertID).
+		Str("message", rules.Message).
+		Msg("heal action requires manual intervention")
+
+	return &HealActionResult{
+		Success: false,
+		Message: rules.Message,
+	}, nil
+}
+
+// getDeploymentStatus gets the deployment status for the given labels
+// TODO: 实际实现中应该查询部署系统获取真实的部署状态
+func (s *HealActionServiceImpl) getDeploymentStatus(labels map[string]string) string {
+	// 这里暂时返回模拟状态
+	// 实际实现中应该：
+	// 1. 从 labels 中提取 service 和 version
+	// 2. 查询部署系统 API 获取当前部署状态
+	// 3. 返回 "deploying" 或 "deployed"
+
+	service := labels["service_name"]
+	version := labels["version"]
+
+	if service == "" || version == "" {
+		return "unknown"
+	}
+
+	// 模拟逻辑：如果版本号包含 "dev" 或 "test"，认为是发布中，待确认修改为实际的部署状态区分方式
+	if version == "dev" || version == "test" {
+		return "deploying"
+	}
+
+	return "deployed"
+}
+
+// deriveDeployIDFromLabels derives deployment ID from labels
+// TODO: Use this function when implementing real rollback API calls
+func deriveDeployIDFromLabels(labels map[string]string) string {
+	if v := labels["deploy_id"]; v != "" {
+		return v
+	}
+	service := labels["service_name"]
+	version := labels["version"]
+	if service != "" && version != "" {
+		return fmt.Sprintf("%s:%s", service, version)
+	}
+	return ""
+}
diff --git a/internal/alerting/service/remediation/heal_action_service_test.go b/internal/alerting/service/remediation/heal_action_service_test.go
new file mode 100644
index 0000000..eb45209
--- /dev/null
+++ b/internal/alerting/service/remediation/heal_action_service_test.go
@@ -0,0 +1,178 @@
+package remediation
+
+import (
+	"context"
+	"encoding/json"
+	"testing"
+)
+
+func TestHealActionServiceImpl_IdentifyFaultDomain(t *testing.T) {
+	service := &HealActionServiceImpl{}
+
+	tests := []struct {
+		name     string
+		labels   map[string]string
+		expected FaultDomain
+	}{
+		{
+			name: "service_version_issue",
+			labels: map[string]string{
+				"service_name": "test-service",
+				"version":      "v1.0.0",
+			},
+			expected: FaultDomainServiceVersion,
+		},
+		{
+			name: "missing_service_name",
+			labels: map[string]string{
+				"version": "v1.0.0",
+			},
+			expected: FaultDomainUnknown,
+		},
+		{
+			name: "missing_version",
+			labels: map[string]string{
+				"service_name": "test-service",
+			},
+			expected: FaultDomainUnknown,
+		},
+		{
+			name:     "empty_labels",
+			labels:   map[string]string{},
+			expected: FaultDomainUnknown,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := service.IdentifyFaultDomain(tt.labels)
+			if result != tt.expected {
+				t.Errorf("IdentifyFaultDomain() = %v, want %v", result, tt.expected)
+			}
+		})
+	}
+}
+
+func TestHealActionServiceImpl_ExecuteHealAction(t *testing.T) {
+	service := &HealActionServiceImpl{}
+
+	tests := []struct {
+		name        string
+		action      *HealAction
+		alertID     string
+		labels      map[string]string
+		expectError bool
+	}{
+		{
+			name: "rollback_action",
+			action: &HealAction{
+				ID:   "test-rollback",
+				Desc: "Test rollback action",
+				Type: "service_version_issue",
+				Rules: json.RawMessage(`{
+					"deployment_status": "deploying",
+					"action": "rollback",
+					"target": "previous_version"
+				}`),
+			},
+			alertID: "test-alert-1",
+			labels: map[string]string{
+				"service_name": "test-service",
+				"version":      "dev",
+			},
+			expectError: false,
+		},
+		{
+			name: "alert_action",
+			action: &HealAction{
+				ID:   "test-alert",
+				Desc: "Test alert action",
+				Type: "service_version_issue",
+				Rules: json.RawMessage(`{
+					"action": "alert",
+					"message": "Version already deployed, manual intervention required"
+				}`),
+			},
+			alertID: "test-alert-2",
+			labels: map[string]string{
+				"service_name": "test-service",
+				"version":      "v1.0.0",
+			},
+			expectError: false,
+		},
+		{
+			name:        "nil_action",
+			action:      nil,
+			alertID:     "test-alert-3",
+			labels:      map[string]string{},
+			expectError: false, // Should not error, but return failure result
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result, err := service.ExecuteHealAction(context.Background(), tt.action, tt.alertID, tt.labels)
+
+			if tt.expectError && err == nil {
+				t.Errorf("ExecuteHealAction() expected error but got none")
+			}
+			if !tt.expectError && err != nil {
+				t.Errorf("ExecuteHealAction() unexpected error: %v", err)
+			}
+
+			if result == nil {
+				t.Errorf("ExecuteHealAction() returned nil result")
+			}
+		})
+	}
+}
+
+func TestHealActionServiceImpl_getDeploymentStatus(t *testing.T) {
+	service := &HealActionServiceImpl{}
+
+	tests := []struct {
+		name     string
+		labels   map[string]string
+		expected string
+	}{
+		{
+			name: "deploying_version",
+			labels: map[string]string{
+				"service_name": "test-service",
+				"version":      "dev",
+			},
+			expected: "deploying",
+		},
+		{
+			name: "deployed_version",
+			labels: map[string]string{
+				"service_name": "test-service",
+				"version":      "v1.0.0",
+			},
+			expected: "deployed",
+		},
+		{
+			name: "missing_service_name",
+			labels: map[string]string{
+				"version": "v1.0.0",
+			},
+			expected: "unknown",
+		},
+		{
+			name: "missing_version",
+			labels: map[string]string{
+				"service_name": "test-service",
+			},
+			expected: "unknown",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := service.getDeploymentStatus(tt.labels)
+			if result != tt.expected {
+				t.Errorf("getDeploymentStatus() = %v, want %v", result, tt.expected)
+			}
+		})
+	}
+}
diff --git a/internal/alerting/service/remediation/init_heal_actions.sql b/internal/alerting/service/remediation/init_heal_actions.sql
new file mode 100644
index 0000000..b11fcee
--- /dev/null
+++ b/internal/alerting/service/remediation/init_heal_actions.sql
@@ -0,0 +1,38 @@
+-- 创建 heal_actions 表
+CREATE TABLE IF NOT EXISTS heal_actions (
+    id VARCHAR(255) PRIMARY KEY,
+    desc TEXT NOT NULL,
+    type VARCHAR(255) NOT NULL,
+    rules JSONB NOT NULL
+);
+
+-- 创建索引
+CREATE INDEX IF NOT EXISTS idx_heal_actions_type ON heal_actions(type);
+
+-- 插入示例数据
+INSERT INTO heal_actions (id, desc, type, rules) VALUES 
+(
+    'service_version_rollback_deploying',
+    '服务版本回滚方案（发布中版本）',
+    'service_version_issue',
+    '{"deployment_status": "deploying", "action": "rollback", "target": "previous_version"}'
+),
+(
+    'service_version_alert_deployed',
+    '服务版本告警方案（已完成发布版本）',
+    'service_version_issue',
+    '{"deployment_status": "deployed", "action": "alert", "message": "版本已发布，暂不支持自动回滚，需要人工介入处理"}'
+),
+(
+    'service_version_rollback_default',
+    '服务版本回滚方案（默认）',
+    'service_version_issue',
+    '{"action": "rollback", "target": "previous_version"}'
+)
+ON CONFLICT (id) DO UPDATE SET
+    desc = EXCLUDED.desc,
+    type = EXCLUDED.type,
+    rules = EXCLUDED.rules;
+
+-- 查询验证
+SELECT id, desc, type, rules FROM heal_actions ORDER BY type, id;
diff --git a/internal/alerting/service/remediation/observation_window.go b/internal/alerting/service/remediation/observation_window.go
new file mode 100644
index 0000000..c07e146
--- /dev/null
+++ b/internal/alerting/service/remediation/observation_window.go
@@ -0,0 +1,169 @@
+package remediation
+
+import (
+	"context"
+	"encoding/json"
+	"fmt"
+	"time"
+
+	"github.com/redis/go-redis/v9"
+	"github.com/rs/zerolog/log"
+)
+
+// RedisObservationWindowManager implements ObservationWindowManager using Redis
+type RedisObservationWindowManager struct {
+	redis *redis.Client
+}
+
+// NewRedisObservationWindowManager creates a new Redis-based observation window manager
+func NewRedisObservationWindowManager(redis *redis.Client) *RedisObservationWindowManager {
+	return &RedisObservationWindowManager{redis: redis}
+}
+
+// StartObservation starts an observation window for a service
+func (m *RedisObservationWindowManager) StartObservation(ctx context.Context, service, version, alertID string, duration time.Duration) error {
+	if m.redis == nil {
+		return fmt.Errorf("redis client is nil")
+	}
+
+	now := time.Now()
+	window := &ObservationWindow{
+		Duration:  duration,
+		Service:   service,
+		Version:   version,
+		AlertID:   alertID,
+		StartTime: now,
+		EndTime:   now.Add(duration),
+		IsActive:  true,
+	}
+
+	key := fmt.Sprintf("observation:%s:%s", service, version)
+	data, err := json.Marshal(window)
+	if err != nil {
+		return fmt.Errorf("failed to marshal observation window: %w", err)
+	}
+
+	// Store with TTL equal to observation duration + buffer
+	ttl := duration + 5*time.Minute
+	err = m.redis.Set(ctx, key, data, ttl).Err()
+	if err != nil {
+		return fmt.Errorf("failed to store observation window: %w", err)
+	}
+
+	log.Info().
+		Str("service", service).
+		Str("version", version).
+		Str("alert_id", alertID).
+		Dur("duration", duration).
+		Time("end_time", window.EndTime).
+		Msg("started observation window")
+
+	return nil
+}
+
+// CheckObservation checks if there's an active observation window for a service
+func (m *RedisObservationWindowManager) CheckObservation(ctx context.Context, service, version string) (*ObservationWindow, error) {
+	if m.redis == nil {
+		return nil, fmt.Errorf("redis client is nil")
+	}
+
+	key := fmt.Sprintf("observation:%s:%s", service, version)
+	data, err := m.redis.Get(ctx, key).Result()
+	if err != nil {
+		if err == redis.Nil {
+			return nil, nil // No active observation window
+		}
+		return nil, fmt.Errorf("failed to get observation window: %w", err)
+	}
+
+	var window ObservationWindow
+	err = json.Unmarshal([]byte(data), &window)
+	if err != nil {
+		return nil, fmt.Errorf("failed to unmarshal observation window: %w", err)
+	}
+
+	// Check if observation window has expired
+	if time.Now().After(window.EndTime) {
+		// Clean up expired window
+		m.redis.Del(ctx, key)
+		return nil, nil
+	}
+
+	return &window, nil
+}
+
+// CompleteObservation completes an observation window and marks it as successful
+func (m *RedisObservationWindowManager) CompleteObservation(ctx context.Context, service, version string) error {
+	if m.redis == nil {
+		return fmt.Errorf("redis client is nil")
+	}
+
+	key := fmt.Sprintf("observation:%s:%s", service, version)
+
+	// Get the current window
+	window, err := m.CheckObservation(ctx, service, version)
+	if err != nil {
+		return fmt.Errorf("failed to check observation window: %w", err)
+	}
+
+	if window == nil {
+		return fmt.Errorf("no active observation window found for service %s version %s", service, version)
+	}
+
+	// Mark as completed and remove from Redis
+	window.IsActive = false
+	err = m.redis.Del(ctx, key).Err()
+	if err != nil {
+		return fmt.Errorf("failed to remove observation window: %w", err)
+	}
+
+	log.Info().
+		Str("service", service).
+		Str("version", version).
+		Str("alert_id", window.AlertID).
+		Dur("duration", window.Duration).
+		Msg("completed observation window successfully")
+
+	return nil
+}
+
+// CancelObservation cancels an observation window due to new alerts
+func (m *RedisObservationWindowManager) CancelObservation(ctx context.Context, service, version string) error {
+	if m.redis == nil {
+		return fmt.Errorf("redis client is nil")
+	}
+
+	key := fmt.Sprintf("observation:%s:%s", service, version)
+
+	// Get the current window for logging
+	window, err := m.CheckObservation(ctx, service, version)
+	if err != nil {
+		return fmt.Errorf("failed to check observation window: %w", err)
+	}
+
+	if window == nil {
+		return nil // No active window to cancel
+	}
+
+	// Remove the observation window
+	err = m.redis.Del(ctx, key).Err()
+	if err != nil {
+		return fmt.Errorf("failed to cancel observation window: %w", err)
+	}
+
+	log.Warn().
+		Str("service", service).
+		Str("version", version).
+		Str("alert_id", window.AlertID).
+		Msg("cancelled observation window due to new alerts")
+
+	return nil
+}
+
+// GetObservationDuration returns the configured observation duration
+// TODO: 后续可以从配置或数据库中动态获取观察时间
+func GetObservationDuration() time.Duration {
+	// 暂时使用固定的30分钟观察窗口
+	// 后续可以扩展为从环境变量或配置文件中读取
+	return 30 * time.Minute
+}
diff --git a/internal/alerting/service/remediation/observation_window_test.go b/internal/alerting/service/remediation/observation_window_test.go
new file mode 100644
index 0000000..8a2ec35
--- /dev/null
+++ b/internal/alerting/service/remediation/observation_window_test.go
@@ -0,0 +1,100 @@
+package remediation
+
+import (
+	"context"
+	"testing"
+	"time"
+
+	"github.com/redis/go-redis/v9"
+	"github.com/stretchr/testify/assert"
+	"github.com/stretchr/testify/require"
+)
+
+func TestRedisObservationWindowManager(t *testing.T) {
+	// 使用内存 Redis 客户端进行测试
+	rdb := redis.NewClient(&redis.Options{
+		Addr: "localhost:6379", // 需要 Redis 实例
+	})
+	defer rdb.Close()
+
+	// 检查 Redis 连接
+	ctx := context.Background()
+	if err := rdb.Ping(ctx).Err(); err != nil {
+		t.Skip("Redis not available, skipping test")
+	}
+
+	manager := NewRedisObservationWindowManager(rdb)
+
+	t.Run("StartObservation", func(t *testing.T) {
+		service := "test-service"
+		version := "v1.0.0"
+		alertID := "test-alert-1"
+		duration := 5 * time.Minute
+
+		err := manager.StartObservation(ctx, service, version, alertID, duration)
+		require.NoError(t, err)
+
+		// 验证观察窗口已创建
+		window, err := manager.CheckObservation(ctx, service, version)
+		require.NoError(t, err)
+		require.NotNil(t, window)
+		assert.Equal(t, service, window.Service)
+		assert.Equal(t, version, window.Version)
+		assert.Equal(t, alertID, window.AlertID)
+		assert.True(t, window.IsActive)
+	})
+
+	t.Run("CheckObservation_NotFound", func(t *testing.T) {
+		service := "non-existent-service"
+		version := "v1.0.0"
+
+		window, err := manager.CheckObservation(ctx, service, version)
+		require.NoError(t, err)
+		assert.Nil(t, window)
+	})
+
+	t.Run("CompleteObservation", func(t *testing.T) {
+		service := "test-service-2"
+		version := "v1.0.0"
+		alertID := "test-alert-2"
+		duration := 5 * time.Minute
+
+		// 先创建观察窗口
+		err := manager.StartObservation(ctx, service, version, alertID, duration)
+		require.NoError(t, err)
+
+		// 完成观察窗口
+		err = manager.CompleteObservation(ctx, service, version)
+		require.NoError(t, err)
+
+		// 验证观察窗口已被移除
+		window, err := manager.CheckObservation(ctx, service, version)
+		require.NoError(t, err)
+		assert.Nil(t, window)
+	})
+
+	t.Run("CancelObservation", func(t *testing.T) {
+		service := "test-service-3"
+		version := "v1.0.0"
+		alertID := "test-alert-3"
+		duration := 5 * time.Minute
+
+		// 先创建观察窗口
+		err := manager.StartObservation(ctx, service, version, alertID, duration)
+		require.NoError(t, err)
+
+		// 取消观察窗口
+		err = manager.CancelObservation(ctx, service, version)
+		require.NoError(t, err)
+
+		// 验证观察窗口已被移除
+		window, err := manager.CheckObservation(ctx, service, version)
+		require.NoError(t, err)
+		assert.Nil(t, window)
+	})
+}
+
+func TestGetObservationDuration(t *testing.T) {
+	duration := GetObservationDuration()
+	assert.Equal(t, 30*time.Minute, duration)
+}
diff --git a/internal/alerting/service/remediation/types.go b/internal/alerting/service/remediation/types.go
new file mode 100644
index 0000000..c1c5f02
--- /dev/null
+++ b/internal/alerting/service/remediation/types.go
@@ -0,0 +1,74 @@
+package remediation
+
+import (
+	"context"
+	"encoding/json"
+	"time"
+)
+
+// HealAction represents a healing action configuration
+type HealAction struct {
+	ID    string          `json:"id"`
+	Desc  string          `json:"desc"`
+	Type  string          `json:"type"`
+	Rules json.RawMessage `json:"rules"`
+}
+
+// HealActionRules represents the rules for a heal action
+type HealActionRules struct {
+	DeploymentStatus string `json:"deployment_status,omitempty"`
+	Action           string `json:"action"`
+	Target           string `json:"target,omitempty"`
+	Message          string `json:"message,omitempty"`
+}
+
+// FaultDomain represents the identified fault domain
+type FaultDomain string
+
+const (
+	FaultDomainServiceVersion FaultDomain = "service_version_issue"
+	FaultDomainUnknown        FaultDomain = "unknown"
+)
+
+// HealActionResult represents the result of executing a heal action
+type HealActionResult struct {
+	Success bool   `json:"success"`
+	Message string `json:"message,omitempty"`
+	Error   string `json:"error,omitempty"`
+}
+
+// ObservationWindow represents the observation period after healing
+type ObservationWindow struct {
+	Duration  time.Duration `json:"duration"`
+	Service   string        `json:"service"`
+	Version   string        `json:"version"`
+	AlertID   string        `json:"alert_id"`
+	StartTime time.Time     `json:"start_time"`
+	EndTime   time.Time     `json:"end_time"`
+	IsActive  bool          `json:"is_active"`
+}
+
+// ObservationWindowManager defines the interface for managing observation windows
+type ObservationWindowManager interface {
+	StartObservation(ctx context.Context, service, version, alertID string, duration time.Duration) error
+	CheckObservation(ctx context.Context, service, version string) (*ObservationWindow, error)
+	CompleteObservation(ctx context.Context, service, version string) error
+	CancelObservation(ctx context.Context, service, version string) error
+}
+
+// HealActionDAO defines the interface for heal action database operations
+type HealActionDAO interface {
+	GetByType(ctx context.Context, faultType string) (*HealAction, error)
+	GetByID(ctx context.Context, id string) (*HealAction, error)
+	Create(ctx context.Context, action *HealAction) error
+	Update(ctx context.Context, action *HealAction) error
+	Delete(ctx context.Context, id string) error
+	List(ctx context.Context) ([]*HealAction, error)
+}
+
+// HealActionService defines the interface for heal action business logic
+type HealActionService interface {
+	IdentifyFaultDomain(labels map[string]string) FaultDomain
+	GetHealAction(ctx context.Context, faultDomain FaultDomain) (*HealAction, error)
+	ExecuteHealAction(ctx context.Context, action *HealAction, alertID string, labels map[string]string) (*HealActionResult, error)
+}

From b40dbd31ccce39b0e748566f1626752feb54b22f Mon Sep 17 00:00:00 2001
From: acd19ml <acd19ml@gmail.com>
Date: Mon, 22 Sep 2025 17:55:37 +0800
Subject: [PATCH 4/4] Revert "Merge branch
 'feature/alert-healing-with-observation-window' into develop"

This reverts commit bc519391b3de8af08641d9bc30f4e69b6d2bbf8d, reversing
changes made to 7ae82d12553ba2c02828e3b44b836cb6f629b701.
---
 docs/alerting/database-design.md              |  47 +--
 go.mod                                        |   3 -
 internal/alerting/database/database.go        |   5 -
 .../alerting/service/remediation/README.md    | 373 +++---------------
 .../alerting/service/remediation/consumer.go  | 258 ++----------
 .../service/remediation/heal_action_dao.go    | 145 -------
 .../remediation/heal_action_service.go        | 172 --------
 .../remediation/heal_action_service_test.go   | 178 ---------
 .../service/remediation/init_heal_actions.sql |  38 --
 .../service/remediation/observation_window.go | 169 --------
 .../remediation/observation_window_test.go    | 100 -----
 .../alerting/service/remediation/types.go     |  74 ----
 12 files changed, 70 insertions(+), 1492 deletions(-)
 delete mode 100644 internal/alerting/service/remediation/heal_action_dao.go
 delete mode 100644 internal/alerting/service/remediation/heal_action_service.go
 delete mode 100644 internal/alerting/service/remediation/heal_action_service_test.go
 delete mode 100644 internal/alerting/service/remediation/init_heal_actions.sql
 delete mode 100644 internal/alerting/service/remediation/observation_window.go
 delete mode 100644 internal/alerting/service/remediation/observation_window_test.go
 delete mode 100644 internal/alerting/service/remediation/types.go

diff --git a/docs/alerting/database-design.md b/docs/alerting/database-design.md
index 0f860df..b119349 100644
--- a/docs/alerting/database-design.md
+++ b/docs/alerting/database-design.md
@@ -2,7 +2,7 @@
 
 ## 概述
 
-本文档为最新数据库设计，总计包含 7 张表：
+本文档为最新数据库设计，总计包含 6 张表：
 
 - alert_issues
 - alert_issue_comments
@@ -10,7 +10,6 @@
 - alert_rules
 - alert_rule_metas
 - service_states
-- heal_actions
 
 ## 数据表设计
 
@@ -112,7 +111,7 @@
 
 ---
 
-### 6) service_states（服务状态表）
+### 7) service_states（服务状态表）
 
 追踪服务在某一版本上的健康状态与处置进度。
 
@@ -128,34 +127,6 @@
 **索引建议：**
 - PRIMARY KEY: `(service, version)`
 
----
-
-### 7) heal_actions（告警治愈解决方案表）
-
-存储不同故障域对应的治愈方案和规则。
-
-| 字段名 | 类型 | 说明 |
-|--------|------|------|
-| id | varchar(255) PK | 治愈方案 ID |
-| desc | text | 简单描述，如 action 是处理什么告警场景的 |
-| type | varchar(255) | 对应的故障域类型 |
-| rules | jsonb | 条件规则：{condition1: action1, condition2: action2} |
-
-**索引建议：**
-- PRIMARY KEY: `id`
-- INDEX: `(type)`
-
-**示例数据：**
-```sql
-INSERT INTO heal_actions (id, desc, type, rules) VALUES 
-('service_version_rollback', '服务版本回滚方案', 'service_version_issue', 
- '{"deployment_status": "deploying", "action": "rollback", "target": "previous_version"}'),
-('service_version_alert', '服务版本告警方案', 'service_version_issue', 
- '{"deployment_status": "deployed", "action": "alert", "message": "版本已发布，暂不支持自动回滚"}');
-```
-
-TODO： health_state映射逻辑
-
 ## 数据关系（ER）
 
 ```mermaid
@@ -204,25 +175,13 @@ erDiagram
         text content
     }
 
-    heal_actions {
-        varchar id PK
-        text desc
-        varchar type
-        jsonb rules
-    }
-
     %% 通过 service 等标签在应用层逻辑关联
     alert_rule_metas ||..|| alert_rules : "by alert_name"
     service_states ||..|| alert_rule_metas : "by service/version labels"
-    heal_actions ||..|| alert_issues : "by fault domain analysis"
 ```
 
 ## 数据流转
 
 1. 以 `alert_rules` 为模版，结合 `alert_rule_metas` 渲染出面向具体服务/版本等的规则（labels 可为空 `{}` 表示全局默认，或包含如 service/version 等标签）。
 2. 指标或规则参数发生调整时，记录到 `alert_meta_change_logs`。
-3. 规则触发创建 `alert_issues`；处理过程中的动作写入 `alert_issue_comments`。
-4. **告警治愈流程**：
-   - P0 告警：根据 `alert_issues.labels` 识别故障域，查询 `heal_actions` 获取治愈方案
-   - 执行治愈操作（如回滚），成功后更新 `alert_issues` 和 `service_states` 状态
-   - P1/P2 告警：直接进入下钻分析，记录分析结果到 `alert_issue_comments`
\ No newline at end of file
+3. 规则触发创建 `alert_issues`；处理过程中的动作写入 `alert_issue_comments`。
\ No newline at end of file
diff --git a/go.mod b/go.mod
index 8cf046f..6094f9c 100644
--- a/go.mod
+++ b/go.mod
@@ -9,7 +9,6 @@ require (
 	github.com/lib/pq v1.10.9
 	github.com/redis/go-redis/v9 v9.5.1
 	github.com/rs/zerolog v1.34.0
-	github.com/stretchr/testify v1.11.1
 )
 
 require (
@@ -17,7 +16,6 @@ require (
 	github.com/bytedance/sonic/loader v0.2.4 // indirect
 	github.com/cespare/xxhash/v2 v2.2.0 // indirect
 	github.com/cloudwego/base64x v0.1.5 // indirect
-	github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect
 	github.com/dgryski/go-rendezvous v0.0.0-20200823014737-9f7001d12a5f // indirect
 	github.com/gabriel-vasile/mimetype v1.4.9 // indirect
 	github.com/gin-contrib/cors v1.7.6 // indirect
@@ -41,7 +39,6 @@ require (
 	github.com/modern-go/reflect2 v1.0.2 // indirect
 	github.com/natefinch/lumberjack v2.0.0+incompatible // indirect
 	github.com/pelletier/go-toml/v2 v2.2.4 // indirect
-	github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect
 	github.com/twitchyliquid64/golang-asm v0.15.1 // indirect
 	github.com/ugorji/go/codec v1.3.0 // indirect
 	golang.org/x/arch v0.18.0 // indirect
diff --git a/internal/alerting/database/database.go b/internal/alerting/database/database.go
index 6b2ab9b..e6ee504 100644
--- a/internal/alerting/database/database.go
+++ b/internal/alerting/database/database.go
@@ -38,8 +38,3 @@ func (d *Database) ExecContext(ctx context.Context, q string, args ...any) (sql.
 func (d *Database) QueryContext(ctx context.Context, q string, args ...any) (*sql.Rows, error) {
 	return d.db.QueryContext(ctx, q, args...)
 }
-
-// QueryRowContext exposes database/sql QueryRowContext for single row SELECT queries.
-func (d *Database) QueryRowContext(ctx context.Context, q string, args ...any) *sql.Row {
-	return d.db.QueryRowContext(ctx, q, args...)
-}
diff --git a/internal/alerting/service/remediation/README.md b/internal/alerting/service/remediation/README.md
index ca41e49..de5ca08 100644
--- a/internal/alerting/service/remediation/README.md
+++ b/internal/alerting/service/remediation/README.md
@@ -1,28 +1,23 @@
-# remediation — 告警治愈与下钻分析
+# remediation — 通道消费与自动回滚（Mock）
 
-本包实现一个后台处理器：消费 `healthcheck` 投递到进程内 channel 的告警消息，根据告警等级进行分流处理：
-- **P0 告警**：进入"故障治愈"模块，执行自动修复操作
-- **P1/P2 告警**：进入"下钻分析"模块，进行深度分析
+本包规划一个后台处理器：消费 `healthcheck` 投递到进程内 channel 的告警消息，模拟执行“自动回滚”，回滚成功后将相关告警与服务态标记为恢复。
 
 ——
 
 ## 1. 目标
 
 - 订阅 `healthcheck` 的 `AlertMessage`（进程内 channel）
-- 根据 `level` 字段进行分流：
-  - **P0 告警**：故障治愈流程
-    1) 确认故障域（从 labels 分析 service_name + version）
-    2) 查询 `heal_actions` 表获取治愈方案
-    3) 执行治愈操作（当前仅支持回滚）
-    4) 治愈成功后启动观察窗口（默认30分钟）
-    5) 观察窗口内如果出现新告警，取消观察并重新处理
-    6) 观察窗口完成后，更新服务状态为正常
-  - **P1/P2 告警**：直接进入下钻分析流程
-    1) 执行 AI 分析
-    2) 更新告警状态为恢复
-    3) 记录分析结果到评论
-
-> 说明：本阶段实现故障域识别和治愈方案查询，真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。
+- 对每条消息：
+  1) Mock 调用回滚接口 `POST /v1/deployments/:deployID/rollback`
+  2) `sleep 30s` 后返回“回滚成功”的模拟响应
+  3) 若成功，则更新 DB 与缓存：
+     - `alert_issues.alert_state = 'Restored'`
+     - `alert_issues.state = 'Closed'`
+     - `service_states.health_state = 'Normal'`
+     - `service_states.resolved_at = NOW()`（当前时间）
+     - 同时在 `alert_issue_comments` 中追加一条 AI 分析评论（见下文内容模板）
+
+> 说明：本阶段仅实现消费与 Mock，真实回滚接口与鉴权可后续接入 `internal/service_manager` 的部署 API。
 
 ——
 
@@ -42,10 +37,8 @@
  }
 ```
 
-- 故障域识别：从 `Labels` 中提取 `service_name` 和 `version` 信息
 - deployID 的来源（用于构造回滚 URL）：
-  - 可从 `Labels["deploy_id"]`（若存在）读取
-  - 若为空，可按 `{service}:{version}` 组装一个占位 ID
+  - Mock 阶段：可从 `Labels["deploy_id"]`（若存在）读取；若为空，可按 `{service}:{version}` 组装一个占位 ID。
 
 ——
 
@@ -69,7 +62,7 @@ REMEDIATION_ROLLBACK_SLEEP=30s
 
 ——
 
-## 4. 处理流程（伪代码）
+## 4. 流程（伪代码）
 
 ```go
 func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rdb *redis.Client) {
@@ -78,256 +71,30 @@ func StartConsumer(ctx context.Context, ch <-chan AlertMessage, db *Database, rd
         case <-ctx.Done():
             return
         case m := <-ch:
-            switch m.Level {
-            case "P0":
-                // P0 告警：故障治愈流程
-                handleP0Alert(ctx, m, db, rdb)
-            case "P1", "P2":
-                // P1/P2 告警：下钻分析流程
-                handleP1P2Alert(ctx, m, db, rdb)
-            default:
-                log.Printf("Unknown alert level: %s", m.Level)
+            // 1) 组装回滚 URL（Mock）
+            deployID := m.Labels["deploy_id"]
+            if deployID == "" {
+                // 仅 Mock：用 service:version 兜底
+                deployID = fmt.Sprintf("%s:%s", m.Service, m.Version)
             }
-        }
-    }
-}
-
-// P0 告警处理：故障治愈流程
-func handleP0Alert(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) {
-    // 1) 确认故障域
-    faultDomain := identifyFaultDomain(m.Labels)
-    
-    // 2) 查询治愈方案
-    healAction, err := queryHealAction(ctx, db, faultDomain)
-    if err != nil {
-        log.Printf("Failed to query heal action: %v", err)
-        // 治愈方案查询失败，直接进入下钻分析
-        handleDrillDownAnalysis(ctx, m, db, rdb)
-        return
-    }
-    
-    // 3) 执行治愈操作
-    success := executeHealAction(ctx, healAction, m)
-    if !success {
-        log.Printf("Heal action failed for alert %s", m.ID)
-        // 治愈操作失败，直接进入下钻分析
-        handleDrillDownAnalysis(ctx, m, db, rdb)
-        return
-    }
-    
-    // 4) 治愈成功后启动观察窗口，延迟状态更新
-    handleDrillDownAnalysisWithObservation(ctx, m, db, rdb)
-}
-
-// P1/P2 告警处理：下钻分析流程
-func handleP1P2Alert(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) {
-    handleDrillDownAnalysis(ctx, m, db, rdb)
-}
+            url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deployID)
 
-// 故障域识别
-func identifyFaultDomain(labels map[string]string) string {
-    service := labels["service_name"]
-    version := labels["version"]
-    
-    if service != "" && version != "" {
-        return "service_version_issue"
-    }
-    
-    // 可根据更多条件扩展其他故障域
-    return "unknown"
-}
+            // 2) 发起回滚（Mock）：sleep 指定时间再判为成功
+            sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second)
+            // TODO: 如需真实 HTTP 调用，可在此发起 POST 并根据响应判断
 
-// 查询治愈方案
-func queryHealAction(ctx context.Context, db *Database, faultDomain string) (*HealAction, error) {
-    const q = `SELECT id, desc, type, rules FROM heal_actions WHERE type = $1 LIMIT 1`
-    // 实现查询逻辑
-    return nil, nil
-}
-
-// 执行治愈操作
-func executeHealAction(ctx context.Context, action *HealAction, m AlertMessage) bool {
-    // 根据 action.rules 中的条件执行相应操作
-    // 当前仅支持回滚操作
-    if action.Rules["action"] == "rollback" {
-        return executeRollback(ctx, m)
-    } else if action.Rules["action"] == "alert" {
-        log.Printf("Alert: %s", action.Rules["message"])
-        return false
-    }
-    return false
-}
-
-// 执行回滚操作
-func executeRollback(ctx context.Context, m AlertMessage) bool {
-    deployID := m.Labels["deploy_id"]
-    if deployID == "" {
-        deployID = fmt.Sprintf("%s:%s", m.Service, m.Version)
+            // 3) 成功后，先写入 AI 分析评论，再更新 DB 与缓存状态
+            _ = addAIAnalysisComment(ctx, db, m)
+            _ = markRestoredInDB(ctx, db, m)
+            _ = markRestoredInCache(ctx, rdb, m)
+        }
     }
-    
-    // Mock 回滚：sleep 指定时间
-    sleep(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second)
-    // TODO: 真实 HTTP 调用回滚接口
-    
-    return true
-}
-
-// 下钻分析处理（P1/P2 告警直接使用）
-func handleDrillDownAnalysis(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) {
-    // 1) 执行 AI 分析
-    _ = addAIAnalysisComment(ctx, db, m)
-    
-    // 2) 更新告警状态为恢复
-    _ = markRestoredInDB(ctx, db, m)
-    
-    // 3) 更新缓存状态
-    _ = markRestoredInCache(ctx, rdb, m)
-}
-
-// 下钻分析处理（P0 告警治愈后使用，延迟状态更新）
-func handleDrillDownAnalysisWithObservation(ctx context.Context, m AlertMessage, db *Database, rdb *redis.Client) {
-    // 1) 执行 AI 分析
-    _ = addAIAnalysisComment(ctx, db, m)
-    
-    // 2) 记录治愈完成评论，但不更新告警状态
-    _ = addHealingCompletedComment(ctx, db, m)
-    
-    // 3) 启动观察窗口，等待30分钟
-    _ = startObservationWindow(ctx, m.Service, m.Version, m.ID, 30*time.Minute)
-    
-    // 注意：此时不更新 alert_issues.alert_state 和 service_states.health_state
-    // 状态更新将在观察窗口完成后进行
-}
-
-// 观察窗口完成后的处理
-func completeObservationWindow(ctx context.Context, service, version string, db *Database, rdb *redis.Client) {
-    // 1) 完成观察窗口
-    _ = completeObservation(ctx, service, version)
-    
-    // 2) 更新 alert_issues.alert_state 为 'Restored'
-    // 3) 更新 service_states.health_state 为 'Normal'
-    // 4) 更新相关缓存
-    _ = markServiceAsNormal(ctx, service, version, db, rdb)
-    
-    log.Printf("Observation window completed for service %s version %s, status updated to Normal", service, version)
 }
 ```
 
 ——
 
-## 5. 故障域识别与治愈方案
-
-### 故障域类型
-
-当前支持的故障域类型：
-
-1. **service_version_issue**：服务版本问题
-   - 识别条件：`labels["service_name"]` 和 `labels["version"]` 都存在
-   - 治愈方案：
-     - 发布中版本：执行回滚操作
-     - 已完成发布版本：提示暂不支持自动回滚
-
-2. **unknown**：未知故障域
-   - 识别条件：无法从标签中识别出已知故障域
-   - 处理方式：跳过治愈，直接进入下钻分析
-
-### 治愈方案规则
-
-`heal_actions.rules` 字段的 JSON 结构：
-
-```json
-{
-  "deployment_status": "deploying|deployed",
-  "action": "rollback|alert",
-  "target": "previous_version",
-  "message": "版本已发布，暂不支持自动回滚"
-}
-```
-
-### 治愈操作类型
-
-1. **rollback**：执行回滚操作
-   - 调用部署系统的回滚接口
-   - 回滚到上一个稳定版本
-
-2. **alert**：仅告警，不执行自动操作
-   - 记录告警信息
-   - 需要人工介入处理
-
-### 扩展性设计
-
-- 故障域类型可扩展：整体问题、单机房问题、网络问题等
-- 治愈方案可扩展：重启服务、扩容、切换流量等
-- 规则条件可扩展：基于更多标签和指标进行判断
-
-#### 添加新的故障域类型
-
-1. 在 `types.go` 中添加新的 `FaultDomain` 常量
-2. 在 `IdentifyFaultDomain` 方法中添加识别逻辑
-3. 在数据库中配置对应的治愈方案
-
-#### 添加新的治愈操作类型
-
-1. 在 `HealActionRules` 结构体中添加新字段
-2. 在 `ExecuteHealAction` 方法中添加新的 case 分支
-3. 实现具体的治愈操作逻辑
-
-### 观察窗口机制
-
-观察窗口是治愈操作完成后的验证期，用于确保治愈操作的有效性：
-
-1. **启动条件**：P0 告警治愈操作成功完成后自动启动
-2. **持续时间**：默认30分钟，可配置
-3. **监控内容**：观察该服务是否在窗口期内出现新的告警
-4. **处理逻辑**：
-   - 如果窗口期内出现新告警：取消观察窗口，重新进入治愈流程
-   - 如果窗口期内无新告警：完成观察窗口，更新服务状态为正常
-5. **状态更新时机**：
-   - **治愈操作完成后**：不立即更新状态，只记录治愈完成评论
-   - **观察窗口完成后**：同时更新 `alert_issues.alert_state` 为 `Restored` 和 `service_states.health_state` 为 `Normal`
-6. **关键原则**：每次修改 `service_states.health_state` 为 `Normal` 时，都必须同时修改 `alert_issues.alert_state` 为 `Restored`
-
-——
-
-## 6. 代码使用示例
-
-### 数据库初始化
-
-```bash
-# 执行初始化脚本
-psql -U postgres -d zeroops -f init_heal_actions.sql
-```
-
-### 代码使用
-
-```go
-// 创建服务
-healDAO := NewPgHealActionDAO(db)
-healService := NewHealActionService(healDAO)
-
-// 识别故障域
-faultDomain := healService.IdentifyFaultDomain(labels)
-
-// 获取治愈方案
-healAction, err := healService.GetHealAction(ctx, faultDomain)
-
-// 执行治愈操作
-result, err := healService.ExecuteHealAction(ctx, healAction, alertID, labels)
-```
-
-### 测试
-
-运行测试：
-
-```bash
-go test ./internal/alerting/service/remediation -v
-```
-
-测试覆盖：
-- 故障域识别逻辑
-- 治愈操作执行
-- 部署状态判断
-
-## 7. DB 更新（SQL 建议）
+## 5. DB 更新（SQL 建议）
 
 - 告警状态：
 ```sql
@@ -354,7 +121,7 @@ VALUES (
 );
 ```
 
-评论内容模板（Markdown，多行，内容暂未设计）：
+评论内容模板（Markdown，多行）：
 ```
 ## AI分析结果
 **问题类型**：非发版本导致的问题
@@ -370,7 +137,7 @@ VALUES (
 
 ——
 
-## 8. 缓存更新（Redis，Lua CAS 建议）
+## 6. 缓存更新（Redis，Lua CAS 建议）
 
 - 告警缓存 `alert:issue:{id}`：
 ```lua
@@ -411,80 +178,30 @@ return 1
 
 ——
 
-## 9. 幂等与重试
+## 7. 幂等与重试
 
 - 幂等：同一 `AlertMessage.ID` 的回滚处理应具备幂等性，重复消费不应产生额外副作用。
 - 重试：Mock 模式下可忽略；接入真实接口后，对 5xx/网络错误考虑重试与退避，最终写入失败应有告警与补偿。
 
 ——
 
-## 10. 验证步骤（与 healthcheck E2E 相衔接）
-
-### 基础验证步骤
+## 8. 验证步骤（与 healthcheck E2E 相衔接）
 
 1) 启动 Redis/Postgres 与 API（参考 `healthcheck/E2E_VALIDATION.md` 与 `env_example.txt`）
-2) 创建 `heal_actions` 表并插入测试数据
-3) 创建 channel，并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)`
-
-### P0 告警验证（故障治愈流程）
-
-4) 触发 P0 级别 Webhook，`alert_issues` 入库为 `Pending`
-5) 等待 `healthcheck` 将缓存态切到 `InProcessing`
-6) 验证故障域识别：检查日志中是否正确识别为 `service_version_issue`
-7) 验证治愈方案查询：检查是否从 `heal_actions` 表查询到对应方案
-8) 等待 `remediation` 执行治愈操作完成：
-   - 验证观察窗口已启动（Redis 中存在观察窗口记录）
-   - `alert_issue_comments` 中新增治愈完成评论
-   - **重要**：验证 `alert_issues.alert_state` 仍为 `InProcessing`（未更新为 `Restored`）
-   - **重要**：验证 `service_states.health_state` 未更新为 `Normal`
-9) 等待观察窗口完成（30分钟后）或模拟窗口期内新告警：
-   - **如果无新告警**：
-     - 验证观察窗口自动完成
-     - 验证状态同时更新为 `alert_issues.alert_state = 'Restored'` 和 `service_states.health_state = 'Normal'`
-   - **如果有新告警**：
-     - 验证观察窗口被取消
-     - 验证重新进入治愈流程
-     - 验证状态未更新为 `Restored`/`Normal`
-
-### P1/P2 告警验证（下钻分析流程）
-
-9) 触发 P1 或 P2 级别 Webhook
-10) 验证直接进入下钻分析流程，跳过故障治愈步骤
-11) 验证 AI 分析评论生成和状态更新
-
-### 最终验证
-
-12) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新
-13) 验证不同告警等级的处理路径正确性
+2) 创建 channel，并将其同时传给 `healthcheck.StartScheduler(..)` 与 `remediation.StartConsumer(..)`
+3) `curl` 触发 Webhook，`alert_issues` 入库为 `Pending`
+4) 等待 `healthcheck` 将缓存态切到 `InProcessing`
+5) 等待 `remediation` mock 回滚完成 → DB 与缓存更新：
+   - `alert_issues.alert_state = 'Restored'`
+   - `service_states.health_state = 'Normal'`
+   - `service_states.resolved_at = NOW()`
+6) 通过 Redis 与 API (`/v1/issues`、`/v1/issues/{id}`) 验证字段已更新（comments 仍为 mock）
 
 ——
 
-## 11. 注意事项
-
-1. **service_states 表逻辑**: 当前版本中，`service_states` 表的更新逻辑暂时不实现，但保留了扩展空间
-2. **Mock 模式**: 当前回滚操作为 Mock 模式，实际部署时需要接入真实的部署系统 API
-3. **错误处理**: 治愈操作失败时会记录日志并继续进入下钻分析流程
-4. **幂等性**: 同一告警的重复处理应该具备幂等性
-
-## 12. 后续计划
-
-### 短期计划
+## 9. 后续计划
 
-- 实现 `heal_actions` 表的完整 CRUD 操作
-- 完善故障域识别逻辑，支持更多故障类型
 - 接入真实部署系统回滚接口与鉴权
-- 实现治愈方案的动态配置和管理界面
-
-### 中期计划
-
-- 扩展治愈操作类型：服务重启、扩容、流量切换等
-- 增加治愈方案的执行结果反馈和效果评估
 - 将进程内 channel 平滑切换为 MQ（Kafka/NATS）
-- 完善指标与可观测：事件消费速率、成功率、时延分位、治愈结果等
-
-### 长期计划
-
-- 基于历史数据训练 AI 模型，自动推荐最优治愈方案
-- 增加补偿任务：对"治愈成功但缓存/DB 未一致"的场景进行对账修复
-- 实现治愈方案的 A/B 测试和效果对比
-- 构建完整的故障自愈知识库和最佳实践库
+- 完善指标与可观测：事件消费速率、成功率、时延分位、回滚结果等
+- 增加补偿任务：对“回滚成功但缓存/DB 未一致”的场景进行对账修复
diff --git a/internal/alerting/service/remediation/consumer.go b/internal/alerting/service/remediation/consumer.go
index cead8d1..07cae6b 100644
--- a/internal/alerting/service/remediation/consumer.go
+++ b/internal/alerting/service/remediation/consumer.go
@@ -3,6 +3,7 @@ package remediation
 import (
 	"context"
 	"fmt"
+	"os"
 	"strconv"
 	"time"
 
@@ -16,189 +17,46 @@ type Consumer struct {
 	DB    *adb.Database
 	Redis *redis.Client
 
-	// Heal action service for P0 alerts
-	healService HealActionService
-
-	// Observation window manager
-	obsManager ObservationWindowManager
-
 	// sleepFn allows overriding for tests
 	sleepFn func(time.Duration)
 }
 
 func NewConsumer(db *adb.Database, rdb *redis.Client) *Consumer {
-	healDAO := NewPgHealActionDAO(db)
-	healService := NewHealActionService(healDAO)
-	obsManager := NewRedisObservationWindowManager(rdb)
-	return &Consumer{
-		DB:          db,
-		Redis:       rdb,
-		healService: healService,
-		obsManager:  obsManager,
-		sleepFn:     time.Sleep,
-	}
+	return &Consumer{DB: db, Redis: rdb, sleepFn: time.Sleep}
 }
 
-// Start consumes alert messages and processes them based on alert level
+// Start consumes alert messages and performs a mocked rollback then marks restored.
 func (c *Consumer) Start(ctx context.Context, ch <-chan healthcheck.AlertMessage) {
 	if ch == nil {
 		log.Warn().Msg("remediation consumer started without channel; no-op")
 		return
 	}
-
+	sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second)
 	for {
 		select {
 		case <-ctx.Done():
 			return
 		case m := <-ch:
-			// 首先检查是否有观察窗口需要处理
-			c.handleObservationWindow(ctx, &m)
-
-			switch m.Level {
-			case "P0":
-				// P0 告警：故障治愈流程
-				c.handleP0Alert(ctx, &m)
-			case "P1", "P2":
-				// P1/P2 告警：下钻分析流程
-				c.handleP1P2Alert(ctx, &m)
-			default:
-				log.Warn().Str("level", m.Level).Str("issue", m.ID).Msg("unknown alert level, skipping")
+			// 1) Mock rollback: optional URL composition (unused)
+			_ = fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(&m))
+			// 2) Sleep to simulate rollback time
+			if c.sleepFn != nil {
+				c.sleepFn(sleepDur)
+			}
+			// 3) On success: add AI analysis comment, update DB and cache
+			if err := c.addAIAnalysisComment(ctx, &m); err != nil {
+				log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed")
+			}
+			if err := c.markRestoredInDB(ctx, &m); err != nil {
+				log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed")
+			}
+			if err := c.markRestoredInCache(ctx, &m); err != nil {
+				log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed")
 			}
 		}
 	}
 }
 
-// handleObservationWindow handles observation window logic for incoming alerts
-func (c *Consumer) handleObservationWindow(ctx context.Context, m *healthcheck.AlertMessage) {
-	if m.Service == "" {
-		return // No service information, skip observation window check
-	}
-
-	// 检查是否有该服务的观察窗口
-	window, err := c.obsManager.CheckObservation(ctx, m.Service, m.Version)
-	if err != nil {
-		log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to check observation window")
-		return
-	}
-
-	if window == nil {
-		return // No active observation window
-	}
-
-	// 如果在观察窗口期间出现新的告警，取消观察窗口
-	log.Warn().
-		Str("service", m.Service).
-		Str("version", m.Version).
-		Str("alert_id", m.ID).
-		Str("observation_alert_id", window.AlertID).
-		Msg("new alert detected during observation window, cancelling observation")
-
-	if err := c.obsManager.CancelObservation(ctx, m.Service, m.Version); err != nil {
-		log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to cancel observation window")
-	}
-}
-
-// handleP0Alert handles P0 alerts with fault healing process
-func (c *Consumer) handleP0Alert(ctx context.Context, m *healthcheck.AlertMessage) {
-	log.Info().Str("issue", m.ID).Str("level", m.Level).Msg("processing P0 alert with fault healing")
-
-	// 1) 确认故障域
-	faultDomain := c.healService.IdentifyFaultDomain(m.Labels)
-	log.Info().Str("issue", m.ID).Str("fault_domain", string(faultDomain)).Msg("identified fault domain")
-
-	// 2) 查询治愈方案
-	healAction, err := c.healService.GetHealAction(ctx, faultDomain)
-	if err != nil {
-		log.Error().Err(err).Str("issue", m.ID).Msg("failed to get heal action")
-		// 如果无法获取治愈方案，直接进入下钻分析
-		c.handleDrillDownAnalysis(ctx, m)
-		return
-	}
-
-	// 3) 执行治愈操作
-	result, err := c.healService.ExecuteHealAction(ctx, healAction, m.ID, m.Labels)
-	if err != nil {
-		log.Error().Err(err).Str("issue", m.ID).Msg("failed to execute heal action")
-		c.handleDrillDownAnalysis(ctx, m)
-		return
-	}
-
-	if !result.Success {
-		log.Warn().Str("issue", m.ID).Str("message", result.Message).Msg("heal action failed")
-		// 治愈失败，仍然进入下钻分析
-		c.handleDrillDownAnalysis(ctx, m)
-		return
-	}
-
-	log.Info().Str("issue", m.ID).Str("message", result.Message).Msg("heal action completed successfully")
-
-	// 4) 治愈成功后启动观察窗口
-	if m.Service != "" {
-		obsDuration := GetObservationDuration()
-		if err := c.obsManager.StartObservation(ctx, m.Service, m.Version, m.ID, obsDuration); err != nil {
-			log.Error().Err(err).Str("service", m.Service).Str("version", m.Version).Msg("failed to start observation window")
-		} else {
-			log.Info().
-				Str("service", m.Service).
-				Str("version", m.Version).
-				Str("alert_id", m.ID).
-				Dur("duration", obsDuration).
-				Msg("started observation window after successful healing")
-		}
-	}
-
-	// 5) 治愈成功后进入下钻分析（但不立即更新状态）
-	c.handleDrillDownAnalysisWithObservation(ctx, m)
-}
-
-// handleP1P2Alert handles P1/P2 alerts with drill-down analysis
-func (c *Consumer) handleP1P2Alert(ctx context.Context, m *healthcheck.AlertMessage) {
-	log.Info().Str("issue", m.ID).Str("level", m.Level).Msg("processing P1/P2 alert with drill-down analysis")
-
-	// 直接进入下钻分析流程
-	c.handleDrillDownAnalysis(ctx, m)
-}
-
-// handleDrillDownAnalysis performs drill-down analysis and marks alert as restored
-func (c *Consumer) handleDrillDownAnalysis(ctx context.Context, m *healthcheck.AlertMessage) {
-	// 1) 执行 AI 分析
-	if err := c.addAIAnalysisComment(ctx, m); err != nil {
-		log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed")
-	}
-
-	// 2) 更新告警状态为恢复
-	if err := c.markRestoredInDB(ctx, m); err != nil {
-		log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInDB failed")
-	}
-
-	// 3) 更新缓存状态
-	if err := c.markRestoredInCache(ctx, m); err != nil {
-		log.Error().Err(err).Str("issue", m.ID).Msg("markRestoredInCache failed")
-	}
-}
-
-// handleDrillDownAnalysisWithObservation performs drill-down analysis but delays status update for observation
-func (c *Consumer) handleDrillDownAnalysisWithObservation(ctx context.Context, m *healthcheck.AlertMessage) {
-	// 1) 执行 AI 分析
-	if err := c.addAIAnalysisComment(ctx, m); err != nil {
-		log.Error().Err(err).Str("issue", m.ID).Msg("addAIAnalysisComment failed")
-	}
-
-	// 2) 暂时不更新告警状态，等待观察窗口完成
-	// 只记录治愈操作完成的评论
-	if err := c.addHealingCompletedComment(ctx, m); err != nil {
-		log.Error().Err(err).Str("issue", m.ID).Msg("addHealingCompletedComment failed")
-	}
-
-	log.Info().
-		Str("issue", m.ID).
-		Str("service", m.Service).
-		Str("version", m.Version).
-		Msg("healing completed, waiting for observation window to complete before updating status")
-}
-
-// deriveDeployID derives deployment ID from alert message
-// TODO: Use this function when implementing real rollback API calls
 func deriveDeployID(m *healthcheck.AlertMessage) string {
 	if m == nil {
 		return ""
@@ -233,38 +91,15 @@ func (c *Consumer) addAIAnalysisComment(ctx context.Context, m *healthcheck.Aler
 	return err
 }
 
-func (c *Consumer) addHealingCompletedComment(ctx context.Context, m *healthcheck.AlertMessage) error {
-	if c.DB == nil || m == nil {
-		return nil
-	}
-	const existsQ = `SELECT 1 FROM alert_issue_comments WHERE issue_id=$1 AND content=$2 LIMIT 1`
-	const insertQ = `INSERT INTO alert_issue_comments (issue_id, create_at, content) VALUES ($1, NOW(), $2)`
-	content := "## 治愈操作完成\n" +
-		"**操作状态**：治愈操作已成功执行\n" +
-		"**观察窗口**：正在等待观察窗口完成（30分钟）\n" +
-		"**下一步**：如果观察窗口内无新告警，将自动更新服务状态为正常"
-	if rows, err := c.DB.QueryContext(ctx, existsQ, m.ID, content); err == nil {
-		defer rows.Close()
-		if rows.Next() {
-			return nil
-		}
-	}
-	_, err := c.DB.ExecContext(ctx, insertQ, m.ID, content)
-	return err
-}
-
 func (c *Consumer) markRestoredInDB(ctx context.Context, m *healthcheck.AlertMessage) error {
 	if c.DB == nil || m == nil {
 		return nil
 	}
-
-	// 更新 alert_issues 状态
+	// alert_issues
 	if _, err := c.DB.ExecContext(ctx, `UPDATE alert_issues SET alert_state = 'Restored' , state = 'Closed' WHERE id = $1`, m.ID); err != nil {
 		return err
 	}
-
-	// 同时更新 service_states.health_state 为 Normal
-	// 注意：每次修改 service_states 为 Normal 时都需要修改 alert_issues.alert_state 为 Restored
+	// service_states (upsert)
 	if m.Service != "" {
 		const upsert = `
 INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids)
@@ -277,7 +112,6 @@ SET health_state = 'Normal',
 			return err
 		}
 	}
-
 	return nil
 }
 
@@ -312,7 +146,7 @@ return 1
 `)
 	_, _ = script.Run(ctx, c.Redis, []string{alertKey, "alert:index:alert_state:Pending", "alert:index:alert_state:InProcessing", "alert:index:alert_state:Restored", "alert:index:open", "alert:index:closed"}, "Restored", m.ID, "Closed").Result()
 
-	// 更新 service_state 缓存
+	// 2) service_state:{service}:{version} → health_state=Normal; resolved_at=now; add to Normal index
 	if m.Service != "" {
 		svcKey := "service_state:" + m.Service + ":" + m.Version
 		now := time.Now().UTC().Format(time.RFC3339Nano)
@@ -331,54 +165,6 @@ return 1
 	return nil
 }
 
-// CompleteObservationAndUpdateStatus completes observation window and updates service status
-func (c *Consumer) CompleteObservationAndUpdateStatus(ctx context.Context, service, version string) error {
-	if service == "" {
-		return fmt.Errorf("service name is required")
-	}
-
-	// 完成观察窗口
-	if err := c.obsManager.CompleteObservation(ctx, service, version); err != nil {
-		return fmt.Errorf("failed to complete observation window: %w", err)
-	}
-
-	// 更新服务状态为正常
-	const upsert = `
-INSERT INTO service_states (service, version, report_at, resolved_at, health_state, alert_issue_ids)
-VALUES ($1, $2, NULL, NOW(), 'Normal', ARRAY[]::text[])
-ON CONFLICT (service, version) DO UPDATE
-SET health_state = 'Normal',
-    resolved_at = NOW();
-`
-	if _, err := c.DB.ExecContext(ctx, upsert, service, version); err != nil {
-		return fmt.Errorf("failed to update service state: %w", err)
-	}
-
-	// 更新缓存
-	if c.Redis != nil {
-		svcKey := "service_state:" + service + ":" + version
-		now := time.Now().UTC().Format(time.RFC3339Nano)
-		svcScript := redis.NewScript(`
-local v = redis.call('GET', KEYS[1])
-if not v then v = '{}' end
-local obj = cjson.decode(v)
-obj.health_state = ARGV[1]
-obj.resolved_at = ARGV[2]
-redis.call('SET', KEYS[1], cjson.encode(obj), 'KEEPTTL')
-if KEYS[2] ~= '' then redis.call('SADD', KEYS[2], KEYS[1]) end
-return 1
-`)
-		_, _ = svcScript.Run(ctx, c.Redis, []string{svcKey, "service_state:index:health:Normal"}, "Normal", now).Result()
-	}
-
-	log.Info().
-		Str("service", service).
-		Str("version", version).
-		Msg("observation window completed successfully, service status updated to Normal")
-
-	return nil
-}
-
 func parseDuration(s string, d time.Duration) time.Duration {
 	if s == "" {
 		return d
diff --git a/internal/alerting/service/remediation/heal_action_dao.go b/internal/alerting/service/remediation/heal_action_dao.go
deleted file mode 100644
index af71b5e..0000000
--- a/internal/alerting/service/remediation/heal_action_dao.go
+++ /dev/null
@@ -1,145 +0,0 @@
-package remediation
-
-import (
-	"context"
-	"database/sql"
-	"encoding/json"
-	"fmt"
-
-	adb "github.com/qiniu/zeroops/internal/alerting/database"
-)
-
-// PgHealActionDAO implements HealActionDAO using PostgreSQL
-type PgHealActionDAO struct {
-	DB *adb.Database
-}
-
-// NewPgHealActionDAO creates a new PostgreSQL heal action DAO
-func NewPgHealActionDAO(db *adb.Database) *PgHealActionDAO {
-	return &PgHealActionDAO{DB: db}
-}
-
-// GetByType retrieves a heal action by fault domain type
-func (d *PgHealActionDAO) GetByType(ctx context.Context, faultType string) (*HealAction, error) {
-	const q = `SELECT id, desc, type, rules FROM heal_actions WHERE type = $1 LIMIT 1`
-
-	row := d.DB.QueryRowContext(ctx, q, faultType)
-	var action HealAction
-	var rulesJSON string
-
-	err := row.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON)
-	if err != nil {
-		if err == sql.ErrNoRows {
-			return nil, fmt.Errorf("no heal action found for type: %s", faultType)
-		}
-		return nil, fmt.Errorf("failed to get heal action by type: %w", err)
-	}
-
-	action.Rules = json.RawMessage(rulesJSON)
-	return &action, nil
-}
-
-// GetByID retrieves a heal action by ID
-func (d *PgHealActionDAO) GetByID(ctx context.Context, id string) (*HealAction, error) {
-	const q = `SELECT id, desc, type, rules FROM heal_actions WHERE id = $1`
-
-	row := d.DB.QueryRowContext(ctx, q, id)
-	var action HealAction
-	var rulesJSON string
-
-	err := row.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON)
-	if err != nil {
-		if err == sql.ErrNoRows {
-			return nil, fmt.Errorf("no heal action found with id: %s", id)
-		}
-		return nil, fmt.Errorf("failed to get heal action by id: %w", err)
-	}
-
-	action.Rules = json.RawMessage(rulesJSON)
-	return &action, nil
-}
-
-// Create creates a new heal action
-func (d *PgHealActionDAO) Create(ctx context.Context, action *HealAction) error {
-	const q = `INSERT INTO heal_actions (id, desc, type, rules) VALUES ($1, $2, $3, $4)`
-
-	_, err := d.DB.ExecContext(ctx, q, action.ID, action.Desc, action.Type, string(action.Rules))
-	if err != nil {
-		return fmt.Errorf("failed to create heal action: %w", err)
-	}
-
-	return nil
-}
-
-// Update updates an existing heal action
-func (d *PgHealActionDAO) Update(ctx context.Context, action *HealAction) error {
-	const q = `UPDATE heal_actions SET desc = $2, type = $3, rules = $4 WHERE id = $1`
-
-	result, err := d.DB.ExecContext(ctx, q, action.ID, action.Desc, action.Type, string(action.Rules))
-	if err != nil {
-		return fmt.Errorf("failed to update heal action: %w", err)
-	}
-
-	rowsAffected, err := result.RowsAffected()
-	if err != nil {
-		return fmt.Errorf("failed to get rows affected: %w", err)
-	}
-
-	if rowsAffected == 0 {
-		return fmt.Errorf("no heal action found with id: %s", action.ID)
-	}
-
-	return nil
-}
-
-// Delete deletes a heal action by ID
-func (d *PgHealActionDAO) Delete(ctx context.Context, id string) error {
-	const q = `DELETE FROM heal_actions WHERE id = $1`
-
-	result, err := d.DB.ExecContext(ctx, q, id)
-	if err != nil {
-		return fmt.Errorf("failed to delete heal action: %w", err)
-	}
-
-	rowsAffected, err := result.RowsAffected()
-	if err != nil {
-		return fmt.Errorf("failed to get rows affected: %w", err)
-	}
-
-	if rowsAffected == 0 {
-		return fmt.Errorf("no heal action found with id: %s", id)
-	}
-
-	return nil
-}
-
-// List retrieves all heal actions
-func (d *PgHealActionDAO) List(ctx context.Context) ([]*HealAction, error) {
-	const q = `SELECT id, desc, type, rules FROM heal_actions ORDER BY type, id`
-
-	rows, err := d.DB.QueryContext(ctx, q)
-	if err != nil {
-		return nil, fmt.Errorf("failed to list heal actions: %w", err)
-	}
-	defer rows.Close()
-
-	var actions []*HealAction
-	for rows.Next() {
-		var action HealAction
-		var rulesJSON string
-
-		err := rows.Scan(&action.ID, &action.Desc, &action.Type, &rulesJSON)
-		if err != nil {
-			return nil, fmt.Errorf("failed to scan heal action: %w", err)
-		}
-
-		action.Rules = json.RawMessage(rulesJSON)
-		actions = append(actions, &action)
-	}
-
-	if err = rows.Err(); err != nil {
-		return nil, fmt.Errorf("error iterating heal actions: %w", err)
-	}
-
-	return actions, nil
-}
diff --git a/internal/alerting/service/remediation/heal_action_service.go b/internal/alerting/service/remediation/heal_action_service.go
deleted file mode 100644
index aa9a8f0..0000000
--- a/internal/alerting/service/remediation/heal_action_service.go
+++ /dev/null
@@ -1,172 +0,0 @@
-package remediation
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"os"
-	"time"
-
-	"github.com/rs/zerolog/log"
-)
-
-// HealActionServiceImpl implements HealActionService
-type HealActionServiceImpl struct {
-	dao HealActionDAO
-}
-
-// NewHealActionService creates a new heal action service
-func NewHealActionService(dao HealActionDAO) *HealActionServiceImpl {
-	return &HealActionServiceImpl{dao: dao}
-}
-
-// IdentifyFaultDomain identifies the fault domain from alert labels
-func (s *HealActionServiceImpl) IdentifyFaultDomain(labels map[string]string) FaultDomain {
-	service := labels["service_name"]
-	version := labels["version"]
-
-	if service != "" && version != "" {
-		return FaultDomainServiceVersion
-	}
-
-	// TODO: 可根据更多条件扩展其他故障域
-	// - 整体问题：检查是否有全局性指标异常
-	// - 单机房问题：检查是否有机房相关标签
-	// - 网络问题：检查是否有网络相关标签
-	return FaultDomainUnknown
-}
-
-// GetHealAction retrieves the appropriate heal action for a fault domain
-func (s *HealActionServiceImpl) GetHealAction(ctx context.Context, faultDomain FaultDomain) (*HealAction, error) {
-	if faultDomain == FaultDomainUnknown {
-		return nil, fmt.Errorf("unknown fault domain, cannot determine heal action")
-	}
-
-	action, err := s.dao.GetByType(ctx, string(faultDomain))
-	if err != nil {
-		return nil, fmt.Errorf("failed to get heal action for domain %s: %w", faultDomain, err)
-	}
-
-	return action, nil
-}
-
-// ExecuteHealAction executes the heal action based on the rules
-func (s *HealActionServiceImpl) ExecuteHealAction(ctx context.Context, action *HealAction, alertID string, labels map[string]string) (*HealActionResult, error) {
-	if action == nil {
-		return &HealActionResult{
-			Success: false,
-			Error:   "no heal action provided",
-		}, nil
-	}
-
-	// Parse the rules
-	var rules HealActionRules
-	if err := json.Unmarshal(action.Rules, &rules); err != nil {
-		return &HealActionResult{
-			Success: false,
-			Error:   fmt.Sprintf("failed to parse heal action rules: %v", err),
-		}, nil
-	}
-
-	// Execute based on action type
-	switch rules.Action {
-	case "rollback":
-		return s.executeRollback(ctx, rules, alertID, labels)
-	case "alert":
-		return s.executeAlert(rules, alertID, labels)
-	default:
-		return &HealActionResult{
-			Success: false,
-			Error:   fmt.Sprintf("unsupported action type: %s", rules.Action),
-		}, nil
-	}
-}
-
-// executeRollback executes a rollback operation
-func (s *HealActionServiceImpl) executeRollback(ctx context.Context, rules HealActionRules, alertID string, labels map[string]string) (*HealActionResult, error) {
-	_ = ctx // TODO: Use context for HTTP timeout when calling real rollback API
-	// Check deployment status if specified
-	if rules.DeploymentStatus != "" {
-		// TODO: 实际实现中应该查询部署系统获取真实的部署状态
-		// 这里暂时模拟检查
-		deployStatus := s.getDeploymentStatus(labels)
-		if deployStatus != rules.DeploymentStatus {
-			return &HealActionResult{
-				Success: false,
-				Message: fmt.Sprintf("deployment status mismatch: expected %s, got %s", rules.DeploymentStatus, deployStatus),
-			}, nil
-		}
-	}
-
-	// Mock rollback execution
-	sleepDur := parseDuration(os.Getenv("REMEDIATION_ROLLBACK_SLEEP"), 30*time.Second)
-	log.Info().
-		Str("alert_id", alertID).
-		Str("target", rules.Target).
-		Dur("sleep_duration", sleepDur).
-		Msg("executing mock rollback")
-
-	// Simulate rollback time
-	time.Sleep(sleepDur)
-
-	// TODO: 实际实现中应该调用真实的回滚接口
-	// url := fmt.Sprintf(os.Getenv("REMEDIATION_ROLLBACK_URL"), deriveDeployID(labels))
-	// 发起 HTTP POST 请求到回滚接口
-
-	return &HealActionResult{
-		Success: true,
-		Message: fmt.Sprintf("rollback completed successfully, target: %s", rules.Target),
-	}, nil
-}
-
-// executeAlert executes an alert-only action (no automatic healing)
-func (s *HealActionServiceImpl) executeAlert(rules HealActionRules, alertID string, labels map[string]string) (*HealActionResult, error) {
-	_ = labels // TODO: Use labels for context-specific alert messages
-	log.Warn().
-		Str("alert_id", alertID).
-		Str("message", rules.Message).
-		Msg("heal action requires manual intervention")
-
-	return &HealActionResult{
-		Success: false,
-		Message: rules.Message,
-	}, nil
-}
-
-// getDeploymentStatus gets the deployment status for the given labels
-// TODO: 实际实现中应该查询部署系统获取真实的部署状态
-func (s *HealActionServiceImpl) getDeploymentStatus(labels map[string]string) string {
-	// 这里暂时返回模拟状态
-	// 实际实现中应该：
-	// 1. 从 labels 中提取 service 和 version
-	// 2. 查询部署系统 API 获取当前部署状态
-	// 3. 返回 "deploying" 或 "deployed"
-
-	service := labels["service_name"]
-	version := labels["version"]
-
-	if service == "" || version == "" {
-		return "unknown"
-	}
-
-	// 模拟逻辑：如果版本号包含 "dev" 或 "test"，认为是发布中，待确认修改为实际的部署状态区分方式
-	if version == "dev" || version == "test" {
-		return "deploying"
-	}
-
-	return "deployed"
-}
-
-// deriveDeployIDFromLabels derives deployment ID from labels
-// TODO: Use this function when implementing real rollback API calls
-func deriveDeployIDFromLabels(labels map[string]string) string {
-	if v := labels["deploy_id"]; v != "" {
-		return v
-	}
-	service := labels["service_name"]
-	version := labels["version"]
-	if service != "" && version != "" {
-		return fmt.Sprintf("%s:%s", service, version)
-	}
-	return ""
-}
diff --git a/internal/alerting/service/remediation/heal_action_service_test.go b/internal/alerting/service/remediation/heal_action_service_test.go
deleted file mode 100644
index eb45209..0000000
--- a/internal/alerting/service/remediation/heal_action_service_test.go
+++ /dev/null
@@ -1,178 +0,0 @@
-package remediation
-
-import (
-	"context"
-	"encoding/json"
-	"testing"
-)
-
-func TestHealActionServiceImpl_IdentifyFaultDomain(t *testing.T) {
-	service := &HealActionServiceImpl{}
-
-	tests := []struct {
-		name     string
-		labels   map[string]string
-		expected FaultDomain
-	}{
-		{
-			name: "service_version_issue",
-			labels: map[string]string{
-				"service_name": "test-service",
-				"version":      "v1.0.0",
-			},
-			expected: FaultDomainServiceVersion,
-		},
-		{
-			name: "missing_service_name",
-			labels: map[string]string{
-				"version": "v1.0.0",
-			},
-			expected: FaultDomainUnknown,
-		},
-		{
-			name: "missing_version",
-			labels: map[string]string{
-				"service_name": "test-service",
-			},
-			expected: FaultDomainUnknown,
-		},
-		{
-			name:     "empty_labels",
-			labels:   map[string]string{},
-			expected: FaultDomainUnknown,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := service.IdentifyFaultDomain(tt.labels)
-			if result != tt.expected {
-				t.Errorf("IdentifyFaultDomain() = %v, want %v", result, tt.expected)
-			}
-		})
-	}
-}
-
-func TestHealActionServiceImpl_ExecuteHealAction(t *testing.T) {
-	service := &HealActionServiceImpl{}
-
-	tests := []struct {
-		name        string
-		action      *HealAction
-		alertID     string
-		labels      map[string]string
-		expectError bool
-	}{
-		{
-			name: "rollback_action",
-			action: &HealAction{
-				ID:   "test-rollback",
-				Desc: "Test rollback action",
-				Type: "service_version_issue",
-				Rules: json.RawMessage(`{
-					"deployment_status": "deploying",
-					"action": "rollback",
-					"target": "previous_version"
-				}`),
-			},
-			alertID: "test-alert-1",
-			labels: map[string]string{
-				"service_name": "test-service",
-				"version":      "dev",
-			},
-			expectError: false,
-		},
-		{
-			name: "alert_action",
-			action: &HealAction{
-				ID:   "test-alert",
-				Desc: "Test alert action",
-				Type: "service_version_issue",
-				Rules: json.RawMessage(`{
-					"action": "alert",
-					"message": "Version already deployed, manual intervention required"
-				}`),
-			},
-			alertID: "test-alert-2",
-			labels: map[string]string{
-				"service_name": "test-service",
-				"version":      "v1.0.0",
-			},
-			expectError: false,
-		},
-		{
-			name:        "nil_action",
-			action:      nil,
-			alertID:     "test-alert-3",
-			labels:      map[string]string{},
-			expectError: false, // Should not error, but return failure result
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result, err := service.ExecuteHealAction(context.Background(), tt.action, tt.alertID, tt.labels)
-
-			if tt.expectError && err == nil {
-				t.Errorf("ExecuteHealAction() expected error but got none")
-			}
-			if !tt.expectError && err != nil {
-				t.Errorf("ExecuteHealAction() unexpected error: %v", err)
-			}
-
-			if result == nil {
-				t.Errorf("ExecuteHealAction() returned nil result")
-			}
-		})
-	}
-}
-
-func TestHealActionServiceImpl_getDeploymentStatus(t *testing.T) {
-	service := &HealActionServiceImpl{}
-
-	tests := []struct {
-		name     string
-		labels   map[string]string
-		expected string
-	}{
-		{
-			name: "deploying_version",
-			labels: map[string]string{
-				"service_name": "test-service",
-				"version":      "dev",
-			},
-			expected: "deploying",
-		},
-		{
-			name: "deployed_version",
-			labels: map[string]string{
-				"service_name": "test-service",
-				"version":      "v1.0.0",
-			},
-			expected: "deployed",
-		},
-		{
-			name: "missing_service_name",
-			labels: map[string]string{
-				"version": "v1.0.0",
-			},
-			expected: "unknown",
-		},
-		{
-			name: "missing_version",
-			labels: map[string]string{
-				"service_name": "test-service",
-			},
-			expected: "unknown",
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			result := service.getDeploymentStatus(tt.labels)
-			if result != tt.expected {
-				t.Errorf("getDeploymentStatus() = %v, want %v", result, tt.expected)
-			}
-		})
-	}
-}
diff --git a/internal/alerting/service/remediation/init_heal_actions.sql b/internal/alerting/service/remediation/init_heal_actions.sql
deleted file mode 100644
index b11fcee..0000000
--- a/internal/alerting/service/remediation/init_heal_actions.sql
+++ /dev/null
@@ -1,38 +0,0 @@
--- 创建 heal_actions 表
-CREATE TABLE IF NOT EXISTS heal_actions (
-    id VARCHAR(255) PRIMARY KEY,
-    desc TEXT NOT NULL,
-    type VARCHAR(255) NOT NULL,
-    rules JSONB NOT NULL
-);
-
--- 创建索引
-CREATE INDEX IF NOT EXISTS idx_heal_actions_type ON heal_actions(type);
-
--- 插入示例数据
-INSERT INTO heal_actions (id, desc, type, rules) VALUES 
-(
-    'service_version_rollback_deploying',
-    '服务版本回滚方案（发布中版本）',
-    'service_version_issue',
-    '{"deployment_status": "deploying", "action": "rollback", "target": "previous_version"}'
-),
-(
-    'service_version_alert_deployed',
-    '服务版本告警方案（已完成发布版本）',
-    'service_version_issue',
-    '{"deployment_status": "deployed", "action": "alert", "message": "版本已发布，暂不支持自动回滚，需要人工介入处理"}'
-),
-(
-    'service_version_rollback_default',
-    '服务版本回滚方案（默认）',
-    'service_version_issue',
-    '{"action": "rollback", "target": "previous_version"}'
-)
-ON CONFLICT (id) DO UPDATE SET
-    desc = EXCLUDED.desc,
-    type = EXCLUDED.type,
-    rules = EXCLUDED.rules;
-
--- 查询验证
-SELECT id, desc, type, rules FROM heal_actions ORDER BY type, id;
diff --git a/internal/alerting/service/remediation/observation_window.go b/internal/alerting/service/remediation/observation_window.go
deleted file mode 100644
index c07e146..0000000
--- a/internal/alerting/service/remediation/observation_window.go
+++ /dev/null
@@ -1,169 +0,0 @@
-package remediation
-
-import (
-	"context"
-	"encoding/json"
-	"fmt"
-	"time"
-
-	"github.com/redis/go-redis/v9"
-	"github.com/rs/zerolog/log"
-)
-
-// RedisObservationWindowManager implements ObservationWindowManager using Redis
-type RedisObservationWindowManager struct {
-	redis *redis.Client
-}
-
-// NewRedisObservationWindowManager creates a new Redis-based observation window manager
-func NewRedisObservationWindowManager(redis *redis.Client) *RedisObservationWindowManager {
-	return &RedisObservationWindowManager{redis: redis}
-}
-
-// StartObservation starts an observation window for a service
-func (m *RedisObservationWindowManager) StartObservation(ctx context.Context, service, version, alertID string, duration time.Duration) error {
-	if m.redis == nil {
-		return fmt.Errorf("redis client is nil")
-	}
-
-	now := time.Now()
-	window := &ObservationWindow{
-		Duration:  duration,
-		Service:   service,
-		Version:   version,
-		AlertID:   alertID,
-		StartTime: now,
-		EndTime:   now.Add(duration),
-		IsActive:  true,
-	}
-
-	key := fmt.Sprintf("observation:%s:%s", service, version)
-	data, err := json.Marshal(window)
-	if err != nil {
-		return fmt.Errorf("failed to marshal observation window: %w", err)
-	}
-
-	// Store with TTL equal to observation duration + buffer
-	ttl := duration + 5*time.Minute
-	err = m.redis.Set(ctx, key, data, ttl).Err()
-	if err != nil {
-		return fmt.Errorf("failed to store observation window: %w", err)
-	}
-
-	log.Info().
-		Str("service", service).
-		Str("version", version).
-		Str("alert_id", alertID).
-		Dur("duration", duration).
-		Time("end_time", window.EndTime).
-		Msg("started observation window")
-
-	return nil
-}
-
-// CheckObservation checks if there's an active observation window for a service
-func (m *RedisObservationWindowManager) CheckObservation(ctx context.Context, service, version string) (*ObservationWindow, error) {
-	if m.redis == nil {
-		return nil, fmt.Errorf("redis client is nil")
-	}
-
-	key := fmt.Sprintf("observation:%s:%s", service, version)
-	data, err := m.redis.Get(ctx, key).Result()
-	if err != nil {
-		if err == redis.Nil {
-			return nil, nil // No active observation window
-		}
-		return nil, fmt.Errorf("failed to get observation window: %w", err)
-	}
-
-	var window ObservationWindow
-	err = json.Unmarshal([]byte(data), &window)
-	if err != nil {
-		return nil, fmt.Errorf("failed to unmarshal observation window: %w", err)
-	}
-
-	// Check if observation window has expired
-	if time.Now().After(window.EndTime) {
-		// Clean up expired window
-		m.redis.Del(ctx, key)
-		return nil, nil
-	}
-
-	return &window, nil
-}
-
-// CompleteObservation completes an observation window and marks it as successful
-func (m *RedisObservationWindowManager) CompleteObservation(ctx context.Context, service, version string) error {
-	if m.redis == nil {
-		return fmt.Errorf("redis client is nil")
-	}
-
-	key := fmt.Sprintf("observation:%s:%s", service, version)
-
-	// Get the current window
-	window, err := m.CheckObservation(ctx, service, version)
-	if err != nil {
-		return fmt.Errorf("failed to check observation window: %w", err)
-	}
-
-	if window == nil {
-		return fmt.Errorf("no active observation window found for service %s version %s", service, version)
-	}
-
-	// Mark as completed and remove from Redis
-	window.IsActive = false
-	err = m.redis.Del(ctx, key).Err()
-	if err != nil {
-		return fmt.Errorf("failed to remove observation window: %w", err)
-	}
-
-	log.Info().
-		Str("service", service).
-		Str("version", version).
-		Str("alert_id", window.AlertID).
-		Dur("duration", window.Duration).
-		Msg("completed observation window successfully")
-
-	return nil
-}
-
-// CancelObservation cancels an observation window due to new alerts
-func (m *RedisObservationWindowManager) CancelObservation(ctx context.Context, service, version string) error {
-	if m.redis == nil {
-		return fmt.Errorf("redis client is nil")
-	}
-
-	key := fmt.Sprintf("observation:%s:%s", service, version)
-
-	// Get the current window for logging
-	window, err := m.CheckObservation(ctx, service, version)
-	if err != nil {
-		return fmt.Errorf("failed to check observation window: %w", err)
-	}
-
-	if window == nil {
-		return nil // No active window to cancel
-	}
-
-	// Remove the observation window
-	err = m.redis.Del(ctx, key).Err()
-	if err != nil {
-		return fmt.Errorf("failed to cancel observation window: %w", err)
-	}
-
-	log.Warn().
-		Str("service", service).
-		Str("version", version).
-		Str("alert_id", window.AlertID).
-		Msg("cancelled observation window due to new alerts")
-
-	return nil
-}
-
-// GetObservationDuration returns the configured observation duration
-// TODO: 后续可以从配置或数据库中动态获取观察时间
-func GetObservationDuration() time.Duration {
-	// 暂时使用固定的30分钟观察窗口
-	// 后续可以扩展为从环境变量或配置文件中读取
-	return 30 * time.Minute
-}
diff --git a/internal/alerting/service/remediation/observation_window_test.go b/internal/alerting/service/remediation/observation_window_test.go
deleted file mode 100644
index 8a2ec35..0000000
--- a/internal/alerting/service/remediation/observation_window_test.go
+++ /dev/null
@@ -1,100 +0,0 @@
-package remediation
-
-import (
-	"context"
-	"testing"
-	"time"
-
-	"github.com/redis/go-redis/v9"
-	"github.com/stretchr/testify/assert"
-	"github.com/stretchr/testify/require"
-)
-
-func TestRedisObservationWindowManager(t *testing.T) {
-	// 使用内存 Redis 客户端进行测试
-	rdb := redis.NewClient(&redis.Options{
-		Addr: "localhost:6379", // 需要 Redis 实例
-	})
-	defer rdb.Close()
-
-	// 检查 Redis 连接
-	ctx := context.Background()
-	if err := rdb.Ping(ctx).Err(); err != nil {
-		t.Skip("Redis not available, skipping test")
-	}
-
-	manager := NewRedisObservationWindowManager(rdb)
-
-	t.Run("StartObservation", func(t *testing.T) {
-		service := "test-service"
-		version := "v1.0.0"
-		alertID := "test-alert-1"
-		duration := 5 * time.Minute
-
-		err := manager.StartObservation(ctx, service, version, alertID, duration)
-		require.NoError(t, err)
-
-		// 验证观察窗口已创建
-		window, err := manager.CheckObservation(ctx, service, version)
-		require.NoError(t, err)
-		require.NotNil(t, window)
-		assert.Equal(t, service, window.Service)
-		assert.Equal(t, version, window.Version)
-		assert.Equal(t, alertID, window.AlertID)
-		assert.True(t, window.IsActive)
-	})
-
-	t.Run("CheckObservation_NotFound", func(t *testing.T) {
-		service := "non-existent-service"
-		version := "v1.0.0"
-
-		window, err := manager.CheckObservation(ctx, service, version)
-		require.NoError(t, err)
-		assert.Nil(t, window)
-	})
-
-	t.Run("CompleteObservation", func(t *testing.T) {
-		service := "test-service-2"
-		version := "v1.0.0"
-		alertID := "test-alert-2"
-		duration := 5 * time.Minute
-
-		// 先创建观察窗口
-		err := manager.StartObservation(ctx, service, version, alertID, duration)
-		require.NoError(t, err)
-
-		// 完成观察窗口
-		err = manager.CompleteObservation(ctx, service, version)
-		require.NoError(t, err)
-
-		// 验证观察窗口已被移除
-		window, err := manager.CheckObservation(ctx, service, version)
-		require.NoError(t, err)
-		assert.Nil(t, window)
-	})
-
-	t.Run("CancelObservation", func(t *testing.T) {
-		service := "test-service-3"
-		version := "v1.0.0"
-		alertID := "test-alert-3"
-		duration := 5 * time.Minute
-
-		// 先创建观察窗口
-		err := manager.StartObservation(ctx, service, version, alertID, duration)
-		require.NoError(t, err)
-
-		// 取消观察窗口
-		err = manager.CancelObservation(ctx, service, version)
-		require.NoError(t, err)
-
-		// 验证观察窗口已被移除
-		window, err := manager.CheckObservation(ctx, service, version)
-		require.NoError(t, err)
-		assert.Nil(t, window)
-	})
-}
-
-func TestGetObservationDuration(t *testing.T) {
-	duration := GetObservationDuration()
-	assert.Equal(t, 30*time.Minute, duration)
-}
diff --git a/internal/alerting/service/remediation/types.go b/internal/alerting/service/remediation/types.go
deleted file mode 100644
index c1c5f02..0000000
--- a/internal/alerting/service/remediation/types.go
+++ /dev/null
@@ -1,74 +0,0 @@
-package remediation
-
-import (
-	"context"
-	"encoding/json"
-	"time"
-)
-
-// HealAction represents a healing action configuration
-type HealAction struct {
-	ID    string          `json:"id"`
-	Desc  string          `json:"desc"`
-	Type  string          `json:"type"`
-	Rules json.RawMessage `json:"rules"`
-}
-
-// HealActionRules represents the rules for a heal action
-type HealActionRules struct {
-	DeploymentStatus string `json:"deployment_status,omitempty"`
-	Action           string `json:"action"`
-	Target           string `json:"target,omitempty"`
-	Message          string `json:"message,omitempty"`
-}
-
-// FaultDomain represents the identified fault domain
-type FaultDomain string
-
-const (
-	FaultDomainServiceVersion FaultDomain = "service_version_issue"
-	FaultDomainUnknown        FaultDomain = "unknown"
-)
-
-// HealActionResult represents the result of executing a heal action
-type HealActionResult struct {
-	Success bool   `json:"success"`
-	Message string `json:"message,omitempty"`
-	Error   string `json:"error,omitempty"`
-}
-
-// ObservationWindow represents the observation period after healing
-type ObservationWindow struct {
-	Duration  time.Duration `json:"duration"`
-	Service   string        `json:"service"`
-	Version   string        `json:"version"`
-	AlertID   string        `json:"alert_id"`
-	StartTime time.Time     `json:"start_time"`
-	EndTime   time.Time     `json:"end_time"`
-	IsActive  bool          `json:"is_active"`
-}
-
-// ObservationWindowManager defines the interface for managing observation windows
-type ObservationWindowManager interface {
-	StartObservation(ctx context.Context, service, version, alertID string, duration time.Duration) error
-	CheckObservation(ctx context.Context, service, version string) (*ObservationWindow, error)
-	CompleteObservation(ctx context.Context, service, version string) error
-	CancelObservation(ctx context.Context, service, version string) error
-}
-
-// HealActionDAO defines the interface for heal action database operations
-type HealActionDAO interface {
-	GetByType(ctx context.Context, faultType string) (*HealAction, error)
-	GetByID(ctx context.Context, id string) (*HealAction, error)
-	Create(ctx context.Context, action *HealAction) error
-	Update(ctx context.Context, action *HealAction) error
-	Delete(ctx context.Context, id string) error
-	List(ctx context.Context) ([]*HealAction, error)
-}
-
-// HealActionService defines the interface for heal action business logic
-type HealActionService interface {
-	IdentifyFaultDomain(labels map[string]string) FaultDomain
-	GetHealAction(ctx context.Context, faultDomain FaultDomain) (*HealAction, error)
-	ExecuteHealAction(ctx context.Context, action *HealAction, alertID string, labels map[string]string) (*HealActionResult, error)
-}