feat(api): alert evaluation engine (keephq#3138)

ezhil56x · Jan 27, 2025 · b0f371f · b0f371f
1 parent 427dc27
commit b0f371f
Show file tree

Hide file tree

Showing 18 changed files with 1,721 additions and 118 deletions.
diff --git a/docs/alertevaluation/examples/victoriametricsmulti.mdx b/docs/alertevaluation/examples/victoriametricsmulti.mdx
@@ -0,0 +1,67 @@
+---
+title: "VictoriaMetrics Multi Alert Example"
+---
+
+This example demonstrates a simple CPU usage multi-alert based on a metric:
+
+```yaml
+workflow:
+  # Unique identifier for this workflow
+  id: query-victoriametrics-multi
+  # Display name shown in the UI
+  name: victoriametrics-multi-alert-example
+  # Brief description of what this workflow does
+  description: victoriametrics
+  triggers:
+    # This workflow can be triggered manually from the UI
+    - type: manual
+  steps:
+    # Query VictoriaMetrics for CPU metrics
+    - name: victoriametrics-step
+      provider:
+        # Use the VictoriaMetrics provider configuration
+        config: "{{ providers.vm }}"
+        type: victoriametrics
+        with:
+          # Query that returns the sum of CPU usage for each job
+          # Example response:
+          # [
+          #   {'metric': {'job': 'victoriametrics'}, 'value': [1737808021, '0.022633333333333307']},
+          #   {'metric': {'job': 'vmagent'}, 'value': [1737808021, '0.009299999999999998']}
+          # ]
+          query: sum(rate(process_cpu_seconds_total)) by (job)
+          queryType: query
+
+  actions:
+    # Create an alert in Keep based on the query results
+    - name: create-alert
+      provider:
+        type: keep
+        with:
+          # Only create alert if CPU usage is above threshold
+          if: "{{ value.1 }} > 0.01 "
+          # Alert must persist for 1 minute
+          for: 1m
+          # Use job label to create unique fingerprint for each alert
+          fingerprint_fields:
+            - labels.job
+          alert:
+            # Alert name includes the specific job
+            name: "High CPU Usage on {{ metric.job }}"
+            description: "CPU usage is high on the VM (created from VM metric)"
+            # Set severity based on CPU usage thresholds:
+            # > 0.9 = critical
+            # > 0.7 = warning
+            # else = info
+            severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"'
+            labels:
+              # Job label is required for alert fingerprinting
+              job: "{{ metric.job }}"
+              # Additional context labels
+              environment: production
+              app: myapp
+              service: api
+              team: devops
+              owner: alice
+
+```
diff --git a/docs/alertevaluation/examples/victoriametricssingle.mdx b/docs/alertevaluation/examples/victoriametricssingle.mdx
@@ -0,0 +1,53 @@
+---
+title: "VictoriaMetrics Single Alert Example"
+---
+
+This example demonstrates a simple CPU usage alert based on a metric:
+
+```yaml
+# This workflow queries VictoriaMetrics metrics and creates alerts based on CPU usage
+workflow:
+  # Unique identifier for this workflow
+  id: query-victoriametrics
+  # Display name shown in the UI
+  name: victoriametrics-alert-example
+  # Brief description of what this workflow does
+  description: Monitors CPU usage metrics from VictoriaMetrics and creates alerts when thresholds are exceeded
+
+  # Define how the workflow is triggered
+  triggers:
+    - type: manual # Can be triggered manually from the UI
+
+  # Steps to execute in order
+  steps:
+    - name: victoriametrics-step
+      provider:
+        # Use VictoriaMetrics provider config defined in providers.vm
+        config: "{{ providers.vm }}"
+        type: victoriametrics
+        with:
+          # Query average CPU usage rate
+          query: avg(rate(process_cpu_seconds_total))
+          queryType: query
+
+  # Actions to take based on the query results
+  actions:
+    - name: create-alert
+      provider:
+        type: keep
+        with:
+          # Create alert if CPU usage exceeds threshold
+          if: "{{ value.1 }} > 0.0040"
+          alert:
+            name: "High CPU Usage"
+            description: "[Single] CPU usage is high on the VM (created from VM metric)"
+            # Set severity based on CPU usage thresholds
+            severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"'
+            # Alert labels for filtering and routing
+            labels:
+              environment: production
+              app: myapp
+              service: api
+              team: devops
+              owner: alice
+```
diff --git a/docs/alertevaluation/overview.mdx b/docs/alertevaluation/overview.mdx
@@ -0,0 +1,52 @@
+---
+title: "Overview"
+---
+
+The Keep Alert Evaluation Engine is a flexible system that enables you to create alerts based on any data source and define evaluation rules. Unlike traditional monitoring solutions that are tied to specific metrics, Keep's engine allows you to combine data from multiple sources and apply complex logic to determine when and how alerts should be triggered.
+
+## Core Features
+
+### Generic Data Source Support
+- Query any data source (databases, APIs, metrics systems)
+- Combine multiple data sources in a single alert rule
+- Apply custom transformations to the data
+
+### Flexible Alert Evaluation
+- Define custom conditions using templated expressions
+- Support for complex boolean logic and mathematical operations
+- State management for alert transitions (pending->firing->resolved)
+- Deduplication and alert instance tracking
+
+### Customizable Alert Definition
+- Full control over alert metadata (name, description, severity)
+- Dynamic labels based on evaluation context
+- Template support for all alert fields
+- Custom fingerprinting for alert grouping
+
+## Core Components
+
+### Alert States
+- **Pending**: Initial state when alert condition is met (relevant only if `for` supplied)
+- **Firing**: Active alert that has met its duration condition
+- **Resolved**: Alert that is no longer active
+
+### Alert Rule Components
+1. **Data Collection**: Query steps to gather data from any source
+2. **Condition (`if`)**: Expression that determines when to create/update an alert
+3. **Duration (`for`)**: Optional time period the condition must be true before firing
+4. **Alert Definition**: Complete control over how the alert looks and behaves:
+   - Name and description
+   - Severity levels
+   - Labels for routing
+   - Custom fields and annotations
+
+### State Management
+- **Fingerprinting**: Unique identifier for alert deduplication and state tracking
+- **Keep-Firing**: Control how long alerts remain active
+- **State Transitions**: Rules for how alerts move between states
+
+## Examples
+The following examples demonstrate different ways to use the alert evaluation engine:
+
+- [Single Metric Alert](/alertevaluation/examples/victoriametricssingle) - Basic example showing metrics-based alerting
+- [Multiple Metrics Alert](/alertevaluation/examples/victoriametricsmulti) - Advanced example with multiple alert instances
diff --git a/docs/mint.json b/docs/mint.json
@@ -103,6 +103,19 @@
         }
       ]
     },
+    {
+      "group": "Alert Evaluation Engine",
+      "pages": [
+        "alertevaluation/overview",
+        {
+          "group": "Examples",
+          "pages": [
+            "alertevaluation/examples/victoriametricssingle",
+            "alertevaluation/examples/victoriametricsmulti"
+          ]
+        }
+      ]
+    },
     {
       "group": "Providers",
       "pages": [

diff --git a/docs/overview/introduction.mdx b/docs/overview/introduction.mdx
@@ -41,6 +41,6 @@ Our vision is to democratize AIOps, making it accessible and practical for teams
 
 ## What you should read next
 
-- [Key Concepts](#key-concepts): Understand the foundational ideas behind Keep.
-- [Use Cases](#use-cases): Learn how Keep can solve specific IT operations challenges.
-- [Getting Started](#getting-started): Dive in and start using Keep today.
+- [Key Concepts](/overview/glossary): Understand the foundational ideas behind Keep.
+- [Use Cases](/overview/usecases): Learn how Keep can solve specific IT operations challenges.
+- [Playground](/overview/playground): Explore Keep's playground.
diff --git a/examples/workflows/create_alert_from_vm_metric.yml b/examples/workflows/create_alert_from_vm_metric.yml
@@ -1,39 +1,45 @@
+# This workflow queries VictoriaMetrics metrics and creates alerts based on CPU usage
 workflow:
+  # Unique identifier for this workflow
   id: query-victoriametrics
+  # Display name shown in the UI
   name: victoriametrics-alert-example
-  description: victoriametrics
+  # Brief description of what this workflow does
+  description: Monitors CPU usage metrics from VictoriaMetrics and creates alerts when thresholds are exceeded
+
+  # Define how the workflow is triggered
   triggers:
-    - type: manual
+    - type: manual # Can be triggered manually from the UI
+
+  # Steps to execute in order
   steps:
     - name: victoriametrics-step
       provider:
+        # Use VictoriaMetrics provider config defined in providers.vm
         config: "{{ providers.vm }}"
         type: victoriametrics
         with:
+          # Query average CPU usage rate
           query: avg(rate(process_cpu_seconds_total))
           queryType: query
 
+  # Actions to take based on the query results
   actions:
     - name: create-alert
-      # only create an alert if the CPU usage is greater than 0.005
-      if: "{{ steps.victoriametrics-step.results.data.result.0.value.1 }} > 0.001 "
       provider:
         type: keep
-        # create an alert with the following details
         with:
-          name: "High CPU Usage"
-          description: "CPU usage is high on the VM (created from VM metric)"
-          severity: '{{ steps.victoriametrics-step.results.data.result.0.value.1 }} > 0.9 ? "critical" : {{ steps.victoriametrics-step.results.data.result.0.value.1 }} > 0.7 ? "warning" : "info"'
-          labels:
-            environment: production
-            app: myapp
-            service: api
-            team: devops
-            owner: alice
-          # optional: customize the fingerprint based on these fields
-          fingerprint_fields:
-            - environment
-            - app
-            - service
-            - team
-            - owner
+          # Create alert if CPU usage exceeds threshold
+          if: "{{ value.1 }} > 0.0040"
+          alert:
+            name: "High CPU Usage"
+            description: "[Single] CPU usage is high on the VM (created from VM metric)"
+            # Set severity based on CPU usage thresholds
+            severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"'
+            # Alert labels for filtering and routing
+            labels:
+              environment: production
+              app: myapp
+              service: api
+              team: devops
+              owner: alice
diff --git a/examples/workflows/create_alert_in_keep.yml b/examples/workflows/create_alert_in_keep.yml
@@ -2,12 +2,13 @@ workflow:
   id: create-alert
   description: Just creating one more alert in Keep
   triggers:
-  - type: manual
+    - type: manual
 
   actions:
     - name: create-alert
       provider:
         type: keep
         with:
-          name: "Alert created from the workflow"
-          description: "This alert was created from the create_alert_in_keep.yml example workflow."
+          alert:
+            name: "Alert created from the workflow"
+            description: "This alert was created from the create_alert_in_keep.yml example workflow."
diff --git a/examples/workflows/create_multi_alert_from_vm_metric.yml b/examples/workflows/create_multi_alert_from_vm_metric.yml
@@ -0,0 +1,58 @@
+workflow:
+  # Unique identifier for this workflow
+  id: query-victoriametrics-multi
+  # Display name shown in the UI
+  name: victoriametrics-multi-alert-example
+  # Brief description of what this workflow does
+  description: victoriametrics
+  triggers:
+    # This workflow can be triggered manually from the UI
+    - type: manual
+  steps:
+    # Query VictoriaMetrics for CPU metrics
+    - name: victoriametrics-step
+      provider:
+        # Use the VictoriaMetrics provider configuration
+        config: "{{ providers.vm }}"
+        type: victoriametrics
+        with:
+          # Query that returns the sum of CPU usage for each job
+          # Example response:
+          # [
+          #   {'metric': {'job': 'victoriametrics'}, 'value': [1737808021, '0.022633333333333307']},
+          #   {'metric': {'job': 'vmagent'}, 'value': [1737808021, '0.009299999999999998']}
+          # ]
+          query: sum(rate(process_cpu_seconds_total)) by (job)
+          queryType: query
+
+  actions:
+    # Create an alert in Keep based on the query results
+    - name: create-alert
+      provider:
+        type: keep
+        with:
+          # Only create alert if CPU usage is above threshold
+          if: "{{ value.1 }} > 0.01 "
+          # Alert must persist for 1 minute
+          for: 1m
+          # Use job label to create unique fingerprint for each alert
+          fingerprint_fields:
+            - labels.job
+          alert:
+            # Alert name includes the specific job
+            name: "High CPU Usage on {{ metric.job }}"
+            description: "CPU usage is high on the VM (created from VM metric)"
+            # Set severity based on CPU usage thresholds:
+            # > 0.9 = critical
+            # > 0.7 = warning
+            # else = info
+            severity: '{{ value.1 }} > 0.9 ? "critical" : {{ value.1 }} > 0.7 ? "warning" : "info"'
+            labels:
+              # Job label is required for alert fingerprinting
+              job: "{{ metric.job }}"
+              # Additional context labels
+              environment: production
+              app: myapp
+              service: api
+              team: devops
+              owner: alice
diff --git a/examples/workflows/update_workflows_from_s3.yml b/examples/workflows/update_workflows_from_s3.yml
@@ -6,7 +6,7 @@ triggers:
 steps:
   - name: s3-dump
     provider:
-      config: '{{ providers.s3 }}'
+      config: "{{ providers.s3 }}"
       type: s3
       with:
         bucket: "keep-workflows"

diff --git a/keep/contextmanager/contextmanager.py b/keep/contextmanager/contextmanager.py
@@ -100,8 +100,9 @@ def api_key(self):
             session.close()
         return self._api_key
 
-    def set_execution_context(self, workflow_execution_id):
+    def set_execution_context(self, workflow_id, workflow_execution_id):
         self.workflow_execution_id = workflow_execution_id
+        self.workflow_id = workflow_id
         self.logger_adapter.workflow_execution_id = workflow_execution_id
         for logger in self.__loggers.values():
             logger.workflow_execution_id = workflow_execution_id