diff --git a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf index 6ccd8ae4..a813808d 100644 --- a/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf +++ b/terraform/infra-policyengine-api/modules/fastapi_cloudrun/monitoring.tf @@ -117,3 +117,41 @@ resource "google_monitoring_alert_policy" "limit_alert" { } } +resource "google_monitoring_alert_policy" "simulation_workflow_failure_alert" { + project = var.project_id + display_name = "Simulation Workflow Failures" + combiner = "OR" + + conditions { + display_name = "Simulation Workflow Failed in Last 5 Minutes" + condition_prometheus_query_language { + query = <<-EOT + increase(workflows_googleapis_com:finished_execution_count{monitored_resource="workflows.googleapis.com/Workflow",status="FAILED"}[5m]) > 1 + EOT + duration = "300s" + evaluation_interval = "60s" + labels = { + severity = "critical" + } + } + } + + notification_channels = local.notification_channels + + documentation { + content = <<-EOT + 🚨 *Simulation Workflow Failure Alert* + + One or more executions of the simulation workflow failed within the last 5 minutes. + + *Steps:* + - Check Cloud Workflows logs: [View Details](https://console.cloud.google.com/workflows/workflow/${var.region}/simulation-workflow/metrics?project=${var.project_id}) + - Confirm input data + - Review recent deploys: [Latest commit](${var.commit_url}) + EOT + mime_type = "text/markdown" + } + + enabled = true +} +