Merge pull request #1 from kabisa/renovate/configure

Configure Renovate
kabisa · Sep 26, 2022 · 36e13e9 · 36e13e9
2 parents cbd0dbc + 2aaba56
commit 36e13e9
Show file tree

Hide file tree

Showing 14 changed files with 829 additions and 5 deletions.
diff --git a/.github/workflows/documentation.yaml b/.github/workflows/documentation.yaml
@@ -0,0 +1,29 @@
+name: Generate terraform docs
+
+on:
+  push:
+    # don't run when we push a tag
+    tags-ignore:
+    - '*'
+    # don't run when we merge to main
+    # the action should have run already
+    branches-ignore:
+    - 'main'
+jobs:
+  pre-commit:
+    runs-on: ubuntu-latest
+    steps:
+    - uses: terraform-linters/setup-tflint@v2
+      name: Setup TFLint
+      with:
+        tflint_version: v0.38.1
+    - uses: actions/checkout@v3
+    - uses: actions/setup-python@v4
+    - uses: pre-commit/[email protected]
+      # pre-commit fails if it changed files
+      # we want to go on
+      continue-on-error: true
+    - uses: pre-commit/[email protected]
+    - uses: EndBug/add-and-commit@v9
+      with:
+        default_author: github_actions
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -1,8 +1,13 @@
 repos:
   - repo: https://github.com/gruntwork-io/pre-commit
-    rev: v0.1.14
+    rev: v0.1.12
     hooks:
       - id: terraform-fmt
       - id: terraform-validate
       - id: tflint
-      - id: shellcheck
+  - repo: https://github.com/kabisa/terraform-datadog-pre-commit-hook
+    rev: "1.3.6"
+    hooks:
+      - id: terraform-datadog-docs
+        args:
+        - "."
diff --git a/.terraform.lock.hcl b/.terraform.lock.hcl
diff --git a/README.md b/README.md
diff --git a/errors-slo-variables.tf b/errors-slo-variables.tf
@@ -0,0 +1,116 @@
+variable "error_slo_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "error_slo_note" {
+  type    = string
+  default = ""
+}
+
+variable "error_slo_docs" {
+  type    = string
+  default = ""
+}
+
+variable "error_slo_filter_override" {
+  type    = string
+  default = ""
+}
+
+variable "error_slo_warning" {
+  type    = number
+  default = null
+}
+
+variable "error_slo_critical" {
+  type    = number
+  default = 99.9
+}
+
+variable "error_slo_alerting_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "error_slo_error_filter" {
+  type        = string
+  description = "Filter string to select the non-errors for the SLO, Dont forget to include the comma or (AND or OR) keywords"
+  default     = ",!status:error"
+}
+
+variable "error_slo_timeframe" {
+  validation {
+    condition     = contains(["7d", "30d", "90d"], var.error_slo_timeframe)
+    error_message = "SLO Timeframe can  be 7,30,90 days. Example: 7d."
+  }
+  type    = string
+  default = "30d"
+}
+
+variable "error_slo_numerator_override" {
+  type    = string
+  default = ""
+}
+
+variable "error_slo_denominator_override" {
+  type    = string
+  default = ""
+}
+
+variable "error_slo_burn_rate_notification_channel_override" {
+  type    = string
+  default = ""
+}
+
+variable "error_slo_burn_rate_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "error_slo_burn_rate_alerting_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "error_slo_burn_rate_priority" {
+  description = "Number from 1 (high) to 5 (low)."
+
+  type    = number
+  default = 3
+}
+
+variable "error_slo_burn_rate_warning" {
+  type    = number
+  default = null
+}
+
+variable "error_slo_burn_rate_critical" {
+  type    = number
+  default = 10 # 10x burn rate
+}
+
+variable "error_slo_burn_rate_note" {
+  type    = string
+  default = ""
+}
+
+variable "error_slo_burn_rate_docs" {
+  type    = string
+  default = "Use burn rates alerts to measure how fast your error budget is being depleted relative to the time window of your SLO. For example, for a 30 day SLO if a burn rate of 1 is sustained, that means the error budget will be fully depleted in exactly 30 days, a burn rate of 2 means in exactly 15 days, etc. Therefore, you could use a burn rate alert to notify you if a burn rate of 10 is measured in the past hour. Burn rate alerts evaluate two time windows: a long window which you specify and a short window that is automatically calculated as 1/12 of your long window. The long window's purpose is to reduce alert flappiness, while the short window's purpose is to improve recovery time. If your threshold is violated in both windows, you will receive an alert."
+}
+
+variable "error_slo_burn_rate_evaluation_period" {
+  type    = string
+  default = "30d"
+}
+
+variable "error_slo_burn_rate_short_window" {
+  type    = string
+  default = "5m"
+}
+
+variable "error_slo_burn_rate_long_window" {
+  type    = string
+  default = "1h"
+}
diff --git a/errors-slo.tf b/errors-slo.tf
@@ -0,0 +1,72 @@
+locals {
+  error_slo_filter = coalesce(
+    var.error_slo_filter_override,
+    local.filter_str
+  )
+  error_slo_numerator = coalesce(
+    var.error_slo_numerator_override,
+    "sum:${var.slo_metric_prefix}requests.count{${local.error_slo_filter}${var.error_slo_error_filter}}.as_count()"
+  )
+  error_slo_denominator = coalesce(
+    var.error_slo_denominator_override,
+    "sum:${var.slo_metric_prefix}requests.count{${local.error_slo_filter}}.as_count()"
+  )
+  error_slo_burn_rate_notification_channel = try(coalesce(
+    var.error_slo_burn_rate_notification_channel_override,
+    var.notification_channel
+  ), "")
+  error_slo_burn_rate_enabled = var.error_slo_enabled && var.error_slo_burn_rate_enabled
+  error_slo_id                = local.error_slo_burn_rate_enabled ? datadog_service_level_objective.error_slo[0].id : ""
+}
+
+resource "datadog_service_level_objective" "error_slo" {
+  count       = var.error_slo_enabled ? 1 : 0
+  name        = "${local.service_display_name} - ${var.log_source_name} - Error SLO"
+  type        = "metric"
+  description = "Errors SLO for ${local.service_display_name}"
+
+  thresholds {
+    timeframe = var.error_slo_timeframe
+    target    = var.error_slo_critical
+    warning   = var.error_slo_warning
+  }
+
+  query {
+    numerator   = local.error_slo_numerator
+    denominator = local.error_slo_denominator
+  }
+
+  tags = local.normalized_tags
+}
+
+module "error_slo_burn_rate" {
+  source  = "kabisa/generic-monitor/datadog"
+  version = "1.0.0"
+
+  name  = "${var.log_source_name} - Error SLO - Burn Rate"
+  query = "burn_rate(\"${local.error_slo_id}\").over(\"${var.error_slo_burn_rate_evaluation_period}\").long_window(\"${var.error_slo_burn_rate_long_window}\").short_window(\"${var.error_slo_burn_rate_short_window}\") > ${var.error_slo_burn_rate_critical}"
+
+
+  alert_message    = "${local.service_display_name} service is burning through its Error Budget. The percentage of 5XX status codes is {{threshold}}x higher than expected"
+  recovery_message = "${local.service_display_name} service burn rate has recovered"
+  type             = "slo alert"
+
+  # monitor level vars
+  enabled            = var.error_slo_enabled && var.error_slo_burn_rate_enabled
+  alerting_enabled   = var.error_slo_burn_rate_alerting_enabled
+  warning_threshold  = var.error_slo_burn_rate_warning
+  critical_threshold = var.error_slo_burn_rate_critical
+  priority           = var.error_slo_burn_rate_priority
+  docs               = var.error_slo_burn_rate_docs
+  note               = var.error_slo_burn_rate_note
+
+  # module level vars
+  env                  = var.env
+  service              = var.service
+  service_display_name = var.service_display_name
+  notification_channel = local.error_slo_burn_rate_notification_channel
+  additional_tags      = var.additional_tags
+  locked               = var.locked
+  name_prefix          = var.name_prefix
+  name_suffix          = var.name_suffix
+}
diff --git a/latency-slo-variables.tf b/latency-slo-variables.tf
@@ -0,0 +1,116 @@
+variable "latency_slo_enabled" {
+  type        = bool
+  default     = true
+  description = "Note that this monitor requires custom metrics to be present. Those can unfortunately not be created with Terraform yet"
+}
+
+variable "latency_slo_note" {
+  type    = string
+  default = ""
+}
+
+variable "latency_slo_docs" {
+  type    = string
+  default = ""
+}
+
+variable "latency_slo_filter_override" {
+  type    = string
+  default = ""
+}
+
+variable "latency_slo_warning" {
+  type    = number
+  default = null
+}
+
+variable "latency_slo_critical" {
+  type    = number
+  default = 99.9
+}
+
+variable "latency_slo_latency_bucket" {
+  description = "SLO latency bucket in ms for your logs"
+  type        = number
+}
+
+variable "latency_slo_alerting_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "latency_slo_timeframe" {
+  validation {
+    condition     = contains(["7d", "30d", "90d"], var.latency_slo_timeframe)
+    error_message = "SLO Timeframe can  be 7,30,90 days. Example: 7d."
+  }
+  type    = string
+  default = "30d"
+}
+
+variable "latency_slo_burn_rate_priority" {
+  description = "Number from 1 (high) to 5 (low)."
+
+  type    = number
+  default = 3
+}
+
+variable "latency_slo_burn_rate_warning" {
+  type    = number
+  default = null
+}
+
+variable "latency_slo_burn_rate_critical" {
+  type    = number
+  default = 10 # 10x burn rate
+}
+
+variable "latency_slo_burn_rate_note" {
+  type    = string
+  default = ""
+}
+
+variable "latency_slo_burn_rate_docs" {
+  type    = string
+  default = "Use burn rates alerts to measure how fast your error budget is being depleted relative to the time window of your SLO. For example, for a 30 day SLO if a burn rate of 1 is sustained, that means the error budget will be fully depleted in exactly 30 days, a burn rate of 2 means in exactly 15 days, etc. Therefore, you could use a burn rate alert to notify you if a burn rate of 10 is measured in the past hour. Burn rate alerts evaluate two time windows: a long window which you specify and a short window that is automatically calculated as 1/12 of your long window. The long window's purpose is to reduce alert flappiness, while the short window's purpose is to improve recovery time. If your threshold is violated in both windows, you will receive an alert."
+}
+
+variable "latency_slo_burn_rate_evaluation_period" {
+  type    = string
+  default = "30d"
+}
+
+variable "latency_slo_burn_rate_short_window" {
+  type    = string
+  default = "5m"
+}
+
+variable "latency_slo_burn_rate_long_window" {
+  type    = string
+  default = "1h"
+}
+
+variable "latency_slo_burn_rate_notification_channel_override" {
+  type    = string
+  default = ""
+}
+
+variable "latency_slo_burn_rate_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "latency_slo_burn_rate_alerting_enabled" {
+  type    = bool
+  default = true
+}
+
+variable "latency_slo_custom_numerator" {
+  type    = string
+  default = ""
+}
+
+variable "latency_slo_custom_denominator" {
+  type    = string
+  default = ""
+}