From c10b4ac4206dec54abf668418f33502a49b4eb2d Mon Sep 17 00:00:00 2001 From: Bugra Kocabay Date: Sun, 5 Nov 2023 08:38:39 +0300 Subject: [PATCH 1/8] feat(alerting): add reminder-interval feature which allows setting an interval to run alert consecutively --- alerting/alert/alert.go | 4 ++++ core/endpoint.go | 3 +++ watchdog/alerting.go | 26 ++++++++++++++++++++------ 3 files changed, 27 insertions(+), 6 deletions(-) diff --git a/alerting/alert/alert.go b/alerting/alert/alert.go index acd5afe85..9d0c39527 100644 --- a/alerting/alert/alert.go +++ b/alerting/alert/alert.go @@ -3,6 +3,7 @@ package alert import ( "errors" "strings" + "time" ) var ( @@ -45,6 +46,9 @@ type Alert struct { // ongoing/triggered incidents ResolveKey string `yaml:"-"` + // ReminderInterval is the interval between reminders + ReminderInterval time.Duration `yaml:"reminder-interval,omitempty"` + // Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value // should be set back to false. It is used to prevent the same alert from going out twice. // diff --git a/core/endpoint.go b/core/endpoint.go index 1276f483f..44f1e45e5 100644 --- a/core/endpoint.go +++ b/core/endpoint.go @@ -130,6 +130,9 @@ type Endpoint struct { // SSH is the configuration of SSH monitoring. SSH *SSH `yaml:"ssh,omitempty"` + + // LastReminderSent is the time at which the last reminder was sent for this endpoint. + LastReminderSent time.Time `yaml:"-"` } type SSH struct { diff --git a/watchdog/alerting.go b/watchdog/alerting.go index 881edbab3..ecdae24f6 100644 --- a/watchdog/alerting.go +++ b/watchdog/alerting.go @@ -4,6 +4,7 @@ import ( "errors" "log" "os" + "time" "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/core" @@ -25,20 +26,29 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin endpoint.NumberOfSuccessesInARow = 0 endpoint.NumberOfFailuresInARow++ for _, endpointAlert := range endpoint.Alerts { - // If the alert hasn't been triggered, move to the next one + // Check for initial alert trigger if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > endpoint.NumberOfFailuresInARow { continue } - if endpointAlert.Triggered { + // Determine if an initial alert should be sent + sendInitialAlert := !endpointAlert.Triggered + // Determine if a reminder should be sent + sendReminder := endpointAlert.Triggered && endpointAlert.ReminderInterval > 0 && time.Since(endpoint.LastReminderSent) >= endpointAlert.ReminderInterval + // If neither initial alert nor reminder needs to be sent, skip to the next alert + if !sendInitialAlert && !sendReminder { if debug { - log.Printf("[watchdog][handleAlertsToTrigger] Alert for endpoint=%s with description='%s' has already been TRIGGERED, skipping", endpoint.Name, endpointAlert.GetDescription()) + log.Printf("[watchdog][handleAlertsToTrigger] Alert for endpoint=%s with description='%s' is not due for triggering or reminding, skipping", endpoint.Name, endpointAlert.GetDescription()) } continue } alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type) if alertProvider != nil { - log.Printf("[watchdog][handleAlertsToTrigger] Sending %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", endpointAlert.Type, endpoint.Name, endpointAlert.GetDescription()) var err error + alertType := "reminder" + if sendInitialAlert { + alertType = "initial" + } + log.Printf("[watchdog][handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, endpoint.Name, endpointAlert.GetDescription()) if os.Getenv("MOCK_ALERT_PROVIDER") == "true" { if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" { err = errors.New("error") @@ -49,10 +59,14 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin if err != nil { log.Printf("[watchdog][handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", endpoint.Name, err.Error()) } else { - endpointAlert.Triggered = true + // Mark initial alert as triggered and update last reminder time + if sendInitialAlert { + endpointAlert.Triggered = true + } + endpoint.LastReminderSent = time.Now() } } else { - log.Printf("[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type) + log.Printf("[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being due, because the provider wasn't configured properly", endpointAlert.Type) } } } From 28a3ac72a29c728847ec93d271c4669fefcc3a3d Mon Sep 17 00:00:00 2001 From: Bugra Kocabay Date: Sun, 5 Nov 2023 08:38:54 +0300 Subject: [PATCH 2/8] feat(test): add tests for reminder-interval feature --- watchdog/alerting_test.go | 42 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/watchdog/alerting_test.go b/watchdog/alerting_test.go index 88dd10a14..3fbfe7faa 100644 --- a/watchdog/alerting_test.go +++ b/watchdog/alerting_test.go @@ -3,6 +3,7 @@ package watchdog import ( "os" "testing" + "time" "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/alerting/alert" @@ -475,6 +476,47 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) { verify(t, endpoint, 0, 2, false, "") } +func TestHandleAlertingWithReminderInterval(t *testing.T) { + _ = os.Setenv("MOCK_ALERT_PROVIDER", "true") + defer os.Clearenv() + + cfg := &config.Config{ + Debug: true, + Alerting: &alerting.Config{ + Custom: &custom.AlertProvider{ + URL: "https://twin.sh/health", + Method: "GET", + }, + }, + } + enabled := true + endpoint := &core.Endpoint{ + URL: "https://example.com", + Alerts: []*alert.Alert{ + { + Type: alert.TypeCustom, + Enabled: &enabled, + FailureThreshold: 2, + SuccessThreshold: 3, + SendOnResolved: &enabled, + Triggered: false, + ReminderInterval: 1 * time.Second, + }, + }, + } + + verify(t, endpoint, 0, 0, false, "The alert shouldn't start triggered") + HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug) + verify(t, endpoint, 1, 0, false, "The alert shouldn't have triggered") + HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug) + verify(t, endpoint, 2, 0, true, "The alert should've triggered") + HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug) + verify(t, endpoint, 3, 0, true, "The alert should still be triggered") + HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug) + verify(t, endpoint, 4, 0, true, "The alert should still be triggered") + HandleAlerting(endpoint, &core.Result{Success: true}, cfg.Alerting, cfg.Debug) +} + func verify(t *testing.T, endpoint *core.Endpoint, expectedNumberOfFailuresInARow, expectedNumberOfSuccessInARow int, expectedTriggered bool, expectedTriggeredReason string) { if endpoint.NumberOfFailuresInARow != expectedNumberOfFailuresInARow { t.Errorf("endpoint.NumberOfFailuresInARow should've been %d, got %d", expectedNumberOfFailuresInARow, endpoint.NumberOfFailuresInARow) From c3e9edbd5a585a108545bafc8c7806178faf0a95 Mon Sep 17 00:00:00 2001 From: Bugra Kocabay Date: Sun, 5 Nov 2023 08:39:09 +0300 Subject: [PATCH 3/8] feat(docs): modify documentation for reminder-interval feature --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 231183d0a..d1db88b48 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,7 @@ If you want to test it locally, see [Docker](#docker). | `endpoints[].alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` | | `endpoints[].alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` | | `endpoints[].alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` | +| `endpoints[].alerts[].reminder-interval` | Configuration for setting an interval between reminders. | `""` | | `endpoints[].client` | [Client configuration](#client-configuration). | `{}` | | `endpoints[].ui` | UI configuration at the endpoint level. | `{}` | | `endpoints[].ui.hide-hostname` | Whether to hide the hostname in the result. | `false` | From cd46503146cd47008e0f17d7219c6a8171c004db Mon Sep 17 00:00:00 2001 From: Bugra Kocabay Date: Sat, 27 Apr 2024 15:15:51 +0300 Subject: [PATCH 4/8] chore: change "due" to "TRIGGERED" for easier log look through --- watchdog/alerting.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/watchdog/alerting.go b/watchdog/alerting.go index ecdae24f6..d500f5ba9 100644 --- a/watchdog/alerting.go +++ b/watchdog/alerting.go @@ -66,7 +66,7 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin endpoint.LastReminderSent = time.Now() } } else { - log.Printf("[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being due, because the provider wasn't configured properly", endpointAlert.Type) + log.Printf("[watchdog][handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type) } } } From ab0610f3017463c5f903b20c2f56a04ab24304fa Mon Sep 17 00:00:00 2001 From: Bugra Kocabay Date: Sat, 27 Apr 2024 15:17:37 +0300 Subject: [PATCH 5/8] chore: update "reminder-interval" to "repeat-interval" --- alerting/alert/alert.go | 4 ++-- watchdog/alerting.go | 2 +- watchdog/alerting_test.go | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/alerting/alert/alert.go b/alerting/alert/alert.go index 9d0c39527..7929321f7 100644 --- a/alerting/alert/alert.go +++ b/alerting/alert/alert.go @@ -46,8 +46,8 @@ type Alert struct { // ongoing/triggered incidents ResolveKey string `yaml:"-"` - // ReminderInterval is the interval between reminders - ReminderInterval time.Duration `yaml:"reminder-interval,omitempty"` + // RepeatInterval is the interval between reminders + RepeatInterval time.Duration `yaml:"repeat-interval,omitempty"` // Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value // should be set back to false. It is used to prevent the same alert from going out twice. diff --git a/watchdog/alerting.go b/watchdog/alerting.go index d500f5ba9..f9f3b8328 100644 --- a/watchdog/alerting.go +++ b/watchdog/alerting.go @@ -33,7 +33,7 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin // Determine if an initial alert should be sent sendInitialAlert := !endpointAlert.Triggered // Determine if a reminder should be sent - sendReminder := endpointAlert.Triggered && endpointAlert.ReminderInterval > 0 && time.Since(endpoint.LastReminderSent) >= endpointAlert.ReminderInterval + sendReminder := endpointAlert.Triggered && endpointAlert.RepeatInterval > 0 && time.Since(endpoint.LastReminderSent) >= endpointAlert.RepeatInterval // If neither initial alert nor reminder needs to be sent, skip to the next alert if !sendInitialAlert && !sendReminder { if debug { diff --git a/watchdog/alerting_test.go b/watchdog/alerting_test.go index 3fbfe7faa..04c45103b 100644 --- a/watchdog/alerting_test.go +++ b/watchdog/alerting_test.go @@ -500,7 +500,7 @@ func TestHandleAlertingWithReminderInterval(t *testing.T) { SuccessThreshold: 3, SendOnResolved: &enabled, Triggered: false, - ReminderInterval: 1 * time.Second, + RepeatInterval: 1 * time.Second, }, }, } From 4c3ab7cd8120e5e85359a8aacb16d55605264e41 Mon Sep 17 00:00:00 2001 From: Bugra Kocabay Date: Sat, 27 Apr 2024 15:49:48 +0300 Subject: [PATCH 6/8] chore: update reminder-interval to repeat-interval --- README.md | 19 ++++++++++--------- watchdog/alerting_test.go | 2 +- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index c7c8a7d57..47fb9e950 100644 --- a/README.md +++ b/README.md @@ -520,15 +520,16 @@ individual endpoints with configurable descriptions and thresholds. Alerts are configured at the endpoint level like so: -| Parameter | Description | Default | -|:-----------------------------|:-------------------------------------------------------------------------------|:--------------| -| `alerts` | List of all alerts for a given endpoint. | `[]` | -| `alerts[].type` | Type of alert.
See table below for all valid types. | Required `""` | -| `alerts[].enabled` | Whether to enable the alert. | `true` | -| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` | -| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` | -| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` | -| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` | +| Parameter | Description | Default | +|:----------------------------------|:-------------------------------------------------------------------------------|:--------------| +| `alerts` | List of all alerts for a given endpoint. | `[]` | +| `alerts[].type` | Type of alert.
See table below for all valid types. | Required `""` | +| `alerts[].enabled` | Whether to enable the alert. | `true` | +| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` | +| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` | +| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` | +| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` | +| `alerts[].repeat-interval` | Configuration for setting an interval between reminders. | `""` | Here's an example of what an alert configuration might look like at the endpoint level: ```yaml diff --git a/watchdog/alerting_test.go b/watchdog/alerting_test.go index 60a6445bb..739f3e9af 100644 --- a/watchdog/alerting_test.go +++ b/watchdog/alerting_test.go @@ -488,7 +488,7 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) { verify(t, endpoint, 0, 2, false, "") } -func TestHandleAlertingWithReminderInterval(t *testing.T) { +func TestHandleAlertingWithRepeatInterval(t *testing.T) { _ = os.Setenv("MOCK_ALERT_PROVIDER", "true") defer os.Clearenv() From bd36446f959013c2e4a8763c85f97df6b9833d67 Mon Sep 17 00:00:00 2001 From: Bugra Kocabay Date: Sat, 27 Apr 2024 15:50:22 +0300 Subject: [PATCH 7/8] chore: adapt repeat interval feature after merge --- watchdog/alerting.go | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/watchdog/alerting.go b/watchdog/alerting.go index 29898016b..d8d3c0486 100644 --- a/watchdog/alerting.go +++ b/watchdog/alerting.go @@ -4,6 +4,7 @@ import ( "errors" "log" "os" + "time" "github.com/TwiN/gatus/v5/alerting" "github.com/TwiN/gatus/v5/core" @@ -29,16 +30,25 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > endpoint.NumberOfFailuresInARow { continue } - if endpointAlert.Triggered { + // Determine if an initial alert should be sent + sendInitialAlert := !endpointAlert.Triggered + // Determine if a reminder should be sent + sendReminder := endpointAlert.Triggered && endpointAlert.RepeatInterval > 0 && time.Since(endpoint.LastReminderSent) >= endpointAlert.RepeatInterval + // If neither initial alert nor reminder needs to be sent, skip to the next alert + if !sendInitialAlert && !sendReminder { if debug { - log.Printf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' has already been TRIGGERED, skipping", endpoint.Name, endpointAlert.GetDescription()) + log.Printf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' is not due for triggering or reminding, skipping", endpoint.Name, endpointAlert.GetDescription()) } continue } alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type) if alertProvider != nil { - log.Printf("[watchdog.handleAlertsToTrigger] Sending %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", endpointAlert.Type, endpoint.Name, endpointAlert.GetDescription()) var err error + alertType := "reminder" + if sendInitialAlert { + alertType = "initial" + } + log.Printf("[watchdog.handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, endpoint.Name, endpointAlert.GetDescription()) if os.Getenv("MOCK_ALERT_PROVIDER") == "true" { if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" { err = errors.New("error") @@ -49,7 +59,11 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin if err != nil { log.Printf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", endpoint.Name, err.Error()) } else { - endpointAlert.Triggered = true + // Mark initial alert as triggered and update last reminder time + if sendInitialAlert { + endpointAlert.Triggered = true + } + endpoint.LastReminderSent = time.Now() } } else { log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type) From 485a5fb6392cb0dfd411de0f65dec7226355f6d5 Mon Sep 17 00:00:00 2001 From: Bugra Kocabay Date: Sat, 27 Apr 2024 15:54:25 +0300 Subject: [PATCH 8/8] chore: adapt repeat interval feature after merge --- core/endpoint.go | 3 +++ 1 file changed, 3 insertions(+) diff --git a/core/endpoint.go b/core/endpoint.go index 749b1d3d3..f027b0808 100644 --- a/core/endpoint.go +++ b/core/endpoint.go @@ -120,6 +120,9 @@ type Endpoint struct { // NumberOfSuccessesInARow is the number of successful evaluations in a row NumberOfSuccessesInARow int `yaml:"-"` + + // LastReminderSent is the time at which the last reminder was sent for this endpoint. + LastReminderSent time.Time `yaml:"-"` } // IsEnabled returns whether the endpoint is enabled or not