Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feat/reminder interval #614

Open
wants to merge 10 commits into
base: master
Choose a base branch
from
19 changes: 10 additions & 9 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -520,15 +520,16 @@ individual endpoints with configurable descriptions and thresholds.

Alerts are configured at the endpoint level like so:

| Parameter | Description | Default |
|:-----------------------------|:-------------------------------------------------------------------------------|:--------------|
| `alerts` | List of all alerts for a given endpoint. | `[]` |
| `alerts[].type` | Type of alert. <br />See table below for all valid types. | Required `""` |
| `alerts[].enabled` | Whether to enable the alert. | `true` |
| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` |
| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
| Parameter | Description | Default |
|:----------------------------------|:-------------------------------------------------------------------------------|:--------------|
| `alerts` | List of all alerts for a given endpoint. | `[]` |
| `alerts[].type` | Type of alert. <br />See table below for all valid types. | Required `""` |
| `alerts[].enabled` | Whether to enable the alert. | `true` |
| `alerts[].failure-threshold` | Number of failures in a row needed before triggering the alert. | `3` |
| `alerts[].success-threshold` | Number of successes in a row before an ongoing incident is marked as resolved. | `2` |
| `alerts[].send-on-resolved` | Whether to send a notification once a triggered alert is marked as resolved. | `false` |
| `alerts[].description` | Description of the alert. Will be included in the alert sent. | `""` |
| `alerts[].repeat-interval` | Configuration for setting an interval between reminders. | `""` |

Here's an example of what an alert configuration might look like at the endpoint level:
```yaml
Expand Down
4 changes: 4 additions & 0 deletions alerting/alert/alert.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package alert
import (
"errors"
"strings"
"time"
)

var (
Expand Down Expand Up @@ -45,6 +46,9 @@ type Alert struct {
// ongoing/triggered incidents
ResolveKey string `yaml:"-"`

// RepeatInterval is the interval between reminders
RepeatInterval time.Duration `yaml:"repeat-interval,omitempty"`

// Triggered is used to determine whether an alert has been triggered. When an alert is resolved, this value
// should be set back to false. It is used to prevent the same alert from going out twice.
//
Expand Down
3 changes: 3 additions & 0 deletions core/endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,9 @@ type Endpoint struct {

// NumberOfSuccessesInARow is the number of successful evaluations in a row
NumberOfSuccessesInARow int `yaml:"-"`

// LastReminderSent is the time at which the last reminder was sent for this endpoint.
LastReminderSent time.Time `yaml:"-"`
}

// IsEnabled returns whether the endpoint is enabled or not
Expand Down
22 changes: 18 additions & 4 deletions watchdog/alerting.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import (
"errors"
"log"
"os"
"time"

"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/core"
Expand All @@ -29,16 +30,25 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin
if !endpointAlert.IsEnabled() || endpointAlert.FailureThreshold > endpoint.NumberOfFailuresInARow {
continue
}
if endpointAlert.Triggered {
// Determine if an initial alert should be sent
sendInitialAlert := !endpointAlert.Triggered
// Determine if a reminder should be sent
sendReminder := endpointAlert.Triggered && endpointAlert.RepeatInterval > 0 && time.Since(endpoint.LastReminderSent) >= endpointAlert.RepeatInterval
// If neither initial alert nor reminder needs to be sent, skip to the next alert
if !sendInitialAlert && !sendReminder {
if debug {
log.Printf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' has already been TRIGGERED, skipping", endpoint.Name, endpointAlert.GetDescription())
log.Printf("[watchdog.handleAlertsToTrigger] Alert for endpoint=%s with description='%s' is not due for triggering or reminding, skipping", endpoint.Name, endpointAlert.GetDescription())
}
continue
}
alertProvider := alertingConfig.GetAlertingProviderByAlertType(endpointAlert.Type)
if alertProvider != nil {
log.Printf("[watchdog.handleAlertsToTrigger] Sending %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", endpointAlert.Type, endpoint.Name, endpointAlert.GetDescription())
var err error
alertType := "reminder"
if sendInitialAlert {
alertType = "initial"
}
log.Printf("[watchdog.handleAlertsToTrigger] Sending %s %s alert because alert for endpoint=%s with description='%s' has been TRIGGERED", alertType, endpointAlert.Type, endpoint.Name, endpointAlert.GetDescription())
if os.Getenv("MOCK_ALERT_PROVIDER") == "true" {
if os.Getenv("MOCK_ALERT_PROVIDER_ERROR") == "true" {
err = errors.New("error")
Expand All @@ -49,7 +59,11 @@ func handleAlertsToTrigger(endpoint *core.Endpoint, result *core.Result, alertin
if err != nil {
log.Printf("[watchdog.handleAlertsToTrigger] Failed to send an alert for endpoint=%s: %s", endpoint.Name, err.Error())
} else {
endpointAlert.Triggered = true
// Mark initial alert as triggered and update last reminder time
if sendInitialAlert {
endpointAlert.Triggered = true
}
endpoint.LastReminderSent = time.Now()
}
} else {
log.Printf("[watchdog.handleAlertsToResolve] Not sending alert of type=%s despite being TRIGGERED, because the provider wasn't configured properly", endpointAlert.Type)
Expand Down
42 changes: 42 additions & 0 deletions watchdog/alerting_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package watchdog
import (
"os"
"testing"
"time"

"github.com/TwiN/gatus/v5/alerting"
"github.com/TwiN/gatus/v5/alerting/alert"
Expand Down Expand Up @@ -487,6 +488,47 @@ func TestHandleAlertingWithProviderThatOnlyReturnsErrorOnResolve(t *testing.T) {
verify(t, endpoint, 0, 2, false, "")
}

func TestHandleAlertingWithRepeatInterval(t *testing.T) {
_ = os.Setenv("MOCK_ALERT_PROVIDER", "true")
defer os.Clearenv()

cfg := &config.Config{
Debug: true,
Alerting: &alerting.Config{
Custom: &custom.AlertProvider{
URL: "https://twin.sh/health",
Method: "GET",
},
},
}
enabled := true
endpoint := &core.Endpoint{
URL: "https://example.com",
Alerts: []*alert.Alert{
{
Type: alert.TypeCustom,
Enabled: &enabled,
FailureThreshold: 2,
SuccessThreshold: 3,
SendOnResolved: &enabled,
Triggered: false,
RepeatInterval: 1 * time.Second,
},
},
}

verify(t, endpoint, 0, 0, false, "The alert shouldn't start triggered")
HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, endpoint, 1, 0, false, "The alert shouldn't have triggered")
HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, endpoint, 2, 0, true, "The alert should've triggered")
HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, endpoint, 3, 0, true, "The alert should still be triggered")
HandleAlerting(endpoint, &core.Result{Success: false}, cfg.Alerting, cfg.Debug)
verify(t, endpoint, 4, 0, true, "The alert should still be triggered")
HandleAlerting(endpoint, &core.Result{Success: true}, cfg.Alerting, cfg.Debug)
}

func verify(t *testing.T, endpoint *core.Endpoint, expectedNumberOfFailuresInARow, expectedNumberOfSuccessInARow int, expectedTriggered bool, expectedTriggeredReason string) {
if endpoint.NumberOfFailuresInARow != expectedNumberOfFailuresInARow {
t.Errorf("endpoint.NumberOfFailuresInARow should've been %d, got %d", expectedNumberOfFailuresInARow, endpoint.NumberOfFailuresInARow)
Expand Down