Skip to content

Commit

Permalink
feat: add support for custom labels in Prometheus metrics
Browse files Browse the repository at this point in the history
- Add a `Labels` field to the `Config` struct to include additional labels for metrics
- Add a `Labels` field to the `Endpoint` struct to include key-value pairs for endpoint metrics
- Inline the `CheckSSHBanner` call in the `call` function of the `Endpoint` struct
- Import the `metrics` package in `main.go` and initialize Prometheus metrics
- Modify `initializePrometheusMetrics` to accept a `Config` parameter and include labels in metric definitions
- Update `PublishMetricsForEndpoint` to handle labels and include them in metric values
- Update `metrics_test.go` to initialize Prometheus metrics with a `Config` and pass labels to `PublishMetricsForEndpoint`
- Pass labels to the `monitor` and `execute` functions in `watchdog.go` and use them in metric publishing

Signed-off-by: appleboy <[email protected]>
  • Loading branch information
appleboy committed Jan 26, 2025
1 parent dd839be commit 4a7d9f1
Show file tree
Hide file tree
Showing 6 changed files with 53 additions and 36 deletions.
3 changes: 3 additions & 0 deletions config/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,9 @@ type Config struct {
// Metrics Whether to expose metrics at /metrics
Metrics bool `yaml:"metrics,omitempty"`

// Labels is a list of labels that will be added to all metrics
Labels []string `yaml:"labels,omitempty"`

// SkipInvalidConfigUpdate Whether to make the application ignore invalid configuration
// if the configuration file is updated while the application is running
SkipInvalidConfigUpdate bool `yaml:"skip-invalid-config-update,omitempty"`
Expand Down
6 changes: 4 additions & 2 deletions config/endpoint/endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,9 @@ type Endpoint struct {
// Headers of the request
Headers map[string]string `yaml:"headers,omitempty"`

// Labels are key-value pairs that can be used to metric the endpoint
Labels map[string]string `yaml:"labels,omitempty"`

// Interval is the duration to wait between every status check
Interval time.Duration `yaml:"interval,omitempty"`

Expand Down Expand Up @@ -365,8 +368,7 @@ func (e *Endpoint) call(result *Result) {
} else if endpointType == TypeSSH {
// If there's no username/password specified, attempt to validate just the SSH banner
if len(e.SSHConfig.Username) == 0 && len(e.SSHConfig.Password) == 0 {
result.Connected, result.HTTPStatus, err =
client.CheckSSHBanner(strings.TrimPrefix(e.URL, "ssh://"), e.ClientConfig)
result.Connected, result.HTTPStatus, err = client.CheckSSHBanner(strings.TrimPrefix(e.URL, "ssh://"), e.ClientConfig)
if err != nil {
result.AddError(err.Error())
return
Expand Down
3 changes: 3 additions & 0 deletions main.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ import (

"github.com/TwiN/gatus/v5/config"
"github.com/TwiN/gatus/v5/controller"
"github.com/TwiN/gatus/v5/metrics"
"github.com/TwiN/gatus/v5/storage/store"
"github.com/TwiN/gatus/v5/watchdog"
"github.com/TwiN/logr"
Expand Down Expand Up @@ -49,6 +50,8 @@ func main() {

func start(cfg *config.Config) {
go controller.Handle(cfg)
// Initialize the metrics
metrics.InitializePrometheusMetrics(cfg)
watchdog.Monitor(cfg)
go listenToConfigurationFileChanges(cfg)
}
Expand Down
46 changes: 25 additions & 21 deletions metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ package metrics
import (
"strconv"

"github.com/TwiN/gatus/v5/config"
"github.com/TwiN/gatus/v5/config/endpoint"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"
Expand All @@ -11,8 +12,6 @@ import (
const namespace = "gatus" // The prefix of the metrics

var (
initializedMetrics bool // Whether the metrics have been initialized

resultTotal *prometheus.CounterVec
resultDurationSeconds *prometheus.GaugeVec
resultConnectedTotal *prometheus.CounterVec
Expand All @@ -21,64 +20,69 @@ var (
resultEndpointSuccess *prometheus.GaugeVec
)

func initializePrometheusMetrics() {
func InitializePrometheusMetrics(cfg *config.Config) {
resultTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Name: "results_total",
Help: "Number of results per endpoint",
}, []string{"key", "group", "name", "type", "success"})
}, append([]string{"key", "group", "name", "type", "success"}, cfg.Labels...))
resultDurationSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "results_duration_seconds",
Help: "Duration of the request in seconds",
}, []string{"key", "group", "name", "type"})
}, append([]string{"key", "group", "name", "type"}, cfg.Labels...))
resultConnectedTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Name: "results_connected_total",
Help: "Total number of results in which a connection was successfully established",
}, []string{"key", "group", "name", "type"})
}, append([]string{"key", "group", "name", "type"}, cfg.Labels...))
resultCodeTotal = promauto.NewCounterVec(prometheus.CounterOpts{
Namespace: namespace,
Name: "results_code_total",
Help: "Total number of results by code",
}, []string{"key", "group", "name", "type", "code"})
}, append([]string{"key", "group", "name", "type", "code"}, cfg.Labels...))
resultCertificateExpirationSeconds = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "results_certificate_expiration_seconds",
Help: "Number of seconds until the certificate expires",
}, []string{"key", "group", "name", "type"})
}, append([]string{"key", "group", "name", "type"}, cfg.Labels...))
resultEndpointSuccess = promauto.NewGaugeVec(prometheus.GaugeOpts{
Namespace: namespace,
Name: "results_endpoint_success",
Help: "Displays whether or not the endpoint was a success",
}, []string{"key", "group", "name", "type"})
}, append([]string{"key", "group", "name", "type"}, cfg.Labels...))
}

// PublishMetricsForEndpoint publishes metrics for the given endpoint and its result.
// These metrics will be exposed at /metrics if the metrics are enabled
func PublishMetricsForEndpoint(ep *endpoint.Endpoint, result *endpoint.Result) {
if !initializedMetrics {
initializePrometheusMetrics()
initializedMetrics = true
func PublishMetricsForEndpoint(labels []string, ep *endpoint.Endpoint, result *endpoint.Result) {
labelValues := []string{}
for _, label := range labels {
if value, ok := ep.Labels[label]; ok {
labelValues = append(labelValues, value)
} else {
labelValues = append(labelValues, "")
}
}

endpointType := ep.Type()
resultTotal.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType), strconv.FormatBool(result.Success)).Inc()
resultDurationSeconds.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Set(result.Duration.Seconds())
resultTotal.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType), strconv.FormatBool(result.Success)}, labelValues...)...).Inc()
resultDurationSeconds.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Set(result.Duration.Seconds())
if result.Connected {
resultConnectedTotal.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Inc()
resultConnectedTotal.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Inc()
}
if result.DNSRCode != "" {
resultCodeTotal.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType), result.DNSRCode).Inc()
resultCodeTotal.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType), result.DNSRCode}, labelValues...)...).Inc()
}
if result.HTTPStatus != 0 {
resultCodeTotal.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType), strconv.Itoa(result.HTTPStatus)).Inc()
resultCodeTotal.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType), strconv.Itoa(result.HTTPStatus)}, labelValues...)...).Inc()
}
if result.CertificateExpiration != 0 {
resultCertificateExpirationSeconds.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Set(result.CertificateExpiration.Seconds())
resultCertificateExpirationSeconds.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Set(result.CertificateExpiration.Seconds())
}
if result.Success {
resultEndpointSuccess.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Set(1)
resultEndpointSuccess.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Set(1)
} else {
resultEndpointSuccess.WithLabelValues(ep.Key(), ep.Group, ep.Name, string(endpointType)).Set(0)
resultEndpointSuccess.WithLabelValues(append([]string{ep.Key(), ep.Group, ep.Name, string(endpointType)}, labelValues...)...).Set(0)
}
}
19 changes: 12 additions & 7 deletions metrics/metrics_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,18 @@ import (
"testing"
"time"

"github.com/TwiN/gatus/v5/config"
"github.com/TwiN/gatus/v5/config/endpoint"
"github.com/TwiN/gatus/v5/config/endpoint/dns"
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/testutil"
)

func TestPublishMetricsForEndpoint(t *testing.T) {
InitializePrometheusMetrics(&config.Config{})

httpEndpoint := &endpoint.Endpoint{Name: "http-ep-name", Group: "http-ep-group", URL: "https://example.org"}
PublishMetricsForEndpoint(httpEndpoint, &endpoint.Result{
PublishMetricsForEndpoint([]string{}, httpEndpoint, &endpoint.Result{
HTTPStatus: 200,
Connected: true,
Duration: 123 * time.Millisecond,
Expand Down Expand Up @@ -47,7 +50,7 @@ gatus_results_endpoint_success{group="http-ep-group",key="http-ep-group_http-ep-
if err != nil {
t.Errorf("Expected no errors but got: %v", err)
}
PublishMetricsForEndpoint(httpEndpoint, &endpoint.Result{
PublishMetricsForEndpoint([]string{}, httpEndpoint, &endpoint.Result{
HTTPStatus: 200,
Connected: true,
Duration: 125 * time.Millisecond,
Expand Down Expand Up @@ -82,11 +85,13 @@ gatus_results_endpoint_success{group="http-ep-group",key="http-ep-group_http-ep-
if err != nil {
t.Errorf("Expected no errors but got: %v", err)
}
dnsEndpoint := &endpoint.Endpoint{Name: "dns-ep-name", Group: "dns-ep-group", URL: "8.8.8.8", DNSConfig: &dns.Config{
QueryType: "A",
QueryName: "example.com.",
}}
PublishMetricsForEndpoint(dnsEndpoint, &endpoint.Result{
dnsEndpoint := &endpoint.Endpoint{
Name: "dns-ep-name", Group: "dns-ep-group", URL: "8.8.8.8", DNSConfig: &dns.Config{
QueryType: "A",
QueryName: "example.com.",
},
}
PublishMetricsForEndpoint([]string{}, dnsEndpoint, &endpoint.Result{
DNSRCode: "NOERROR",
Connected: true,
Duration: 50 * time.Millisecond,
Expand Down
12 changes: 6 additions & 6 deletions watchdog/watchdog.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,31 +31,31 @@ func Monitor(cfg *config.Config) {
if endpoint.IsEnabled() {
// To prevent multiple requests from running at the same time, we'll wait for a little before each iteration
time.Sleep(777 * time.Millisecond)
go monitor(endpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, ctx)
go monitor(cfg.Labels, endpoint, cfg.Alerting, cfg.Maintenance, cfg.Connectivity, cfg.DisableMonitoringLock, cfg.Metrics, ctx)
}
}
}

// monitor a single endpoint in a loop
func monitor(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, ctx context.Context) {
func monitor(labels []string, ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool, ctx context.Context) {
// Run it immediately on start
execute(ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics)
execute(labels, ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics)
// Loop for the next executions
for {
select {
case <-ctx.Done():
logr.Warnf("[watchdog.monitor] Canceling current execution of group=%s; endpoint=%s; key=%s", ep.Group, ep.Name, ep.Key())
return
case <-time.After(ep.Interval):
execute(ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics)
execute(labels, ep, alertingConfig, maintenanceConfig, connectivityConfig, disableMonitoringLock, enabledMetrics)
}
}
// Just in case somebody wandered all the way to here and wonders, "what about ExternalEndpoints?"
// Alerting is checked every time an external endpoint is pushed to Gatus, so they're not monitored
// periodically like they are for normal endpoints.
}

func execute(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool) {
func execute(labels []string, ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenanceConfig *maintenance.Config, connectivityConfig *connectivity.Config, disableMonitoringLock bool, enabledMetrics bool) {
if !disableMonitoringLock {
// By placing the lock here, we prevent multiple endpoints from being monitored at the exact same time, which
// could cause performance issues and return inaccurate results
Expand All @@ -70,7 +70,7 @@ func execute(ep *endpoint.Endpoint, alertingConfig *alerting.Config, maintenance
logr.Debugf("[watchdog.execute] Monitoring group=%s; endpoint=%s; key=%s", ep.Group, ep.Name, ep.Key())
result := ep.EvaluateHealth()
if enabledMetrics {
metrics.PublishMetricsForEndpoint(ep, result)
metrics.PublishMetricsForEndpoint(labels, ep, result)
}
UpdateEndpointStatuses(ep, result)
if logr.GetThreshold() == logr.LevelDebug && !result.Success {
Expand Down

0 comments on commit 4a7d9f1

Please sign in to comment.