forked from vcavallo/nostr-hypermedia
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathmetrics.go
More file actions
205 lines (167 loc) · 7.39 KB
/
metrics.go
File metadata and controls
205 lines (167 loc) · 7.39 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
package main
import (
"fmt"
"net/http"
"runtime"
"sync/atomic"
"time"
)
// HTTP metrics
var (
httpRequestsTotal atomic.Int64
httpErrorsTotal atomic.Int64
)
// Relay metrics
var (
droppedEventCount atomic.Int64
)
// Cache metrics
var (
cacheHitsTotal atomic.Int64
cacheMissesTotal atomic.Int64
)
// SSE connection metrics
var (
sseConnectionsActive atomic.Int64
)
// SSE connection tracking
func IncrementSSEConnections() {
sseConnectionsActive.Add(1)
}
func DecrementSSEConnections() {
sseConnectionsActive.Add(-1)
}
// IncrementCacheHit increments the cache hit counter
func IncrementCacheHit() {
cacheHitsTotal.Add(1)
}
// IncrementCacheMiss increments the cache miss counter
func IncrementCacheMiss() {
cacheMissesTotal.Add(1)
}
// GetConnectionStats returns current connection pool statistics
func (p *RelayPool) GetConnectionStats() (active int, max int) {
p.mu.RLock()
defer p.mu.RUnlock()
return len(p.connections), maxTotalConnections
}
// RelayHealthDetail holds per-relay health information
type RelayHealthDetail struct {
URL string `json:"url"`
Status string `json:"status"` // "healthy" or "unhealthy"
AvgResponseMs int64 `json:"avg_response_ms"`
RequestCount int64 `json:"request_count"`
}
// metricsHandler serves Prometheus-compatible metrics
func metricsHandler(w http.ResponseWriter, r *http.Request) {
w.Header().Set("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
// Build info metric
fmt.Fprintf(w, "# HELP nostr_build_info Build and configuration information\n")
fmt.Fprintf(w, "# TYPE nostr_build_info gauge\n")
fmt.Fprintf(w, "nostr_build_info{cache_backend=%q,go_version=%q} 1\n\n", cacheBackendType, runtime.Version())
// Process metrics
fmt.Fprintf(w, "# HELP process_start_time_seconds Unix timestamp of process start\n")
fmt.Fprintf(w, "# TYPE process_start_time_seconds gauge\n")
fmt.Fprintf(w, "process_start_time_seconds %d\n\n", serverStartTime.Unix())
fmt.Fprintf(w, "# HELP process_uptime_seconds Time since process started\n")
fmt.Fprintf(w, "# TYPE process_uptime_seconds gauge\n")
fmt.Fprintf(w, "process_uptime_seconds %.0f\n\n", time.Since(serverStartTime).Seconds())
// Go runtime metrics
var memStats runtime.MemStats
runtime.ReadMemStats(&memStats)
fmt.Fprintf(w, "# HELP go_goroutines Number of active goroutines\n")
fmt.Fprintf(w, "# TYPE go_goroutines gauge\n")
fmt.Fprintf(w, "go_goroutines %d\n\n", runtime.NumGoroutine())
fmt.Fprintf(w, "# HELP go_memstats_alloc_bytes Currently allocated memory in bytes\n")
fmt.Fprintf(w, "# TYPE go_memstats_alloc_bytes gauge\n")
fmt.Fprintf(w, "go_memstats_alloc_bytes %d\n\n", memStats.Alloc)
fmt.Fprintf(w, "# HELP go_memstats_sys_bytes Total memory obtained from the OS\n")
fmt.Fprintf(w, "# TYPE go_memstats_sys_bytes gauge\n")
fmt.Fprintf(w, "go_memstats_sys_bytes %d\n\n", memStats.Sys)
fmt.Fprintf(w, "# HELP go_memstats_heap_inuse_bytes Heap memory in use\n")
fmt.Fprintf(w, "# TYPE go_memstats_heap_inuse_bytes gauge\n")
fmt.Fprintf(w, "go_memstats_heap_inuse_bytes %d\n\n", memStats.HeapInuse)
fmt.Fprintf(w, "# HELP go_gc_duration_seconds_total Total GC pause duration\n")
fmt.Fprintf(w, "# TYPE go_gc_duration_seconds_total counter\n")
fmt.Fprintf(w, "go_gc_duration_seconds_total %.6f\n\n", float64(memStats.PauseTotalNs)/1e9)
fmt.Fprintf(w, "# HELP go_gc_cycles_total Number of completed GC cycles\n")
fmt.Fprintf(w, "# TYPE go_gc_cycles_total counter\n")
fmt.Fprintf(w, "go_gc_cycles_total %d\n\n", memStats.NumGC)
// HTTP metrics
fmt.Fprintf(w, "# HELP http_requests_total Total number of HTTP requests\n")
fmt.Fprintf(w, "# TYPE http_requests_total counter\n")
fmt.Fprintf(w, "http_requests_total %d\n\n", httpRequestsTotal.Load())
fmt.Fprintf(w, "# HELP http_errors_total Total number of HTTP 5xx errors\n")
fmt.Fprintf(w, "# TYPE http_errors_total counter\n")
fmt.Fprintf(w, "http_errors_total %d\n\n", httpErrorsTotal.Load())
// SSE metrics
fmt.Fprintf(w, "# HELP sse_connections_active Number of active SSE connections\n")
fmt.Fprintf(w, "# TYPE sse_connections_active gauge\n")
fmt.Fprintf(w, "sse_connections_active %d\n\n", sseConnectionsActive.Load())
// Connection pool metrics
activeConns, maxConns := relayPool.GetConnectionStats()
fmt.Fprintf(w, "# HELP nostr_relay_connections_active Number of active relay connections\n")
fmt.Fprintf(w, "# TYPE nostr_relay_connections_active gauge\n")
fmt.Fprintf(w, "nostr_relay_connections_active %d\n\n", activeConns)
fmt.Fprintf(w, "# HELP nostr_relay_connections_max Maximum relay connections allowed\n")
fmt.Fprintf(w, "# TYPE nostr_relay_connections_max gauge\n")
fmt.Fprintf(w, "nostr_relay_connections_max %d\n\n", maxConns)
// Relay health summary
healthy, unhealthy, avgMs := relayHealthStore.GetRelayHealthStats()
fmt.Fprintf(w, "# HELP nostr_relays_healthy Number of healthy relays\n")
fmt.Fprintf(w, "# TYPE nostr_relays_healthy gauge\n")
fmt.Fprintf(w, "nostr_relays_healthy %d\n\n", healthy)
fmt.Fprintf(w, "# HELP nostr_relays_unhealthy Number of unhealthy relays\n")
fmt.Fprintf(w, "# TYPE nostr_relays_unhealthy gauge\n")
fmt.Fprintf(w, "nostr_relays_unhealthy %d\n\n", unhealthy)
fmt.Fprintf(w, "# HELP nostr_relay_avg_response_ms Average relay response time in milliseconds\n")
fmt.Fprintf(w, "# TYPE nostr_relay_avg_response_ms gauge\n")
fmt.Fprintf(w, "nostr_relay_avg_response_ms %d\n\n", avgMs)
// Per-relay metrics with labels
relayDetails := relayHealthStore.GetRelayHealthDetails()
if len(relayDetails) > 0 {
fmt.Fprintf(w, "# HELP nostr_relay_response_ms Response time per relay in milliseconds\n")
fmt.Fprintf(w, "# TYPE nostr_relay_response_ms gauge\n")
for _, detail := range relayDetails {
fmt.Fprintf(w, "nostr_relay_response_ms{relay=%q} %d\n", detail.URL, detail.AvgResponseMs)
}
fmt.Fprintf(w, "\n")
fmt.Fprintf(w, "# HELP nostr_relay_requests_total Total requests per relay\n")
fmt.Fprintf(w, "# TYPE nostr_relay_requests_total counter\n")
for _, detail := range relayDetails {
fmt.Fprintf(w, "nostr_relay_requests_total{relay=%q} %d\n", detail.URL, detail.RequestCount)
}
fmt.Fprintf(w, "\n")
fmt.Fprintf(w, "# HELP nostr_relay_healthy Whether relay is healthy (1) or not (0)\n")
fmt.Fprintf(w, "# TYPE nostr_relay_healthy gauge\n")
for _, detail := range relayDetails {
healthyVal := 0
if detail.Status == "healthy" {
healthyVal = 1
}
fmt.Fprintf(w, "nostr_relay_healthy{relay=%q} %d\n", detail.URL, healthyVal)
}
fmt.Fprintf(w, "\n")
}
// Event metrics
fmt.Fprintf(w, "# HELP nostr_events_dropped_total Events dropped due to full channels\n")
fmt.Fprintf(w, "# TYPE nostr_events_dropped_total counter\n")
fmt.Fprintf(w, "nostr_events_dropped_total %d\n\n", droppedEventCount.Load())
// Cache metrics
cacheHits := cacheHitsTotal.Load()
cacheMisses := cacheMissesTotal.Load()
fmt.Fprintf(w, "# HELP cache_hits_total Total cache hits\n")
fmt.Fprintf(w, "# TYPE cache_hits_total counter\n")
fmt.Fprintf(w, "cache_hits_total %d\n\n", cacheHits)
fmt.Fprintf(w, "# HELP cache_misses_total Total cache misses\n")
fmt.Fprintf(w, "# TYPE cache_misses_total counter\n")
fmt.Fprintf(w, "cache_misses_total %d\n\n", cacheMisses)
// Cache hit ratio (useful for alerting)
var hitRatio float64
if total := cacheHits + cacheMisses; total > 0 {
hitRatio = float64(cacheHits) / float64(total)
}
fmt.Fprintf(w, "# HELP cache_hit_ratio Cache hit ratio (0-1)\n")
fmt.Fprintf(w, "# TYPE cache_hit_ratio gauge\n")
fmt.Fprintf(w, "cache_hit_ratio %.4f\n", hitRatio)
}