Skip to content

Commit 25b8d52

Browse files
committed
✨(backend) add prometheus metrics and probe endpoints
Provides Prometheus metrics, a custom metrics exporter, a CIDR filter for monitoring targeted views, and readiness/liveness probe endpoints for Kubernetes. Signed-off-by: lindenb1 <[email protected]>
1 parent 9194bf5 commit 25b8d52

File tree

13 files changed

+836
-0
lines changed

13 files changed

+836
-0
lines changed

.github/workflows/impress.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ jobs:
153153
AWS_S3_ENDPOINT_URL: http://localhost:9000
154154
AWS_S3_ACCESS_KEY_ID: impress
155155
AWS_S3_SECRET_ACCESS_KEY: password
156+
MONITORING_PROMETHEUS_EXPORTER: true
157+
MONITORING_PROBING: true
158+
MONITORING_ALLOWED_CIDR_RANGES: "*"
156159

157160
steps:
158161
- name: Checkout repository

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ and this project adheres to
99

1010
## [Unreleased]
1111

12+
## Added
13+
14+
- ✨(backend) add prometheus metrics and probe endpoints #455
15+
1216
## [2.0.1] - 2025-01-17
1317

1418
## Fixed

docker-compose.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ services:
5050
environment:
5151
- PYLINTHOME=/app/.pylint.d
5252
- DJANGO_CONFIGURATION=Development
53+
- MONITORING_PROMETHEUS_EXPORTER=true
54+
- MONITORING_PROBING=true
55+
- MONITORING_ALLOWED_CIDR_RANGES="*"
5356
env_file:
5457
- env.d/development/common
5558
- env.d/development/postgresql
Lines changed: 196 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,196 @@
1+
"""Custom Prometheus Metrics Exporter for Impress' core application."""
2+
3+
from datetime import timedelta
4+
5+
from django.conf import settings
6+
from django.db.models import Count, F, Max, Min, Q
7+
from django.utils.timezone import now
8+
9+
from prometheus_client.core import GaugeMetricFamily
10+
11+
from core import models
12+
13+
14+
class CustomMetricsExporter:
15+
"""
16+
Custom Prometheus metrics collector for various application
17+
relevant metrics.
18+
"""
19+
20+
def collect(self):
21+
"""
22+
Collect and yield Prometheus metrics for user activity, document activity,
23+
and document statistics over various time periods.
24+
"""
25+
26+
namespace = getattr(settings, "PROMETHEUS_METRIC_NAMESPACE", "")
27+
28+
def prefixed_metric_name(name):
29+
return f"{namespace}_{name}" if namespace else name
30+
31+
# Group time boundaries into a dictionary to reduce separate local variables
32+
times = {}
33+
times["today_start_utc"] = now().replace(hour=0, minute=0, second=0, microsecond=0)
34+
times["one_week_ago"] = times["today_start_utc"] - timedelta(days=7)
35+
times["one_month_ago"] = times["today_start_utc"] - timedelta(days=30)
36+
37+
# Group user queries/metrics into a dictionary
38+
user_metrics = {
39+
"total_users": models.User.objects.count(),
40+
"active_users_today": models.User.objects.filter(
41+
Q(documentaccess__updated_at__gte=times["today_start_utc"])
42+
| Q(link_traces__created_at__gte=times["today_start_utc"])
43+
| Q(last_login__gte=times["today_start_utc"])
44+
)
45+
.distinct()
46+
.count(),
47+
"active_users_7_days": models.User.objects.filter(
48+
Q(documentaccess__updated_at__gte=times["one_week_ago"])
49+
| Q(link_traces__created_at__gte=times["one_week_ago"])
50+
| Q(last_login__gte=times["one_week_ago"])
51+
)
52+
.distinct()
53+
.count(),
54+
"active_users_30_days": models.User.objects.filter(
55+
Q(documentaccess__updated_at__gte=times["one_month_ago"])
56+
| Q(link_traces__created_at__gte=times["one_month_ago"])
57+
| Q(last_login__gte=times["one_month_ago"])
58+
)
59+
.distinct()
60+
.count(),
61+
}
62+
63+
# Group document queries/metrics into a dictionary
64+
doc_metrics = {
65+
"total_documents": models.Document.objects.count(),
66+
"shared_docs_count": (
67+
models.Document.objects.annotate(access_count=Count("accesses"))
68+
.filter(access_count__gt=1)
69+
.count()
70+
),
71+
"active_docs_today": models.Document.objects.filter(
72+
updated_at__gte=times["today_start_utc"],
73+
updated_at__lt=times["today_start_utc"] + timedelta(days=1),
74+
).count(),
75+
"active_docs_last_7_days": models.Document.objects.filter(
76+
updated_at__gte=times["one_week_ago"]
77+
).count(),
78+
"active_docs_last_30_days": models.Document.objects.filter(
79+
updated_at__gte=times["one_month_ago"]
80+
).count(),
81+
}
82+
83+
# Use a single aggregation call for oldest/newest document creation date
84+
doc_ages = models.Document.objects.aggregate(
85+
oldest=Min("created_at"),
86+
newest=Max("created_at"),
87+
)
88+
89+
# Prepare user distribution data
90+
user_doc_counts = models.DocumentAccess.objects.values("user_id").annotate(
91+
doc_count=Count("document_id"), admin_email=F("user__admin_email")
92+
)
93+
94+
# Collect all metrics in one list
95+
metrics = []
96+
97+
# -- User metrics
98+
metrics.append(
99+
GaugeMetricFamily(
100+
prefixed_metric_name("total_users"),
101+
"Total number of users",
102+
value=user_metrics["total_users"],
103+
)
104+
)
105+
metrics.append(
106+
GaugeMetricFamily(
107+
prefixed_metric_name("active_users_today"),
108+
"Number of active users today",
109+
value=user_metrics["active_users_today"],
110+
)
111+
)
112+
metrics.append(
113+
GaugeMetricFamily(
114+
prefixed_metric_name("active_users_7_days"),
115+
"Number of active users in the last 7 days",
116+
value=user_metrics["active_users_7_days"],
117+
)
118+
)
119+
metrics.append(
120+
GaugeMetricFamily(
121+
prefixed_metric_name("active_users_30_days"),
122+
"Number of active users in the last 30 days",
123+
value=user_metrics["active_users_30_days"],
124+
)
125+
)
126+
127+
# -- Document metrics
128+
metrics.append(
129+
GaugeMetricFamily(
130+
prefixed_metric_name("total_documents"),
131+
"Total number of documents",
132+
value=doc_metrics["total_documents"],
133+
)
134+
)
135+
metrics.append(
136+
GaugeMetricFamily(
137+
prefixed_metric_name("shared_documents"),
138+
"Number of shared documents",
139+
value=doc_metrics["shared_docs_count"],
140+
)
141+
)
142+
metrics.append(
143+
GaugeMetricFamily(
144+
prefixed_metric_name("active_documents_today"),
145+
"Number of active documents today",
146+
value=doc_metrics["active_docs_today"],
147+
)
148+
)
149+
metrics.append(
150+
GaugeMetricFamily(
151+
prefixed_metric_name("active_documents_7_days"),
152+
"Number of active documents in the last 7 days",
153+
value=doc_metrics["active_docs_last_7_days"],
154+
)
155+
)
156+
metrics.append(
157+
GaugeMetricFamily(
158+
prefixed_metric_name("active_documents_30_days"),
159+
"Number of active documents in the last 30 days",
160+
value=doc_metrics["active_docs_last_30_days"],
161+
)
162+
)
163+
164+
# -- Document oldest/newest timestamps
165+
if doc_ages["oldest"]:
166+
metrics.append(
167+
GaugeMetricFamily(
168+
prefixed_metric_name("oldest_document_date"),
169+
"Timestamp of the oldest document creation date",
170+
value=doc_ages["oldest"].timestamp(),
171+
)
172+
)
173+
if doc_ages["newest"]:
174+
metrics.append(
175+
GaugeMetricFamily(
176+
prefixed_metric_name("newest_document_date"),
177+
"Timestamp of the newest document creation date",
178+
value=doc_ages["newest"].timestamp(),
179+
)
180+
)
181+
182+
# -- User document distribution
183+
user_distribution_metric = GaugeMetricFamily(
184+
prefixed_metric_name("user_document_distribution"),
185+
"Document counts per user",
186+
labels=["user_email"],
187+
)
188+
for user in user_doc_counts:
189+
if user["admin_email"]: # Validate email existence
190+
user_distribution_metric.add_metric(
191+
[user["admin_email"]], user["doc_count"]
192+
)
193+
metrics.append(user_distribution_metric)
194+
195+
# Yield from metrics
196+
yield from metrics
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""API liveness and readiness probes for Impress' core application."""
2+
3+
import uuid
4+
5+
from django.core.cache import CacheKeyWarning, cache
6+
from django.core.exceptions import SuspiciousFileOperation
7+
from django.core.files.base import ContentFile
8+
from django.core.files.storage import default_storage
9+
from django.db import connections
10+
from django.db.utils import OperationalError
11+
from django.http import JsonResponse
12+
13+
import requests
14+
from botocore.exceptions import BotoCoreError, ClientError
15+
16+
17+
def liveness_check(request):
18+
"""
19+
Liveness probe endpoint.
20+
Returns HTTP 200 if the application is alive and running.
21+
"""
22+
23+
return JsonResponse({"status": "OK"}, status=200)
24+
25+
26+
def readiness_check(request):
27+
"""
28+
Readiness probe endpoint.
29+
Checks database, cache, media storage, and OIDC configuration.
30+
Returns HTTP 200 with JSON status "OK" if all checks pass,
31+
or HTTP 500 with JSON status "Error" and an error message.
32+
"""
33+
34+
def check_database():
35+
"""Check database connectivity."""
36+
try:
37+
db_conn = connections["default"]
38+
db_conn.cursor()
39+
except OperationalError as e:
40+
raise RuntimeError(
41+
"Database connectivity check failed."
42+
"Please verify your database configuration and status."
43+
) from e
44+
45+
def check_cache():
46+
"""Check cache connectivity."""
47+
test_key = "readiness-probe"
48+
test_value = "ready"
49+
cache.set(test_key, test_value, timeout=5)
50+
if cache.get(test_key) != test_value:
51+
raise RuntimeError(
52+
"Cache check failed: Value mismatch or cache unavailable."
53+
)
54+
55+
def check_media_storage():
56+
"""Check S3 storage connectivity by attempting to write and delete a test file."""
57+
test_file_name = f"readiness-check-{uuid.uuid4()}.txt"
58+
test_content = ContentFile(b"readiness check")
59+
60+
try:
61+
# Attempt to save the test file
62+
default_storage.save(test_file_name, test_content)
63+
# Attempt to delete the test file
64+
default_storage.delete(test_file_name)
65+
except (SuspiciousFileOperation, OSError, BotoCoreError, ClientError) as e:
66+
# Re-raise with context from the original exception
67+
raise RuntimeError("Media storage check failed.") from e
68+
69+
try:
70+
# Run all checks
71+
check_database()
72+
check_cache()
73+
check_media_storage()
74+
75+
# If all checks pass
76+
return JsonResponse({"status": "OK"}, status=200)
77+
78+
except (
79+
OperationalError,
80+
CacheKeyWarning,
81+
BotoCoreError,
82+
ClientError,
83+
requests.RequestException,
84+
) as e:
85+
return JsonResponse({"status": "Error", "message": str(e)}, status=500)

src/backend/core/api/decorators.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""Decorators for Impress' core application."""
2+
3+
import os
4+
from ipaddress import ip_address, ip_network
5+
6+
from django.http import HttpResponseForbidden
7+
8+
9+
def monitoring_cidr_protected_view(view):
10+
"""
11+
Decorator to restrict access to a view based on CIDR ranges.
12+
13+
Checks the client's IP address against allowed CIDR ranges specified
14+
in the MONITORING_ALLOWED_CIDR_RANGES environment variable. If the
15+
IP address is not within the allowed ranges, access is denied.
16+
"""
17+
18+
def wrapped_view(request, *args, **kwargs):
19+
cidr_env_raw = os.environ.get("MONITORING_ALLOWED_CIDR_RANGES", "")
20+
cidr_env_stripped = cidr_env_raw.strip().strip('"').strip("'")
21+
22+
allow_all = cidr_env_stripped == "*"
23+
24+
allowed_cidr_ranges = []
25+
if not allow_all:
26+
try:
27+
allowed_cidr_ranges = [
28+
ip_network(c.strip().strip('"').strip("'"))
29+
for c in cidr_env_stripped.split(",")
30+
if c.strip()
31+
]
32+
except ValueError as e:
33+
raise ValueError(
34+
f"Invalid CIDR range in MONITORING_ALLOWED_CIDR_RANGES: {e}"
35+
) from e
36+
37+
client_ip = request.META.get("REMOTE_ADDR")
38+
39+
if allow_all:
40+
return view(request, *args, **kwargs)
41+
42+
if not allowed_cidr_ranges:
43+
return HttpResponseForbidden("No allowed CIDR ranges configured.")
44+
45+
if not any(ip_address(client_ip) in cidr for cidr in allowed_cidr_ranges):
46+
return HttpResponseForbidden("Access denied: Your IP is not allowed.")
47+
48+
return view(request, *args, **kwargs)
49+
50+
return wrapped_view

0 commit comments

Comments
 (0)