Skip to content

Commit 755538d

Browse files
committed
✨(backend) add prometheus metrics and probe endpoints
Provides Prometheus metrics, a custom metrics exporter, a CIDR filter for monitoring targeted views, and readiness/liveness probe endpoints for Kubernetes. Signed-off-by: lindenb1 <[email protected]>
1 parent 9194bf5 commit 755538d

File tree

13 files changed

+838
-0
lines changed

13 files changed

+838
-0
lines changed

.github/workflows/impress.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -153,6 +153,9 @@ jobs:
153153
AWS_S3_ENDPOINT_URL: http://localhost:9000
154154
AWS_S3_ACCESS_KEY_ID: impress
155155
AWS_S3_SECRET_ACCESS_KEY: password
156+
MONITORING_PROMETHEUS_EXPORTER: true
157+
MONITORING_PROBING: true
158+
MONITORING_ALLOWED_CIDR_RANGES: "*"
156159

157160
steps:
158161
- name: Checkout repository

CHANGELOG.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@ and this project adheres to
99

1010
## [Unreleased]
1111

12+
## Added
13+
14+
- ✨(backend) add prometheus metrics and probe endpoints #455
15+
1216
## [2.0.1] - 2025-01-17
1317

1418
## Fixed

docker-compose.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,9 @@ services:
5050
environment:
5151
- PYLINTHOME=/app/.pylint.d
5252
- DJANGO_CONFIGURATION=Development
53+
- MONITORING_PROMETHEUS_EXPORTER=true
54+
- MONITORING_PROBING=true
55+
- MONITORING_ALLOWED_CIDR_RANGES="*"
5356
env_file:
5457
- env.d/development/common
5558
- env.d/development/postgresql
Lines changed: 198 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,198 @@
1+
"""Custom Prometheus Metrics Exporter for Impress' core application."""
2+
3+
from datetime import timedelta
4+
5+
from django.conf import settings
6+
from django.db.models import Count, F, Max, Min, Q
7+
from django.utils.timezone import now
8+
9+
from prometheus_client.core import GaugeMetricFamily
10+
11+
from core import models
12+
13+
14+
class CustomMetricsExporter:
15+
"""
16+
Custom Prometheus metrics collector for various application
17+
relevant metrics.
18+
"""
19+
20+
def collect(self):
21+
"""
22+
Collect and yield Prometheus metrics for user activity, document activity,
23+
and document statistics over various time periods.
24+
"""
25+
26+
namespace = getattr(settings, "PROMETHEUS_METRIC_NAMESPACE", "")
27+
28+
def prefixed_metric_name(name):
29+
return f"{namespace}_{name}" if namespace else name
30+
31+
# Group time boundaries into a dictionary to reduce separate local variables
32+
times = {}
33+
times["today_start_utc"] = now().replace(
34+
hour=0, minute=0, second=0, microsecond=0
35+
)
36+
times["one_week_ago"] = times["today_start_utc"] - timedelta(days=7)
37+
times["one_month_ago"] = times["today_start_utc"] - timedelta(days=30)
38+
39+
# Group user queries/metrics into a dictionary
40+
user_metrics = {
41+
"total_users": models.User.objects.count(),
42+
"active_users_today": models.User.objects.filter(
43+
Q(documentaccess__updated_at__gte=times["today_start_utc"])
44+
| Q(link_traces__created_at__gte=times["today_start_utc"])
45+
| Q(last_login__gte=times["today_start_utc"])
46+
)
47+
.distinct()
48+
.count(),
49+
"active_users_7_days": models.User.objects.filter(
50+
Q(documentaccess__updated_at__gte=times["one_week_ago"])
51+
| Q(link_traces__created_at__gte=times["one_week_ago"])
52+
| Q(last_login__gte=times["one_week_ago"])
53+
)
54+
.distinct()
55+
.count(),
56+
"active_users_30_days": models.User.objects.filter(
57+
Q(documentaccess__updated_at__gte=times["one_month_ago"])
58+
| Q(link_traces__created_at__gte=times["one_month_ago"])
59+
| Q(last_login__gte=times["one_month_ago"])
60+
)
61+
.distinct()
62+
.count(),
63+
}
64+
65+
# Group document queries/metrics into a dictionary
66+
doc_metrics = {
67+
"total_documents": models.Document.objects.count(),
68+
"shared_docs_count": (
69+
models.Document.objects.annotate(access_count=Count("accesses"))
70+
.filter(access_count__gt=1)
71+
.count()
72+
),
73+
"active_docs_today": models.Document.objects.filter(
74+
updated_at__gte=times["today_start_utc"],
75+
updated_at__lt=times["today_start_utc"] + timedelta(days=1),
76+
).count(),
77+
"active_docs_last_7_days": models.Document.objects.filter(
78+
updated_at__gte=times["one_week_ago"]
79+
).count(),
80+
"active_docs_last_30_days": models.Document.objects.filter(
81+
updated_at__gte=times["one_month_ago"]
82+
).count(),
83+
}
84+
85+
# Use a single aggregation call for oldest/newest document creation date
86+
doc_ages = models.Document.objects.aggregate(
87+
oldest=Min("created_at"),
88+
newest=Max("created_at"),
89+
)
90+
91+
# Prepare user distribution data
92+
user_doc_counts = models.DocumentAccess.objects.values("user_id").annotate(
93+
doc_count=Count("document_id"), admin_email=F("user__admin_email")
94+
)
95+
96+
# Collect all metrics in one list
97+
metrics = []
98+
99+
# -- User metrics
100+
metrics.append(
101+
GaugeMetricFamily(
102+
prefixed_metric_name("total_users"),
103+
"Total number of users",
104+
value=user_metrics["total_users"],
105+
)
106+
)
107+
metrics.append(
108+
GaugeMetricFamily(
109+
prefixed_metric_name("active_users_today"),
110+
"Number of active users today",
111+
value=user_metrics["active_users_today"],
112+
)
113+
)
114+
metrics.append(
115+
GaugeMetricFamily(
116+
prefixed_metric_name("active_users_7_days"),
117+
"Number of active users in the last 7 days",
118+
value=user_metrics["active_users_7_days"],
119+
)
120+
)
121+
metrics.append(
122+
GaugeMetricFamily(
123+
prefixed_metric_name("active_users_30_days"),
124+
"Number of active users in the last 30 days",
125+
value=user_metrics["active_users_30_days"],
126+
)
127+
)
128+
129+
# -- Document metrics
130+
metrics.append(
131+
GaugeMetricFamily(
132+
prefixed_metric_name("total_documents"),
133+
"Total number of documents",
134+
value=doc_metrics["total_documents"],
135+
)
136+
)
137+
metrics.append(
138+
GaugeMetricFamily(
139+
prefixed_metric_name("shared_documents"),
140+
"Number of shared documents",
141+
value=doc_metrics["shared_docs_count"],
142+
)
143+
)
144+
metrics.append(
145+
GaugeMetricFamily(
146+
prefixed_metric_name("active_documents_today"),
147+
"Number of active documents today",
148+
value=doc_metrics["active_docs_today"],
149+
)
150+
)
151+
metrics.append(
152+
GaugeMetricFamily(
153+
prefixed_metric_name("active_documents_7_days"),
154+
"Number of active documents in the last 7 days",
155+
value=doc_metrics["active_docs_last_7_days"],
156+
)
157+
)
158+
metrics.append(
159+
GaugeMetricFamily(
160+
prefixed_metric_name("active_documents_30_days"),
161+
"Number of active documents in the last 30 days",
162+
value=doc_metrics["active_docs_last_30_days"],
163+
)
164+
)
165+
166+
# -- Document oldest/newest timestamps
167+
if doc_ages["oldest"]:
168+
metrics.append(
169+
GaugeMetricFamily(
170+
prefixed_metric_name("oldest_document_date"),
171+
"Timestamp of the oldest document creation date",
172+
value=doc_ages["oldest"].timestamp(),
173+
)
174+
)
175+
if doc_ages["newest"]:
176+
metrics.append(
177+
GaugeMetricFamily(
178+
prefixed_metric_name("newest_document_date"),
179+
"Timestamp of the newest document creation date",
180+
value=doc_ages["newest"].timestamp(),
181+
)
182+
)
183+
184+
# -- User document distribution
185+
user_distribution_metric = GaugeMetricFamily(
186+
prefixed_metric_name("user_document_distribution"),
187+
"Document counts per user",
188+
labels=["user_email"],
189+
)
190+
for user in user_doc_counts:
191+
if user["admin_email"]: # Validate email existence
192+
user_distribution_metric.add_metric(
193+
[user["admin_email"]], user["doc_count"]
194+
)
195+
metrics.append(user_distribution_metric)
196+
197+
# Yield from metrics
198+
yield from metrics
Lines changed: 85 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,85 @@
1+
"""API liveness and readiness probes for Impress' core application."""
2+
3+
import uuid
4+
5+
from django.core.cache import CacheKeyWarning, cache
6+
from django.core.exceptions import SuspiciousFileOperation
7+
from django.core.files.base import ContentFile
8+
from django.core.files.storage import default_storage
9+
from django.db import connections
10+
from django.db.utils import OperationalError
11+
from django.http import JsonResponse
12+
13+
import requests
14+
from botocore.exceptions import BotoCoreError, ClientError
15+
16+
17+
def liveness_check(request):
18+
"""
19+
Liveness probe endpoint.
20+
Returns HTTP 200 if the application is alive and running.
21+
"""
22+
23+
return JsonResponse({"status": "OK"}, status=200)
24+
25+
26+
def readiness_check(request):
27+
"""
28+
Readiness probe endpoint.
29+
Checks database, cache, media storage, and OIDC configuration.
30+
Returns HTTP 200 with JSON status "OK" if all checks pass,
31+
or HTTP 500 with JSON status "Error" and an error message.
32+
"""
33+
34+
def check_database():
35+
"""Check database connectivity."""
36+
try:
37+
db_conn = connections["default"]
38+
db_conn.cursor()
39+
except OperationalError as e:
40+
raise RuntimeError(
41+
"Database connectivity check failed."
42+
"Please verify your database configuration and status."
43+
) from e
44+
45+
def check_cache():
46+
"""Check cache connectivity."""
47+
test_key = "readiness-probe"
48+
test_value = "ready"
49+
cache.set(test_key, test_value, timeout=5)
50+
if cache.get(test_key) != test_value:
51+
raise RuntimeError(
52+
"Cache check failed: Value mismatch or cache unavailable."
53+
)
54+
55+
def check_media_storage():
56+
"""Check S3 storage connectivity by attempting to write and delete a test file."""
57+
test_file_name = f"readiness-check-{uuid.uuid4()}.txt"
58+
test_content = ContentFile(b"readiness check")
59+
60+
try:
61+
# Attempt to save the test file
62+
default_storage.save(test_file_name, test_content)
63+
# Attempt to delete the test file
64+
default_storage.delete(test_file_name)
65+
except (SuspiciousFileOperation, OSError, BotoCoreError, ClientError) as e:
66+
# Re-raise with context from the original exception
67+
raise RuntimeError("Media storage check failed.") from e
68+
69+
try:
70+
# Run all checks
71+
check_database()
72+
check_cache()
73+
check_media_storage()
74+
75+
# If all checks pass
76+
return JsonResponse({"status": "OK"}, status=200)
77+
78+
except (
79+
OperationalError,
80+
CacheKeyWarning,
81+
BotoCoreError,
82+
ClientError,
83+
requests.RequestException,
84+
) as e:
85+
return JsonResponse({"status": "Error", "message": str(e)}, status=500)

src/backend/core/api/decorators.py

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
"""Decorators for Impress' core application."""
2+
3+
import os
4+
from ipaddress import ip_address, ip_network
5+
6+
from django.http import HttpResponseForbidden
7+
8+
9+
def monitoring_cidr_protected_view(view):
10+
"""
11+
Decorator to restrict access to a view based on CIDR ranges.
12+
13+
Checks the client's IP address against allowed CIDR ranges specified
14+
in the MONITORING_ALLOWED_CIDR_RANGES environment variable. If the
15+
IP address is not within the allowed ranges, access is denied.
16+
"""
17+
18+
def wrapped_view(request, *args, **kwargs):
19+
cidr_env_raw = os.environ.get("MONITORING_ALLOWED_CIDR_RANGES", "")
20+
cidr_env_stripped = cidr_env_raw.strip().strip('"').strip("'")
21+
22+
allow_all = cidr_env_stripped == "*"
23+
24+
allowed_cidr_ranges = []
25+
if not allow_all:
26+
try:
27+
allowed_cidr_ranges = [
28+
ip_network(c.strip().strip('"').strip("'"))
29+
for c in cidr_env_stripped.split(",")
30+
if c.strip()
31+
]
32+
except ValueError as e:
33+
raise ValueError(
34+
f"Invalid CIDR range in MONITORING_ALLOWED_CIDR_RANGES: {e}"
35+
) from e
36+
37+
client_ip = request.META.get("REMOTE_ADDR")
38+
39+
if allow_all:
40+
return view(request, *args, **kwargs)
41+
42+
if not allowed_cidr_ranges:
43+
return HttpResponseForbidden("No allowed CIDR ranges configured.")
44+
45+
if not any(ip_address(client_ip) in cidr for cidr in allowed_cidr_ranges):
46+
return HttpResponseForbidden("Access denied: Your IP is not allowed.")
47+
48+
return view(request, *args, **kwargs)
49+
50+
return wrapped_view

0 commit comments

Comments
 (0)