diff --git a/.github/workflows/impress.yml b/.github/workflows/impress.yml index 896f07efb..123ac51c9 100644 --- a/.github/workflows/impress.yml +++ b/.github/workflows/impress.yml @@ -153,6 +153,9 @@ jobs: AWS_S3_ENDPOINT_URL: http://localhost:9000 AWS_S3_ACCESS_KEY_ID: impress AWS_S3_SECRET_ACCESS_KEY: password + MONITORING_PROMETHEUS_EXPORTER: true + MONITORING_PROBING: true + MONITORING_ALLOWED_CIDR_RANGES: "*" steps: - name: Checkout repository diff --git a/CHANGELOG.md b/CHANGELOG.md index 99ecaa94b..a6b41705c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,10 @@ and this project adheres to ## [Unreleased] +## Added + +- ✨(backend) add prometheus metrics and probe endpoints #455 + ## [2.0.1] - 2025-01-17 ## Fixed diff --git a/docker-compose.yml b/docker-compose.yml index 62d2d914b..bec006e01 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -50,6 +50,9 @@ services: environment: - PYLINTHOME=/app/.pylint.d - DJANGO_CONFIGURATION=Development + - MONITORING_PROMETHEUS_EXPORTER=true + - MONITORING_PROBING=true + - MONITORING_ALLOWED_CIDR_RANGES="*" env_file: - env.d/development/common - env.d/development/postgresql diff --git a/src/backend/core/api/custom_metrics_exporter.py b/src/backend/core/api/custom_metrics_exporter.py new file mode 100644 index 000000000..e7ab65911 --- /dev/null +++ b/src/backend/core/api/custom_metrics_exporter.py @@ -0,0 +1,198 @@ +"""Custom Prometheus Metrics Exporter for Impress' core application.""" + +from datetime import timedelta + +from django.conf import settings +from django.db.models import Count, F, Max, Min, Q +from django.utils.timezone import now + +from prometheus_client.core import GaugeMetricFamily + +from core import models + + +class CustomMetricsExporter: + """ + Custom Prometheus metrics collector for various application + relevant metrics. + """ + + def collect(self): + """ + Collect and yield Prometheus metrics for user activity, document activity, + and document statistics over various time periods. + """ + + namespace = getattr(settings, "PROMETHEUS_METRIC_NAMESPACE", "") + + def prefixed_metric_name(name): + return f"{namespace}_{name}" if namespace else name + + # Group time boundaries into a dictionary to reduce separate local variables + times = {} + times["today_start_utc"] = now().replace( + hour=0, minute=0, second=0, microsecond=0 + ) + times["one_week_ago"] = times["today_start_utc"] - timedelta(days=7) + times["one_month_ago"] = times["today_start_utc"] - timedelta(days=30) + + # Group user queries/metrics into a dictionary + user_metrics = { + "total_users": models.User.objects.count(), + "active_users_today": models.User.objects.filter( + Q(documentaccess__updated_at__gte=times["today_start_utc"]) + | Q(link_traces__created_at__gte=times["today_start_utc"]) + | Q(last_login__gte=times["today_start_utc"]) + ) + .distinct() + .count(), + "active_users_7_days": models.User.objects.filter( + Q(documentaccess__updated_at__gte=times["one_week_ago"]) + | Q(link_traces__created_at__gte=times["one_week_ago"]) + | Q(last_login__gte=times["one_week_ago"]) + ) + .distinct() + .count(), + "active_users_30_days": models.User.objects.filter( + Q(documentaccess__updated_at__gte=times["one_month_ago"]) + | Q(link_traces__created_at__gte=times["one_month_ago"]) + | Q(last_login__gte=times["one_month_ago"]) + ) + .distinct() + .count(), + } + + # Group document queries/metrics into a dictionary + doc_metrics = { + "total_documents": models.Document.objects.count(), + "shared_docs_count": ( + models.Document.objects.annotate(access_count=Count("accesses")) + .filter(access_count__gt=1) + .count() + ), + "active_docs_today": models.Document.objects.filter( + updated_at__gte=times["today_start_utc"], + updated_at__lt=times["today_start_utc"] + timedelta(days=1), + ).count(), + "active_docs_last_7_days": models.Document.objects.filter( + updated_at__gte=times["one_week_ago"] + ).count(), + "active_docs_last_30_days": models.Document.objects.filter( + updated_at__gte=times["one_month_ago"] + ).count(), + } + + # Use a single aggregation call for oldest/newest document creation date + doc_ages = models.Document.objects.aggregate( + oldest=Min("created_at"), + newest=Max("created_at"), + ) + + # Prepare user distribution data + user_doc_counts = models.DocumentAccess.objects.values("user_id").annotate( + doc_count=Count("document_id"), admin_email=F("user__admin_email") + ) + + # Collect all metrics in one list + metrics = [] + + # -- User metrics + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("total_users"), + "Total number of users", + value=user_metrics["total_users"], + ) + ) + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("active_users_today"), + "Number of active users today", + value=user_metrics["active_users_today"], + ) + ) + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("active_users_7_days"), + "Number of active users in the last 7 days", + value=user_metrics["active_users_7_days"], + ) + ) + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("active_users_30_days"), + "Number of active users in the last 30 days", + value=user_metrics["active_users_30_days"], + ) + ) + + # -- Document metrics + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("total_documents"), + "Total number of documents", + value=doc_metrics["total_documents"], + ) + ) + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("shared_documents"), + "Number of shared documents", + value=doc_metrics["shared_docs_count"], + ) + ) + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("active_documents_today"), + "Number of active documents today", + value=doc_metrics["active_docs_today"], + ) + ) + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("active_documents_7_days"), + "Number of active documents in the last 7 days", + value=doc_metrics["active_docs_last_7_days"], + ) + ) + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("active_documents_30_days"), + "Number of active documents in the last 30 days", + value=doc_metrics["active_docs_last_30_days"], + ) + ) + + # -- Document oldest/newest timestamps + if doc_ages["oldest"]: + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("oldest_document_date"), + "Timestamp of the oldest document creation date", + value=doc_ages["oldest"].timestamp(), + ) + ) + if doc_ages["newest"]: + metrics.append( + GaugeMetricFamily( + prefixed_metric_name("newest_document_date"), + "Timestamp of the newest document creation date", + value=doc_ages["newest"].timestamp(), + ) + ) + + # -- User document distribution + user_distribution_metric = GaugeMetricFamily( + prefixed_metric_name("user_document_distribution"), + "Document counts per user", + labels=["user_email"], + ) + for user in user_doc_counts: + if user["admin_email"]: # Validate email existence + user_distribution_metric.add_metric( + [user["admin_email"]], user["doc_count"] + ) + metrics.append(user_distribution_metric) + + # Yield from metrics + yield from metrics diff --git a/src/backend/core/api/custom_probe_views.py b/src/backend/core/api/custom_probe_views.py new file mode 100644 index 000000000..927066a50 --- /dev/null +++ b/src/backend/core/api/custom_probe_views.py @@ -0,0 +1,85 @@ +"""API liveness and readiness probes for Impress' core application.""" + +import uuid + +from django.core.cache import CacheKeyWarning, cache +from django.core.exceptions import SuspiciousFileOperation +from django.core.files.base import ContentFile +from django.core.files.storage import default_storage +from django.db import connections +from django.db.utils import OperationalError +from django.http import JsonResponse + +import requests +from botocore.exceptions import BotoCoreError, ClientError + + +def liveness_check(request): + """ + Liveness probe endpoint. + Returns HTTP 200 if the application is alive and running. + """ + + return JsonResponse({"status": "OK"}, status=200) + + +def readiness_check(request): + """ + Readiness probe endpoint. + Checks database, cache, media storage, and OIDC configuration. + Returns HTTP 200 with JSON status "OK" if all checks pass, + or HTTP 500 with JSON status "Error" and an error message. + """ + + def check_database(): + """Check database connectivity.""" + try: + db_conn = connections["default"] + db_conn.cursor() + except OperationalError as e: + raise RuntimeError( + "Database connectivity check failed." + "Please verify your database configuration and status." + ) from e + + def check_cache(): + """Check cache connectivity.""" + test_key = "readiness-probe" + test_value = "ready" + cache.set(test_key, test_value, timeout=5) + if cache.get(test_key) != test_value: + raise RuntimeError( + "Cache check failed: Value mismatch or cache unavailable." + ) + + def check_media_storage(): + """Check S3 storage connectivity by attempting to write and delete a test file.""" + test_file_name = f"readiness-check-{uuid.uuid4()}.txt" + test_content = ContentFile(b"readiness check") + + try: + # Attempt to save the test file + default_storage.save(test_file_name, test_content) + # Attempt to delete the test file + default_storage.delete(test_file_name) + except (SuspiciousFileOperation, OSError, BotoCoreError, ClientError) as e: + # Re-raise with context from the original exception + raise RuntimeError("Media storage check failed.") from e + + try: + # Run all checks + check_database() + check_cache() + check_media_storage() + + # If all checks pass + return JsonResponse({"status": "OK"}, status=200) + + except ( + OperationalError, + CacheKeyWarning, + BotoCoreError, + ClientError, + requests.RequestException, + ) as e: + return JsonResponse({"status": "Error", "message": str(e)}, status=500) diff --git a/src/backend/core/api/decorators.py b/src/backend/core/api/decorators.py new file mode 100644 index 000000000..4e63f55df --- /dev/null +++ b/src/backend/core/api/decorators.py @@ -0,0 +1,50 @@ +"""Decorators for Impress' core application.""" + +import os +from ipaddress import ip_address, ip_network + +from django.http import HttpResponseForbidden + + +def monitoring_cidr_protected_view(view): + """ + Decorator to restrict access to a view based on CIDR ranges. + + Checks the client's IP address against allowed CIDR ranges specified + in the MONITORING_ALLOWED_CIDR_RANGES environment variable. If the + IP address is not within the allowed ranges, access is denied. + """ + + def wrapped_view(request, *args, **kwargs): + cidr_env_raw = os.environ.get("MONITORING_ALLOWED_CIDR_RANGES", "") + cidr_env_stripped = cidr_env_raw.strip().strip('"').strip("'") + + allow_all = cidr_env_stripped == "*" + + allowed_cidr_ranges = [] + if not allow_all: + try: + allowed_cidr_ranges = [ + ip_network(c.strip().strip('"').strip("'")) + for c in cidr_env_stripped.split(",") + if c.strip() + ] + except ValueError as e: + raise ValueError( + f"Invalid CIDR range in MONITORING_ALLOWED_CIDR_RANGES: {e}" + ) from e + + client_ip = request.META.get("REMOTE_ADDR") + + if allow_all: + return view(request, *args, **kwargs) + + if not allowed_cidr_ranges: + return HttpResponseForbidden("No allowed CIDR ranges configured.") + + if not any(ip_address(client_ip) in cidr for cidr in allowed_cidr_ranges): + return HttpResponseForbidden("Access denied: Your IP is not allowed.") + + return view(request, *args, **kwargs) + + return wrapped_view diff --git a/src/backend/core/tests/monitoring/test_probes_cidr.py b/src/backend/core/tests/monitoring/test_probes_cidr.py new file mode 100644 index 000000000..b134485af --- /dev/null +++ b/src/backend/core/tests/monitoring/test_probes_cidr.py @@ -0,0 +1,179 @@ +"""Test liveness and readiness CIDR protected probes of impress's core app.""" + +import os +from unittest.mock import patch + +from django.test import TestCase + + +class ProbesCidrProtectionTest(TestCase): + def test_probes_cidr_protection(self): + """ + Tests that hitting the /probes/liveness/ or /probes/readiness/ endpoints + returns 403 for disallowed IPs, 200 for allowed IPs (CIDR logic), + and 403 if MONITORING_PROBING is not set or "False" (because no CIDR is configured). + """ + + scenarios = [ + # 1) No CIDR => 403 on liveness (MONITORING_PROBING="True") + { + "name": "No CIDR => 403 + 'No allowed CIDR ranges configured.' => liveness", + "env": { + "MONITORING_PROBING": "True", + # We do NOT define MONITORING_ALLOWED_CIDR_RANGES at all + }, + "endpoint": "/probes/liveness/", + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + # 2) No CIDR => 403 on readiness (MONITORING_PROBING="True") + { + "name": "No CIDR => 403 + 'No allowed CIDR ranges configured.' => readiness", + "env": { + "MONITORING_PROBING": "True", + }, + "endpoint": "/probes/readiness/", + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + # 3) CIDR=172.19.0.0/16 => IP outside => 403 on liveness + { + "name": "CIDR='172.19.0.0/16' => IP outside => 403 => liveness", + "env": { + "MONITORING_PROBING": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "172.19.0.0/16", + }, + "endpoint": "/probes/liveness/", + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "Access denied: Your IP is not allowed.", + }, + # 4) CIDR=172.19.0.0/16 => IP outside => 403 on readiness + { + "name": "CIDR='172.19.0.0/16' => IP outside => 403 => readiness", + "env": { + "MONITORING_PROBING": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "172.19.0.0/16", + }, + "endpoint": "/probes/readiness/", + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "Access denied: Your IP is not allowed.", + }, + # 5) CIDR='*' => any IP => 200 => liveness + { + "name": "CIDR='*' => any IP => 200 => liveness", + "env": { + "MONITORING_PROBING": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "*", + }, + "endpoint": "/probes/liveness/", + "remote_addr": "8.8.8.8", + "expected_status": 200, + "expected_text": "OK", + }, + # 6) CIDR='*' => any IP => 200 => readiness + { + "name": "CIDR='*' => any IP => 200 => readiness", + "env": { + "MONITORING_PROBING": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "*", + }, + "endpoint": "/probes/readiness/", + "remote_addr": "8.8.8.8", + "expected_status": 200, + "expected_text": "OK", + }, + # 7) CIDR=172.19.0.0/16 => IP inside => 200 => liveness + { + "name": "CIDR='172.19.0.0/16' => IP inside => 200 => liveness", + "env": { + "MONITORING_PROBING": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "172.19.0.0/16", + }, + "endpoint": "/probes/liveness/", + "remote_addr": "172.19.0.2", + "expected_status": 200, + "expected_text": "OK", + }, + # 8) CIDR=172.19.0.0/16 => IP inside => 200 => readiness + { + "name": "CIDR='172.19.0.0/16' => IP inside => 200 => readiness", + "env": { + "MONITORING_PROBING": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "172.19.0.0/16", + }, + "endpoint": "/probes/readiness/", + "remote_addr": "172.19.0.2", + "expected_status": 200, + "expected_text": "OK", + }, + # 9) MONITORING_PROBING not set => no CIDR => 403 + { + "name": "MONITORING_PROBING not set => liveness => 403 => 'No allowed CIDR ranges configured.'", + "env": { + # No MONITORING_PROBING, no CIDR + }, + "endpoint": "/probes/liveness/", + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + # 10) MONITORING_PROBING not set => readiness => 403 + { + "name": "MONITORING_PROBING not set => readiness => 403 => 'No allowed CIDR ranges configured.'", + "env": { + # No MONITORING_PROBING, no CIDR + }, + "endpoint": "/probes/readiness/", + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + # 11) MONITORING_PROBING='False' => no CIDR => 403 + { + "name": 'MONITORING_PROBING="False" => liveness => 403 => "No allowed CIDR..."', + "env": { + "MONITORING_PROBING": "False", + }, + "endpoint": "/probes/liveness/", + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + # 12) MONITORING_PROBING='False' => readiness => 403 + { + "name": 'MONITORING_PROBING="False" => readiness => 403 => "No allowed CIDR..."', + "env": { + "MONITORING_PROBING": "False", + }, + "endpoint": "/probes/readiness/", + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + ] + + for scenario in scenarios: + with self.subTest(msg=scenario["name"]): + with patch.dict(os.environ, scenario["env"], clear=True): + response = self.client.get( + scenario["endpoint"], REMOTE_ADDR=scenario["remote_addr"] + ) + + self.assertEqual( + scenario["expected_status"], + response.status_code, + f"Failed scenario: {scenario['name']}", + ) + + content = response.content.decode("utf-8") + + self.assertIn( + scenario["expected_text"], + content, + f"Failed scenario: {scenario['name']}\n" + f"Response content:\n{content}", + ) diff --git a/src/backend/core/tests/monitoring/test_prometheus_cidr.py b/src/backend/core/tests/monitoring/test_prometheus_cidr.py new file mode 100644 index 000000000..21cd2a61a --- /dev/null +++ b/src/backend/core/tests/monitoring/test_prometheus_cidr.py @@ -0,0 +1,108 @@ +"""Test prometheus metrics CIDR protection of impress's core app.""" + +import os +from unittest.mock import patch + +from django.test import TestCase + + +class PrometheusCidrProtectionTest(TestCase): + def test_prometheus_cidr_protection(self): + """ + Adopts the same 12-scenario style from the ProbesCidrProtectionTest, + but applies it to /prometheus/ using MONITORING_PROMETHEUS_EXPORTER and + MONITORING_ALLOWED_CIDR_RANGES. + + We'll check for either 403 with specific messages, + or 200 with a known metric in the response (e.g., 'process_virtual_memory_bytes'). + """ + + scenarios = [ + # 1) MONITORING_PROMETHEUS_EXPORTER=True => No CIDR => 403 + { + "name": "No CIDR => 403 + 'No allowed CIDR ranges configured.'", + "env": { + "MONITORING_PROMETHEUS_EXPORTER": "True", + # We do NOT define MONITORING_ALLOWED_CIDR_RANGES + }, + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + # 2) MONITORING_PROMETHEUS_EXPORTER=True => CIDR=172.19.0.0/16 => IP outside => 403 + { + "name": "CIDR='172.19.0.0/16' => IP outside => 403 => 'Access denied'", + "env": { + "MONITORING_PROMETHEUS_EXPORTER": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "172.19.0.0/16", + }, + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "Access denied: Your IP is not allowed.", + }, + # 3) MONITORING_PROMETHEUS_EXPORTER=True => CIDR='*' => any IP => 200 => known metric + { + "name": "CIDR='*' => any IP => 200 => 'process_virtual_memory_bytes'", + "env": { + "MONITORING_PROMETHEUS_EXPORTER": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "*", + }, + "remote_addr": "8.8.8.8", + "expected_status": 200, + # We check for a known metric in the 200 response: + "expected_text": "process_virtual_memory_bytes", + }, + # 4) MONITORING_PROMETHEUS_EXPORTER=True => CIDR=172.19.0.0/16 => IP inside => 200 + { + "name": "CIDR='172.19.0.0/16' => IP inside => 200 => known metric", + "env": { + "MONITORING_PROMETHEUS_EXPORTER": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "172.19.0.0/16", + }, + "remote_addr": "172.19.0.2", + "expected_status": 200, + "expected_text": "process_virtual_memory_bytes", + }, + # 5) MONITORING_PROMETHEUS_EXPORTER not set => no CIDR => 403 + { + "name": "MONITORING_PROMETHEUS_EXPORTER not set => no CIDR => 403 => 'No allowed CIDR ranges configured.'", + "env": { + # No MONITORING_PROMETHEUS_EXPORTER + }, + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + # 6) MONITORING_PROMETHEUS_EXPORTER='False' => no CIDR => 403 + { + "name": "MONITORING_PROMETHEUS_EXPORTER='False' => no CIDR => 403 => 'No allowed CIDR ranges configured.'", + "env": { + "MONITORING_PROMETHEUS_EXPORTER": "False", + }, + "remote_addr": "127.0.0.1", + "expected_status": 403, + "expected_text": "No allowed CIDR ranges configured.", + }, + ] + + for scenario in scenarios: + with self.subTest(msg=scenario["name"]): + with patch.dict(os.environ, scenario["env"], clear=True): + response = self.client.get( + "/prometheus/", REMOTE_ADDR=scenario["remote_addr"] + ) + + self.assertEqual( + scenario["expected_status"], + response.status_code, + f"Failed scenario: {scenario['name']}", + ) + + content = response.content.decode("utf-8") + + self.assertIn( + scenario["expected_text"], + content, + f"Failed scenario: {scenario['name']}\n" + f"Response content:\n{content}", + ) diff --git a/src/backend/core/tests/monitoring/test_prometheus_metrics.py b/src/backend/core/tests/monitoring/test_prometheus_metrics.py new file mode 100644 index 000000000..c409ef5a3 --- /dev/null +++ b/src/backend/core/tests/monitoring/test_prometheus_metrics.py @@ -0,0 +1,131 @@ +"""Test prometheus metrics of impress's core app.""" + +import os +from unittest.mock import patch + +from django.conf import settings +from django.test import TestCase +from django.test.utils import override_settings + +from prometheus_client import REGISTRY + +from core import factories +from core.api.custom_metrics_exporter import CustomMetricsExporter + + +def namespaced(metric): + """ + Pulls PROMETHEUS_METRIC_NAMESPACE (if any) + from Django settings to the given metric name. + + e.g. if PROMETHEUS_METRIC_NAMESPACE='impress' and metric='total_users', + returns 'impress_total_users'. + """ + ns = getattr(settings, "PROMETHEUS_METRIC_NAMESPACE", "") + return f"{ns}_{metric}" if ns else metric + + +class PrometheusMetricsTest(TestCase): + """ + Tests hitting the /prometheus/ endpoint. + We forcibly register the CustomMetricsExporter (normally done in wsgi.py) + so its metrics will appear even though wsgi.py is not invoked in tests. + """ + + @classmethod + def setUpClass(cls): + super().setUpClass() + # Ensure CustomMetricsExporter is registered + if not any( + isinstance(collector, CustomMetricsExporter) + for collector in REGISTRY._collector_to_names + ): + REGISTRY.register(CustomMetricsExporter()) + + def setUp(self): + """ + Create a user + document so user/doc metrics (e.g. total_users, + total_documents) are definitely non-zero and appear in the output list. + """ + self.user = factories.UserFactory() + self.doc = factories.DocumentFactory() + self.doc.accesses.create(role="owner", user=self.user) + + @override_settings(PROMETHEUS_METRIC_NAMESPACE="") + def test_prometheus_metrics_no_namespace(self): + """ + Scenario 1: No metric namespace => we expect the custom metrics + to appear with their raw names (e.g. 'total_users'). + """ + env = { + "MONITORING_PROMETHEUS_EXPORTER": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "*", + } + with patch.dict(os.environ, env, clear=True): + response = self.client.get("/prometheus/", REMOTE_ADDR="127.0.0.1") + self.assertEqual( + 200, response.status_code, "Expected 200 OK but got a different status." + ) + content = response.content.decode("utf-8") + + # Check for a couple default 'process_' metrics + for metric in [ + "process_virtual_memory_bytes", + "process_resident_memory_bytes", + ]: + self.assertIn( + metric, content, f"Missing default process metric {metric}" + ) + + # Check for selected custom metrics (no prefix) + for metric in [ + "total_users", + "total_documents", + "active_users_today", + ]: + self.assertIn( + metric, + content, + f"Expected custom metric {metric} not found.\n{content}", + ) + + @override_settings(PROMETHEUS_METRIC_NAMESPACE="impress") + def test_prometheus_metrics_with_namespace(self): + """ + Scenario 2: We set PROMETHEUS_METRIC_NAMESPACE='impress' in settings, + so all custom metrics should appear prefixed with 'impress_'. + """ + env = { + "MONITORING_PROMETHEUS_EXPORTER": "True", + "MONITORING_ALLOWED_CIDR_RANGES": "*", + } + with patch.dict(os.environ, env, clear=True): + response = self.client.get("/prometheus/", REMOTE_ADDR="127.0.0.1") + self.assertEqual( + 200, response.status_code, "Expected 200 OK but got a different status." + ) + content = response.content.decode("utf-8") + + # Check for default metrics + for metric in [ + "process_virtual_memory_bytes", + "process_resident_memory_bytes", + ]: + self.assertIn( + metric, content, f"Missing default process metric {metric}" + ) + + # Check custom metrics that should be prefixed with + # the define value from settings.py ... + # We'll build the expected string via `namespaced()`. + for base_metric in [ + "total_users", + "total_documents", + "active_users_today", + ]: + expected_metric = namespaced(base_metric) + self.assertIn( + expected_metric, + content, + f"Expected custom metric {expected_metric} not found.\n{content}", + ) diff --git a/src/backend/impress/settings.py b/src/backend/impress/settings.py index 495ec4bcf..270bf54de 100755 --- a/src/backend/impress/settings.py +++ b/src/backend/impress/settings.py @@ -23,6 +23,9 @@ # Build paths inside the project like this: BASE_DIR / 'subdir'. BASE_DIR = os.path.dirname(os.path.dirname(os.path.abspath(__file__))) DATA_DIR = os.path.join("/", "data") +MONITORING_PROMETHEUS_EXPORTER = ( + os.getenv("MONITORING_PROMETHEUS_EXPORTER", "False").lower() == "true" +) def get_release(): @@ -283,6 +286,26 @@ class Base(Configuration): "dockerflow.django.middleware.DockerflowMiddleware", ] + if MONITORING_PROMETHEUS_EXPORTER: + MIDDLEWARE.insert(0, "django_prometheus.middleware.PrometheusBeforeMiddleware") + MIDDLEWARE.append("django_prometheus.middleware.PrometheusAfterMiddleware") + PROMETHEUS_METRIC_NAMESPACE = "impress" + PROMETHEUS_LATENCY_BUCKETS = ( + 0.05, + 0.1, + 0.25, + 0.5, + 0.75, + 1.0, + 1.5, + 2.5, + 5.0, + 10.0, + 15.0, + 30.0, + float("inf"), + ) + AUTHENTICATION_BACKENDS = [ "django.contrib.auth.backends.ModelBackend", "core.authentication.backends.OIDCAuthenticationBackend", @@ -296,6 +319,7 @@ class Base(Configuration): "drf_spectacular", # Third party apps "corsheaders", + "django_prometheus", "dockerflow.django", "rest_framework", "parler", diff --git a/src/backend/impress/urls.py b/src/backend/impress/urls.py index 5dc490ac1..4011f8fd6 100644 --- a/src/backend/impress/urls.py +++ b/src/backend/impress/urls.py @@ -1,22 +1,55 @@ """URL configuration for the impress project""" +import os + from django.conf import settings from django.conf.urls.static import static from django.contrib import admin from django.contrib.staticfiles.urls import staticfiles_urlpatterns from django.urls import include, path, re_path +from django_prometheus import exports from drf_spectacular.views import ( SpectacularJSONAPIView, SpectacularRedocView, SpectacularSwaggerView, ) +from core.api.custom_probe_views import liveness_check, readiness_check +from core.api.decorators import monitoring_cidr_protected_view + urlpatterns = [ path("admin/", admin.site.urls), path("", include("core.urls")), ] +# Conditionally add Prometheus Exporter endpoint +if os.environ.get("MONITORING_PROMETHEUS_EXPORTER", "False").lower() == "true": + urlpatterns.append( + path( + "prometheus/", + monitoring_cidr_protected_view(exports.ExportToDjangoView), + name="prometheus-django-metrics", + ), + ) + +# Conditionally add liveness and readiness probe endpoints +if os.environ.get("MONITORING_PROBING", "False").lower() == "true": + urlpatterns.append( + path( + "probes/liveness/", + monitoring_cidr_protected_view(liveness_check), + name="liveness-probe", + ), + ) + urlpatterns.append( + path( + "probes/readiness/", + monitoring_cidr_protected_view(readiness_check), + name="readiness-probe", + ), + ) + if settings.DEBUG: urlpatterns = ( urlpatterns diff --git a/src/backend/impress/wsgi.py b/src/backend/impress/wsgi.py index 6076021c6..4988c0d33 100644 --- a/src/backend/impress/wsgi.py +++ b/src/backend/impress/wsgi.py @@ -11,7 +11,26 @@ from configurations.wsgi import get_wsgi_application +# Prometheus Metrics Registration +from prometheus_client import REGISTRY + +from core.api.custom_metrics_exporter import CustomMetricsExporter + + +def register_prometheus_exporter(): + """ + Register custom Prometheus metrics collector. + """ + if not any( + isinstance(collector, CustomMetricsExporter) for collector in REGISTRY.collect() + ): + REGISTRY.register(CustomMetricsExporter()) + + os.environ.setdefault("DJANGO_SETTINGS_MODULE", "impress.settings") os.environ.setdefault("DJANGO_CONFIGURATION", "Development") +if os.environ.get("MONITORING_PROMETHEUS_EXPORTER", "False").lower() == "true": + register_prometheus_exporter() + application = get_wsgi_application() diff --git a/src/backend/pyproject.toml b/src/backend/pyproject.toml index 25b5df954..c236808ae 100644 --- a/src/backend/pyproject.toml +++ b/src/backend/pyproject.toml @@ -33,6 +33,7 @@ dependencies = [ "django-countries==7.6.1", "django-filter==24.3", "django-parler==2.3", + "django-prometheus==2.3.1", "redis==5.2.1", "django-redis==5.4.0", "django-storages[s3]==1.14.4",