Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

✨(backend) add prometheus metrics, liveness and readiness probe endpoints #562

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .github/workflows/impress.yml
Original file line number Diff line number Diff line change
Expand Up @@ -153,6 +153,9 @@ jobs:
AWS_S3_ENDPOINT_URL: http://localhost:9000
AWS_S3_ACCESS_KEY_ID: impress
AWS_S3_SECRET_ACCESS_KEY: password
MONITORING_PROMETHEUS_EXPORTER: true
MONITORING_PROBING: true
MONITORING_ALLOWED_CIDR_RANGES: "*"

steps:
- name: Checkout repository
Expand Down
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,10 @@ and this project adheres to

## [Unreleased]

## Added

- ✨(backend) add prometheus metrics and probe endpoints #455

## [2.0.1] - 2025-01-17

## Fixed
Expand Down
3 changes: 3 additions & 0 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,9 @@ services:
environment:
- PYLINTHOME=/app/.pylint.d
- DJANGO_CONFIGURATION=Development
- MONITORING_PROMETHEUS_EXPORTER=true
- MONITORING_PROBING=true
- MONITORING_ALLOWED_CIDR_RANGES="*"
env_file:
- env.d/development/common
- env.d/development/postgresql
Expand Down
198 changes: 198 additions & 0 deletions src/backend/core/api/custom_metrics_exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
"""Custom Prometheus Metrics Exporter for Impress' core application."""

from datetime import timedelta

from django.conf import settings
from django.db.models import Count, F, Max, Min, Q
from django.utils.timezone import now

from prometheus_client.core import GaugeMetricFamily

from core import models


class CustomMetricsExporter:
"""
Custom Prometheus metrics collector for various application
relevant metrics.
"""

def collect(self):
"""
Collect and yield Prometheus metrics for user activity, document activity,
and document statistics over various time periods.
"""

namespace = getattr(settings, "PROMETHEUS_METRIC_NAMESPACE", "")

def prefixed_metric_name(name):
return f"{namespace}_{name}" if namespace else name

# Group time boundaries into a dictionary to reduce separate local variables
times = {}
times["today_start_utc"] = now().replace(
hour=0, minute=0, second=0, microsecond=0
)
times["one_week_ago"] = times["today_start_utc"] - timedelta(days=7)
times["one_month_ago"] = times["today_start_utc"] - timedelta(days=30)

# Group user queries/metrics into a dictionary
user_metrics = {
"total_users": models.User.objects.count(),
"active_users_today": models.User.objects.filter(
Q(documentaccess__updated_at__gte=times["today_start_utc"])
| Q(link_traces__created_at__gte=times["today_start_utc"])
| Q(last_login__gte=times["today_start_utc"])
)
.distinct()
.count(),
"active_users_7_days": models.User.objects.filter(
Q(documentaccess__updated_at__gte=times["one_week_ago"])
| Q(link_traces__created_at__gte=times["one_week_ago"])
| Q(last_login__gte=times["one_week_ago"])
)
.distinct()
.count(),
"active_users_30_days": models.User.objects.filter(
Q(documentaccess__updated_at__gte=times["one_month_ago"])
| Q(link_traces__created_at__gte=times["one_month_ago"])
| Q(last_login__gte=times["one_month_ago"])
)
.distinct()
.count(),
}

# Group document queries/metrics into a dictionary
doc_metrics = {
"total_documents": models.Document.objects.count(),
"shared_docs_count": (
models.Document.objects.annotate(access_count=Count("accesses"))
.filter(access_count__gt=1)
.count()
),
"active_docs_today": models.Document.objects.filter(
updated_at__gte=times["today_start_utc"],
updated_at__lt=times["today_start_utc"] + timedelta(days=1),
).count(),
"active_docs_last_7_days": models.Document.objects.filter(
updated_at__gte=times["one_week_ago"]
).count(),
"active_docs_last_30_days": models.Document.objects.filter(
updated_at__gte=times["one_month_ago"]
).count(),
}

# Use a single aggregation call for oldest/newest document creation date
doc_ages = models.Document.objects.aggregate(
oldest=Min("created_at"),
newest=Max("created_at"),
)

# Prepare user distribution data
user_doc_counts = models.DocumentAccess.objects.values("user_id").annotate(
doc_count=Count("document_id"), admin_email=F("user__admin_email")
)

# Collect all metrics in one list
metrics = []

# -- User metrics
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("total_users"),
"Total number of users",
value=user_metrics["total_users"],
)
)
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("active_users_today"),
"Number of active users today",
value=user_metrics["active_users_today"],
)
)
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("active_users_7_days"),
"Number of active users in the last 7 days",
value=user_metrics["active_users_7_days"],
)
)
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("active_users_30_days"),
"Number of active users in the last 30 days",
value=user_metrics["active_users_30_days"],
)
)

# -- Document metrics
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("total_documents"),
"Total number of documents",
value=doc_metrics["total_documents"],
)
)
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("shared_documents"),
"Number of shared documents",
value=doc_metrics["shared_docs_count"],
)
)
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("active_documents_today"),
"Number of active documents today",
value=doc_metrics["active_docs_today"],
)
)
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("active_documents_7_days"),
"Number of active documents in the last 7 days",
value=doc_metrics["active_docs_last_7_days"],
)
)
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("active_documents_30_days"),
"Number of active documents in the last 30 days",
value=doc_metrics["active_docs_last_30_days"],
)
)

# -- Document oldest/newest timestamps
if doc_ages["oldest"]:
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("oldest_document_date"),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's recommended to include "timestamp"in the metric name and have a _seconds suffix (from https://prometheus.io/docs/practices/naming/#metric-names)

Suggested change
prefixed_metric_name("oldest_document_date"),
prefixed_metric_name("oldest_document_timestamp_seconds"),

"Timestamp of the oldest document creation date",
value=doc_ages["oldest"].timestamp(),
)
)
if doc_ages["newest"]:
metrics.append(
GaugeMetricFamily(
prefixed_metric_name("newest_document_date"),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
prefixed_metric_name("newest_document_date"),
prefixed_metric_name("newest_document_timestamp_seconds"),

"Timestamp of the newest document creation date",
value=doc_ages["newest"].timestamp(),
)
)

# -- User document distribution
user_distribution_metric = GaugeMetricFamily(
prefixed_metric_name("user_document_distribution"),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(suggestion) not sure that the "_distribution" is meaningful?

Suggested change
prefixed_metric_name("user_document_distribution"),
prefixed_metric_name("user_documents"),

"Document counts per user",
labels=["user_email"],
)
for user in user_doc_counts:
if user["admin_email"]: # Validate email existence
user_distribution_metric.add_metric(
[user["admin_email"]], user["doc_count"]
)
metrics.append(user_distribution_metric)

# Yield from metrics
yield from metrics
85 changes: 85 additions & 0 deletions src/backend/core/api/custom_probe_views.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
"""API liveness and readiness probes for Impress' core application."""

import uuid

from django.core.cache import CacheKeyWarning, cache
from django.core.exceptions import SuspiciousFileOperation
from django.core.files.base import ContentFile
from django.core.files.storage import default_storage
from django.db import connections
from django.db.utils import OperationalError
from django.http import JsonResponse

import requests
from botocore.exceptions import BotoCoreError, ClientError


def liveness_check(request):
"""
Liveness probe endpoint.
Returns HTTP 200 if the application is alive and running.
"""

return JsonResponse({"status": "OK"}, status=200)


def readiness_check(request):
"""
Readiness probe endpoint.
Checks database, cache, media storage, and OIDC configuration.
Returns HTTP 200 with JSON status "OK" if all checks pass,
or HTTP 500 with JSON status "Error" and an error message.
"""

def check_database():
"""Check database connectivity."""
try:
db_conn = connections["default"]
db_conn.cursor()
except OperationalError as e:
raise RuntimeError(
"Database connectivity check failed."
"Please verify your database configuration and status."
) from e

def check_cache():
"""Check cache connectivity."""
test_key = "readiness-probe"
test_value = "ready"
cache.set(test_key, test_value, timeout=5)
if cache.get(test_key) != test_value:
raise RuntimeError(
"Cache check failed: Value mismatch or cache unavailable."
)

def check_media_storage():
"""Check S3 storage connectivity by attempting to write and delete a test file."""
test_file_name = f"readiness-check-{uuid.uuid4()}.txt"
test_content = ContentFile(b"readiness check")

try:
# Attempt to save the test file
default_storage.save(test_file_name, test_content)
# Attempt to delete the test file
default_storage.delete(test_file_name)
except (SuspiciousFileOperation, OSError, BotoCoreError, ClientError) as e:
# Re-raise with context from the original exception
raise RuntimeError("Media storage check failed.") from e

try:
# Run all checks
check_database()
check_cache()
check_media_storage()

# If all checks pass
return JsonResponse({"status": "OK"}, status=200)

except (
OperationalError,
CacheKeyWarning,
BotoCoreError,
ClientError,
requests.RequestException,
) as e:
return JsonResponse({"status": "Error", "message": str(e)}, status=500)
50 changes: 50 additions & 0 deletions src/backend/core/api/decorators.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
"""Decorators for Impress' core application."""

import os
from ipaddress import ip_address, ip_network

from django.http import HttpResponseForbidden


def monitoring_cidr_protected_view(view):
"""
Decorator to restrict access to a view based on CIDR ranges.

Checks the client's IP address against allowed CIDR ranges specified
in the MONITORING_ALLOWED_CIDR_RANGES environment variable. If the
IP address is not within the allowed ranges, access is denied.
"""

def wrapped_view(request, *args, **kwargs):
cidr_env_raw = os.environ.get("MONITORING_ALLOWED_CIDR_RANGES", "")
cidr_env_stripped = cidr_env_raw.strip().strip('"').strip("'")

allow_all = cidr_env_stripped == "*"

allowed_cidr_ranges = []
if not allow_all:
try:
allowed_cidr_ranges = [
ip_network(c.strip().strip('"').strip("'"))
for c in cidr_env_stripped.split(",")
if c.strip()
]
except ValueError as e:
raise ValueError(
f"Invalid CIDR range in MONITORING_ALLOWED_CIDR_RANGES: {e}"
) from e

client_ip = request.META.get("REMOTE_ADDR")

if allow_all:
return view(request, *args, **kwargs)

if not allowed_cidr_ranges:
return HttpResponseForbidden("No allowed CIDR ranges configured.")

if not any(ip_address(client_ip) in cidr for cidr in allowed_cidr_ranges):
return HttpResponseForbidden("Access denied: Your IP is not allowed.")

return view(request, *args, **kwargs)

return wrapped_view
Loading
Loading