Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Prometheus http and kernel startup/shutdown metrics #1377

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 17 additions & 1 deletion enterprise_gateway/base/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from typing import List

import jupyter_server._version
import prometheus_client
from jupyter_server.base.handlers import APIHandler
from tornado import web

Expand All @@ -31,6 +32,17 @@ def get(self):
)


class PrometheusMetricsHandler(CORSMixin, web.RequestHandler):
"""
Return prometheus metrics from this enterprise gateway
"""

def get(self):
"""Get the latest state of the Prometheus' metrics."""
self.set_header("Content-Type", prometheus_client.CONTENT_TYPE_LATEST)
self.write(prometheus_client.generate_latest(prometheus_client.REGISTRY))


class NotFoundHandler(JSONErrorsMixin, web.RequestHandler):
"""
Catches all requests and responds with 404 JSON messages.
Expand All @@ -48,4 +60,8 @@ def prepare(self):
raise web.HTTPError(404)


default_handlers: List[tuple] = [(r"/api", APIVersionHandler), (r"/(.*)", NotFoundHandler)]
default_handlers: List[tuple] = [
(r"/api", APIVersionHandler),
(r"/metrics", PrometheusMetricsHandler),
(r"/(.*)", NotFoundHandler),
]
3 changes: 2 additions & 1 deletion enterprise_gateway/enterprisegatewayapp.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@
WebhookKernelSessionManager,
)
from .services.sessions.sessionmanager import SessionManager
from .webapp import EnterpriseGatewayWebApp

try:
from jupyter_server.auth.authorizer import AllowAllAuthorizer
Expand Down Expand Up @@ -219,7 +220,7 @@ def init_webapp(self) -> None:

handlers = self._create_request_handlers()

self.web_app = web.Application(
self.web_app = EnterpriseGatewayWebApp(
handlers=handlers,
kernel_manager=self.kernel_manager,
session_manager=self.session_manager,
Expand Down
30 changes: 30 additions & 0 deletions enterprise_gateway/metrics.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
"""Collection of all the metrics used by the Enterprise Gateway"""

import os

from prometheus_client import Histogram

metrics_prefix = os.environ.get("EG_METRICS_PREFIX", "enterprise_gateway")

HTTP_REQUEST_DURATION_SECONDS = Histogram(
'http_request_duration_seconds',
'Request duration for all HTTP requests',
['method', 'handler', 'status_code'],
namespace=metrics_prefix,
)

KERNEL_START_DURATION_SECONDS = Histogram(
'kernel_start_duration_seconds',
'Kernel startup duration',
['kernel_name', 'process_proxy'],
buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0],
namespace=metrics_prefix,
)

KERNEL_SHUTDOWN_DURATION_SECONDS = Histogram(
'kernel_shutdown_duration_seconds',
'Kernel startup duration for all HTTP requests',
['kernel_name', 'process_proxy'],
buckets=[0.1, 0.25, 0.5, 1, 2.5, 5.0, 10.0, 15.0, 20.0, 30.0],
namespace=metrics_prefix,
)
33 changes: 32 additions & 1 deletion enterprise_gateway/services/kernels/remotemanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@

from enterprise_gateway.mixins import EnterpriseGatewayConfigMixin

from ...metrics import KERNEL_SHUTDOWN_DURATION_SECONDS, KERNEL_START_DURATION_SECONDS
from ..processproxies.processproxy import BaseProcessProxyABC, LocalProcessProxy, RemoteProcessProxy
from ..sessions.kernelsessionmanager import KernelSessionManager

Expand Down Expand Up @@ -501,7 +502,12 @@ async def start_kernel(self, **kwargs: dict[str, Any] | None):
"""
self._get_process_proxy()
self._capture_user_overrides(**kwargs)
await super().start_kernel(**kwargs)
with KERNEL_START_DURATION_SECONDS.time() as timer:
timer.labels(
kernel_name=self.kernel_name,
process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}',
)
await super().start_kernel(**kwargs)

def _capture_user_overrides(self, **kwargs: dict[str, Any] | None) -> None:
"""
Expand Down Expand Up @@ -588,6 +594,31 @@ def request_shutdown(self, restart: bool = False) -> None:
if isinstance(self.process_proxy, RemoteProcessProxy):
self.process_proxy.shutdown_listener()

async def shutdown_kernel(self, now: bool = False, restart: bool = False):
"""Attempts to stop the kernel process cleanly.

This attempts to shutdown the kernels cleanly by:

1. Sending it a shutdown message over the control channel.
2. If that fails, the kernel is shutdown forcibly by sending it
a signal.

Parameters
----------
now : bool
Should the kernel be forcible killed *now*. This skips the
first, nice shutdown attempt.
restart: bool
Will this kernel be restarted after it is shutdown. When this
is True, connection files will not be cleaned up.
"""
with KERNEL_SHUTDOWN_DURATION_SECONDS.time() as timer:
timer.labels(
kernel_name=self.kernel_name,
process_proxy=f'{self.process_proxy.__class__.__module__}.{type(self.process_proxy).__name__}',
)
await super().shutdown_kernel(now=now, restart=restart)

async def restart_kernel(self, now: bool = False, **kwargs: dict[str, Any] | None) -> None:
"""
Restarts a kernel with the arguments that were used to launch it.
Expand Down
6 changes: 6 additions & 0 deletions enterprise_gateway/tests/test_handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -557,6 +557,12 @@ def test_kernel_env_auth_token(self):
if ws:
ws.close()

@gen_test
def test_get_metrics(self):
"""Getting the swagger.json spec should be ok"""
response = yield self.http_client.fetch(self.get_url("/metrics"))
self.assertEqual(response.code, 200)


class TestCustomDefaultKernel(TestHandlers):
"""Tests gateway behavior when setting a custom default kernelspec."""
Expand Down
36 changes: 36 additions & 0 deletions enterprise_gateway/webapp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
"""Tornado web app for enterprise_gateway."""

from tornado import web
from tornado.web import RequestHandler

from enterprise_gateway.metrics import HTTP_REQUEST_DURATION_SECONDS


class EnterpriseGatewayWebApp(web.Application):
"""
Custom Tornado web application that handles all HTTP traffic for the Enterprise Gateway.
"""

def log_request(self, handler: RequestHandler) -> None:
"""
Tornado log handler for recording RED metrics.

We record the following metrics:
Rate: the number of requests, per second, your services are serving.
Errors: the number of failed requests per second.
Duration: the amount of time each request takes expressed as a time interval.

We use a fully qualified name of the handler as a label,
rather than every url path to reduce cardinality.

This function should be either the value of or called from a function
that is the 'log_function' tornado setting. This makes it get called
at the end of every request, allowing us to record the metrics we need.
"""
super().log_request(handler)

HTTP_REQUEST_DURATION_SECONDS.labels(
method=handler.request.method,
handler=f'{handler.__class__.__module__}.{type(handler).__name__}',
status_code=handler.get_status(),
).observe(handler.request.request_time())
Loading